1 /* 2 * Copyright (c) 2012, 2013, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 package jdk.internal.util.xml.impl; 27 28 import java.io.IOException; 29 import java.io.InputStream; 30 import java.io.InputStreamReader; 31 import java.io.Reader; 32 import java.io.UnsupportedEncodingException; 33 import java.util.HashMap; 34 import java.util.Map; 35 import jdk.internal.org.xml.sax.InputSource; 36 import jdk.internal.org.xml.sax.SAXException; 37 38 /** 39 * XML non-validating parser engine. 40 */ 41 public abstract class Parser { 42 43 public final static String FAULT = ""; 44 protected final static int BUFFSIZE_READER = 512; 45 protected final static int BUFFSIZE_PARSER = 128; 46 /** 47 * The end of stream character. 48 */ 49 public final static char EOS = 0xffff; 50 private Pair mNoNS; // there is no namespace 51 private Pair mXml; // the xml namespace 52 private Map<String, Input> mEnt; // the entities look up table 53 private Map<String, Input> mPEnt; // the parmeter entities look up table 54 protected boolean mIsSAlone; // xml decl standalone flag 55 protected boolean mIsSAloneSet; // standalone is explicitely set 56 protected boolean mIsNSAware; // if true - namespace aware mode 57 protected int mPh; // current phase of document processing 58 protected final static int PH_BEFORE_DOC = -1; // before parsing 59 protected final static int PH_DOC_START = 0; // document start 60 protected final static int PH_MISC_DTD = 1; // misc before DTD 61 protected final static int PH_DTD = 2; // DTD 62 protected final static int PH_DTD_MISC = 3; // misc after DTD 63 protected final static int PH_DOCELM = 4; // document's element 64 protected final static int PH_DOCELM_MISC = 5; // misc after element 65 protected final static int PH_AFTER_DOC = 6; // after parsing 66 protected int mEvt; // current event type 67 protected final static int EV_NULL = 0; // unknown 68 protected final static int EV_ELM = 1; // empty element 69 protected final static int EV_ELMS = 2; // start element 70 protected final static int EV_ELME = 3; // end element 71 protected final static int EV_TEXT = 4; // textual content 72 protected final static int EV_WSPC = 5; // white space content 73 protected final static int EV_PI = 6; // processing instruction 74 protected final static int EV_CDAT = 7; // character data 75 protected final static int EV_COMM = 8; // comment 76 protected final static int EV_DTD = 9; // document type definition 77 protected final static int EV_ENT = 10; // skipped entity 78 private char mESt; // built-in entity recognizer state 79 // mESt values: 80 // 0x100 : the initial state 81 // > 0x100 : unrecognized name 82 // < 0x100 : replacement character 83 protected char[] mBuff; // parser buffer 84 protected int mBuffIdx; // index of the last char 85 protected Pair mPref; // stack of prefixes 86 protected Pair mElm; // stack of elements 87 // mAttL.chars - element qname 88 // mAttL.next - next element 89 // mAttL.list - list of attributes defined on this element 90 // mAttL.list.chars - attribute qname 91 // mAttL.list.id - a char representing attribute's type see below 92 // mAttL.list.next - next attribute defined on the element 93 // mAttL.list.list - devault value structure or null 94 // mAttL.list.list.chars - "name='value' " chars array for Input 95 // 96 // Attribute type character values: 97 // 'i' - "ID" 98 // 'r' - "IDREF" 99 // 'R' - "IDREFS" 100 // 'n' - "ENTITY" 101 // 'N' - "ENTITIES" 102 // 't' - "NMTOKEN" 103 // 'T' - "NMTOKENS" 104 // 'u' - enumeration type 105 // 'o' - "NOTATION" 106 // 'c' - "CDATA" 107 // see also: bkeyword() and atype() 108 // 109 protected Pair mAttL; // list of defined attrs by element name 110 protected Input mDoc; // document entity 111 protected Input mInp; // stack of entities 112 private char[] mChars; // reading buffer 113 private int mChLen; // current capacity 114 private int mChIdx; // index to the next char 115 protected Attrs mAttrs; // attributes of the curr. element 116 private String[] mItems; // attributes array of the curr. element 117 private char mAttrIdx; // attributes counter/index 118 private String mUnent; // unresolved entity name 119 private Pair mDltd; // deleted objects for reuse 120 /** 121 * Default prefixes 122 */ 123 private final static char NONS[]; 124 private final static char XML[]; 125 private final static char XMLNS[]; 126 127 static { 128 NONS = new char[1]; 129 NONS[0] = (char) 0; 130 131 XML = new char[4]; 132 XML[0] = (char) 4; 133 XML[1] = 'x'; 134 XML[2] = 'm'; 135 XML[3] = 'l'; 136 137 XMLNS = new char[6]; 138 XMLNS[0] = (char) 6; 139 XMLNS[1] = 'x'; 140 XMLNS[2] = 'm'; 141 XMLNS[3] = 'l'; 142 XMLNS[4] = 'n'; 143 XMLNS[5] = 's'; 144 } 145 /** 146 * ASCII character type array. 147 * 148 * This array maps an ASCII (7 bit) character to the character type.<br /> 149 * Possible character type values are:<br /> - ' ' for any kind of white 150 * space character;<br /> - 'a' for any lower case alphabetical character 151 * value;<br /> - 'A' for any upper case alphabetical character value;<br /> 152 * - 'd' for any decimal digit character value;<br /> - 'z' for any 153 * character less then ' ' except '\t', '\n', '\r';<br /> An ASCII (7 bit) 154 * character which does not fall in any category listed above is mapped to 155 * it self. 156 */ 157 private static final byte asctyp[]; 158 /** 159 * NMTOKEN character type array. 160 * 161 * This array maps an ASCII (7 bit) character to the character type.<br /> 162 * Possible character type values are:<br /> - 0 for underscore ('_') or any 163 * lower and upper case alphabetical character value;<br /> - 1 for colon 164 * (':') character;<br /> - 2 for dash ('-') and dot ('.') or any decimal 165 * digit character value;<br /> - 3 for any kind of white space character<br 166 * /> An ASCII (7 bit) character which does not fall in any category listed 167 * above is mapped to 0xff. 168 */ 169 private static final byte nmttyp[]; 170 171 /** 172 * Static constructor. 173 * 174 * Sets up the ASCII character type array which is used by 175 * {@link #asctyp asctyp} method and NMTOKEN character type array. 176 */ 177 static { 178 short i = 0; 179 180 asctyp = new byte[0x80]; 181 while (i < ' ') { 182 asctyp[i++] = (byte) 'z'; 183 } 184 asctyp['\t'] = (byte) ' '; 185 asctyp['\r'] = (byte) ' '; 186 asctyp['\n'] = (byte) ' '; 187 while (i < '0') { 188 asctyp[i] = (byte) i++; 189 } 190 while (i <= '9') { 191 asctyp[i++] = (byte) 'd'; 192 } 193 while (i < 'A') { 194 asctyp[i] = (byte) i++; 195 } 196 while (i <= 'Z') { 197 asctyp[i++] = (byte) 'A'; 198 } 199 while (i < 'a') { 200 asctyp[i] = (byte) i++; 201 } 202 while (i <= 'z') { 203 asctyp[i++] = (byte) 'a'; 204 } 205 while (i < 0x80) { 206 asctyp[i] = (byte) i++; 207 } 208 209 nmttyp = new byte[0x80]; 210 for (i = 0; i < '0'; i++) { 211 nmttyp[i] = (byte) 0xff; 212 } 213 while (i <= '9') { 214 nmttyp[i++] = (byte) 2; // digits 215 } 216 while (i < 'A') { 217 nmttyp[i++] = (byte) 0xff; 218 } 219 // skiped upper case alphabetical character are already 0 220 for (i = '['; i < 'a'; i++) { 221 nmttyp[i] = (byte) 0xff; 222 } 223 // skiped lower case alphabetical character are already 0 224 for (i = '{'; i < 0x80; i++) { 225 nmttyp[i] = (byte) 0xff; 226 } 227 nmttyp['_'] = 0; 228 nmttyp[':'] = 1; 229 nmttyp['.'] = 2; 230 nmttyp['-'] = 2; 231 nmttyp[' '] = 3; 232 nmttyp['\t'] = 3; 233 nmttyp['\r'] = 3; 234 nmttyp['\n'] = 3; 235 } 236 237 /** 238 * Constructor. 239 */ 240 protected Parser() { 241 mPh = PH_BEFORE_DOC; // before parsing 242 243 // Initialize the parser 244 mBuff = new char[BUFFSIZE_PARSER]; 245 mAttrs = new Attrs(); 246 247 // Default namespace 248 mPref = pair(mPref); 249 mPref.name = ""; 250 mPref.value = ""; 251 mPref.chars = NONS; 252 mNoNS = mPref; // no namespace 253 // XML namespace 254 mPref = pair(mPref); 255 mPref.name = "xml"; 256 mPref.value = "http://www.w3.org/XML/1998/namespace"; 257 mPref.chars = XML; 258 mXml = mPref; // XML namespace 259 } 260 261 /** 262 * Initializes parser's internals. Note, current input has to be set before 263 * this method is called. 264 */ 265 protected void init() { 266 mUnent = null; 267 mElm = null; 268 mPref = mXml; 269 mAttL = null; 270 mPEnt = new HashMap<>(); 271 mEnt = new HashMap<>(); 272 mDoc = mInp; // current input is document entity 273 mChars = mInp.chars; // use document entity buffer 274 mPh = PH_DOC_START; // the begining of the document 275 } 276 277 /** 278 * Cleans up parser internal resources. 279 */ 280 protected void cleanup() { 281 // Default attributes 282 while (mAttL != null) { 283 while (mAttL.list != null) { 284 if (mAttL.list.list != null) { 285 del(mAttL.list.list); 286 } 287 mAttL.list = del(mAttL.list); 288 } 289 mAttL = del(mAttL); 290 } 291 // Element stack 292 while (mElm != null) { 293 mElm = del(mElm); 294 } 295 // Namespace prefixes 296 while (mPref != mXml) { 297 mPref = del(mPref); 298 } 299 // Inputs 300 while (mInp != null) { 301 pop(); 302 } 303 // Document reader 304 if ((mDoc != null) && (mDoc.src != null)) { 305 try { 306 mDoc.src.close(); 307 } catch (IOException ioe) { 308 } 309 } 310 mPEnt = null; 311 mEnt = null; 312 mDoc = null; 313 mPh = PH_AFTER_DOC; // before documnet processing 314 } 315 316 /** 317 * Processes a portion of document. This method returns one of EV_* 318 * constants as an identifier of the portion of document have been read. 319 * 320 * @return Identifier of processed document portion. 321 * @exception Exception is parser specific exception form panic method. 322 * @exception IOException 323 */ 324 @SuppressWarnings("fallthrough") 325 protected int step() throws Exception { 326 mEvt = EV_NULL; 327 int st = 0; 328 while (mEvt == EV_NULL) { 329 char ch = (mChIdx < mChLen) ? mChars[mChIdx++] : getch(); 330 switch (st) { 331 case 0: // all sorts of markup (dispetcher) 332 if (ch != '<') { 333 bkch(); 334 mBuffIdx = -1; // clean parser buffer 335 st = 1; 336 break; 337 } 338 switch (getch()) { 339 case '/': // the end of the element content 340 mEvt = EV_ELME; 341 if (mElm == null) { 342 panic(FAULT); 343 } 344 // Check element's open/close tags balance 345 mBuffIdx = -1; // clean parser buffer 346 bname(mIsNSAware); 347 char[] chars = mElm.chars; 348 if (chars.length == (mBuffIdx + 1)) { 349 for (char i = 1; i <= mBuffIdx; i += 1) { 350 if (chars[i] != mBuff[i]) { 351 panic(FAULT); 352 } 353 } 354 } else { 355 panic(FAULT); 356 } 357 // Skip white spaces before '>' 358 if (wsskip() != '>') { 359 panic(FAULT); 360 } 361 getch(); // read '>' 362 break; 363 364 case '!': // a comment or a CDATA 365 ch = getch(); 366 bkch(); 367 switch (ch) { 368 case '-': // must be a comment 369 mEvt = EV_COMM; 370 comm(); 371 break; 372 373 case '[': // must be a CDATA section 374 mEvt = EV_CDAT; 375 cdat(); 376 break; 377 378 default: // must be 'DOCTYPE' 379 mEvt = EV_DTD; 380 dtd(); 381 break; 382 } 383 break; 384 385 case '?': // processing instruction 386 mEvt = EV_PI; 387 pi(); 388 break; 389 390 default: // must be the first char of an xml name 391 bkch(); 392 // Read an element name and put it on top of the 393 // element stack 394 mElm = pair(mElm); // add new element to the stack 395 mElm.chars = qname(mIsNSAware); 396 mElm.name = mElm.local(); 397 mElm.id = (mElm.next != null) ? mElm.next.id : 0; // flags 398 mElm.num = 0; // namespace counter 399 // Find the list of defined attributs of the current 400 // element 401 Pair elm = find(mAttL, mElm.chars); 402 mElm.list = (elm != null) ? elm.list : null; 403 // Read attributes till the end of the element tag 404 mAttrIdx = 0; 405 Pair att = pair(null); 406 att.num = 0; // clear attribute's flags 407 attr(att); // get all attributes inc. defaults 408 del(att); 409 mElm.value = (mIsNSAware) ? rslv(mElm.chars) : null; 410 // Skip white spaces before '>' 411 switch (wsskip()) { 412 case '>': 413 getch(); // read '>' 414 mEvt = EV_ELMS; 415 break; 416 417 case '/': 418 getch(); // read '/' 419 if (getch() != '>') // read '>' 420 { 421 panic(FAULT); 422 } 423 mEvt = EV_ELM; 424 break; 425 426 default: 427 panic(FAULT); 428 } 429 break; 430 } 431 break; 432 433 case 1: // read white space 434 switch (ch) { 435 case ' ': 436 case '\t': 437 case '\n': 438 bappend(ch); 439 break; 440 441 case '\r': // EOL processing [#2.11] 442 if (getch() != '\n') { 443 bkch(); 444 } 445 bappend('\n'); 446 break; 447 448 case '<': 449 mEvt = EV_WSPC; 450 bkch(); 451 bflash_ws(); 452 break; 453 454 default: 455 bkch(); 456 st = 2; 457 break; 458 } 459 break; 460 461 case 2: // read the text content of the element 462 switch (ch) { 463 case '&': 464 if (mUnent == null) { 465 // There was no unresolved entity on previous step. 466 if ((mUnent = ent('x')) != null) { 467 mEvt = EV_TEXT; 468 bkch(); // move back to ';' after entity name 469 setch('&'); // parser must be back on next step 470 bflash(); 471 } 472 } else { 473 // There was unresolved entity on previous step. 474 mEvt = EV_ENT; 475 skippedEnt(mUnent); 476 mUnent = null; 477 } 478 break; 479 480 case '<': 481 mEvt = EV_TEXT; 482 bkch(); 483 bflash(); 484 break; 485 486 case '\r': // EOL processing [#2.11] 487 if (getch() != '\n') { 488 bkch(); 489 } 490 bappend('\n'); 491 break; 492 493 case EOS: 494 panic(FAULT); 495 496 default: 497 bappend(ch); 498 break; 499 } 500 break; 501 502 default: 503 panic(FAULT); 504 } 505 } 506 507 return mEvt; 508 } 509 510 /** 511 * Parses the document type declaration. 512 * 513 * @exception Exception is parser specific exception form panic method. 514 * @exception IOException 515 */ 516 private void dtd() throws Exception { 517 char ch; 518 String str = null; 519 String name = null; 520 Pair psid = null; 521 // read 'DOCTYPE' 522 if ("DOCTYPE".equals(name(false)) != true) { 523 panic(FAULT); 524 } 525 mPh = PH_DTD; // DTD 526 for (short st = 0; st >= 0;) { 527 ch = getch(); 528 switch (st) { 529 case 0: // read the document type name 530 if (chtyp(ch) != ' ') { 531 bkch(); 532 name = name(mIsNSAware); 533 wsskip(); 534 st = 1; // read 'PUPLIC' or 'SYSTEM' 535 } 536 break; 537 538 case 1: // read 'PUPLIC' or 'SYSTEM' 539 switch (chtyp(ch)) { 540 case 'A': 541 bkch(); 542 psid = pubsys(' '); 543 st = 2; // skip spaces before internal subset 544 docType(name, psid.name, psid.value); 545 break; 546 547 case '[': 548 bkch(); 549 st = 2; // skip spaces before internal subset 550 docType(name, null, null); 551 break; 552 553 case '>': 554 bkch(); 555 st = 3; // skip spaces after internal subset 556 docType(name, null, null); 557 break; 558 559 default: 560 panic(FAULT); 561 } 562 break; 563 564 case 2: // skip spaces before internal subset 565 switch (chtyp(ch)) { 566 case '[': 567 // Process internal subset 568 dtdsub(); 569 st = 3; // skip spaces after internal subset 570 break; 571 572 case '>': 573 // There is no internal subset 574 bkch(); 575 st = 3; // skip spaces after internal subset 576 break; 577 578 case ' ': 579 // skip white spaces 580 break; 581 582 default: 583 panic(FAULT); 584 } 585 break; 586 587 case 3: // skip spaces after internal subset 588 switch (chtyp(ch)) { 589 case '>': 590 if (psid != null) { 591 // Report the DTD external subset 592 InputSource is = resolveEnt(name, psid.name, psid.value); 593 if (is != null) { 594 if (mIsSAlone == false) { 595 // Set the end of DTD external subset char 596 bkch(); 597 setch(']'); 598 // Set the DTD external subset InputSource 599 push(new Input(BUFFSIZE_READER)); 600 setinp(is); 601 mInp.pubid = psid.name; 602 mInp.sysid = psid.value; 603 // Parse the DTD external subset 604 dtdsub(); 605 } else { 606 // Unresolved DTD external subset 607 skippedEnt("[dtd]"); 608 // Release reader and stream 609 if (is.getCharacterStream() != null) { 610 try { 611 is.getCharacterStream().close(); 612 } catch (IOException ioe) { 613 } 614 } 615 if (is.getByteStream() != null) { 616 try { 617 is.getByteStream().close(); 618 } catch (IOException ioe) { 619 } 620 } 621 } 622 } else { 623 // Unresolved DTD external subset 624 skippedEnt("[dtd]"); 625 } 626 del(psid); 627 } 628 st = -1; // end of DTD 629 break; 630 631 case ' ': 632 // skip white spaces 633 break; 634 635 default: 636 panic(FAULT); 637 } 638 break; 639 640 default: 641 panic(FAULT); 642 } 643 } 644 } 645 646 /** 647 * Parses the document type declaration subset. 648 * 649 * @exception Exception is parser specific exception form panic method. 650 * @exception IOException 651 */ 652 private void dtdsub() throws Exception { 653 char ch; 654 for (short st = 0; st >= 0;) { 655 ch = getch(); 656 switch (st) { 657 case 0: // skip white spaces before a declaration 658 switch (chtyp(ch)) { 659 case '<': 660 ch = getch(); 661 switch (ch) { 662 case '?': 663 pi(); 664 break; 665 666 case '!': 667 ch = getch(); 668 bkch(); 669 if (ch == '-') { 670 comm(); 671 break; 672 } 673 // A markup or an entity declaration 674 bntok(); 675 switch (bkeyword()) { 676 case 'n': 677 dtdent(); 678 break; 679 680 case 'a': 681 dtdattl(); // parse attributes declaration 682 break; 683 684 case 'e': 685 dtdelm(); // parse element declaration 686 break; 687 688 case 'o': 689 dtdnot(); // parse notation declaration 690 break; 691 692 default: 693 panic(FAULT); // unsupported markup declaration 694 break; 695 } 696 st = 1; // read the end of declaration 697 break; 698 699 default: 700 panic(FAULT); 701 break; 702 } 703 break; 704 705 case '%': 706 // A parameter entity reference 707 pent(' '); 708 break; 709 710 case ']': 711 // End of DTD subset 712 st = -1; 713 break; 714 715 case ' ': 716 // Skip white spaces 717 break; 718 719 case 'Z': 720 // End of stream 721 if (getch() != ']') { 722 panic(FAULT); 723 } 724 st = -1; 725 break; 726 727 default: 728 panic(FAULT); 729 } 730 break; 731 732 case 1: // read the end of declaration 733 switch (ch) { 734 case '>': // there is no notation 735 st = 0; // skip white spaces before a declaration 736 break; 737 738 case ' ': 739 case '\n': 740 case '\r': 741 case '\t': 742 // Skip white spaces 743 break; 744 745 default: 746 panic(FAULT); 747 break; 748 } 749 break; 750 751 default: 752 panic(FAULT); 753 } 754 } 755 } 756 757 /** 758 * Parses an entity declaration. This method fills the general ( 759 * <code>mEnt</code>) and parameter 760 * ( 761 * <code>mPEnt</code>) entity look up table. 762 * 763 * @exception Exception is parser specific exception form panic method. 764 * @exception IOException 765 */ 766 @SuppressWarnings("fallthrough") 767 private void dtdent() throws Exception { 768 String str = null; 769 char[] val = null; 770 Input inp = null; 771 Pair ids = null; 772 char ch; 773 for (short st = 0; st >= 0;) { 774 ch = getch(); 775 switch (st) { 776 case 0: // skip white spaces before entity name 777 switch (chtyp(ch)) { 778 case ' ': 779 // Skip white spaces 780 break; 781 782 case '%': 783 // Parameter entity or parameter entity declaration. 784 ch = getch(); 785 bkch(); 786 if (chtyp(ch) == ' ') { 787 // Parameter entity declaration. 788 wsskip(); 789 str = name(false); 790 switch (chtyp(wsskip())) { 791 case 'A': 792 // Read the external identifier 793 ids = pubsys(' '); 794 if (wsskip() == '>') { 795 // External parsed entity 796 if (mPEnt.containsKey(str) == false) { // [#4.2] 797 inp = new Input(); 798 inp.pubid = ids.name; 799 inp.sysid = ids.value; 800 mPEnt.put(str, inp); 801 } 802 } else { 803 panic(FAULT); 804 } 805 del(ids); 806 st = -1; // the end of declaration 807 break; 808 809 case '\"': 810 case '\'': 811 // Read the parameter entity value 812 bqstr('d'); 813 // Create the parameter entity value 814 val = new char[mBuffIdx + 1]; 815 System.arraycopy(mBuff, 1, val, 1, val.length - 1); 816 // Add surrounding spaces [#4.4.8] 817 val[0] = ' '; 818 // Add the entity to the entity look up table 819 if (mPEnt.containsKey(str) == false) { // [#4.2] 820 inp = new Input(val); 821 inp.pubid = mInp.pubid; 822 inp.sysid = mInp.sysid; 823 inp.xmlenc = mInp.xmlenc; 824 inp.xmlver = mInp.xmlver; 825 mPEnt.put(str, inp); 826 } 827 st = -1; // the end of declaration 828 break; 829 830 default: 831 panic(FAULT); 832 break; 833 } 834 } else { 835 // Parameter entity reference. 836 pent(' '); 837 } 838 break; 839 840 default: 841 bkch(); 842 str = name(false); 843 st = 1; // read entity declaration value 844 break; 845 } 846 break; 847 848 case 1: // read entity declaration value 849 switch (chtyp(ch)) { 850 case '\"': // internal entity 851 case '\'': 852 bkch(); 853 bqstr('d'); // read a string into the buffer 854 if (mEnt.get(str) == null) { 855 // Create general entity value 856 val = new char[mBuffIdx]; 857 System.arraycopy(mBuff, 1, val, 0, val.length); 858 // Add the entity to the entity look up table 859 if (mEnt.containsKey(str) == false) { // [#4.2] 860 inp = new Input(val); 861 inp.pubid = mInp.pubid; 862 inp.sysid = mInp.sysid; 863 inp.xmlenc = mInp.xmlenc; 864 inp.xmlver = mInp.xmlver; 865 mEnt.put(str, inp); 866 } 867 } 868 st = -1; // the end of declaration 869 break; 870 871 case 'A': // external entity 872 bkch(); 873 ids = pubsys(' '); 874 switch (wsskip()) { 875 case '>': // external parsed entity 876 if (mEnt.containsKey(str) == false) { // [#4.2] 877 inp = new Input(); 878 inp.pubid = ids.name; 879 inp.sysid = ids.value; 880 mEnt.put(str, inp); 881 } 882 break; 883 884 case 'N': // external general unparsed entity 885 if ("NDATA".equals(name(false)) == true) { 886 wsskip(); 887 unparsedEntDecl(str, ids.name, ids.value, name(false)); 888 break; 889 } 890 default: 891 panic(FAULT); 892 break; 893 } 894 del(ids); 895 st = -1; // the end of declaration 896 break; 897 898 case ' ': 899 // Skip white spaces 900 break; 901 902 default: 903 panic(FAULT); 904 break; 905 } 906 break; 907 908 default: 909 panic(FAULT); 910 } 911 } 912 } 913 914 /** 915 * Parses an element declaration. 916 * 917 * This method parses the declaration up to the closing angle bracket. 918 * 919 * @exception Exception is parser specific exception form panic method. 920 * @exception IOException 921 */ 922 @SuppressWarnings("fallthrough") 923 private void dtdelm() throws Exception { 924 // This is stub implementation which skips an element 925 // declaration. 926 wsskip(); 927 name(mIsNSAware); 928 929 char ch; 930 while (true) { 931 ch = getch(); 932 switch (ch) { 933 case '>': 934 bkch(); 935 return; 936 937 case EOS: 938 panic(FAULT); 939 940 default: 941 break; 942 } 943 } 944 } 945 946 /** 947 * Parses an attribute list declaration. 948 * 949 * This method parses the declaration up to the closing angle bracket. 950 * 951 * @exception Exception is parser specific exception form panic method. 952 * @exception IOException 953 */ 954 private void dtdattl() throws Exception { 955 char elmqn[] = null; 956 Pair elm = null; 957 char ch; 958 for (short st = 0; st >= 0;) { 959 ch = getch(); 960 switch (st) { 961 case 0: // read the element name 962 switch (chtyp(ch)) { 963 case 'a': 964 case 'A': 965 case '_': 966 case 'X': 967 case ':': 968 bkch(); 969 // Get the element from the list or add a new one. 970 elmqn = qname(mIsNSAware); 971 elm = find(mAttL, elmqn); 972 if (elm == null) { 973 elm = pair(mAttL); 974 elm.chars = elmqn; 975 mAttL = elm; 976 } 977 st = 1; // read an attribute declaration 978 break; 979 980 case ' ': 981 break; 982 983 case '%': 984 pent(' '); 985 break; 986 987 default: 988 panic(FAULT); 989 break; 990 } 991 break; 992 993 case 1: // read an attribute declaration 994 switch (chtyp(ch)) { 995 case 'a': 996 case 'A': 997 case '_': 998 case 'X': 999 case ':': 1000 bkch(); 1001 dtdatt(elm); 1002 if (wsskip() == '>') { 1003 return; 1004 } 1005 break; 1006 1007 case ' ': 1008 break; 1009 1010 case '%': 1011 pent(' '); 1012 break; 1013 1014 default: 1015 panic(FAULT); 1016 break; 1017 } 1018 break; 1019 1020 default: 1021 panic(FAULT); 1022 break; 1023 } 1024 } 1025 } 1026 1027 /** 1028 * Parses an attribute declaration. 1029 * 1030 * The attribute uses the following fields of Pair object: chars - characters 1031 * of qualified name id - the type identifier of the attribute list - a pair 1032 * which holds the default value (chars field) 1033 * 1034 * @param elm An object which represents all defined attributes on an 1035 * element. 1036 * @exception Exception is parser specific exception form panic method. 1037 * @exception IOException 1038 */ 1039 @SuppressWarnings("fallthrough") 1040 private void dtdatt(Pair elm) throws Exception { 1041 char attqn[] = null; 1042 Pair att = null; 1043 char ch; 1044 for (short st = 0; st >= 0;) { 1045 ch = getch(); 1046 switch (st) { 1047 case 0: // the attribute name 1048 switch (chtyp(ch)) { 1049 case 'a': 1050 case 'A': 1051 case '_': 1052 case 'X': 1053 case ':': 1054 bkch(); 1055 // Get the attribute from the list or add a new one. 1056 attqn = qname(mIsNSAware); 1057 att = find(elm.list, attqn); 1058 if (att == null) { 1059 // New attribute declaration 1060 att = pair(elm.list); 1061 att.chars = attqn; 1062 elm.list = att; 1063 } else { 1064 // Do not override the attribute declaration [#3.3] 1065 att = pair(null); 1066 att.chars = attqn; 1067 att.id = 'c'; 1068 } 1069 wsskip(); 1070 st = 1; 1071 break; 1072 1073 case '%': 1074 pent(' '); 1075 break; 1076 1077 case ' ': 1078 break; 1079 1080 default: 1081 panic(FAULT); 1082 break; 1083 } 1084 break; 1085 1086 case 1: // the attribute type 1087 switch (chtyp(ch)) { 1088 case '(': 1089 att.id = 'u'; // enumeration type 1090 st = 2; // read the first element of the list 1091 break; 1092 1093 case '%': 1094 pent(' '); 1095 break; 1096 1097 case ' ': 1098 break; 1099 1100 default: 1101 bkch(); 1102 bntok(); // read type id 1103 att.id = bkeyword(); 1104 switch (att.id) { 1105 case 'o': // NOTATION 1106 if (wsskip() != '(') { 1107 panic(FAULT); 1108 } 1109 ch = getch(); 1110 st = 2; // read the first element of the list 1111 break; 1112 1113 case 'i': // ID 1114 case 'r': // IDREF 1115 case 'R': // IDREFS 1116 case 'n': // ENTITY 1117 case 'N': // ENTITIES 1118 case 't': // NMTOKEN 1119 case 'T': // NMTOKENS 1120 case 'c': // CDATA 1121 wsskip(); 1122 st = 4; // read default declaration 1123 break; 1124 1125 default: 1126 panic(FAULT); 1127 break; 1128 } 1129 break; 1130 } 1131 break; 1132 1133 case 2: // read the first element of the list 1134 switch (chtyp(ch)) { 1135 case 'a': 1136 case 'A': 1137 case 'd': 1138 case '.': 1139 case ':': 1140 case '-': 1141 case '_': 1142 case 'X': 1143 bkch(); 1144 switch (att.id) { 1145 case 'u': // enumeration type 1146 bntok(); 1147 break; 1148 1149 case 'o': // NOTATION 1150 mBuffIdx = -1; 1151 bname(false); 1152 break; 1153 1154 default: 1155 panic(FAULT); 1156 break; 1157 } 1158 wsskip(); 1159 st = 3; // read next element of the list 1160 break; 1161 1162 case '%': 1163 pent(' '); 1164 break; 1165 1166 case ' ': 1167 break; 1168 1169 default: 1170 panic(FAULT); 1171 break; 1172 } 1173 break; 1174 1175 case 3: // read next element of the list 1176 switch (ch) { 1177 case ')': 1178 wsskip(); 1179 st = 4; // read default declaration 1180 break; 1181 1182 case '|': 1183 wsskip(); 1184 switch (att.id) { 1185 case 'u': // enumeration type 1186 bntok(); 1187 break; 1188 1189 case 'o': // NOTATION 1190 mBuffIdx = -1; 1191 bname(false); 1192 break; 1193 1194 default: 1195 panic(FAULT); 1196 break; 1197 } 1198 wsskip(); 1199 break; 1200 1201 case '%': 1202 pent(' '); 1203 break; 1204 1205 default: 1206 panic(FAULT); 1207 break; 1208 } 1209 break; 1210 1211 case 4: // read default declaration 1212 switch (ch) { 1213 case '#': 1214 bntok(); 1215 switch (bkeyword()) { 1216 case 'F': // FIXED 1217 switch (wsskip()) { 1218 case '\"': 1219 case '\'': 1220 st = 5; // read the default value 1221 break; 1222 1223 case EOS: 1224 panic(FAULT); 1225 1226 default: 1227 st = -1; 1228 break; 1229 } 1230 break; 1231 1232 case 'Q': // REQUIRED 1233 case 'I': // IMPLIED 1234 st = -1; 1235 break; 1236 1237 default: 1238 panic(FAULT); 1239 break; 1240 } 1241 break; 1242 1243 case '\"': 1244 case '\'': 1245 bkch(); 1246 st = 5; // read the default value 1247 break; 1248 1249 case ' ': 1250 case '\n': 1251 case '\r': 1252 case '\t': 1253 break; 1254 1255 case '%': 1256 pent(' '); 1257 break; 1258 1259 default: 1260 bkch(); 1261 st = -1; 1262 break; 1263 } 1264 break; 1265 1266 case 5: // read the default value 1267 switch (ch) { 1268 case '\"': 1269 case '\'': 1270 bkch(); 1271 bqstr('d'); // the value in the mBuff now 1272 att.list = pair(null); 1273 // Create a string like "attqname='value' " 1274 att.list.chars = new char[att.chars.length + mBuffIdx + 3]; 1275 System.arraycopy( 1276 att.chars, 1, att.list.chars, 0, att.chars.length - 1); 1277 att.list.chars[att.chars.length - 1] = '='; 1278 att.list.chars[att.chars.length] = ch; 1279 System.arraycopy( 1280 mBuff, 1, att.list.chars, att.chars.length + 1, mBuffIdx); 1281 att.list.chars[att.chars.length + mBuffIdx + 1] = ch; 1282 att.list.chars[att.chars.length + mBuffIdx + 2] = ' '; 1283 st = -1; 1284 break; 1285 1286 default: 1287 panic(FAULT); 1288 break; 1289 } 1290 break; 1291 1292 default: 1293 panic(FAULT); 1294 break; 1295 } 1296 } 1297 } 1298 1299 /** 1300 * Parses a notation declaration. 1301 * 1302 * This method parses the declaration up to the closing angle bracket. 1303 * 1304 * @exception Exception is parser specific exception form panic method. 1305 * @exception IOException 1306 */ 1307 private void dtdnot() throws Exception { 1308 wsskip(); 1309 String name = name(false); 1310 wsskip(); 1311 Pair ids = pubsys('N'); 1312 notDecl(name, ids.name, ids.value); 1313 del(ids); 1314 } 1315 1316 /** 1317 * Parses an attribute. 1318 * 1319 * This recursive method is responsible for prefix addition 1320 * ( 1321 * <code>mPref</code>) on the way down. The element's start tag end triggers 1322 * the return process. The method then on it's way back resolves prefixes 1323 * and accumulates attributes. 1324 * 1325 * <p><code>att.num</code> carries attribute flags where: 0x1 - attribute is 1326 * declared in DTD (attribute decalration had been read); 0x2 - attribute's 1327 * default value is used.</p> 1328 * 1329 * @param att An object which reprecents current attribute. 1330 * @exception Exception is parser specific exception form panic method. 1331 * @exception IOException 1332 */ 1333 @SuppressWarnings("fallthrough") 1334 private void attr(Pair att) throws Exception { 1335 switch (wsskip()) { 1336 case '/': 1337 case '>': 1338 if ((att.num & 0x2) == 0) { // all attributes have been read 1339 att.num |= 0x2; // set default attribute flag 1340 Input inp = mInp; 1341 // Go through all attributes defined on current element. 1342 for (Pair def = mElm.list; def != null; def = def.next) { 1343 if (def.list == null) // no default value 1344 { 1345 continue; 1346 } 1347 // Go through all attributes defined on current 1348 // element and add defaults. 1349 Pair act = find(att.next, def.chars); 1350 if (act == null) { 1351 push(new Input(def.list.chars)); 1352 } 1353 } 1354 if (mInp != inp) { // defaults have been added 1355 attr(att); 1356 return; 1357 } 1358 } 1359 // Ensure the attribute string array capacity 1360 mAttrs.setLength(mAttrIdx); 1361 mItems = mAttrs.mItems; 1362 return; 1363 1364 case EOS: 1365 panic(FAULT); 1366 1367 default: 1368 // Read the attribute name and value 1369 att.chars = qname(mIsNSAware); 1370 att.name = att.local(); 1371 String type = atype(att); // sets attribute's type on att.id 1372 wsskip(); 1373 if (getch() != '=') { 1374 panic(FAULT); 1375 } 1376 bqstr((char) att.id); // read the value with normalization. 1377 String val = new String(mBuff, 1, mBuffIdx); 1378 Pair next = pair(att); 1379 next.num = (att.num & ~0x1); // inherit attribute flags 1380 // Put a namespace declaration on top of the prefix stack 1381 if ((mIsNSAware == false) || (isdecl(att, val) == false)) { 1382 // An ordinary attribute 1383 mAttrIdx++; 1384 attr(next); // recursive call to parse the next attribute 1385 mAttrIdx--; 1386 // Add the attribute to the attributes string array 1387 char idx = (char) (mAttrIdx << 3); 1388 mItems[idx + 1] = att.qname(); // attr qname 1389 mItems[idx + 2] = (mIsNSAware) ? att.name : ""; // attr local name 1390 mItems[idx + 3] = val; // attr value 1391 mItems[idx + 4] = type; // attr type 1392 switch (att.num & 0x3) { 1393 case 0x0: 1394 mItems[idx + 5] = null; 1395 break; 1396 1397 case 0x1: // declared attribute 1398 mItems[idx + 5] = "d"; 1399 break; 1400 1401 default: // 0x2, 0x3 - default attribute always declared 1402 mItems[idx + 5] = "D"; 1403 break; 1404 } 1405 // Resolve the prefix if any and report the attribute 1406 // NOTE: The attribute does not accept the default namespace. 1407 mItems[idx + 0] = (att.chars[0] != 0) ? rslv(att.chars) : ""; 1408 } else { 1409 // A namespace declaration. mPref.name contains prefix and 1410 // mPref.value contains namespace URI set by isdecl method. 1411 // Report a start of the new mapping 1412 newPrefix(); 1413 // Recursive call to parse the next attribute 1414 attr(next); 1415 // NOTE: The namespace declaration is not reported. 1416 } 1417 del(next); 1418 break; 1419 } 1420 } 1421 1422 /** 1423 * Retrieves attribute type. 1424 * 1425 * This method sets the type of normalization in the attribute 1426 * <code>id</code> field and returns the name of attribute type. 1427 * 1428 * @param att An object which represents current attribute. 1429 * @return The name of the attribute type. 1430 * @exception Exception is parser specific exception form panic method. 1431 */ 1432 private String atype(Pair att) 1433 throws Exception { 1434 Pair attr; 1435 1436 // CDATA-type normalization by default [#3.3.3] 1437 att.id = 'c'; 1438 if (mElm.list == null || (attr = find(mElm.list, att.chars)) == null) { 1439 return "CDATA"; 1440 } 1441 1442 att.num |= 0x1; // attribute is declared 1443 1444 // Non-CDATA normalization except when the attribute type is CDATA. 1445 att.id = 'i'; 1446 switch (attr.id) { 1447 case 'i': 1448 return "ID"; 1449 1450 case 'r': 1451 return "IDREF"; 1452 1453 case 'R': 1454 return "IDREFS"; 1455 1456 case 'n': 1457 return "ENTITY"; 1458 1459 case 'N': 1460 return "ENTITIES"; 1461 1462 case 't': 1463 return "NMTOKEN"; 1464 1465 case 'T': 1466 return "NMTOKENS"; 1467 1468 case 'u': 1469 return "NMTOKEN"; 1470 1471 case 'o': 1472 return "NOTATION"; 1473 1474 case 'c': 1475 att.id = 'c'; 1476 return "CDATA"; 1477 1478 default: 1479 panic(FAULT); 1480 } 1481 return null; 1482 } 1483 1484 /** 1485 * Parses a comment. 1486 * 1487 * The '<!' part is read in dispatcher so the method starts 1488 * with first '-' after '<!'. 1489 * 1490 * @exception Exception is parser specific exception form panic method. 1491 */ 1492 @SuppressWarnings("fallthrough") 1493 private void comm() throws Exception { 1494 if (mPh == PH_DOC_START) { 1495 mPh = PH_MISC_DTD; // misc before DTD 1496 } // '<!' has been already read by dispetcher. 1497 char ch; 1498 mBuffIdx = -1; 1499 for (short st = 0; st >= 0;) { 1500 ch = (mChIdx < mChLen) ? mChars[mChIdx++] : getch(); 1501 if (ch == EOS) { 1502 panic(FAULT); 1503 } 1504 switch (st) { 1505 case 0: // first '-' of the comment open 1506 if (ch == '-') { 1507 st = 1; 1508 } else { 1509 panic(FAULT); 1510 } 1511 break; 1512 1513 case 1: // secind '-' of the comment open 1514 if (ch == '-') { 1515 st = 2; 1516 } else { 1517 panic(FAULT); 1518 } 1519 break; 1520 1521 case 2: // skip the comment body 1522 switch (ch) { 1523 case '-': 1524 st = 3; 1525 break; 1526 1527 default: 1528 bappend(ch); 1529 break; 1530 } 1531 break; 1532 1533 case 3: // second '-' of the comment close 1534 switch (ch) { 1535 case '-': 1536 st = 4; 1537 break; 1538 1539 default: 1540 bappend('-'); 1541 bappend(ch); 1542 st = 2; 1543 break; 1544 } 1545 break; 1546 1547 case 4: // '>' of the comment close 1548 if (ch == '>') { 1549 comm(mBuff, mBuffIdx + 1); 1550 st = -1; 1551 break; 1552 } 1553 // else - panic [#2.5 compatibility note] 1554 1555 default: 1556 panic(FAULT); 1557 } 1558 } 1559 } 1560 1561 /** 1562 * Parses a processing instruction. 1563 * 1564 * The '<?' is read in dispatcher so the method starts with 1565 * first character of PI target name after '<?'. 1566 * 1567 * @exception Exception is parser specific exception form panic method. 1568 * @exception IOException 1569 */ 1570 private void pi() throws Exception { 1571 // '<?' has been already read by dispetcher. 1572 char ch; 1573 String str = null; 1574 mBuffIdx = -1; 1575 for (short st = 0; st >= 0;) { 1576 ch = getch(); 1577 if (ch == EOS) { 1578 panic(FAULT); 1579 } 1580 switch (st) { 1581 case 0: // read the PI target name 1582 switch (chtyp(ch)) { 1583 case 'a': 1584 case 'A': 1585 case '_': 1586 case ':': 1587 case 'X': 1588 bkch(); 1589 str = name(false); 1590 // PI target name may not be empty string [#2.6] 1591 // PI target name 'XML' is reserved [#2.6] 1592 if ((str.length() == 0) 1593 || (mXml.name.equals(str.toLowerCase()) == true)) { 1594 panic(FAULT); 1595 } 1596 // This is processing instruction 1597 if (mPh == PH_DOC_START) // the begining of the document 1598 { 1599 mPh = PH_MISC_DTD; // misc before DTD 1600 } 1601 wsskip(); // skip spaces after the PI target name 1602 st = 1; // accumulate the PI body 1603 mBuffIdx = -1; 1604 break; 1605 1606 default: 1607 panic(FAULT); 1608 } 1609 break; 1610 1611 case 1: // accumulate the PI body 1612 switch (ch) { 1613 case '?': 1614 st = 2; // end of the PI body 1615 break; 1616 1617 default: 1618 bappend(ch); 1619 break; 1620 } 1621 break; 1622 1623 case 2: // end of the PI body 1624 switch (ch) { 1625 case '>': 1626 // PI has been read. 1627 pi(str, new String(mBuff, 0, mBuffIdx + 1)); 1628 st = -1; 1629 break; 1630 1631 case '?': 1632 bappend('?'); 1633 break; 1634 1635 default: 1636 bappend('?'); 1637 bappend(ch); 1638 st = 1; // accumulate the PI body 1639 break; 1640 } 1641 break; 1642 1643 default: 1644 panic(FAULT); 1645 } 1646 } 1647 } 1648 1649 /** 1650 * Parses a character data. 1651 * 1652 * The '<!' part is read in dispatcher so the method starts 1653 * with first '[' after '<!'. 1654 * 1655 * @exception Exception is parser specific exception form panic method. 1656 * @exception IOException 1657 */ 1658 private void cdat() 1659 throws Exception { 1660 // '<!' has been already read by dispetcher. 1661 char ch; 1662 mBuffIdx = -1; 1663 for (short st = 0; st >= 0;) { 1664 ch = getch(); 1665 switch (st) { 1666 case 0: // the first '[' of the CDATA open 1667 if (ch == '[') { 1668 st = 1; 1669 } else { 1670 panic(FAULT); 1671 } 1672 break; 1673 1674 case 1: // read "CDATA" 1675 if (chtyp(ch) == 'A') { 1676 bappend(ch); 1677 } else { 1678 if ("CDATA".equals( 1679 new String(mBuff, 0, mBuffIdx + 1)) != true) { 1680 panic(FAULT); 1681 } 1682 bkch(); 1683 st = 2; 1684 } 1685 break; 1686 1687 case 2: // the second '[' of the CDATA open 1688 if (ch != '[') { 1689 panic(FAULT); 1690 } 1691 mBuffIdx = -1; 1692 st = 3; 1693 break; 1694 1695 case 3: // read data before the first ']' 1696 if (ch != ']') { 1697 bappend(ch); 1698 } else { 1699 st = 4; 1700 } 1701 break; 1702 1703 case 4: // read the second ']' or continue to read the data 1704 if (ch != ']') { 1705 bappend(']'); 1706 bappend(ch); 1707 st = 3; 1708 } else { 1709 st = 5; 1710 } 1711 break; 1712 1713 case 5: // read '>' or continue to read the data 1714 switch (ch) { 1715 case ']': 1716 bappend(']'); 1717 break; 1718 1719 case '>': 1720 bflash(); 1721 st = -1; 1722 break; 1723 1724 default: 1725 bappend(']'); 1726 bappend(']'); 1727 bappend(ch); 1728 st = 3; 1729 break; 1730 } 1731 break; 1732 1733 default: 1734 panic(FAULT); 1735 } 1736 } 1737 } 1738 1739 /** 1740 * Reads a xml name. 1741 * 1742 * The xml name must conform "Namespaces in XML" specification. Therefore 1743 * the ':' character is not allowed in the name. This method should be used 1744 * for PI and entity names which may not have a namespace according to the 1745 * specification mentioned above. 1746 * 1747 * @param ns The true value turns namespace conformance on. 1748 * @return The name has been read. 1749 * @exception Exception When incorrect character appear in the name. 1750 * @exception IOException 1751 */ 1752 protected String name(boolean ns) 1753 throws Exception { 1754 mBuffIdx = -1; 1755 bname(ns); 1756 return new String(mBuff, 1, mBuffIdx); 1757 } 1758 1759 /** 1760 * Reads a qualified xml name. 1761 * 1762 * The characters of a qualified name is an array of characters. The first 1763 * (chars[0]) character is the index of the colon character which separates 1764 * the prefix from the local name. If the index is zero, the name does not 1765 * contain separator or the parser works in the namespace unaware mode. The 1766 * length of qualified name is the length of the array minus one. 1767 * 1768 * @param ns The true value turns namespace conformance on. 1769 * @return The characters of a qualified name. 1770 * @exception Exception When incorrect character appear in the name. 1771 * @exception IOException 1772 */ 1773 protected char[] qname(boolean ns) 1774 throws Exception { 1775 mBuffIdx = -1; 1776 bname(ns); 1777 char chars[] = new char[mBuffIdx + 1]; 1778 System.arraycopy(mBuff, 0, chars, 0, mBuffIdx + 1); 1779 return chars; 1780 } 1781 1782 /** 1783 * Reads the public or/and system identifiers. 1784 * 1785 * @param inp The input object. 1786 * @exception Exception is parser specific exception form panic method. 1787 * @exception IOException 1788 */ 1789 private void pubsys(Input inp) 1790 throws Exception { 1791 Pair pair = pubsys(' '); 1792 inp.pubid = pair.name; 1793 inp.sysid = pair.value; 1794 del(pair); 1795 } 1796 1797 /** 1798 * Reads the public or/and system identifiers. 1799 * 1800 * @param flag The 'N' allows public id be without system id. 1801 * @return The public or/and system identifiers pair. 1802 * @exception Exception is parser specific exception form panic method. 1803 * @exception IOException 1804 */ 1805 @SuppressWarnings("fallthrough") 1806 private Pair pubsys(char flag) throws Exception { 1807 Pair ids = pair(null); 1808 String str = name(false); 1809 if ("PUBLIC".equals(str) == true) { 1810 bqstr('i'); // non-CDATA normalization [#4.2.2] 1811 ids.name = new String(mBuff, 1, mBuffIdx); 1812 switch (wsskip()) { 1813 case '\"': 1814 case '\'': 1815 bqstr(' '); 1816 ids.value = new String(mBuff, 1, mBuffIdx); 1817 break; 1818 1819 case EOS: 1820 panic(FAULT); 1821 1822 default: 1823 if (flag != 'N') // [#4.7] 1824 { 1825 panic(FAULT); 1826 } 1827 ids.value = null; 1828 break; 1829 } 1830 return ids; 1831 } else if ("SYSTEM".equals(str) == true) { 1832 ids.name = null; 1833 bqstr(' '); 1834 ids.value = new String(mBuff, 1, mBuffIdx); 1835 return ids; 1836 } 1837 panic(FAULT); 1838 return null; 1839 } 1840 1841 /** 1842 * Reads an attribute value. 1843 * 1844 * The grammar which this method can read is:<br /> 1845 * <code>eqstr := S "=" qstr</code><br /> 1846 * <code>qstr := S ("'" string "'") | 1847 * ('"' string '"')</code><br /> This method resolves entities 1848 * inside a string unless the parser parses DTD. 1849 * 1850 * @param flag The '=' character forces the method to accept the '=' 1851 * character before quoted string and read the following string as not an 1852 * attribute ('-'), 'c' - CDATA, 'i' - non CDATA, ' ' - no normalization; 1853 * '-' - not an attribute value; 'd' - in DTD context. 1854 * @return The content of the quoted strign as a string. 1855 * @exception Exception is parser specific exception form panic method. 1856 * @exception IOException 1857 */ 1858 protected String eqstr(char flag) throws Exception { 1859 if (flag == '=') { 1860 wsskip(); 1861 if (getch() != '=') { 1862 panic(FAULT); 1863 } 1864 } 1865 bqstr((flag == '=') ? '-' : flag); 1866 return new String(mBuff, 1, mBuffIdx); 1867 } 1868 1869 /** 1870 * Resoves an entity. 1871 * 1872 * This method resolves built-in and character entity references. It is also 1873 * reports external entities to the application. 1874 * 1875 * @param flag The 'x' character forces the method to report a skipped 1876 * entity; 'i' character - indicates non-CDATA normalization. 1877 * @return Name of unresolved entity or <code>null</code> if entity had been 1878 * resolved successfully. 1879 * @exception Exception is parser specific exception form panic method. 1880 * @exception IOException 1881 */ 1882 @SuppressWarnings("fallthrough") 1883 private String ent(char flag) throws Exception { 1884 char ch; 1885 int idx = mBuffIdx + 1; 1886 Input inp = null; 1887 String str = null; 1888 mESt = 0x100; // reset the built-in entity recognizer 1889 bappend('&'); 1890 for (short st = 0; st >= 0;) { 1891 ch = (mChIdx < mChLen) ? mChars[mChIdx++] : getch(); 1892 switch (st) { 1893 case 0: // the first character of the entity name 1894 case 1: // read built-in entity name 1895 switch (chtyp(ch)) { 1896 case 'd': 1897 case '.': 1898 case '-': 1899 if (st != 1) { 1900 panic(FAULT); 1901 } 1902 case 'a': 1903 case 'A': 1904 case '_': 1905 case 'X': 1906 bappend(ch); 1907 eappend(ch); 1908 st = 1; 1909 break; 1910 1911 case ':': 1912 if (mIsNSAware != false) { 1913 panic(FAULT); 1914 } 1915 bappend(ch); 1916 eappend(ch); 1917 st = 1; 1918 break; 1919 1920 case ';': 1921 if (mESt < 0x100) { 1922 // The entity is a built-in entity 1923 mBuffIdx = idx - 1; 1924 bappend(mESt); 1925 st = -1; 1926 break; 1927 } else if (mPh == PH_DTD) { 1928 // In DTD entity declaration has to resolve character 1929 // entities and include "as is" others. [#4.4.7] 1930 bappend(';'); 1931 st = -1; 1932 break; 1933 } 1934 // Convert an entity name to a string 1935 str = new String(mBuff, idx + 1, mBuffIdx - idx); 1936 inp = mEnt.get(str); 1937 // Restore the buffer offset 1938 mBuffIdx = idx - 1; 1939 if (inp != null) { 1940 if (inp.chars == null) { 1941 // External entity 1942 InputSource is = resolveEnt(str, inp.pubid, inp.sysid); 1943 if (is != null) { 1944 push(new Input(BUFFSIZE_READER)); 1945 setinp(is); 1946 mInp.pubid = inp.pubid; 1947 mInp.sysid = inp.sysid; 1948 str = null; // the entity is resolved 1949 } else { 1950 // Unresolved external entity 1951 if (flag != 'x') { 1952 panic(FAULT); // unknown entity within marckup 1953 } // str is name of unresolved entity 1954 } 1955 } else { 1956 // Internal entity 1957 push(inp); 1958 str = null; // the entity is resolved 1959 } 1960 } else { 1961 // Unknown or general unparsed entity 1962 if (flag != 'x') { 1963 panic(FAULT); // unknown entity within marckup 1964 } // str is name of unresolved entity 1965 } 1966 st = -1; 1967 break; 1968 1969 case '#': 1970 if (st != 0) { 1971 panic(FAULT); 1972 } 1973 st = 2; 1974 break; 1975 1976 default: 1977 panic(FAULT); 1978 } 1979 break; 1980 1981 case 2: // read character entity 1982 switch (chtyp(ch)) { 1983 case 'd': 1984 bappend(ch); 1985 break; 1986 1987 case ';': 1988 // Convert the character entity to a character 1989 try { 1990 int i = Integer.parseInt( 1991 new String(mBuff, idx + 1, mBuffIdx - idx), 10); 1992 if (i >= 0xffff) { 1993 panic(FAULT); 1994 } 1995 ch = (char) i; 1996 } catch (NumberFormatException nfe) { 1997 panic(FAULT); 1998 } 1999 // Restore the buffer offset 2000 mBuffIdx = idx - 1; 2001 if (ch == ' ' || mInp.next != null) { 2002 bappend(ch, flag); 2003 } else { 2004 bappend(ch); 2005 } 2006 st = -1; 2007 break; 2008 2009 case 'a': 2010 // If the entity buffer is empty and ch == 'x' 2011 if ((mBuffIdx == idx) && (ch == 'x')) { 2012 st = 3; 2013 break; 2014 } 2015 default: 2016 panic(FAULT); 2017 } 2018 break; 2019 2020 case 3: // read hex character entity 2021 switch (chtyp(ch)) { 2022 case 'A': 2023 case 'a': 2024 case 'd': 2025 bappend(ch); 2026 break; 2027 2028 case ';': 2029 // Convert the character entity to a character 2030 try { 2031 int i = Integer.parseInt( 2032 new String(mBuff, idx + 1, mBuffIdx - idx), 16); 2033 if (i >= 0xffff) { 2034 panic(FAULT); 2035 } 2036 ch = (char) i; 2037 } catch (NumberFormatException nfe) { 2038 panic(FAULT); 2039 } 2040 // Restore the buffer offset 2041 mBuffIdx = idx - 1; 2042 if (ch == ' ' || mInp.next != null) { 2043 bappend(ch, flag); 2044 } else { 2045 bappend(ch); 2046 } 2047 st = -1; 2048 break; 2049 2050 default: 2051 panic(FAULT); 2052 } 2053 break; 2054 2055 default: 2056 panic(FAULT); 2057 } 2058 } 2059 2060 return str; 2061 } 2062 2063 /** 2064 * Resoves a parameter entity. 2065 * 2066 * This method resolves a parameter entity references. It is also reports 2067 * external entities to the application. 2068 * 2069 * @param flag The '-' instruct the method to do not set up surrounding 2070 * spaces [#4.4.8]. 2071 * @exception Exception is parser specific exception form panic method. 2072 * @exception IOException 2073 */ 2074 @SuppressWarnings("fallthrough") 2075 private void pent(char flag) throws Exception { 2076 char ch; 2077 int idx = mBuffIdx + 1; 2078 Input inp = null; 2079 String str = null; 2080 bappend('%'); 2081 if (mPh != PH_DTD) // the DTD internal subset 2082 { 2083 return; // Not Recognized [#4.4.1] 2084 } // Read entity name 2085 bname(false); 2086 str = new String(mBuff, idx + 2, mBuffIdx - idx - 1); 2087 if (getch() != ';') { 2088 panic(FAULT); 2089 } 2090 inp = mPEnt.get(str); 2091 // Restore the buffer offset 2092 mBuffIdx = idx - 1; 2093 if (inp != null) { 2094 if (inp.chars == null) { 2095 // External parameter entity 2096 InputSource is = resolveEnt(str, inp.pubid, inp.sysid); 2097 if (is != null) { 2098 if (flag != '-') { 2099 bappend(' '); // tail space 2100 } 2101 push(new Input(BUFFSIZE_READER)); 2102 // BUG: there is no leading space! [#4.4.8] 2103 setinp(is); 2104 mInp.pubid = inp.pubid; 2105 mInp.sysid = inp.sysid; 2106 } else { 2107 // Unresolved external parameter entity 2108 skippedEnt("%" + str); 2109 } 2110 } else { 2111 // Internal parameter entity 2112 if (flag == '-') { 2113 // No surrounding spaces 2114 inp.chIdx = 1; 2115 } else { 2116 // Insert surrounding spaces 2117 bappend(' '); // tail space 2118 inp.chIdx = 0; 2119 } 2120 push(inp); 2121 } 2122 } else { 2123 // Unknown parameter entity 2124 skippedEnt("%" + str); 2125 } 2126 } 2127 2128 /** 2129 * Recognizes and handles a namespace declaration. 2130 * 2131 * This method identifies a type of namespace declaration if any and puts 2132 * new mapping on top of prefix stack. 2133 * 2134 * @param name The attribute qualified name (<code>name.value</code> is a 2135 * <code>String</code> object which represents the attribute prefix). 2136 * @param value The attribute value. 2137 * @return <code>true</code> if a namespace declaration is recognized. 2138 */ 2139 private boolean isdecl(Pair name, String value) { 2140 if (name.chars[0] == 0) { 2141 if ("xmlns".equals(name.name) == true) { 2142 // New default namespace declaration 2143 mPref = pair(mPref); 2144 mPref.list = mElm; // prefix owner element 2145 mPref.value = value; 2146 mPref.name = ""; 2147 mPref.chars = NONS; 2148 mElm.num++; // namespace counter 2149 return true; 2150 } 2151 } else { 2152 if (name.eqpref(XMLNS) == true) { 2153 // New prefix declaration 2154 int len = name.name.length(); 2155 mPref = pair(mPref); 2156 mPref.list = mElm; // prefix owner element 2157 mPref.value = value; 2158 mPref.name = name.name; 2159 mPref.chars = new char[len + 1]; 2160 mPref.chars[0] = (char) (len + 1); 2161 name.name.getChars(0, len, mPref.chars, 1); 2162 mElm.num++; // namespace counter 2163 return true; 2164 } 2165 } 2166 return false; 2167 } 2168 2169 /** 2170 * Resolves a prefix. 2171 * 2172 * @return The namespace assigned to the prefix. 2173 * @exception Exception When mapping for specified prefix is not found. 2174 */ 2175 private String rslv(char[] qname) 2176 throws Exception { 2177 for (Pair pref = mPref; pref != null; pref = pref.next) { 2178 if (pref.eqpref(qname) == true) { 2179 return pref.value; 2180 } 2181 } 2182 if (qname[0] == 1) { // QNames like ':local' 2183 for (Pair pref = mPref; pref != null; pref = pref.next) { 2184 if (pref.chars[0] == 0) { 2185 return pref.value; 2186 } 2187 } 2188 } 2189 panic(FAULT); 2190 return null; 2191 } 2192 2193 /** 2194 * Skips xml white space characters. 2195 * 2196 * This method skips white space characters (' ', '\t', '\n', '\r') and 2197 * looks ahead not white space character. 2198 * 2199 * @return The first not white space look ahead character. 2200 * @exception IOException 2201 */ 2202 protected char wsskip() 2203 throws IOException { 2204 char ch; 2205 while (true) { 2206 // Read next character 2207 ch = (mChIdx < mChLen) ? mChars[mChIdx++] : getch(); 2208 if (ch < 0x80) { 2209 if (nmttyp[ch] != 3) // [ \t\n\r] 2210 { 2211 break; 2212 } 2213 } else { 2214 break; 2215 } 2216 } 2217 mChIdx--; // bkch(); 2218 return ch; 2219 } 2220 2221 /** 2222 * Reports document type. 2223 * 2224 * @param name The name of the entity. 2225 * @param pubid The public identifier of the entity or <code>null</code>. 2226 * @param sysid The system identifier of the entity or <code>null</code>. 2227 */ 2228 protected abstract void docType(String name, String pubid, String sysid) 2229 throws SAXException; 2230 2231 /** 2232 * Reports a comment. 2233 * 2234 * @param text The comment text starting from first charcater. 2235 * @param length The number of characters in comment. 2236 */ 2237 protected abstract void comm(char[] text, int length); 2238 2239 /** 2240 * Reports a processing instruction. 2241 * 2242 * @param target The processing instruction target name. 2243 * @param body The processing instruction body text. 2244 */ 2245 protected abstract void pi(String target, String body) 2246 throws Exception; 2247 2248 /** 2249 * Reports new namespace prefix. The Namespace prefix ( 2250 * <code>mPref.name</code>) being declared and the Namespace URI ( 2251 * <code>mPref.value</code>) the prefix is mapped to. An empty string is 2252 * used for the default element namespace, which has no prefix. 2253 */ 2254 protected abstract void newPrefix() 2255 throws Exception; 2256 2257 /** 2258 * Reports skipped entity name. 2259 * 2260 * @param name The entity name. 2261 */ 2262 protected abstract void skippedEnt(String name) 2263 throws Exception; 2264 2265 /** 2266 * Returns an 2267 * <code>InputSource</code> for specified entity or 2268 * <code>null</code>. 2269 * 2270 * @param name The name of the entity. 2271 * @param pubid The public identifier of the entity. 2272 * @param sysid The system identifier of the entity. 2273 */ 2274 protected abstract InputSource resolveEnt( 2275 String name, String pubid, String sysid) 2276 throws Exception; 2277 2278 /** 2279 * Reports notation declaration. 2280 * 2281 * @param name The notation's name. 2282 * @param pubid The notation's public identifier, or null if none was given. 2283 * @param sysid The notation's system identifier, or null if none was given. 2284 */ 2285 protected abstract void notDecl(String name, String pubid, String sysid) 2286 throws Exception; 2287 2288 /** 2289 * Reports unparsed entity name. 2290 * 2291 * @param name The unparsed entity's name. 2292 * @param pubid The entity's public identifier, or null if none was given. 2293 * @param sysid The entity's system identifier. 2294 * @param notation The name of the associated notation. 2295 */ 2296 protected abstract void unparsedEntDecl( 2297 String name, String pubid, String sysid, String notation) 2298 throws Exception; 2299 2300 /** 2301 * Notifies the handler about fatal parsing error. 2302 * 2303 * @param msg The problem description message. 2304 */ 2305 protected abstract void panic(String msg) 2306 throws Exception; 2307 2308 /** 2309 * Reads a qualified xml name. 2310 * 2311 * This is low level routine which leaves a qName in the buffer. The 2312 * characters of a qualified name is an array of characters. The first 2313 * (chars[0]) character is the index of the colon character which separates 2314 * the prefix from the local name. If the index is zero, the name does not 2315 * contain separator or the parser works in the namespace unaware mode. The 2316 * length of qualified name is the length of the array minus one. 2317 * 2318 * @param ns The true value turns namespace conformance on. 2319 * @exception Exception is parser specific exception form panic method. 2320 * @exception IOException 2321 */ 2322 private void bname(boolean ns) 2323 throws Exception { 2324 char ch; 2325 char type; 2326 mBuffIdx++; // allocate a char for colon offset 2327 int bqname = mBuffIdx; 2328 int bcolon = bqname; 2329 int bchidx = bqname + 1; 2330 int bstart = bchidx; 2331 int cstart = mChIdx; 2332 short st = (short) ((ns == true) ? 0 : 2); 2333 while (true) { 2334 // Read next character 2335 if (mChIdx >= mChLen) { 2336 bcopy(cstart, bstart); 2337 getch(); 2338 mChIdx--; // bkch(); 2339 cstart = mChIdx; 2340 bstart = bchidx; 2341 } 2342 ch = mChars[mChIdx++]; 2343 type = (char) 0; // [X] 2344 if (ch < 0x80) { 2345 type = (char) nmttyp[ch]; 2346 } else if (ch == EOS) { 2347 panic(FAULT); 2348 } 2349 // Parse QName 2350 switch (st) { 2351 case 0: // read the first char of the prefix 2352 case 2: // read the first char of the suffix 2353 switch (type) { 2354 case 0: // [aA_X] 2355 bchidx++; // append char to the buffer 2356 st++; // (st == 0)? 1: 3; 2357 break; 2358 2359 case 1: // [:] 2360 mChIdx--; // bkch(); 2361 st++; // (st == 0)? 1: 3; 2362 break; 2363 2364 default: 2365 panic(FAULT); 2366 } 2367 break; 2368 2369 case 1: // read the prefix 2370 case 3: // read the suffix 2371 switch (type) { 2372 case 0: // [aA_X] 2373 case 2: // [.-d] 2374 bchidx++; // append char to the buffer 2375 break; 2376 2377 case 1: // [:] 2378 bchidx++; // append char to the buffer 2379 if (ns == true) { 2380 if (bcolon != bqname) { 2381 panic(FAULT); // it must be only one colon 2382 } 2383 bcolon = bchidx - 1; 2384 if (st == 1) { 2385 st = 2; 2386 } 2387 } 2388 break; 2389 2390 default: 2391 mChIdx--; // bkch(); 2392 bcopy(cstart, bstart); 2393 mBuff[bqname] = (char) (bcolon - bqname); 2394 return; 2395 } 2396 break; 2397 2398 default: 2399 panic(FAULT); 2400 } 2401 } 2402 } 2403 2404 /** 2405 * Reads a nmtoken. 2406 * 2407 * This is low level routine which leaves a nmtoken in the buffer. 2408 * 2409 * @exception Exception is parser specific exception form panic method. 2410 * @exception IOException 2411 */ 2412 @SuppressWarnings("fallthrough") 2413 private void bntok() throws Exception { 2414 char ch; 2415 mBuffIdx = -1; 2416 bappend((char) 0); // default offset to the colon char 2417 while (true) { 2418 ch = getch(); 2419 switch (chtyp(ch)) { 2420 case 'a': 2421 case 'A': 2422 case 'd': 2423 case '.': 2424 case ':': 2425 case '-': 2426 case '_': 2427 case 'X': 2428 bappend(ch); 2429 break; 2430 2431 case 'Z': 2432 panic(FAULT); 2433 2434 default: 2435 bkch(); 2436 return; 2437 } 2438 } 2439 } 2440 2441 /** 2442 * Recognizes a keyword. 2443 * 2444 * This is low level routine which recognizes one of keywords in the buffer. 2445 * Keyword Id ID - i IDREF - r IDREFS - R ENTITY - n ENTITIES - N NMTOKEN - 2446 * t NMTOKENS - T ELEMENT - e ATTLIST - a NOTATION - o CDATA - c REQUIRED - 2447 * Q IMPLIED - I FIXED - F 2448 * 2449 * @return an id of a keyword or '?'. 2450 * @exception Exception is parser specific exception form panic method. 2451 * @exception IOException 2452 */ 2453 private char bkeyword() 2454 throws Exception { 2455 String str = new String(mBuff, 1, mBuffIdx); 2456 switch (str.length()) { 2457 case 2: // ID 2458 return ("ID".equals(str) == true) ? 'i' : '?'; 2459 2460 case 5: // IDREF, CDATA, FIXED 2461 switch (mBuff[1]) { 2462 case 'I': 2463 return ("IDREF".equals(str) == true) ? 'r' : '?'; 2464 case 'C': 2465 return ("CDATA".equals(str) == true) ? 'c' : '?'; 2466 case 'F': 2467 return ("FIXED".equals(str) == true) ? 'F' : '?'; 2468 default: 2469 break; 2470 } 2471 break; 2472 2473 case 6: // IDREFS, ENTITY 2474 switch (mBuff[1]) { 2475 case 'I': 2476 return ("IDREFS".equals(str) == true) ? 'R' : '?'; 2477 case 'E': 2478 return ("ENTITY".equals(str) == true) ? 'n' : '?'; 2479 default: 2480 break; 2481 } 2482 break; 2483 2484 case 7: // NMTOKEN, IMPLIED, ATTLIST, ELEMENT 2485 switch (mBuff[1]) { 2486 case 'I': 2487 return ("IMPLIED".equals(str) == true) ? 'I' : '?'; 2488 case 'N': 2489 return ("NMTOKEN".equals(str) == true) ? 't' : '?'; 2490 case 'A': 2491 return ("ATTLIST".equals(str) == true) ? 'a' : '?'; 2492 case 'E': 2493 return ("ELEMENT".equals(str) == true) ? 'e' : '?'; 2494 default: 2495 break; 2496 } 2497 break; 2498 2499 case 8: // ENTITIES, NMTOKENS, NOTATION, REQUIRED 2500 switch (mBuff[2]) { 2501 case 'N': 2502 return ("ENTITIES".equals(str) == true) ? 'N' : '?'; 2503 case 'M': 2504 return ("NMTOKENS".equals(str) == true) ? 'T' : '?'; 2505 case 'O': 2506 return ("NOTATION".equals(str) == true) ? 'o' : '?'; 2507 case 'E': 2508 return ("REQUIRED".equals(str) == true) ? 'Q' : '?'; 2509 default: 2510 break; 2511 } 2512 break; 2513 2514 default: 2515 break; 2516 } 2517 return '?'; 2518 } 2519 2520 /** 2521 * Reads a single or double quotted string in to the buffer. 2522 * 2523 * This method resolves entities inside a string unless the parser parses 2524 * DTD. 2525 * 2526 * @param flag 'c' - CDATA, 'i' - non CDATA, ' ' - no normalization; '-' - 2527 * not an attribute value; 'd' - in DTD context. 2528 * @exception Exception is parser specific exception form panic method. 2529 * @exception IOException 2530 */ 2531 @SuppressWarnings("fallthrough") 2532 private void bqstr(char flag) throws Exception { 2533 Input inp = mInp; // remember the original input 2534 mBuffIdx = -1; 2535 bappend((char) 0); // default offset to the colon char 2536 char ch; 2537 for (short st = 0; st >= 0;) { 2538 ch = (mChIdx < mChLen) ? mChars[mChIdx++] : getch(); 2539 switch (st) { 2540 case 0: // read a single or double quote 2541 switch (ch) { 2542 case ' ': 2543 case '\n': 2544 case '\r': 2545 case '\t': 2546 break; 2547 2548 case '\'': 2549 st = 2; // read a single quoted string 2550 break; 2551 2552 case '\"': 2553 st = 3; // read a double quoted string 2554 break; 2555 2556 default: 2557 panic(FAULT); 2558 break; 2559 } 2560 break; 2561 2562 case 2: // read a single quoted string 2563 case 3: // read a double quoted string 2564 switch (ch) { 2565 case '\'': 2566 if ((st == 2) && (mInp == inp)) { 2567 st = -1; 2568 } else { 2569 bappend(ch); 2570 } 2571 break; 2572 2573 case '\"': 2574 if ((st == 3) && (mInp == inp)) { 2575 st = -1; 2576 } else { 2577 bappend(ch); 2578 } 2579 break; 2580 2581 case '&': 2582 if (flag != 'd') { 2583 ent(flag); 2584 } else { 2585 bappend(ch); 2586 } 2587 break; 2588 2589 case '%': 2590 if (flag == 'd') { 2591 pent('-'); 2592 } else { 2593 bappend(ch); 2594 } 2595 break; 2596 2597 case '<': 2598 if ((flag == '-') || (flag == 'd')) { 2599 bappend(ch); 2600 } else { 2601 panic(FAULT); 2602 } 2603 break; 2604 2605 case EOS: // EOS before single/double quote 2606 panic(FAULT); 2607 2608 case '\r': // EOL processing [#2.11 & #3.3.3] 2609 if (flag != ' ' && mInp.next == null) { 2610 if (getch() != '\n') { 2611 bkch(); 2612 } 2613 ch = '\n'; 2614 } 2615 default: 2616 bappend(ch, flag); 2617 break; 2618 } 2619 break; 2620 2621 default: 2622 panic(FAULT); 2623 } 2624 } 2625 // There is maximum one space at the end of the string in 2626 // i-mode (non CDATA normalization) and it has to be removed. 2627 if ((flag == 'i') && (mBuff[mBuffIdx] == ' ')) { 2628 mBuffIdx -= 1; 2629 } 2630 } 2631 2632 /** 2633 * Reports characters and empties the parser's buffer. This method is called 2634 * only if parser is going to return control to the main loop. This means 2635 * that this method may use parser buffer to report white space without 2636 * copeing characters to temporary buffer. 2637 */ 2638 protected abstract void bflash() 2639 throws Exception; 2640 2641 /** 2642 * Reports white space characters and empties the parser's buffer. This 2643 * method is called only if parser is going to return control to the main 2644 * loop. This means that this method may use parser buffer to report white 2645 * space without copeing characters to temporary buffer. 2646 */ 2647 protected abstract void bflash_ws() 2648 throws Exception; 2649 2650 /** 2651 * Appends a character to parser's buffer with normalization. 2652 * 2653 * @param ch The character to append to the buffer. 2654 * @param mode The normalization mode. 2655 */ 2656 private void bappend(char ch, char mode) { 2657 // This implements attribute value normalization as 2658 // described in the XML specification [#3.3.3]. 2659 switch (mode) { 2660 case 'i': // non CDATA normalization 2661 switch (ch) { 2662 case ' ': 2663 case '\n': 2664 case '\r': 2665 case '\t': 2666 if ((mBuffIdx > 0) && (mBuff[mBuffIdx] != ' ')) { 2667 bappend(' '); 2668 } 2669 return; 2670 2671 default: 2672 break; 2673 } 2674 break; 2675 2676 case 'c': // CDATA normalization 2677 switch (ch) { 2678 case '\n': 2679 case '\r': 2680 case '\t': 2681 ch = ' '; 2682 break; 2683 2684 default: 2685 break; 2686 } 2687 break; 2688 2689 default: // no normalization 2690 break; 2691 } 2692 mBuffIdx++; 2693 if (mBuffIdx < mBuff.length) { 2694 mBuff[mBuffIdx] = ch; 2695 } else { 2696 mBuffIdx--; 2697 bappend(ch); 2698 } 2699 } 2700 2701 /** 2702 * Appends a character to parser's buffer. 2703 * 2704 * @param ch The character to append to the buffer. 2705 */ 2706 private void bappend(char ch) { 2707 try { 2708 mBuff[++mBuffIdx] = ch; 2709 } catch (Exception exp) { 2710 // Double the buffer size 2711 char buff[] = new char[mBuff.length << 1]; 2712 System.arraycopy(mBuff, 0, buff, 0, mBuff.length); 2713 mBuff = buff; 2714 mBuff[mBuffIdx] = ch; 2715 } 2716 } 2717 2718 /** 2719 * Appends (mChIdx - cidx) characters from character buffer (mChars) to 2720 * parser's buffer (mBuff). 2721 * 2722 * @param cidx The character buffer (mChars) start index. 2723 * @param bidx The parser buffer (mBuff) start index. 2724 */ 2725 private void bcopy(int cidx, int bidx) { 2726 int length = mChIdx - cidx; 2727 if ((bidx + length + 1) >= mBuff.length) { 2728 // Expand the buffer 2729 char buff[] = new char[mBuff.length + length]; 2730 System.arraycopy(mBuff, 0, buff, 0, mBuff.length); 2731 mBuff = buff; 2732 } 2733 System.arraycopy(mChars, cidx, mBuff, bidx, length); 2734 mBuffIdx += length; 2735 } 2736 2737 /** 2738 * Recognizes the built-in entities <i>lt</i>, <i>gt</i>, <i>amp</i>, 2739 * <i>apos</i>, <i>quot</i>. The initial state is 0x100. Any state belowe 2740 * 0x100 is a built-in entity replacement character. 2741 * 2742 * @param ch the next character of an entity name. 2743 */ 2744 @SuppressWarnings("fallthrough") 2745 private void eappend(char ch) { 2746 switch (mESt) { 2747 case 0x100: // "l" or "g" or "a" or "q" 2748 switch (ch) { 2749 case 'l': 2750 mESt = 0x101; 2751 break; 2752 case 'g': 2753 mESt = 0x102; 2754 break; 2755 case 'a': 2756 mESt = 0x103; 2757 break; 2758 case 'q': 2759 mESt = 0x107; 2760 break; 2761 default: 2762 mESt = 0x200; 2763 break; 2764 } 2765 break; 2766 2767 case 0x101: // "lt" 2768 mESt = (ch == 't') ? '<' : (char) 0x200; 2769 break; 2770 2771 case 0x102: // "gt" 2772 mESt = (ch == 't') ? '>' : (char) 0x200; 2773 break; 2774 2775 case 0x103: // "am" or "ap" 2776 switch (ch) { 2777 case 'm': 2778 mESt = 0x104; 2779 break; 2780 case 'p': 2781 mESt = 0x105; 2782 break; 2783 default: 2784 mESt = 0x200; 2785 break; 2786 } 2787 break; 2788 2789 case 0x104: // "amp" 2790 mESt = (ch == 'p') ? '&' : (char) 0x200; 2791 break; 2792 2793 case 0x105: // "apo" 2794 mESt = (ch == 'o') ? (char) 0x106 : (char) 0x200; 2795 break; 2796 2797 case 0x106: // "apos" 2798 mESt = (ch == 's') ? '\'' : (char) 0x200; 2799 break; 2800 2801 case 0x107: // "qu" 2802 mESt = (ch == 'u') ? (char) 0x108 : (char) 0x200; 2803 break; 2804 2805 case 0x108: // "quo" 2806 mESt = (ch == 'o') ? (char) 0x109 : (char) 0x200; 2807 break; 2808 2809 case 0x109: // "quot" 2810 mESt = (ch == 't') ? '\"' : (char) 0x200; 2811 break; 2812 2813 case '<': // "lt" 2814 case '>': // "gt" 2815 case '&': // "amp" 2816 case '\'': // "apos" 2817 case '\"': // "quot" 2818 mESt = 0x200; 2819 default: 2820 break; 2821 } 2822 } 2823 2824 /** 2825 * Sets up a new input source on the top of the input stack. Note, the first 2826 * byte returned by the entity's byte stream has to be the first byte in the 2827 * entity. However, the parser does not expect the byte order mask in both 2828 * cases when encoding is provided by the input source. 2829 * 2830 * @param is A new input source to set up. 2831 * @exception IOException If any IO errors occur. 2832 * @exception Exception is parser specific exception form panic method. 2833 */ 2834 protected void setinp(InputSource is) 2835 throws Exception { 2836 Reader reader = null; 2837 mChIdx = 0; 2838 mChLen = 0; 2839 mChars = mInp.chars; 2840 mInp.src = null; 2841 if (mPh < PH_DOC_START) { 2842 mIsSAlone = false; // default [#2.9] 2843 } 2844 mIsSAloneSet = false; 2845 if (is.getCharacterStream() != null) { 2846 // Ignore encoding in the xml text decl. 2847 reader = is.getCharacterStream(); 2848 xml(reader); 2849 } else if (is.getByteStream() != null) { 2850 String expenc; 2851 if (is.getEncoding() != null) { 2852 // Ignore encoding in the xml text decl. 2853 expenc = is.getEncoding().toUpperCase(); 2854 if (expenc.equals("UTF-16")) { 2855 reader = bom(is.getByteStream(), 'U'); // UTF-16 [#4.3.3] 2856 } else { 2857 reader = enc(expenc, is.getByteStream()); 2858 } 2859 xml(reader); 2860 } else { 2861 // Get encoding from BOM or the xml text decl. 2862 reader = bom(is.getByteStream(), ' '); 2863 /** 2864 * [#4.3.3] requires BOM for UTF-16, however, it's not uncommon 2865 * that it may be missing. A mature technique exists in Xerces 2866 * to further check for possible UTF-16 encoding 2867 */ 2868 if (reader == null) { 2869 reader = utf16(is.getByteStream()); 2870 } 2871 2872 if (reader == null) { 2873 // Encoding is defined by the xml text decl. 2874 reader = enc("UTF-8", is.getByteStream()); 2875 expenc = xml(reader); 2876 if (!expenc.equals("UTF-8")) { 2877 if (expenc.startsWith("UTF-16")) { 2878 panic(FAULT); // UTF-16 must have BOM [#4.3.3] 2879 } 2880 reader = enc(expenc, is.getByteStream()); 2881 } 2882 } else { 2883 // Encoding is defined by the BOM. 2884 xml(reader); 2885 } 2886 } 2887 } else { 2888 // There is no support for public/system identifiers. 2889 panic(FAULT); 2890 } 2891 mInp.src = reader; 2892 mInp.pubid = is.getPublicId(); 2893 mInp.sysid = is.getSystemId(); 2894 } 2895 2896 /** 2897 * Determines the entity encoding. 2898 * 2899 * This method gets encoding from Byte Order Mask [#4.3.3] if any. Note, the 2900 * first byte returned by the entity's byte stream has to be the first byte 2901 * in the entity. Also, there is no support for UCS-4. 2902 * 2903 * @param is A byte stream of the entity. 2904 * @param hint An encoding hint, character U means UTF-16. 2905 * @return a reader constructed from the BOM or UTF-8 by default. 2906 * @exception Exception is parser specific exception form panic method. 2907 * @exception IOException 2908 */ 2909 private Reader bom(InputStream is, char hint) 2910 throws Exception { 2911 int val = is.read(); 2912 switch (val) { 2913 case 0xef: // UTF-8 2914 if (hint == 'U') // must be UTF-16 2915 { 2916 panic(FAULT); 2917 } 2918 if (is.read() != 0xbb) { 2919 panic(FAULT); 2920 } 2921 if (is.read() != 0xbf) { 2922 panic(FAULT); 2923 } 2924 return new ReaderUTF8(is); 2925 2926 case 0xfe: // UTF-16, big-endian 2927 if (is.read() != 0xff) { 2928 panic(FAULT); 2929 } 2930 return new ReaderUTF16(is, 'b'); 2931 2932 case 0xff: // UTF-16, little-endian 2933 if (is.read() != 0xfe) { 2934 panic(FAULT); 2935 } 2936 return new ReaderUTF16(is, 'l'); 2937 2938 case -1: 2939 mChars[mChIdx++] = EOS; 2940 return new ReaderUTF8(is); 2941 2942 default: 2943 if (hint == 'U') // must be UTF-16 2944 { 2945 panic(FAULT); 2946 } 2947 // Read the rest of UTF-8 character 2948 switch (val & 0xf0) { 2949 case 0xc0: 2950 case 0xd0: 2951 mChars[mChIdx++] = (char) (((val & 0x1f) << 6) | (is.read() & 0x3f)); 2952 break; 2953 2954 case 0xe0: 2955 mChars[mChIdx++] = (char) (((val & 0x0f) << 12) 2956 | ((is.read() & 0x3f) << 6) | (is.read() & 0x3f)); 2957 break; 2958 2959 case 0xf0: // UCS-4 character 2960 throw new UnsupportedEncodingException(); 2961 2962 default: 2963 mChars[mChIdx++] = (char) val; 2964 break; 2965 } 2966 return null; 2967 } 2968 } 2969 2970 2971 /** 2972 * Using a mature technique from Xerces, this method checks further after 2973 * the bom method above to see if the encoding is UTF-16 2974 * 2975 * @param is A byte stream of the entity. 2976 * @return a reader, may be null 2977 * @exception Exception is parser specific exception form panic method. 2978 * @exception IOException 2979 */ 2980 private Reader utf16(InputStream is) 2981 throws Exception { 2982 if (mChIdx != 0) { 2983 //The bom method has read ONE byte into the buffer. 2984 byte b0 = (byte)mChars[0]; 2985 if (b0 == 0x00 || b0 == 0x3C) { 2986 int b1 = is.read(); 2987 int b2 = is.read(); 2988 int b3 = is.read(); 2989 if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x3F) { 2990 // UTF-16, big-endian, no BOM 2991 mChars[0] = (char)(b1); 2992 mChars[mChIdx++] = (char)(b3); 2993 return new ReaderUTF16(is, 'b'); 2994 } else if (b0 == 0x3C && b1 == 0x00 && b2 == 0x3F && b3 == 0x00) { 2995 // UTF-16, little-endian, no BOM 2996 mChars[0] = (char)(b0); 2997 mChars[mChIdx++] = (char)(b2); 2998 return new ReaderUTF16(is, 'l'); 2999 } else { 3000 /**not every InputStream supports reset, so we have to remember 3001 * the state for further parsing 3002 **/ 3003 mChars[0] = (char)(b0); 3004 mChars[mChIdx++] = (char)(b1); 3005 mChars[mChIdx++] = (char)(b2); 3006 mChars[mChIdx++] = (char)(b3); 3007 } 3008 3009 } 3010 } 3011 return null; 3012 } 3013 /** 3014 * Parses the xml text declaration. 3015 * 3016 * This method gets encoding from the xml text declaration [#4.3.1] if any. 3017 * The method assumes the buffer (mChars) is big enough to accommodate whole 3018 * xml text declaration. 3019 * 3020 * @param reader is entity reader. 3021 * @return The xml text declaration encoding or default UTF-8 encoding. 3022 * @exception Exception is parser specific exception form panic method. 3023 * @exception IOException 3024 */ 3025 private String xml(Reader reader) 3026 throws Exception { 3027 String str = null; 3028 String enc = "UTF-8"; 3029 char ch; 3030 int val; 3031 short st = 0; 3032 int byteRead = mChIdx; //number of bytes read prior to entering this method 3033 3034 while (st >= 0 && mChIdx < mChars.length) { 3035 if (st < byteRead) { 3036 ch = mChars[st]; 3037 } else { 3038 ch = ((val = reader.read()) >= 0) ? (char) val : EOS; 3039 mChars[mChIdx++] = ch; 3040 } 3041 3042 switch (st) { 3043 case 0: // read '<' of xml declaration 3044 switch (ch) { 3045 case '<': 3046 st = 1; 3047 break; 3048 3049 case 0xfeff: // the byte order mask 3050 ch = ((val = reader.read()) >= 0) ? (char) val : EOS; 3051 mChars[mChIdx - 1] = ch; 3052 st = (short) ((ch == '<') ? 1 : -1); 3053 break; 3054 3055 default: 3056 st = -1; 3057 break; 3058 } 3059 break; 3060 3061 case 1: // read '?' of xml declaration [#4.3.1] 3062 st = (short) ((ch == '?') ? 2 : -1); 3063 break; 3064 3065 case 2: // read 'x' of xml declaration [#4.3.1] 3066 st = (short) ((ch == 'x') ? 3 : -1); 3067 break; 3068 3069 case 3: // read 'm' of xml declaration [#4.3.1] 3070 st = (short) ((ch == 'm') ? 4 : -1); 3071 break; 3072 3073 case 4: // read 'l' of xml declaration [#4.3.1] 3074 st = (short) ((ch == 'l') ? 5 : -1); 3075 break; 3076 3077 case 5: // read white space after 'xml' 3078 switch (ch) { 3079 case ' ': 3080 case '\t': 3081 case '\r': 3082 case '\n': 3083 st = 6; 3084 break; 3085 3086 default: 3087 st = -1; 3088 break; 3089 } 3090 break; 3091 3092 case 6: // read content of xml declaration 3093 switch (ch) { 3094 case '?': 3095 st = 7; 3096 break; 3097 3098 case EOS: 3099 st = -2; 3100 break; 3101 3102 default: 3103 break; 3104 } 3105 break; 3106 3107 case 7: // read '>' after '?' of xml declaration 3108 switch (ch) { 3109 case '>': 3110 case EOS: 3111 st = -2; 3112 break; 3113 3114 default: 3115 st = 6; 3116 break; 3117 } 3118 break; 3119 3120 default: 3121 panic(FAULT); 3122 break; 3123 } 3124 } 3125 mChLen = mChIdx; 3126 mChIdx = 0; 3127 // If there is no xml text declaration, the encoding is default. 3128 if (st == -1) { 3129 return enc; 3130 } 3131 mChIdx = 5; // the first white space after "<?xml" 3132 // Parse the xml text declaration 3133 for (st = 0; st >= 0;) { 3134 ch = getch(); 3135 switch (st) { 3136 case 0: // skip spaces after the xml declaration name 3137 if (chtyp(ch) != ' ') { 3138 bkch(); 3139 st = 1; 3140 } 3141 break; 3142 3143 case 1: // read xml declaration version 3144 case 2: // read xml declaration encoding or standalone 3145 case 3: // read xml declaration standalone 3146 switch (chtyp(ch)) { 3147 case 'a': 3148 case 'A': 3149 case '_': 3150 bkch(); 3151 str = name(false).toLowerCase(); 3152 if ("version".equals(str) == true) { 3153 if (st != 1) { 3154 panic(FAULT); 3155 } 3156 if ("1.0".equals(eqstr('=')) != true) { 3157 panic(FAULT); 3158 } 3159 mInp.xmlver = 0x0100; 3160 st = 2; 3161 } else if ("encoding".equals(str) == true) { 3162 if (st != 2) { 3163 panic(FAULT); 3164 } 3165 mInp.xmlenc = eqstr('=').toUpperCase(); 3166 enc = mInp.xmlenc; 3167 st = 3; 3168 } else if ("standalone".equals(str) == true) { 3169 if ((st == 1) || (mPh >= PH_DOC_START)) // [#4.3.1] 3170 { 3171 panic(FAULT); 3172 } 3173 str = eqstr('=').toLowerCase(); 3174 // Check the 'standalone' value and use it [#5.1] 3175 if (str.equals("yes") == true) { 3176 mIsSAlone = true; 3177 } else if (str.equals("no") == true) { 3178 mIsSAlone = false; 3179 } else { 3180 panic(FAULT); 3181 } 3182 mIsSAloneSet = true; 3183 st = 4; 3184 } else { 3185 panic(FAULT); 3186 } 3187 break; 3188 3189 case ' ': 3190 break; 3191 3192 case '?': 3193 if (st == 1) { 3194 panic(FAULT); 3195 } 3196 bkch(); 3197 st = 4; 3198 break; 3199 3200 default: 3201 panic(FAULT); 3202 } 3203 break; 3204 3205 case 4: // end of xml declaration 3206 switch (chtyp(ch)) { 3207 case '?': 3208 if (getch() != '>') { 3209 panic(FAULT); 3210 } 3211 if (mPh <= PH_DOC_START) { 3212 mPh = PH_MISC_DTD; // misc before DTD 3213 } 3214 st = -1; 3215 break; 3216 3217 case ' ': 3218 break; 3219 3220 default: 3221 panic(FAULT); 3222 } 3223 break; 3224 3225 default: 3226 panic(FAULT); 3227 } 3228 } 3229 return enc; 3230 } 3231 3232 /** 3233 * Sets up the document reader. 3234 * 3235 * @param name an encoding name. 3236 * @param is the document byte input stream. 3237 * @return a reader constructed from encoding name and input stream. 3238 * @exception UnsupportedEncodingException 3239 */ 3240 private Reader enc(String name, InputStream is) 3241 throws UnsupportedEncodingException { 3242 // DO NOT CLOSE current reader if any! 3243 if (name.equals("UTF-8")) { 3244 return new ReaderUTF8(is); 3245 } else if (name.equals("UTF-16LE")) { 3246 return new ReaderUTF16(is, 'l'); 3247 } else if (name.equals("UTF-16BE")) { 3248 return new ReaderUTF16(is, 'b'); 3249 } else { 3250 return new InputStreamReader(is, name); 3251 } 3252 } 3253 3254 /** 3255 * Sets up current input on the top of the input stack. 3256 * 3257 * @param inp A new input to set up. 3258 */ 3259 protected void push(Input inp) { 3260 mInp.chLen = mChLen; 3261 mInp.chIdx = mChIdx; 3262 inp.next = mInp; 3263 mInp = inp; 3264 mChars = inp.chars; 3265 mChLen = inp.chLen; 3266 mChIdx = inp.chIdx; 3267 } 3268 3269 /** 3270 * Restores previous input on the top of the input stack. 3271 */ 3272 protected void pop() { 3273 if (mInp.src != null) { 3274 try { 3275 mInp.src.close(); 3276 } catch (IOException ioe) { 3277 } 3278 mInp.src = null; 3279 } 3280 mInp = mInp.next; 3281 if (mInp != null) { 3282 mChars = mInp.chars; 3283 mChLen = mInp.chLen; 3284 mChIdx = mInp.chIdx; 3285 } else { 3286 mChars = null; 3287 mChLen = 0; 3288 mChIdx = 0; 3289 } 3290 } 3291 3292 /** 3293 * Maps a character to it's type. 3294 * 3295 * Possible character type values are:<br /> - ' ' for any kind of white 3296 * space character;<br /> - 'a' for any lower case alphabetical character 3297 * value;<br /> - 'A' for any upper case alphabetical character value;<br /> 3298 * - 'd' for any decimal digit character value;<br /> - 'z' for any 3299 * character less then ' ' except '\t', '\n', '\r';<br /> - 'X' for any not 3300 * ASCII character;<br /> - 'Z' for EOS character.<br /> An ASCII (7 bit) 3301 * character which does not fall in any category listed above is mapped to 3302 * it self. 3303 * 3304 * @param ch The character to map. 3305 * @return The type of character. 3306 */ 3307 protected char chtyp(char ch) { 3308 if (ch < 0x80) { 3309 return (char) asctyp[ch]; 3310 } 3311 return (ch != EOS) ? 'X' : 'Z'; 3312 } 3313 3314 /** 3315 * Retrives the next character in the document. 3316 * 3317 * @return The next character in the document. 3318 */ 3319 protected char getch() 3320 throws IOException { 3321 if (mChIdx >= mChLen) { 3322 if (mInp.src == null) { 3323 pop(); // remove internal entity 3324 return getch(); 3325 } 3326 // Read new portion of the document characters 3327 int Num = mInp.src.read(mChars, 0, mChars.length); 3328 if (Num < 0) { 3329 if (mInp != mDoc) { 3330 pop(); // restore the previous input 3331 return getch(); 3332 } else { 3333 mChars[0] = EOS; 3334 mChLen = 1; 3335 } 3336 } else { 3337 mChLen = Num; 3338 } 3339 mChIdx = 0; 3340 } 3341 return mChars[mChIdx++]; 3342 } 3343 3344 /** 3345 * Puts back the last read character. 3346 * 3347 * This method <strong>MUST NOT</strong> be called more then once after each 3348 * call of {@link #getch getch} method. 3349 */ 3350 protected void bkch() 3351 throws Exception { 3352 if (mChIdx <= 0) { 3353 panic(FAULT); 3354 } 3355 mChIdx--; 3356 } 3357 3358 /** 3359 * Sets the current character. 3360 * 3361 * @param ch The character to set. 3362 */ 3363 protected void setch(char ch) { 3364 mChars[mChIdx] = ch; 3365 } 3366 3367 /** 3368 * Finds a pair in the pair chain by a qualified name. 3369 * 3370 * @param chain The first element of the chain of pairs. 3371 * @param qname The qualified name. 3372 * @return A pair with the specified qualified name or null. 3373 */ 3374 protected Pair find(Pair chain, char[] qname) { 3375 for (Pair pair = chain; pair != null; pair = pair.next) { 3376 if (pair.eqname(qname) == true) { 3377 return pair; 3378 } 3379 } 3380 return null; 3381 } 3382 3383 /** 3384 * Provedes an instance of a pair. 3385 * 3386 * @param next The reference to a next pair. 3387 * @return An instance of a pair. 3388 */ 3389 protected Pair pair(Pair next) { 3390 Pair pair; 3391 3392 if (mDltd != null) { 3393 pair = mDltd; 3394 mDltd = pair.next; 3395 } else { 3396 pair = new Pair(); 3397 } 3398 pair.next = next; 3399 3400 return pair; 3401 } 3402 3403 /** 3404 * Deletes an instance of a pair. 3405 * 3406 * @param pair The pair to delete. 3407 * @return A reference to the next pair in a chain. 3408 */ 3409 protected Pair del(Pair pair) { 3410 Pair next = pair.next; 3411 3412 pair.name = null; 3413 pair.value = null; 3414 pair.chars = null; 3415 pair.list = null; 3416 pair.next = mDltd; 3417 mDltd = pair; 3418 3419 return next; 3420 } 3421 }