1 /* 2 * Copyright (c) 2012, 2018, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 package jdk.internal.util.xml.impl; 27 28 import java.io.IOException; 29 import java.io.InputStream; 30 import java.io.InputStreamReader; 31 import java.io.Reader; 32 import java.io.UnsupportedEncodingException; 33 import java.util.HashMap; 34 import java.util.Map; 35 import jdk.internal.org.xml.sax.InputSource; 36 import jdk.internal.org.xml.sax.SAXException; 37 38 /** 39 * XML non-validating parser engine. 40 */ 41 public abstract class Parser { 42 43 public final static String FAULT = ""; 44 protected final static int BUFFSIZE_READER = 512; 45 protected final static int BUFFSIZE_PARSER = 128; 46 /** 47 * The end of stream character. 48 */ 49 public final static char EOS = 0xffff; 50 private Pair mNoNS; // there is no namespace 51 private Pair mXml; // the xml namespace 52 private Map<String, Input> mEnt; // the entities look up table 53 private Map<String, Input> mPEnt; // the parmeter entities look up table 54 protected boolean mIsSAlone; // xml decl standalone flag 55 protected boolean mIsSAloneSet; // standalone is explicitely set 56 protected boolean mIsNSAware; // if true - namespace aware mode 57 protected int mPh; // current phase of document processing 58 protected final static int PH_BEFORE_DOC = -1; // before parsing 59 protected final static int PH_DOC_START = 0; // document start 60 protected final static int PH_MISC_DTD = 1; // misc before DTD 61 protected final static int PH_DTD = 2; // DTD 62 protected final static int PH_DTD_MISC = 3; // misc after DTD 63 protected final static int PH_DOCELM = 4; // document's element 64 protected final static int PH_DOCELM_MISC = 5; // misc after element 65 protected final static int PH_AFTER_DOC = 6; // after parsing 66 protected int mEvt; // current event type 67 protected final static int EV_NULL = 0; // unknown 68 protected final static int EV_ELM = 1; // empty element 69 protected final static int EV_ELMS = 2; // start element 70 protected final static int EV_ELME = 3; // end element 71 protected final static int EV_TEXT = 4; // textual content 72 protected final static int EV_WSPC = 5; // white space content 73 protected final static int EV_PI = 6; // processing instruction 74 protected final static int EV_CDAT = 7; // character data 75 protected final static int EV_COMM = 8; // comment 76 protected final static int EV_DTD = 9; // document type definition 77 protected final static int EV_ENT = 10; // skipped entity 78 private char mESt; // built-in entity recognizer state 79 // mESt values: 80 // 0x100 : the initial state 81 // > 0x100 : unrecognized name 82 // < 0x100 : replacement character 83 protected char[] mBuff; // parser buffer 84 protected int mBuffIdx; // index of the last char 85 protected Pair mPref; // stack of prefixes 86 protected Pair mElm; // stack of elements 87 // mAttL.chars - element qname 88 // mAttL.next - next element 89 // mAttL.list - list of attributes defined on this element 90 // mAttL.list.chars - attribute qname 91 // mAttL.list.id - a char representing attribute's type see below 92 // mAttL.list.next - next attribute defined on the element 93 // mAttL.list.list - devault value structure or null 94 // mAttL.list.list.chars - "name='value' " chars array for Input 95 // 96 // Attribute type character values: 97 // 'i' - "ID" 98 // 'r' - "IDREF" 99 // 'R' - "IDREFS" 100 // 'n' - "ENTITY" 101 // 'N' - "ENTITIES" 102 // 't' - "NMTOKEN" 103 // 'T' - "NMTOKENS" 104 // 'u' - enumeration type 105 // 'o' - "NOTATION" 106 // 'c' - "CDATA" 107 // see also: bkeyword() and atype() 108 // 109 protected Pair mAttL; // list of defined attrs by element name 110 protected Input mDoc; // document entity 111 protected Input mInp; // stack of entities 112 private char[] mChars; // reading buffer 113 private int mChLen; // current capacity 114 private int mChIdx; // index to the next char 115 protected Attrs mAttrs; // attributes of the curr. element 116 private String[] mItems; // attributes array of the curr. element 117 private char mAttrIdx; // attributes counter/index 118 private String mUnent; // unresolved entity name 119 private Pair mDltd; // deleted objects for reuse 120 /** 121 * Default prefixes 122 */ 123 private final static char NONS[]; 124 private final static char XML[]; 125 private final static char XMLNS[]; 126 127 static { 128 NONS = new char[1]; 129 NONS[0] = (char) 0; 130 131 XML = new char[4]; 132 XML[0] = (char) 4; 133 XML[1] = 'x'; 134 XML[2] = 'm'; 135 XML[3] = 'l'; 136 137 XMLNS = new char[6]; 138 XMLNS[0] = (char) 6; 139 XMLNS[1] = 'x'; 140 XMLNS[2] = 'm'; 141 XMLNS[3] = 'l'; 142 XMLNS[4] = 'n'; 143 XMLNS[5] = 's'; 144 } 145 /** 146 * ASCII character type array. 147 * 148 * This array maps an ASCII (7 bit) character to the character type.<br /> 149 * Possible character type values are:<br /> - ' ' for any kind of white 150 * space character;<br /> - 'a' for any lower case alphabetical character 151 * value;<br /> - 'A' for any upper case alphabetical character value;<br /> 152 * - 'd' for any decimal digit character value;<br /> - 'z' for any 153 * character less then ' ' except '\t', '\n', '\r';<br /> An ASCII (7 bit) 154 * character which does not fall in any category listed above is mapped to 155 * it self. 156 */ 157 private static final byte asctyp[]; 158 /** 159 * NMTOKEN character type array. 160 * 161 * This array maps an ASCII (7 bit) character to the character type.<br /> 162 * Possible character type values are:<br /> - 0 for underscore ('_') or any 163 * lower and upper case alphabetical character value;<br /> - 1 for colon 164 * (':') character;<br /> - 2 for dash ('-') and dot ('.') or any decimal 165 * digit character value;<br /> - 3 for any kind of white space character<br 166 * /> An ASCII (7 bit) character which does not fall in any category listed 167 * above is mapped to 0xff. 168 */ 169 private static final byte nmttyp[]; 170 171 /** 172 * Static constructor. 173 * 174 * Sets up the ASCII character type array which is used by 175 * {@link #asctyp asctyp} method and NMTOKEN character type array. 176 */ 177 static { 178 short i = 0; 179 180 asctyp = new byte[0x80]; 181 while (i < ' ') { 182 asctyp[i++] = (byte) 'z'; 183 } 184 asctyp['\t'] = (byte) ' '; 185 asctyp['\r'] = (byte) ' '; 186 asctyp['\n'] = (byte) ' '; 187 while (i < '0') { 188 asctyp[i] = (byte) i++; 189 } 190 while (i <= '9') { 191 asctyp[i++] = (byte) 'd'; 192 } 193 while (i < 'A') { 194 asctyp[i] = (byte) i++; 195 } 196 while (i <= 'Z') { 197 asctyp[i++] = (byte) 'A'; 198 } 199 while (i < 'a') { 200 asctyp[i] = (byte) i++; 201 } 202 while (i <= 'z') { 203 asctyp[i++] = (byte) 'a'; 204 } 205 while (i < 0x80) { 206 asctyp[i] = (byte) i++; 207 } 208 209 nmttyp = new byte[0x80]; 210 for (i = 0; i < '0'; i++) { 211 nmttyp[i] = (byte) 0xff; 212 } 213 while (i <= '9') { 214 nmttyp[i++] = (byte) 2; // digits 215 } 216 while (i < 'A') { 217 nmttyp[i++] = (byte) 0xff; 218 } 219 // skiped upper case alphabetical character are already 0 220 for (i = '['; i < 'a'; i++) { 221 nmttyp[i] = (byte) 0xff; 222 } 223 // skiped lower case alphabetical character are already 0 224 for (i = '{'; i < 0x80; i++) { 225 nmttyp[i] = (byte) 0xff; 226 } 227 nmttyp['_'] = 0; 228 nmttyp[':'] = 1; 229 nmttyp['.'] = 2; 230 nmttyp['-'] = 2; 231 nmttyp[' '] = 3; 232 nmttyp['\t'] = 3; 233 nmttyp['\r'] = 3; 234 nmttyp['\n'] = 3; 235 } 236 237 /** 238 * Constructor. 239 */ 240 protected Parser() { 241 mPh = PH_BEFORE_DOC; // before parsing 242 243 // Initialize the parser 244 mBuff = new char[BUFFSIZE_PARSER]; 245 mAttrs = new Attrs(); 246 247 // Default namespace 248 mPref = pair(mPref); 249 mPref.name = ""; 250 mPref.value = ""; 251 mPref.chars = NONS; 252 mNoNS = mPref; // no namespace 253 // XML namespace 254 mPref = pair(mPref); 255 mPref.name = "xml"; 256 mPref.value = "http://www.w3.org/XML/1998/namespace"; 257 mPref.chars = XML; 258 mXml = mPref; // XML namespace 259 } 260 261 /** 262 * Initializes parser's internals. Note, current input has to be set before 263 * this method is called. 264 */ 265 protected void init() { 266 mUnent = null; 267 mElm = null; 268 mPref = mXml; 269 mAttL = null; 270 mPEnt = new HashMap<>(); 271 mEnt = new HashMap<>(); 272 mDoc = mInp; // current input is document entity 273 mChars = mInp.chars; // use document entity buffer 274 mPh = PH_DOC_START; // the begining of the document 275 } 276 277 /** 278 * Cleans up parser internal resources. 279 */ 280 protected void cleanup() { 281 // Default attributes 282 while (mAttL != null) { 283 while (mAttL.list != null) { 284 if (mAttL.list.list != null) { 285 del(mAttL.list.list); 286 } 287 mAttL.list = del(mAttL.list); 288 } 289 mAttL = del(mAttL); 290 } 291 // Element stack 292 while (mElm != null) { 293 mElm = del(mElm); 294 } 295 // Namespace prefixes 296 while (mPref != mXml) { 297 mPref = del(mPref); 298 } 299 // Inputs 300 while (mInp != null) { 301 pop(); 302 } 303 // Document reader 304 if ((mDoc != null) && (mDoc.src != null)) { 305 try { 306 mDoc.src.close(); 307 } catch (IOException ioe) { 308 } 309 } 310 mPEnt = null; 311 mEnt = null; 312 mDoc = null; 313 mPh = PH_AFTER_DOC; // before documnet processing 314 } 315 316 /** 317 * Processes a portion of document. This method returns one of EV_* 318 * constants as an identifier of the portion of document have been read. 319 * 320 * @return Identifier of processed document portion. 321 * @exception Exception is parser specific exception form panic method. 322 * @exception IOException 323 */ 324 @SuppressWarnings("fallthrough") 325 protected int step() throws Exception { 326 mEvt = EV_NULL; 327 int st = 0; 328 while (mEvt == EV_NULL) { 329 char ch = (mChIdx < mChLen) ? mChars[mChIdx++] : getch(); 330 switch (st) { 331 case 0: // all sorts of markup (dispetcher) 332 if (ch != '<') { 333 bkch(); 334 mBuffIdx = -1; // clean parser buffer 335 st = 1; 336 break; 337 } 338 switch (getch()) { 339 case '/': // the end of the element content 340 mEvt = EV_ELME; 341 if (mElm == null) { 342 panic(FAULT); 343 } 344 // Check element's open/close tags balance 345 mBuffIdx = -1; // clean parser buffer 346 bname(mIsNSAware); 347 char[] chars = mElm.chars; 348 if (chars.length == (mBuffIdx + 1)) { 349 for (char i = 1; i <= mBuffIdx; i += 1) { 350 if (chars[i] != mBuff[i]) { 351 panic(FAULT); 352 } 353 } 354 } else { 355 panic(FAULT); 356 } 357 // Skip white spaces before '>' 358 if (wsskip() != '>') { 359 panic(FAULT); 360 } 361 getch(); // read '>' 362 break; 363 364 case '!': // a comment or a CDATA 365 ch = getch(); 366 bkch(); 367 switch (ch) { 368 case '-': // must be a comment 369 mEvt = EV_COMM; 370 comm(); 371 break; 372 373 case '[': // must be a CDATA section 374 mEvt = EV_CDAT; 375 cdat(); 376 break; 377 378 default: // must be 'DOCTYPE' 379 mEvt = EV_DTD; 380 dtd(); 381 break; 382 } 383 break; 384 385 case '?': // processing instruction 386 mEvt = EV_PI; 387 pi(); 388 break; 389 390 default: // must be the first char of an xml name 391 bkch(); 392 // Read an element name and put it on top of the 393 // element stack 394 mElm = pair(mElm); // add new element to the stack 395 mElm.chars = qname(mIsNSAware); 396 mElm.name = mElm.local(); 397 mElm.id = (mElm.next != null) ? mElm.next.id : 0; // flags 398 mElm.num = 0; // namespace counter 399 // Find the list of defined attributs of the current 400 // element 401 Pair elm = find(mAttL, mElm.chars); 402 mElm.list = (elm != null) ? elm.list : null; 403 // Read attributes till the end of the element tag 404 mAttrIdx = 0; 405 Pair att = pair(null); 406 att.num = 0; // clear attribute's flags 407 attr(att); // get all attributes inc. defaults 408 del(att); 409 mElm.value = (mIsNSAware) ? rslv(mElm.chars) : null; 410 // Skip white spaces before '>' 411 switch (wsskip()) { 412 case '>': 413 getch(); // read '>' 414 mEvt = EV_ELMS; 415 break; 416 417 case '/': 418 getch(); // read '/' 419 if (getch() != '>') // read '>' 420 { 421 panic(FAULT); 422 } 423 mEvt = EV_ELM; 424 break; 425 426 default: 427 panic(FAULT); 428 } 429 break; 430 } 431 break; 432 433 case 1: // read white space 434 switch (ch) { 435 case ' ': 436 case '\t': 437 case '\n': 438 bappend(ch); 439 break; 440 441 case '\r': // EOL processing [#2.11] 442 if (getch() != '\n') { 443 bkch(); 444 } 445 bappend('\n'); 446 break; 447 448 case '<': 449 mEvt = EV_WSPC; 450 bkch(); 451 bflash_ws(); 452 break; 453 454 default: 455 bkch(); 456 st = 2; 457 break; 458 } 459 break; 460 461 case 2: // read the text content of the element 462 switch (ch) { 463 case '&': 464 if (mUnent == null) { 465 // There was no unresolved entity on previous step. 466 if ((mUnent = ent('x')) != null) { 467 mEvt = EV_TEXT; 468 bkch(); // move back to ';' after entity name 469 setch('&'); // parser must be back on next step 470 bflash(); 471 } 472 } else { 473 // There was unresolved entity on previous step. 474 mEvt = EV_ENT; 475 skippedEnt(mUnent); 476 mUnent = null; 477 } 478 break; 479 480 case '<': 481 mEvt = EV_TEXT; 482 bkch(); 483 bflash(); 484 break; 485 486 case '\r': // EOL processing [#2.11] 487 if (getch() != '\n') { 488 bkch(); 489 } 490 bappend('\n'); 491 break; 492 493 case EOS: 494 panic(FAULT); 495 496 default: 497 bappend(ch); 498 break; 499 } 500 break; 501 502 default: 503 panic(FAULT); 504 } 505 } 506 507 return mEvt; 508 } 509 510 /** 511 * Parses the document type declaration. 512 * 513 * @exception Exception is parser specific exception form panic method. 514 * @exception IOException 515 */ 516 private void dtd() throws Exception { 517 char ch; 518 String str = null; 519 String name = null; 520 Pair psid = null; 521 // read 'DOCTYPE' 522 if ("DOCTYPE".equals(name(false)) != true) { 523 panic(FAULT); 524 } 525 mPh = PH_DTD; // DTD 526 for (short st = 0; st >= 0;) { 527 ch = getch(); 528 switch (st) { 529 case 0: // read the document type name 530 if (chtyp(ch) != ' ') { 531 bkch(); 532 name = name(mIsNSAware); 533 wsskip(); 534 st = 1; // read 'PUPLIC' or 'SYSTEM' 535 } 536 break; 537 538 case 1: // read 'PUPLIC' or 'SYSTEM' 539 switch (chtyp(ch)) { 540 case 'A': 541 bkch(); 542 psid = pubsys(' '); 543 st = 2; // skip spaces before internal subset 544 docType(name, psid.name, psid.value); 545 break; 546 547 case '[': 548 bkch(); 549 st = 2; // skip spaces before internal subset 550 docType(name, null, null); 551 break; 552 553 case '>': 554 bkch(); 555 st = 3; // skip spaces after internal subset 556 docType(name, null, null); 557 break; 558 559 default: 560 panic(FAULT); 561 } 562 break; 563 564 case 2: // skip spaces before internal subset 565 switch (chtyp(ch)) { 566 case '[': 567 // Process internal subset 568 dtdsub(); 569 st = 3; // skip spaces after internal subset 570 break; 571 572 case '>': 573 // There is no internal subset 574 bkch(); 575 st = 3; // skip spaces after internal subset 576 break; 577 578 case ' ': 579 // skip white spaces 580 break; 581 582 default: 583 panic(FAULT); 584 } 585 break; 586 587 case 3: // skip spaces after internal subset 588 switch (chtyp(ch)) { 589 case '>': 590 if (psid != null) { 591 // Report the DTD external subset 592 InputSource is = resolveEnt(name, psid.name, psid.value); 593 if (is != null) { 594 if (mIsSAlone == false) { 595 // Set the end of DTD external subset char 596 bkch(); 597 setch(']'); 598 // Set the DTD external subset InputSource 599 push(new Input(BUFFSIZE_READER)); 600 setinp(is); 601 mInp.pubid = psid.name; 602 mInp.sysid = psid.value; 603 // Parse the DTD external subset 604 dtdsub(); 605 } else { 606 // Unresolved DTD external subset 607 skippedEnt("[dtd]"); 608 // Release reader and stream 609 if (is.getCharacterStream() != null) { 610 try { 611 is.getCharacterStream().close(); 612 } catch (IOException ioe) { 613 } 614 } 615 if (is.getByteStream() != null) { 616 try { 617 is.getByteStream().close(); 618 } catch (IOException ioe) { 619 } 620 } 621 } 622 } else { 623 // Unresolved DTD external subset 624 skippedEnt("[dtd]"); 625 } 626 del(psid); 627 } 628 st = -1; // end of DTD 629 break; 630 631 case ' ': 632 // skip white spaces 633 break; 634 635 default: 636 panic(FAULT); 637 } 638 break; 639 640 default: 641 panic(FAULT); 642 } 643 } 644 } 645 646 /** 647 * Parses the document type declaration subset. 648 * 649 * @exception Exception is parser specific exception form panic method. 650 * @exception IOException 651 */ 652 private void dtdsub() throws Exception { 653 startInternalSub(); // reports the event before parsing the subset 654 655 char ch; 656 for (short st = 0; st >= 0;) { 657 ch = getch(); 658 switch (st) { 659 case 0: // skip white spaces before a declaration 660 switch (chtyp(ch)) { 661 case '<': 662 ch = getch(); 663 switch (ch) { 664 case '?': 665 pi(); 666 break; 667 668 case '!': 669 ch = getch(); 670 bkch(); 671 if (ch == '-') { 672 comm(); 673 break; 674 } 675 // A markup or an entity declaration 676 bntok(); 677 switch (bkeyword()) { 678 case 'n': 679 dtdent(); 680 break; 681 682 case 'a': 683 dtdattl(); // parse attributes declaration 684 break; 685 686 case 'e': 687 dtdelm(); // parse element declaration 688 break; 689 690 case 'o': 691 dtdnot(); // parse notation declaration 692 break; 693 694 default: 695 panic(FAULT); // unsupported markup declaration 696 break; 697 } 698 st = 1; // read the end of declaration 699 break; 700 701 default: 702 panic(FAULT); 703 break; 704 } 705 break; 706 707 case '%': 708 // A parameter entity reference 709 pent(' '); 710 break; 711 712 case ']': 713 // End of DTD subset 714 st = -1; 715 break; 716 717 case ' ': 718 // Skip white spaces 719 break; 720 721 case 'Z': 722 // End of stream 723 if (getch() != ']') { 724 panic(FAULT); 725 } 726 st = -1; 727 break; 728 729 default: 730 panic(FAULT); 731 } 732 break; 733 734 case 1: // read the end of declaration 735 switch (ch) { 736 case '>': // there is no notation 737 st = 0; // skip white spaces before a declaration 738 break; 739 740 case ' ': 741 case '\n': 742 case '\r': 743 case '\t': 744 // Skip white spaces 745 break; 746 747 default: 748 panic(FAULT); 749 break; 750 } 751 break; 752 753 default: 754 panic(FAULT); 755 } 756 } 757 } 758 759 /** 760 * Parses an entity declaration. This method fills the general ( 761 * <code>mEnt</code>) and parameter 762 * ( 763 * <code>mPEnt</code>) entity look up table. 764 * 765 * @exception Exception is parser specific exception form panic method. 766 * @exception IOException 767 */ 768 @SuppressWarnings("fallthrough") 769 private void dtdent() throws Exception { 770 String str = null; 771 char[] val = null; 772 Input inp = null; 773 Pair ids = null; 774 char ch; 775 for (short st = 0; st >= 0;) { 776 ch = getch(); 777 switch (st) { 778 case 0: // skip white spaces before entity name 779 switch (chtyp(ch)) { 780 case ' ': 781 // Skip white spaces 782 break; 783 784 case '%': 785 // Parameter entity or parameter entity declaration. 786 ch = getch(); 787 bkch(); 788 if (chtyp(ch) == ' ') { 789 // Parameter entity declaration. 790 wsskip(); 791 str = name(false); 792 switch (chtyp(wsskip())) { 793 case 'A': 794 // Read the external identifier 795 ids = pubsys(' '); 796 if (wsskip() == '>') { 797 // External parsed entity 798 if (mPEnt.containsKey(str) == false) { // [#4.2] 799 inp = new Input(); 800 inp.pubid = ids.name; 801 inp.sysid = ids.value; 802 mPEnt.put(str, inp); 803 } 804 } else { 805 panic(FAULT); 806 } 807 del(ids); 808 st = -1; // the end of declaration 809 break; 810 811 case '\"': 812 case '\'': 813 // Read the parameter entity value 814 bqstr('d'); 815 // Create the parameter entity value 816 val = new char[mBuffIdx + 1]; 817 System.arraycopy(mBuff, 1, val, 1, val.length - 1); 818 // Add surrounding spaces [#4.4.8] 819 val[0] = ' '; 820 // Add the entity to the entity look up table 821 if (mPEnt.containsKey(str) == false) { // [#4.2] 822 inp = new Input(val); 823 inp.pubid = mInp.pubid; 824 inp.sysid = mInp.sysid; 825 inp.xmlenc = mInp.xmlenc; 826 inp.xmlver = mInp.xmlver; 827 mPEnt.put(str, inp); 828 } 829 st = -1; // the end of declaration 830 break; 831 832 default: 833 panic(FAULT); 834 break; 835 } 836 } else { 837 // Parameter entity reference. 838 pent(' '); 839 } 840 break; 841 842 default: 843 bkch(); 844 str = name(false); 845 st = 1; // read entity declaration value 846 break; 847 } 848 break; 849 850 case 1: // read entity declaration value 851 switch (chtyp(ch)) { 852 case '\"': // internal entity 853 case '\'': 854 bkch(); 855 bqstr('d'); // read a string into the buffer 856 if (mEnt.get(str) == null) { 857 // Create general entity value 858 val = new char[mBuffIdx]; 859 System.arraycopy(mBuff, 1, val, 0, val.length); 860 // Add the entity to the entity look up table 861 if (mEnt.containsKey(str) == false) { // [#4.2] 862 inp = new Input(val); 863 inp.pubid = mInp.pubid; 864 inp.sysid = mInp.sysid; 865 inp.xmlenc = mInp.xmlenc; 866 inp.xmlver = mInp.xmlver; 867 mEnt.put(str, inp); 868 } 869 } 870 st = -1; // the end of declaration 871 break; 872 873 case 'A': // external entity 874 bkch(); 875 ids = pubsys(' '); 876 switch (wsskip()) { 877 case '>': // external parsed entity 878 if (mEnt.containsKey(str) == false) { // [#4.2] 879 inp = new Input(); 880 inp.pubid = ids.name; 881 inp.sysid = ids.value; 882 mEnt.put(str, inp); 883 } 884 break; 885 886 case 'N': // external general unparsed entity 887 if ("NDATA".equals(name(false)) == true) { 888 wsskip(); 889 unparsedEntDecl(str, ids.name, ids.value, name(false)); 890 break; 891 } 892 default: 893 panic(FAULT); 894 break; 895 } 896 del(ids); 897 st = -1; // the end of declaration 898 break; 899 900 case ' ': 901 // Skip white spaces 902 break; 903 904 default: 905 panic(FAULT); 906 break; 907 } 908 break; 909 910 default: 911 panic(FAULT); 912 } 913 } 914 } 915 916 /** 917 * Parses an element declaration. 918 * 919 * This method parses the declaration up to the closing angle bracket. 920 * 921 * @exception Exception is parser specific exception form panic method. 922 * @exception IOException 923 */ 924 @SuppressWarnings("fallthrough") 925 private void dtdelm() throws Exception { 926 // This is stub implementation which skips an element 927 // declaration. 928 wsskip(); 929 name(mIsNSAware); 930 931 char ch; 932 while (true) { 933 ch = getch(); 934 switch (ch) { 935 case '>': 936 bkch(); 937 return; 938 939 case EOS: 940 panic(FAULT); 941 942 default: 943 break; 944 } 945 } 946 } 947 948 /** 949 * Parses an attribute list declaration. 950 * 951 * This method parses the declaration up to the closing angle bracket. 952 * 953 * @exception Exception is parser specific exception form panic method. 954 * @exception IOException 955 */ 956 private void dtdattl() throws Exception { 957 char elmqn[] = null; 958 Pair elm = null; 959 char ch; 960 for (short st = 0; st >= 0;) { 961 ch = getch(); 962 switch (st) { 963 case 0: // read the element name 964 switch (chtyp(ch)) { 965 case 'a': 966 case 'A': 967 case '_': 968 case 'X': 969 case ':': 970 bkch(); 971 // Get the element from the list or add a new one. 972 elmqn = qname(mIsNSAware); 973 elm = find(mAttL, elmqn); 974 if (elm == null) { 975 elm = pair(mAttL); 976 elm.chars = elmqn; 977 mAttL = elm; 978 } 979 st = 1; // read an attribute declaration 980 break; 981 982 case ' ': 983 break; 984 985 case '%': 986 pent(' '); 987 break; 988 989 default: 990 panic(FAULT); 991 break; 992 } 993 break; 994 995 case 1: // read an attribute declaration 996 switch (chtyp(ch)) { 997 case 'a': 998 case 'A': 999 case '_': 1000 case 'X': 1001 case ':': 1002 bkch(); 1003 dtdatt(elm); 1004 if (wsskip() == '>') { 1005 return; 1006 } 1007 break; 1008 1009 case ' ': 1010 break; 1011 1012 case '%': 1013 pent(' '); 1014 break; 1015 1016 default: 1017 panic(FAULT); 1018 break; 1019 } 1020 break; 1021 1022 default: 1023 panic(FAULT); 1024 break; 1025 } 1026 } 1027 } 1028 1029 /** 1030 * Parses an attribute declaration. 1031 * 1032 * The attribute uses the following fields of Pair object: chars - characters 1033 * of qualified name id - the type identifier of the attribute list - a pair 1034 * which holds the default value (chars field) 1035 * 1036 * @param elm An object which represents all defined attributes on an 1037 * element. 1038 * @exception Exception is parser specific exception form panic method. 1039 * @exception IOException 1040 */ 1041 @SuppressWarnings("fallthrough") 1042 private void dtdatt(Pair elm) throws Exception { 1043 char attqn[] = null; 1044 Pair att = null; 1045 char ch; 1046 for (short st = 0; st >= 0;) { 1047 ch = getch(); 1048 switch (st) { 1049 case 0: // the attribute name 1050 switch (chtyp(ch)) { 1051 case 'a': 1052 case 'A': 1053 case '_': 1054 case 'X': 1055 case ':': 1056 bkch(); 1057 // Get the attribute from the list or add a new one. 1058 attqn = qname(mIsNSAware); 1059 att = find(elm.list, attqn); 1060 if (att == null) { 1061 // New attribute declaration 1062 att = pair(elm.list); 1063 att.chars = attqn; 1064 elm.list = att; 1065 } else { 1066 // Do not override the attribute declaration [#3.3] 1067 att = pair(null); 1068 att.chars = attqn; 1069 att.id = 'c'; 1070 } 1071 wsskip(); 1072 st = 1; 1073 break; 1074 1075 case '%': 1076 pent(' '); 1077 break; 1078 1079 case ' ': 1080 break; 1081 1082 default: 1083 panic(FAULT); 1084 break; 1085 } 1086 break; 1087 1088 case 1: // the attribute type 1089 switch (chtyp(ch)) { 1090 case '(': 1091 att.id = 'u'; // enumeration type 1092 st = 2; // read the first element of the list 1093 break; 1094 1095 case '%': 1096 pent(' '); 1097 break; 1098 1099 case ' ': 1100 break; 1101 1102 default: 1103 bkch(); 1104 bntok(); // read type id 1105 att.id = bkeyword(); 1106 switch (att.id) { 1107 case 'o': // NOTATION 1108 if (wsskip() != '(') { 1109 panic(FAULT); 1110 } 1111 ch = getch(); 1112 st = 2; // read the first element of the list 1113 break; 1114 1115 case 'i': // ID 1116 case 'r': // IDREF 1117 case 'R': // IDREFS 1118 case 'n': // ENTITY 1119 case 'N': // ENTITIES 1120 case 't': // NMTOKEN 1121 case 'T': // NMTOKENS 1122 case 'c': // CDATA 1123 wsskip(); 1124 st = 4; // read default declaration 1125 break; 1126 1127 default: 1128 panic(FAULT); 1129 break; 1130 } 1131 break; 1132 } 1133 break; 1134 1135 case 2: // read the first element of the list 1136 switch (chtyp(ch)) { 1137 case 'a': 1138 case 'A': 1139 case 'd': 1140 case '.': 1141 case ':': 1142 case '-': 1143 case '_': 1144 case 'X': 1145 bkch(); 1146 switch (att.id) { 1147 case 'u': // enumeration type 1148 bntok(); 1149 break; 1150 1151 case 'o': // NOTATION 1152 mBuffIdx = -1; 1153 bname(false); 1154 break; 1155 1156 default: 1157 panic(FAULT); 1158 break; 1159 } 1160 wsskip(); 1161 st = 3; // read next element of the list 1162 break; 1163 1164 case '%': 1165 pent(' '); 1166 break; 1167 1168 case ' ': 1169 break; 1170 1171 default: 1172 panic(FAULT); 1173 break; 1174 } 1175 break; 1176 1177 case 3: // read next element of the list 1178 switch (ch) { 1179 case ')': 1180 wsskip(); 1181 st = 4; // read default declaration 1182 break; 1183 1184 case '|': 1185 wsskip(); 1186 switch (att.id) { 1187 case 'u': // enumeration type 1188 bntok(); 1189 break; 1190 1191 case 'o': // NOTATION 1192 mBuffIdx = -1; 1193 bname(false); 1194 break; 1195 1196 default: 1197 panic(FAULT); 1198 break; 1199 } 1200 wsskip(); 1201 break; 1202 1203 case '%': 1204 pent(' '); 1205 break; 1206 1207 default: 1208 panic(FAULT); 1209 break; 1210 } 1211 break; 1212 1213 case 4: // read default declaration 1214 switch (ch) { 1215 case '#': 1216 bntok(); 1217 switch (bkeyword()) { 1218 case 'F': // FIXED 1219 switch (wsskip()) { 1220 case '\"': 1221 case '\'': 1222 st = 5; // read the default value 1223 break; 1224 1225 case EOS: 1226 panic(FAULT); 1227 1228 default: 1229 st = -1; 1230 break; 1231 } 1232 break; 1233 1234 case 'Q': // REQUIRED 1235 case 'I': // IMPLIED 1236 st = -1; 1237 break; 1238 1239 default: 1240 panic(FAULT); 1241 break; 1242 } 1243 break; 1244 1245 case '\"': 1246 case '\'': 1247 bkch(); 1248 st = 5; // read the default value 1249 break; 1250 1251 case ' ': 1252 case '\n': 1253 case '\r': 1254 case '\t': 1255 break; 1256 1257 case '%': 1258 pent(' '); 1259 break; 1260 1261 default: 1262 bkch(); 1263 st = -1; 1264 break; 1265 } 1266 break; 1267 1268 case 5: // read the default value 1269 switch (ch) { 1270 case '\"': 1271 case '\'': 1272 bkch(); 1273 bqstr('d'); // the value in the mBuff now 1274 att.list = pair(null); 1275 // Create a string like "attqname='value' " 1276 att.list.chars = new char[att.chars.length + mBuffIdx + 3]; 1277 System.arraycopy( 1278 att.chars, 1, att.list.chars, 0, att.chars.length - 1); 1279 att.list.chars[att.chars.length - 1] = '='; 1280 att.list.chars[att.chars.length] = ch; 1281 System.arraycopy( 1282 mBuff, 1, att.list.chars, att.chars.length + 1, mBuffIdx); 1283 att.list.chars[att.chars.length + mBuffIdx + 1] = ch; 1284 att.list.chars[att.chars.length + mBuffIdx + 2] = ' '; 1285 st = -1; 1286 break; 1287 1288 default: 1289 panic(FAULT); 1290 break; 1291 } 1292 break; 1293 1294 default: 1295 panic(FAULT); 1296 break; 1297 } 1298 } 1299 } 1300 1301 /** 1302 * Parses a notation declaration. 1303 * 1304 * This method parses the declaration up to the closing angle bracket. 1305 * 1306 * @exception Exception is parser specific exception form panic method. 1307 * @exception IOException 1308 */ 1309 private void dtdnot() throws Exception { 1310 wsskip(); 1311 String name = name(false); 1312 wsskip(); 1313 Pair ids = pubsys('N'); 1314 notDecl(name, ids.name, ids.value); 1315 del(ids); 1316 } 1317 1318 /** 1319 * Parses an attribute. 1320 * 1321 * This recursive method is responsible for prefix addition 1322 * ( 1323 * <code>mPref</code>) on the way down. The element's start tag end triggers 1324 * the return process. The method then on it's way back resolves prefixes 1325 * and accumulates attributes. 1326 * 1327 * <p><code>att.num</code> carries attribute flags where: 0x1 - attribute is 1328 * declared in DTD (attribute decalration had been read); 0x2 - attribute's 1329 * default value is used.</p> 1330 * 1331 * @param att An object which reprecents current attribute. 1332 * @exception Exception is parser specific exception form panic method. 1333 * @exception IOException 1334 */ 1335 @SuppressWarnings("fallthrough") 1336 private void attr(Pair att) throws Exception { 1337 switch (wsskip()) { 1338 case '/': 1339 case '>': 1340 if ((att.num & 0x2) == 0) { // all attributes have been read 1341 att.num |= 0x2; // set default attribute flag 1342 Input inp = mInp; 1343 // Go through all attributes defined on current element. 1344 for (Pair def = mElm.list; def != null; def = def.next) { 1345 if (def.list == null) // no default value 1346 { 1347 continue; 1348 } 1349 // Go through all attributes defined on current 1350 // element and add defaults. 1351 Pair act = find(att.next, def.chars); 1352 if (act == null) { 1353 push(new Input(def.list.chars)); 1354 } 1355 } 1356 if (mInp != inp) { // defaults have been added 1357 attr(att); 1358 return; 1359 } 1360 } 1361 // Ensure the attribute string array capacity 1362 mAttrs.setLength(mAttrIdx); 1363 mItems = mAttrs.mItems; 1364 return; 1365 1366 case EOS: 1367 panic(FAULT); 1368 1369 default: 1370 // Read the attribute name and value 1371 att.chars = qname(mIsNSAware); 1372 att.name = att.local(); 1373 String type = atype(att); // sets attribute's type on att.id 1374 wsskip(); 1375 if (getch() != '=') { 1376 panic(FAULT); 1377 } 1378 bqstr((char) att.id); // read the value with normalization. 1379 String val = new String(mBuff, 1, mBuffIdx); 1380 Pair next = pair(att); 1381 next.num = (att.num & ~0x1); // inherit attribute flags 1382 // Put a namespace declaration on top of the prefix stack 1383 if ((mIsNSAware == false) || (isdecl(att, val) == false)) { 1384 // An ordinary attribute 1385 mAttrIdx++; 1386 attr(next); // recursive call to parse the next attribute 1387 mAttrIdx--; 1388 // Add the attribute to the attributes string array 1389 char idx = (char) (mAttrIdx << 3); 1390 mItems[idx + 1] = att.qname(); // attr qname 1391 mItems[idx + 2] = (mIsNSAware) ? att.name : ""; // attr local name 1392 mItems[idx + 3] = val; // attr value 1393 mItems[idx + 4] = type; // attr type 1394 switch (att.num & 0x3) { 1395 case 0x0: 1396 mItems[idx + 5] = null; 1397 break; 1398 1399 case 0x1: // declared attribute 1400 mItems[idx + 5] = "d"; 1401 break; 1402 1403 default: // 0x2, 0x3 - default attribute always declared 1404 mItems[idx + 5] = "D"; 1405 break; 1406 } 1407 // Resolve the prefix if any and report the attribute 1408 // NOTE: The attribute does not accept the default namespace. 1409 mItems[idx + 0] = (att.chars[0] != 0) ? rslv(att.chars) : ""; 1410 } else { 1411 // A namespace declaration. mPref.name contains prefix and 1412 // mPref.value contains namespace URI set by isdecl method. 1413 // Report a start of the new mapping 1414 newPrefix(); 1415 // Recursive call to parse the next attribute 1416 attr(next); 1417 // NOTE: The namespace declaration is not reported. 1418 } 1419 del(next); 1420 break; 1421 } 1422 } 1423 1424 /** 1425 * Retrieves attribute type. 1426 * 1427 * This method sets the type of normalization in the attribute 1428 * <code>id</code> field and returns the name of attribute type. 1429 * 1430 * @param att An object which represents current attribute. 1431 * @return The name of the attribute type. 1432 * @exception Exception is parser specific exception form panic method. 1433 */ 1434 private String atype(Pair att) 1435 throws Exception { 1436 Pair attr; 1437 1438 // CDATA-type normalization by default [#3.3.3] 1439 att.id = 'c'; 1440 if (mElm.list == null || (attr = find(mElm.list, att.chars)) == null) { 1441 return "CDATA"; 1442 } 1443 1444 att.num |= 0x1; // attribute is declared 1445 1446 // Non-CDATA normalization except when the attribute type is CDATA. 1447 att.id = 'i'; 1448 switch (attr.id) { 1449 case 'i': 1450 return "ID"; 1451 1452 case 'r': 1453 return "IDREF"; 1454 1455 case 'R': 1456 return "IDREFS"; 1457 1458 case 'n': 1459 return "ENTITY"; 1460 1461 case 'N': 1462 return "ENTITIES"; 1463 1464 case 't': 1465 return "NMTOKEN"; 1466 1467 case 'T': 1468 return "NMTOKENS"; 1469 1470 case 'u': 1471 return "NMTOKEN"; 1472 1473 case 'o': 1474 return "NOTATION"; 1475 1476 case 'c': 1477 att.id = 'c'; 1478 return "CDATA"; 1479 1480 default: 1481 panic(FAULT); 1482 } 1483 return null; 1484 } 1485 1486 /** 1487 * Parses a comment. 1488 * 1489 * The '<!' part is read in dispatcher so the method starts 1490 * with first '-' after '<!'. 1491 * 1492 * @exception Exception is parser specific exception form panic method. 1493 */ 1494 @SuppressWarnings("fallthrough") 1495 private void comm() throws Exception { 1496 if (mPh == PH_DOC_START) { 1497 mPh = PH_MISC_DTD; // misc before DTD 1498 } // '<!' has been already read by dispetcher. 1499 char ch; 1500 mBuffIdx = -1; 1501 for (short st = 0; st >= 0;) { 1502 ch = (mChIdx < mChLen) ? mChars[mChIdx++] : getch(); 1503 if (ch == EOS) { 1504 panic(FAULT); 1505 } 1506 switch (st) { 1507 case 0: // first '-' of the comment open 1508 if (ch == '-') { 1509 st = 1; 1510 } else { 1511 panic(FAULT); 1512 } 1513 break; 1514 1515 case 1: // secind '-' of the comment open 1516 if (ch == '-') { 1517 st = 2; 1518 } else { 1519 panic(FAULT); 1520 } 1521 break; 1522 1523 case 2: // skip the comment body 1524 switch (ch) { 1525 case '-': 1526 st = 3; 1527 break; 1528 1529 default: 1530 bappend(ch); 1531 break; 1532 } 1533 break; 1534 1535 case 3: // second '-' of the comment close 1536 switch (ch) { 1537 case '-': 1538 st = 4; 1539 break; 1540 1541 default: 1542 bappend('-'); 1543 bappend(ch); 1544 st = 2; 1545 break; 1546 } 1547 break; 1548 1549 case 4: // '>' of the comment close 1550 if (ch == '>') { 1551 comm(mBuff, mBuffIdx + 1); 1552 st = -1; 1553 break; 1554 } 1555 // else - panic [#2.5 compatibility note] 1556 1557 default: 1558 panic(FAULT); 1559 } 1560 } 1561 } 1562 1563 /** 1564 * Parses a processing instruction. 1565 * 1566 * The '<?' is read in dispatcher so the method starts with 1567 * first character of PI target name after '<?'. 1568 * 1569 * @exception Exception is parser specific exception form panic method. 1570 * @exception IOException 1571 */ 1572 private void pi() throws Exception { 1573 // '<?' has been already read by dispetcher. 1574 char ch; 1575 String str = null; 1576 mBuffIdx = -1; 1577 for (short st = 0; st >= 0;) { 1578 ch = getch(); 1579 if (ch == EOS) { 1580 panic(FAULT); 1581 } 1582 switch (st) { 1583 case 0: // read the PI target name 1584 switch (chtyp(ch)) { 1585 case 'a': 1586 case 'A': 1587 case '_': 1588 case ':': 1589 case 'X': 1590 bkch(); 1591 str = name(false); 1592 // PI target name may not be empty string [#2.6] 1593 // PI target name 'XML' is reserved [#2.6] 1594 if ((str.length() == 0) 1595 || (mXml.name.equals(str.toLowerCase()) == true)) { 1596 panic(FAULT); 1597 } 1598 // This is processing instruction 1599 if (mPh == PH_DOC_START) // the begining of the document 1600 { 1601 mPh = PH_MISC_DTD; // misc before DTD 1602 } 1603 wsskip(); // skip spaces after the PI target name 1604 st = 1; // accumulate the PI body 1605 mBuffIdx = -1; 1606 break; 1607 1608 default: 1609 panic(FAULT); 1610 } 1611 break; 1612 1613 case 1: // accumulate the PI body 1614 switch (ch) { 1615 case '?': 1616 st = 2; // end of the PI body 1617 break; 1618 1619 default: 1620 bappend(ch); 1621 break; 1622 } 1623 break; 1624 1625 case 2: // end of the PI body 1626 switch (ch) { 1627 case '>': 1628 // PI has been read. 1629 pi(str, new String(mBuff, 0, mBuffIdx + 1)); 1630 st = -1; 1631 break; 1632 1633 case '?': 1634 bappend('?'); 1635 break; 1636 1637 default: 1638 bappend('?'); 1639 bappend(ch); 1640 st = 1; // accumulate the PI body 1641 break; 1642 } 1643 break; 1644 1645 default: 1646 panic(FAULT); 1647 } 1648 } 1649 } 1650 1651 /** 1652 * Parses a character data. 1653 * 1654 * The '<!' part is read in dispatcher so the method starts 1655 * with first '[' after '<!'. 1656 * 1657 * @exception Exception is parser specific exception form panic method. 1658 * @exception IOException 1659 */ 1660 private void cdat() 1661 throws Exception { 1662 // '<!' has been already read by dispetcher. 1663 char ch; 1664 mBuffIdx = -1; 1665 for (short st = 0; st >= 0;) { 1666 ch = getch(); 1667 switch (st) { 1668 case 0: // the first '[' of the CDATA open 1669 if (ch == '[') { 1670 st = 1; 1671 } else { 1672 panic(FAULT); 1673 } 1674 break; 1675 1676 case 1: // read "CDATA" 1677 if (chtyp(ch) == 'A') { 1678 bappend(ch); 1679 } else { 1680 if ("CDATA".equals( 1681 new String(mBuff, 0, mBuffIdx + 1)) != true) { 1682 panic(FAULT); 1683 } 1684 bkch(); 1685 st = 2; 1686 } 1687 break; 1688 1689 case 2: // the second '[' of the CDATA open 1690 if (ch != '[') { 1691 panic(FAULT); 1692 } 1693 mBuffIdx = -1; 1694 st = 3; 1695 break; 1696 1697 case 3: // read data before the first ']' 1698 if (ch != ']') { 1699 bappend(ch); 1700 } else { 1701 st = 4; 1702 } 1703 break; 1704 1705 case 4: // read the second ']' or continue to read the data 1706 if (ch != ']') { 1707 bappend(']'); 1708 bappend(ch); 1709 st = 3; 1710 } else { 1711 st = 5; 1712 } 1713 break; 1714 1715 case 5: // read '>' or continue to read the data 1716 switch (ch) { 1717 case ']': 1718 bappend(']'); 1719 break; 1720 1721 case '>': 1722 bflash(); 1723 st = -1; 1724 break; 1725 1726 default: 1727 bappend(']'); 1728 bappend(']'); 1729 bappend(ch); 1730 st = 3; 1731 break; 1732 } 1733 break; 1734 1735 default: 1736 panic(FAULT); 1737 } 1738 } 1739 } 1740 1741 /** 1742 * Reads a xml name. 1743 * 1744 * The xml name must conform "Namespaces in XML" specification. Therefore 1745 * the ':' character is not allowed in the name. This method should be used 1746 * for PI and entity names which may not have a namespace according to the 1747 * specification mentioned above. 1748 * 1749 * @param ns The true value turns namespace conformance on. 1750 * @return The name has been read. 1751 * @exception Exception When incorrect character appear in the name. 1752 * @exception IOException 1753 */ 1754 protected String name(boolean ns) 1755 throws Exception { 1756 mBuffIdx = -1; 1757 bname(ns); 1758 return new String(mBuff, 1, mBuffIdx); 1759 } 1760 1761 /** 1762 * Reads a qualified xml name. 1763 * 1764 * The characters of a qualified name is an array of characters. The first 1765 * (chars[0]) character is the index of the colon character which separates 1766 * the prefix from the local name. If the index is zero, the name does not 1767 * contain separator or the parser works in the namespace unaware mode. The 1768 * length of qualified name is the length of the array minus one. 1769 * 1770 * @param ns The true value turns namespace conformance on. 1771 * @return The characters of a qualified name. 1772 * @exception Exception When incorrect character appear in the name. 1773 * @exception IOException 1774 */ 1775 protected char[] qname(boolean ns) 1776 throws Exception { 1777 mBuffIdx = -1; 1778 bname(ns); 1779 char chars[] = new char[mBuffIdx + 1]; 1780 System.arraycopy(mBuff, 0, chars, 0, mBuffIdx + 1); 1781 return chars; 1782 } 1783 1784 /** 1785 * Reads the public or/and system identifiers. 1786 * 1787 * @param inp The input object. 1788 * @exception Exception is parser specific exception form panic method. 1789 * @exception IOException 1790 */ 1791 private void pubsys(Input inp) 1792 throws Exception { 1793 Pair pair = pubsys(' '); 1794 inp.pubid = pair.name; 1795 inp.sysid = pair.value; 1796 del(pair); 1797 } 1798 1799 /** 1800 * Reads the public or/and system identifiers. 1801 * 1802 * @param flag The 'N' allows public id be without system id. 1803 * @return The public or/and system identifiers pair. 1804 * @exception Exception is parser specific exception form panic method. 1805 * @exception IOException 1806 */ 1807 @SuppressWarnings("fallthrough") 1808 private Pair pubsys(char flag) throws Exception { 1809 Pair ids = pair(null); 1810 String str = name(false); 1811 if ("PUBLIC".equals(str) == true) { 1812 bqstr('i'); // non-CDATA normalization [#4.2.2] 1813 ids.name = new String(mBuff, 1, mBuffIdx); 1814 switch (wsskip()) { 1815 case '\"': 1816 case '\'': 1817 bqstr(' '); 1818 ids.value = new String(mBuff, 1, mBuffIdx); 1819 break; 1820 1821 case EOS: 1822 panic(FAULT); 1823 1824 default: 1825 if (flag != 'N') // [#4.7] 1826 { 1827 panic(FAULT); 1828 } 1829 ids.value = null; 1830 break; 1831 } 1832 return ids; 1833 } else if ("SYSTEM".equals(str) == true) { 1834 ids.name = null; 1835 bqstr(' '); 1836 ids.value = new String(mBuff, 1, mBuffIdx); 1837 return ids; 1838 } 1839 panic(FAULT); 1840 return null; 1841 } 1842 1843 /** 1844 * Reads an attribute value. 1845 * 1846 * The grammar which this method can read is:<br /> 1847 * <code>eqstr := S "=" qstr</code><br /> 1848 * <code>qstr := S ("'" string "'") | 1849 * ('"' string '"')</code><br /> This method resolves entities 1850 * inside a string unless the parser parses DTD. 1851 * 1852 * @param flag The '=' character forces the method to accept the '=' 1853 * character before quoted string and read the following string as not an 1854 * attribute ('-'), 'c' - CDATA, 'i' - non CDATA, ' ' - no normalization; 1855 * '-' - not an attribute value; 'd' - in DTD context. 1856 * @return The content of the quoted strign as a string. 1857 * @exception Exception is parser specific exception form panic method. 1858 * @exception IOException 1859 */ 1860 protected String eqstr(char flag) throws Exception { 1861 if (flag == '=') { 1862 wsskip(); 1863 if (getch() != '=') { 1864 panic(FAULT); 1865 } 1866 } 1867 bqstr((flag == '=') ? '-' : flag); 1868 return new String(mBuff, 1, mBuffIdx); 1869 } 1870 1871 /** 1872 * Resoves an entity. 1873 * 1874 * This method resolves built-in and character entity references. It is also 1875 * reports external entities to the application. 1876 * 1877 * @param flag The 'x' character forces the method to report a skipped 1878 * entity; 'i' character - indicates non-CDATA normalization. 1879 * @return Name of unresolved entity or <code>null</code> if entity had been 1880 * resolved successfully. 1881 * @exception Exception is parser specific exception form panic method. 1882 * @exception IOException 1883 */ 1884 @SuppressWarnings("fallthrough") 1885 private String ent(char flag) throws Exception { 1886 char ch; 1887 int idx = mBuffIdx + 1; 1888 Input inp = null; 1889 String str = null; 1890 mESt = 0x100; // reset the built-in entity recognizer 1891 bappend('&'); 1892 for (short st = 0; st >= 0;) { 1893 ch = (mChIdx < mChLen) ? mChars[mChIdx++] : getch(); 1894 switch (st) { 1895 case 0: // the first character of the entity name 1896 case 1: // read built-in entity name 1897 switch (chtyp(ch)) { 1898 case 'd': 1899 case '.': 1900 case '-': 1901 if (st != 1) { 1902 panic(FAULT); 1903 } 1904 case 'a': 1905 case 'A': 1906 case '_': 1907 case 'X': 1908 bappend(ch); 1909 eappend(ch); 1910 st = 1; 1911 break; 1912 1913 case ':': 1914 if (mIsNSAware != false) { 1915 panic(FAULT); 1916 } 1917 bappend(ch); 1918 eappend(ch); 1919 st = 1; 1920 break; 1921 1922 case ';': 1923 if (mESt < 0x100) { 1924 // The entity is a built-in entity 1925 mBuffIdx = idx - 1; 1926 bappend(mESt); 1927 st = -1; 1928 break; 1929 } else if (mPh == PH_DTD) { 1930 // In DTD entity declaration has to resolve character 1931 // entities and include "as is" others. [#4.4.7] 1932 bappend(';'); 1933 st = -1; 1934 break; 1935 } 1936 // Convert an entity name to a string 1937 str = new String(mBuff, idx + 1, mBuffIdx - idx); 1938 inp = mEnt.get(str); 1939 // Restore the buffer offset 1940 mBuffIdx = idx - 1; 1941 if (inp != null) { 1942 if (inp.chars == null) { 1943 // External entity 1944 InputSource is = resolveEnt(str, inp.pubid, inp.sysid); 1945 if (is != null) { 1946 push(new Input(BUFFSIZE_READER)); 1947 setinp(is); 1948 mInp.pubid = inp.pubid; 1949 mInp.sysid = inp.sysid; 1950 str = null; // the entity is resolved 1951 } else { 1952 // Unresolved external entity 1953 if (flag != 'x') { 1954 panic(FAULT); // unknown entity within marckup 1955 } // str is name of unresolved entity 1956 } 1957 } else { 1958 // Internal entity 1959 push(inp); 1960 str = null; // the entity is resolved 1961 } 1962 } else { 1963 // Unknown or general unparsed entity 1964 if (flag != 'x') { 1965 panic(FAULT); // unknown entity within marckup 1966 } // str is name of unresolved entity 1967 } 1968 st = -1; 1969 break; 1970 1971 case '#': 1972 if (st != 0) { 1973 panic(FAULT); 1974 } 1975 st = 2; 1976 break; 1977 1978 default: 1979 panic(FAULT); 1980 } 1981 break; 1982 1983 case 2: // read character entity 1984 switch (chtyp(ch)) { 1985 case 'd': 1986 bappend(ch); 1987 break; 1988 1989 case ';': 1990 // Convert the character entity to a character 1991 try { 1992 int i = Integer.parseInt( 1993 new String(mBuff, idx + 1, mBuffIdx - idx), 10); 1994 if (i >= 0xffff) { 1995 panic(FAULT); 1996 } 1997 ch = (char) i; 1998 } catch (NumberFormatException nfe) { 1999 panic(FAULT); 2000 } 2001 // Restore the buffer offset 2002 mBuffIdx = idx - 1; 2003 if (ch == ' ' || mInp.next != null) { 2004 bappend(ch, flag); 2005 } else { 2006 bappend(ch); 2007 } 2008 st = -1; 2009 break; 2010 2011 case 'a': 2012 // If the entity buffer is empty and ch == 'x' 2013 if ((mBuffIdx == idx) && (ch == 'x')) { 2014 st = 3; 2015 break; 2016 } 2017 default: 2018 panic(FAULT); 2019 } 2020 break; 2021 2022 case 3: // read hex character entity 2023 switch (chtyp(ch)) { 2024 case 'A': 2025 case 'a': 2026 case 'd': 2027 bappend(ch); 2028 break; 2029 2030 case ';': 2031 // Convert the character entity to a character 2032 try { 2033 int i = Integer.parseInt( 2034 new String(mBuff, idx + 1, mBuffIdx - idx), 16); 2035 if (i >= 0xffff) { 2036 panic(FAULT); 2037 } 2038 ch = (char) i; 2039 } catch (NumberFormatException nfe) { 2040 panic(FAULT); 2041 } 2042 // Restore the buffer offset 2043 mBuffIdx = idx - 1; 2044 if (ch == ' ' || mInp.next != null) { 2045 bappend(ch, flag); 2046 } else { 2047 bappend(ch); 2048 } 2049 st = -1; 2050 break; 2051 2052 default: 2053 panic(FAULT); 2054 } 2055 break; 2056 2057 default: 2058 panic(FAULT); 2059 } 2060 } 2061 2062 return str; 2063 } 2064 2065 /** 2066 * Resoves a parameter entity. 2067 * 2068 * This method resolves a parameter entity references. It is also reports 2069 * external entities to the application. 2070 * 2071 * @param flag The '-' instruct the method to do not set up surrounding 2072 * spaces [#4.4.8]. 2073 * @exception Exception is parser specific exception form panic method. 2074 * @exception IOException 2075 */ 2076 @SuppressWarnings("fallthrough") 2077 private void pent(char flag) throws Exception { 2078 char ch; 2079 int idx = mBuffIdx + 1; 2080 Input inp = null; 2081 String str = null; 2082 bappend('%'); 2083 if (mPh != PH_DTD) // the DTD internal subset 2084 { 2085 return; // Not Recognized [#4.4.1] 2086 } // Read entity name 2087 bname(false); 2088 str = new String(mBuff, idx + 2, mBuffIdx - idx - 1); 2089 if (getch() != ';') { 2090 panic(FAULT); 2091 } 2092 inp = mPEnt.get(str); 2093 // Restore the buffer offset 2094 mBuffIdx = idx - 1; 2095 if (inp != null) { 2096 if (inp.chars == null) { 2097 // External parameter entity 2098 InputSource is = resolveEnt(str, inp.pubid, inp.sysid); 2099 if (is != null) { 2100 if (flag != '-') { 2101 bappend(' '); // tail space 2102 } 2103 push(new Input(BUFFSIZE_READER)); 2104 // BUG: there is no leading space! [#4.4.8] 2105 setinp(is); 2106 mInp.pubid = inp.pubid; 2107 mInp.sysid = inp.sysid; 2108 } else { 2109 // Unresolved external parameter entity 2110 skippedEnt("%" + str); 2111 } 2112 } else { 2113 // Internal parameter entity 2114 if (flag == '-') { 2115 // No surrounding spaces 2116 inp.chIdx = 1; 2117 } else { 2118 // Insert surrounding spaces 2119 bappend(' '); // tail space 2120 inp.chIdx = 0; 2121 } 2122 push(inp); 2123 } 2124 } else { 2125 // Unknown parameter entity 2126 skippedEnt("%" + str); 2127 } 2128 } 2129 2130 /** 2131 * Recognizes and handles a namespace declaration. 2132 * 2133 * This method identifies a type of namespace declaration if any and puts 2134 * new mapping on top of prefix stack. 2135 * 2136 * @param name The attribute qualified name (<code>name.value</code> is a 2137 * <code>String</code> object which represents the attribute prefix). 2138 * @param value The attribute value. 2139 * @return <code>true</code> if a namespace declaration is recognized. 2140 */ 2141 private boolean isdecl(Pair name, String value) { 2142 if (name.chars[0] == 0) { 2143 if ("xmlns".equals(name.name) == true) { 2144 // New default namespace declaration 2145 mPref = pair(mPref); 2146 mPref.list = mElm; // prefix owner element 2147 mPref.value = value; 2148 mPref.name = ""; 2149 mPref.chars = NONS; 2150 mElm.num++; // namespace counter 2151 return true; 2152 } 2153 } else { 2154 if (name.eqpref(XMLNS) == true) { 2155 // New prefix declaration 2156 int len = name.name.length(); 2157 mPref = pair(mPref); 2158 mPref.list = mElm; // prefix owner element 2159 mPref.value = value; 2160 mPref.name = name.name; 2161 mPref.chars = new char[len + 1]; 2162 mPref.chars[0] = (char) (len + 1); 2163 name.name.getChars(0, len, mPref.chars, 1); 2164 mElm.num++; // namespace counter 2165 return true; 2166 } 2167 } 2168 return false; 2169 } 2170 2171 /** 2172 * Resolves a prefix. 2173 * 2174 * @return The namespace assigned to the prefix. 2175 * @exception Exception When mapping for specified prefix is not found. 2176 */ 2177 private String rslv(char[] qname) 2178 throws Exception { 2179 for (Pair pref = mPref; pref != null; pref = pref.next) { 2180 if (pref.eqpref(qname) == true) { 2181 return pref.value; 2182 } 2183 } 2184 if (qname[0] == 1) { // QNames like ':local' 2185 for (Pair pref = mPref; pref != null; pref = pref.next) { 2186 if (pref.chars[0] == 0) { 2187 return pref.value; 2188 } 2189 } 2190 } 2191 panic(FAULT); 2192 return null; 2193 } 2194 2195 /** 2196 * Skips xml white space characters. 2197 * 2198 * This method skips white space characters (' ', '\t', '\n', '\r') and 2199 * looks ahead not white space character. 2200 * 2201 * @return The first not white space look ahead character. 2202 * @exception IOException 2203 */ 2204 protected char wsskip() 2205 throws IOException { 2206 char ch; 2207 while (true) { 2208 // Read next character 2209 ch = (mChIdx < mChLen) ? mChars[mChIdx++] : getch(); 2210 if (ch < 0x80) { 2211 if (nmttyp[ch] != 3) // [ \t\n\r] 2212 { 2213 break; 2214 } 2215 } else { 2216 break; 2217 } 2218 } 2219 mChIdx--; // bkch(); 2220 return ch; 2221 } 2222 2223 /** 2224 * Reports document type. 2225 * 2226 * @param name The name of the entity. 2227 * @param pubid The public identifier of the entity or <code>null</code>. 2228 * @param sysid The system identifier of the entity or <code>null</code>. 2229 */ 2230 protected abstract void docType(String name, String pubid, String sysid) 2231 throws SAXException; 2232 2233 /** 2234 * Reports the start of DTD internal subset. 2235 * 2236 * @throws SAXException if the receiver throws SAXException 2237 */ 2238 public abstract void startInternalSub () throws SAXException; 2239 2240 /** 2241 * Reports a comment. 2242 * 2243 * @param text The comment text starting from first charcater. 2244 * @param length The number of characters in comment. 2245 */ 2246 protected abstract void comm(char[] text, int length); 2247 2248 /** 2249 * Reports a processing instruction. 2250 * 2251 * @param target The processing instruction target name. 2252 * @param body The processing instruction body text. 2253 */ 2254 protected abstract void pi(String target, String body) 2255 throws Exception; 2256 2257 /** 2258 * Reports new namespace prefix. The Namespace prefix ( 2259 * <code>mPref.name</code>) being declared and the Namespace URI ( 2260 * <code>mPref.value</code>) the prefix is mapped to. An empty string is 2261 * used for the default element namespace, which has no prefix. 2262 */ 2263 protected abstract void newPrefix() 2264 throws Exception; 2265 2266 /** 2267 * Reports skipped entity name. 2268 * 2269 * @param name The entity name. 2270 */ 2271 protected abstract void skippedEnt(String name) 2272 throws Exception; 2273 2274 /** 2275 * Returns an 2276 * <code>InputSource</code> for specified entity or 2277 * <code>null</code>. 2278 * 2279 * @param name The name of the entity. 2280 * @param pubid The public identifier of the entity. 2281 * @param sysid The system identifier of the entity. 2282 */ 2283 protected abstract InputSource resolveEnt( 2284 String name, String pubid, String sysid) 2285 throws Exception; 2286 2287 /** 2288 * Reports notation declaration. 2289 * 2290 * @param name The notation's name. 2291 * @param pubid The notation's public identifier, or null if none was given. 2292 * @param sysid The notation's system identifier, or null if none was given. 2293 */ 2294 protected abstract void notDecl(String name, String pubid, String sysid) 2295 throws Exception; 2296 2297 /** 2298 * Reports unparsed entity name. 2299 * 2300 * @param name The unparsed entity's name. 2301 * @param pubid The entity's public identifier, or null if none was given. 2302 * @param sysid The entity's system identifier. 2303 * @param notation The name of the associated notation. 2304 */ 2305 protected abstract void unparsedEntDecl( 2306 String name, String pubid, String sysid, String notation) 2307 throws Exception; 2308 2309 /** 2310 * Notifies the handler about fatal parsing error. 2311 * 2312 * @param msg The problem description message. 2313 */ 2314 protected abstract void panic(String msg) 2315 throws Exception; 2316 2317 /** 2318 * Reads a qualified xml name. 2319 * 2320 * This is low level routine which leaves a qName in the buffer. The 2321 * characters of a qualified name is an array of characters. The first 2322 * (chars[0]) character is the index of the colon character which separates 2323 * the prefix from the local name. If the index is zero, the name does not 2324 * contain separator or the parser works in the namespace unaware mode. The 2325 * length of qualified name is the length of the array minus one. 2326 * 2327 * @param ns The true value turns namespace conformance on. 2328 * @exception Exception is parser specific exception form panic method. 2329 * @exception IOException 2330 */ 2331 private void bname(boolean ns) 2332 throws Exception { 2333 char ch; 2334 char type; 2335 mBuffIdx++; // allocate a char for colon offset 2336 int bqname = mBuffIdx; 2337 int bcolon = bqname; 2338 int bchidx = bqname + 1; 2339 int bstart = bchidx; 2340 int cstart = mChIdx; 2341 short st = (short) ((ns == true) ? 0 : 2); 2342 while (true) { 2343 // Read next character 2344 if (mChIdx >= mChLen) { 2345 bcopy(cstart, bstart); 2346 getch(); 2347 mChIdx--; // bkch(); 2348 cstart = mChIdx; 2349 bstart = bchidx; 2350 } 2351 ch = mChars[mChIdx++]; 2352 type = (char) 0; // [X] 2353 if (ch < 0x80) { 2354 type = (char) nmttyp[ch]; 2355 } else if (ch == EOS) { 2356 panic(FAULT); 2357 } 2358 // Parse QName 2359 switch (st) { 2360 case 0: // read the first char of the prefix 2361 case 2: // read the first char of the suffix 2362 switch (type) { 2363 case 0: // [aA_X] 2364 bchidx++; // append char to the buffer 2365 st++; // (st == 0)? 1: 3; 2366 break; 2367 2368 case 1: // [:] 2369 mChIdx--; // bkch(); 2370 st++; // (st == 0)? 1: 3; 2371 break; 2372 2373 default: 2374 panic(FAULT); 2375 } 2376 break; 2377 2378 case 1: // read the prefix 2379 case 3: // read the suffix 2380 switch (type) { 2381 case 0: // [aA_X] 2382 case 2: // [.-d] 2383 bchidx++; // append char to the buffer 2384 break; 2385 2386 case 1: // [:] 2387 bchidx++; // append char to the buffer 2388 if (ns == true) { 2389 if (bcolon != bqname) { 2390 panic(FAULT); // it must be only one colon 2391 } 2392 bcolon = bchidx - 1; 2393 if (st == 1) { 2394 st = 2; 2395 } 2396 } 2397 break; 2398 2399 default: 2400 mChIdx--; // bkch(); 2401 bcopy(cstart, bstart); 2402 mBuff[bqname] = (char) (bcolon - bqname); 2403 return; 2404 } 2405 break; 2406 2407 default: 2408 panic(FAULT); 2409 } 2410 } 2411 } 2412 2413 /** 2414 * Reads a nmtoken. 2415 * 2416 * This is low level routine which leaves a nmtoken in the buffer. 2417 * 2418 * @exception Exception is parser specific exception form panic method. 2419 * @exception IOException 2420 */ 2421 @SuppressWarnings("fallthrough") 2422 private void bntok() throws Exception { 2423 char ch; 2424 mBuffIdx = -1; 2425 bappend((char) 0); // default offset to the colon char 2426 while (true) { 2427 ch = getch(); 2428 switch (chtyp(ch)) { 2429 case 'a': 2430 case 'A': 2431 case 'd': 2432 case '.': 2433 case ':': 2434 case '-': 2435 case '_': 2436 case 'X': 2437 bappend(ch); 2438 break; 2439 2440 case 'Z': 2441 panic(FAULT); 2442 2443 default: 2444 bkch(); 2445 return; 2446 } 2447 } 2448 } 2449 2450 /** 2451 * Recognizes a keyword. 2452 * 2453 * This is low level routine which recognizes one of keywords in the buffer. 2454 * Keyword Id ID - i IDREF - r IDREFS - R ENTITY - n ENTITIES - N NMTOKEN - 2455 * t NMTOKENS - T ELEMENT - e ATTLIST - a NOTATION - o CDATA - c REQUIRED - 2456 * Q IMPLIED - I FIXED - F 2457 * 2458 * @return an id of a keyword or '?'. 2459 * @exception Exception is parser specific exception form panic method. 2460 * @exception IOException 2461 */ 2462 private char bkeyword() 2463 throws Exception { 2464 String str = new String(mBuff, 1, mBuffIdx); 2465 switch (str.length()) { 2466 case 2: // ID 2467 return ("ID".equals(str) == true) ? 'i' : '?'; 2468 2469 case 5: // IDREF, CDATA, FIXED 2470 switch (mBuff[1]) { 2471 case 'I': 2472 return ("IDREF".equals(str) == true) ? 'r' : '?'; 2473 case 'C': 2474 return ("CDATA".equals(str) == true) ? 'c' : '?'; 2475 case 'F': 2476 return ("FIXED".equals(str) == true) ? 'F' : '?'; 2477 default: 2478 break; 2479 } 2480 break; 2481 2482 case 6: // IDREFS, ENTITY 2483 switch (mBuff[1]) { 2484 case 'I': 2485 return ("IDREFS".equals(str) == true) ? 'R' : '?'; 2486 case 'E': 2487 return ("ENTITY".equals(str) == true) ? 'n' : '?'; 2488 default: 2489 break; 2490 } 2491 break; 2492 2493 case 7: // NMTOKEN, IMPLIED, ATTLIST, ELEMENT 2494 switch (mBuff[1]) { 2495 case 'I': 2496 return ("IMPLIED".equals(str) == true) ? 'I' : '?'; 2497 case 'N': 2498 return ("NMTOKEN".equals(str) == true) ? 't' : '?'; 2499 case 'A': 2500 return ("ATTLIST".equals(str) == true) ? 'a' : '?'; 2501 case 'E': 2502 return ("ELEMENT".equals(str) == true) ? 'e' : '?'; 2503 default: 2504 break; 2505 } 2506 break; 2507 2508 case 8: // ENTITIES, NMTOKENS, NOTATION, REQUIRED 2509 switch (mBuff[2]) { 2510 case 'N': 2511 return ("ENTITIES".equals(str) == true) ? 'N' : '?'; 2512 case 'M': 2513 return ("NMTOKENS".equals(str) == true) ? 'T' : '?'; 2514 case 'O': 2515 return ("NOTATION".equals(str) == true) ? 'o' : '?'; 2516 case 'E': 2517 return ("REQUIRED".equals(str) == true) ? 'Q' : '?'; 2518 default: 2519 break; 2520 } 2521 break; 2522 2523 default: 2524 break; 2525 } 2526 return '?'; 2527 } 2528 2529 /** 2530 * Reads a single or double quotted string in to the buffer. 2531 * 2532 * This method resolves entities inside a string unless the parser parses 2533 * DTD. 2534 * 2535 * @param flag 'c' - CDATA, 'i' - non CDATA, ' ' - no normalization; '-' - 2536 * not an attribute value; 'd' - in DTD context. 2537 * @exception Exception is parser specific exception form panic method. 2538 * @exception IOException 2539 */ 2540 @SuppressWarnings("fallthrough") 2541 private void bqstr(char flag) throws Exception { 2542 Input inp = mInp; // remember the original input 2543 mBuffIdx = -1; 2544 bappend((char) 0); // default offset to the colon char 2545 char ch; 2546 for (short st = 0; st >= 0;) { 2547 ch = (mChIdx < mChLen) ? mChars[mChIdx++] : getch(); 2548 switch (st) { 2549 case 0: // read a single or double quote 2550 switch (ch) { 2551 case ' ': 2552 case '\n': 2553 case '\r': 2554 case '\t': 2555 break; 2556 2557 case '\'': 2558 st = 2; // read a single quoted string 2559 break; 2560 2561 case '\"': 2562 st = 3; // read a double quoted string 2563 break; 2564 2565 default: 2566 panic(FAULT); 2567 break; 2568 } 2569 break; 2570 2571 case 2: // read a single quoted string 2572 case 3: // read a double quoted string 2573 switch (ch) { 2574 case '\'': 2575 if ((st == 2) && (mInp == inp)) { 2576 st = -1; 2577 } else { 2578 bappend(ch); 2579 } 2580 break; 2581 2582 case '\"': 2583 if ((st == 3) && (mInp == inp)) { 2584 st = -1; 2585 } else { 2586 bappend(ch); 2587 } 2588 break; 2589 2590 case '&': 2591 if (flag != 'd') { 2592 ent(flag); 2593 } else { 2594 bappend(ch); 2595 } 2596 break; 2597 2598 case '%': 2599 if (flag == 'd') { 2600 pent('-'); 2601 } else { 2602 bappend(ch); 2603 } 2604 break; 2605 2606 case '<': 2607 if ((flag == '-') || (flag == 'd')) { 2608 bappend(ch); 2609 } else { 2610 panic(FAULT); 2611 } 2612 break; 2613 2614 case EOS: // EOS before single/double quote 2615 panic(FAULT); 2616 2617 case '\r': // EOL processing [#2.11 & #3.3.3] 2618 if (flag != ' ' && mInp.next == null) { 2619 if (getch() != '\n') { 2620 bkch(); 2621 } 2622 ch = '\n'; 2623 } 2624 default: 2625 bappend(ch, flag); 2626 break; 2627 } 2628 break; 2629 2630 default: 2631 panic(FAULT); 2632 } 2633 } 2634 // There is maximum one space at the end of the string in 2635 // i-mode (non CDATA normalization) and it has to be removed. 2636 if ((flag == 'i') && (mBuff[mBuffIdx] == ' ')) { 2637 mBuffIdx -= 1; 2638 } 2639 } 2640 2641 /** 2642 * Reports characters and empties the parser's buffer. This method is called 2643 * only if parser is going to return control to the main loop. This means 2644 * that this method may use parser buffer to report white space without 2645 * copeing characters to temporary buffer. 2646 */ 2647 protected abstract void bflash() 2648 throws Exception; 2649 2650 /** 2651 * Reports white space characters and empties the parser's buffer. This 2652 * method is called only if parser is going to return control to the main 2653 * loop. This means that this method may use parser buffer to report white 2654 * space without copeing characters to temporary buffer. 2655 */ 2656 protected abstract void bflash_ws() 2657 throws Exception; 2658 2659 /** 2660 * Appends a character to parser's buffer with normalization. 2661 * 2662 * @param ch The character to append to the buffer. 2663 * @param mode The normalization mode. 2664 */ 2665 private void bappend(char ch, char mode) { 2666 // This implements attribute value normalization as 2667 // described in the XML specification [#3.3.3]. 2668 switch (mode) { 2669 case 'i': // non CDATA normalization 2670 switch (ch) { 2671 case ' ': 2672 case '\n': 2673 case '\r': 2674 case '\t': 2675 if ((mBuffIdx > 0) && (mBuff[mBuffIdx] != ' ')) { 2676 bappend(' '); 2677 } 2678 return; 2679 2680 default: 2681 break; 2682 } 2683 break; 2684 2685 case 'c': // CDATA normalization 2686 switch (ch) { 2687 case '\n': 2688 case '\r': 2689 case '\t': 2690 ch = ' '; 2691 break; 2692 2693 default: 2694 break; 2695 } 2696 break; 2697 2698 default: // no normalization 2699 break; 2700 } 2701 mBuffIdx++; 2702 if (mBuffIdx < mBuff.length) { 2703 mBuff[mBuffIdx] = ch; 2704 } else { 2705 mBuffIdx--; 2706 bappend(ch); 2707 } 2708 } 2709 2710 /** 2711 * Appends a character to parser's buffer. 2712 * 2713 * @param ch The character to append to the buffer. 2714 */ 2715 private void bappend(char ch) { 2716 try { 2717 mBuff[++mBuffIdx] = ch; 2718 } catch (Exception exp) { 2719 // Double the buffer size 2720 char buff[] = new char[mBuff.length << 1]; 2721 System.arraycopy(mBuff, 0, buff, 0, mBuff.length); 2722 mBuff = buff; 2723 mBuff[mBuffIdx] = ch; 2724 } 2725 } 2726 2727 /** 2728 * Appends (mChIdx - cidx) characters from character buffer (mChars) to 2729 * parser's buffer (mBuff). 2730 * 2731 * @param cidx The character buffer (mChars) start index. 2732 * @param bidx The parser buffer (mBuff) start index. 2733 */ 2734 private void bcopy(int cidx, int bidx) { 2735 int length = mChIdx - cidx; 2736 if ((bidx + length + 1) >= mBuff.length) { 2737 // Expand the buffer 2738 char buff[] = new char[mBuff.length + length]; 2739 System.arraycopy(mBuff, 0, buff, 0, mBuff.length); 2740 mBuff = buff; 2741 } 2742 System.arraycopy(mChars, cidx, mBuff, bidx, length); 2743 mBuffIdx += length; 2744 } 2745 2746 /** 2747 * Recognizes the built-in entities <i>lt</i>, <i>gt</i>, <i>amp</i>, 2748 * <i>apos</i>, <i>quot</i>. The initial state is 0x100. Any state belowe 2749 * 0x100 is a built-in entity replacement character. 2750 * 2751 * @param ch the next character of an entity name. 2752 */ 2753 @SuppressWarnings("fallthrough") 2754 private void eappend(char ch) { 2755 switch (mESt) { 2756 case 0x100: // "l" or "g" or "a" or "q" 2757 switch (ch) { 2758 case 'l': 2759 mESt = 0x101; 2760 break; 2761 case 'g': 2762 mESt = 0x102; 2763 break; 2764 case 'a': 2765 mESt = 0x103; 2766 break; 2767 case 'q': 2768 mESt = 0x107; 2769 break; 2770 default: 2771 mESt = 0x200; 2772 break; 2773 } 2774 break; 2775 2776 case 0x101: // "lt" 2777 mESt = (ch == 't') ? '<' : (char) 0x200; 2778 break; 2779 2780 case 0x102: // "gt" 2781 mESt = (ch == 't') ? '>' : (char) 0x200; 2782 break; 2783 2784 case 0x103: // "am" or "ap" 2785 switch (ch) { 2786 case 'm': 2787 mESt = 0x104; 2788 break; 2789 case 'p': 2790 mESt = 0x105; 2791 break; 2792 default: 2793 mESt = 0x200; 2794 break; 2795 } 2796 break; 2797 2798 case 0x104: // "amp" 2799 mESt = (ch == 'p') ? '&' : (char) 0x200; 2800 break; 2801 2802 case 0x105: // "apo" 2803 mESt = (ch == 'o') ? (char) 0x106 : (char) 0x200; 2804 break; 2805 2806 case 0x106: // "apos" 2807 mESt = (ch == 's') ? '\'' : (char) 0x200; 2808 break; 2809 2810 case 0x107: // "qu" 2811 mESt = (ch == 'u') ? (char) 0x108 : (char) 0x200; 2812 break; 2813 2814 case 0x108: // "quo" 2815 mESt = (ch == 'o') ? (char) 0x109 : (char) 0x200; 2816 break; 2817 2818 case 0x109: // "quot" 2819 mESt = (ch == 't') ? '\"' : (char) 0x200; 2820 break; 2821 2822 case '<': // "lt" 2823 case '>': // "gt" 2824 case '&': // "amp" 2825 case '\'': // "apos" 2826 case '\"': // "quot" 2827 mESt = 0x200; 2828 default: 2829 break; 2830 } 2831 } 2832 2833 /** 2834 * Sets up a new input source on the top of the input stack. Note, the first 2835 * byte returned by the entity's byte stream has to be the first byte in the 2836 * entity. However, the parser does not expect the byte order mask in both 2837 * cases when encoding is provided by the input source. 2838 * 2839 * @param is A new input source to set up. 2840 * @exception IOException If any IO errors occur. 2841 * @exception Exception is parser specific exception form panic method. 2842 */ 2843 protected void setinp(InputSource is) 2844 throws Exception { 2845 Reader reader = null; 2846 mChIdx = 0; 2847 mChLen = 0; 2848 mChars = mInp.chars; 2849 mInp.src = null; 2850 if (mPh < PH_DOC_START) { 2851 mIsSAlone = false; // default [#2.9] 2852 } 2853 mIsSAloneSet = false; 2854 if (is.getCharacterStream() != null) { 2855 // Ignore encoding in the xml text decl. 2856 reader = is.getCharacterStream(); 2857 xml(reader); 2858 } else if (is.getByteStream() != null) { 2859 String expenc; 2860 if (is.getEncoding() != null) { 2861 // Ignore encoding in the xml text decl. 2862 expenc = is.getEncoding().toUpperCase(); 2863 if (expenc.equals("UTF-16")) { 2864 reader = bom(is.getByteStream(), 'U'); // UTF-16 [#4.3.3] 2865 } else { 2866 reader = enc(expenc, is.getByteStream()); 2867 } 2868 xml(reader); 2869 } else { 2870 // Get encoding from BOM or the xml text decl. 2871 reader = bom(is.getByteStream(), ' '); 2872 if (reader == null) { 2873 // Encoding is defined by the xml text decl. 2874 reader = enc("UTF-8", is.getByteStream()); 2875 expenc = xml(reader); 2876 if (expenc.startsWith("UTF-16")) { 2877 panic(FAULT); // UTF-16 must have BOM [#4.3.3] 2878 } 2879 reader = enc(expenc, is.getByteStream()); 2880 } else { 2881 // Encoding is defined by the BOM. 2882 xml(reader); 2883 } 2884 } 2885 } else { 2886 // There is no support for public/system identifiers. 2887 panic(FAULT); 2888 } 2889 mInp.src = reader; 2890 mInp.pubid = is.getPublicId(); 2891 mInp.sysid = is.getSystemId(); 2892 } 2893 2894 /** 2895 * Determines the entity encoding. 2896 * 2897 * This method gets encoding from Byte Order Mask [#4.3.3] if any. Note, the 2898 * first byte returned by the entity's byte stream has to be the first byte 2899 * in the entity. Also, there is no support for UCS-4. 2900 * 2901 * @param is A byte stream of the entity. 2902 * @param hint An encoding hint, character U means UTF-16. 2903 * @return a reader constructed from the BOM or UTF-8 by default. 2904 * @exception Exception is parser specific exception form panic method. 2905 * @exception IOException 2906 */ 2907 private Reader bom(InputStream is, char hint) 2908 throws Exception { 2909 int val = is.read(); 2910 switch (val) { 2911 case 0xef: // UTF-8 2912 if (hint == 'U') // must be UTF-16 2913 { 2914 panic(FAULT); 2915 } 2916 if (is.read() != 0xbb) { 2917 panic(FAULT); 2918 } 2919 if (is.read() != 0xbf) { 2920 panic(FAULT); 2921 } 2922 return new ReaderUTF8(is); 2923 2924 case 0xfe: // UTF-16, big-endian 2925 if (is.read() != 0xff) { 2926 panic(FAULT); 2927 } 2928 return new ReaderUTF16(is, 'b'); 2929 2930 case 0xff: // UTF-16, little-endian 2931 if (is.read() != 0xfe) { 2932 panic(FAULT); 2933 } 2934 return new ReaderUTF16(is, 'l'); 2935 2936 case -1: 2937 mChars[mChIdx++] = EOS; 2938 return new ReaderUTF8(is); 2939 2940 default: 2941 if (hint == 'U') // must be UTF-16 2942 { 2943 panic(FAULT); 2944 } 2945 // Read the rest of UTF-8 character 2946 switch (val & 0xf0) { 2947 case 0xc0: 2948 case 0xd0: 2949 mChars[mChIdx++] = (char) (((val & 0x1f) << 6) | (is.read() & 0x3f)); 2950 break; 2951 2952 case 0xe0: 2953 mChars[mChIdx++] = (char) (((val & 0x0f) << 12) 2954 | ((is.read() & 0x3f) << 6) | (is.read() & 0x3f)); 2955 break; 2956 2957 case 0xf0: // UCS-4 character 2958 throw new UnsupportedEncodingException(); 2959 2960 default: 2961 mChars[mChIdx++] = (char) val; 2962 break; 2963 } 2964 return null; 2965 } 2966 } 2967 2968 /** 2969 * Parses the xml text declaration. 2970 * 2971 * This method gets encoding from the xml text declaration [#4.3.1] if any. 2972 * The method assumes the buffer (mChars) is big enough to accommodate whole 2973 * xml text declaration. 2974 * 2975 * @param reader is entity reader. 2976 * @return The xml text declaration encoding or default UTF-8 encoding. 2977 * @exception Exception is parser specific exception form panic method. 2978 * @exception IOException 2979 */ 2980 private String xml(Reader reader) 2981 throws Exception { 2982 String str = null; 2983 String enc = "UTF-8"; 2984 char ch; 2985 int val; 2986 short st; 2987 // Read the xml text declaration into the buffer 2988 if (mChIdx != 0) { 2989 // The bom method have read ONE char into the buffer. 2990 st = (short) ((mChars[0] == '<') ? 1 : -1); 2991 } else { 2992 st = 0; 2993 } 2994 while (st >= 0 && mChIdx < mChars.length) { 2995 ch = ((val = reader.read()) >= 0) ? (char) val : EOS; 2996 mChars[mChIdx++] = ch; 2997 switch (st) { 2998 case 0: // read '<' of xml declaration 2999 switch (ch) { 3000 case '<': 3001 st = 1; 3002 break; 3003 3004 case 0xfeff: // the byte order mask 3005 ch = ((val = reader.read()) >= 0) ? (char) val : EOS; 3006 mChars[mChIdx - 1] = ch; 3007 st = (short) ((ch == '<') ? 1 : -1); 3008 break; 3009 3010 default: 3011 st = -1; 3012 break; 3013 } 3014 break; 3015 3016 case 1: // read '?' of xml declaration [#4.3.1] 3017 st = (short) ((ch == '?') ? 2 : -1); 3018 break; 3019 3020 case 2: // read 'x' of xml declaration [#4.3.1] 3021 st = (short) ((ch == 'x') ? 3 : -1); 3022 break; 3023 3024 case 3: // read 'm' of xml declaration [#4.3.1] 3025 st = (short) ((ch == 'm') ? 4 : -1); 3026 break; 3027 3028 case 4: // read 'l' of xml declaration [#4.3.1] 3029 st = (short) ((ch == 'l') ? 5 : -1); 3030 break; 3031 3032 case 5: // read white space after 'xml' 3033 switch (ch) { 3034 case ' ': 3035 case '\t': 3036 case '\r': 3037 case '\n': 3038 st = 6; 3039 break; 3040 3041 default: 3042 st = -1; 3043 break; 3044 } 3045 break; 3046 3047 case 6: // read content of xml declaration 3048 switch (ch) { 3049 case '?': 3050 st = 7; 3051 break; 3052 3053 case EOS: 3054 st = -2; 3055 break; 3056 3057 default: 3058 break; 3059 } 3060 break; 3061 3062 case 7: // read '>' after '?' of xml declaration 3063 switch (ch) { 3064 case '>': 3065 case EOS: 3066 st = -2; 3067 break; 3068 3069 default: 3070 st = 6; 3071 break; 3072 } 3073 break; 3074 3075 default: 3076 panic(FAULT); 3077 break; 3078 } 3079 } 3080 mChLen = mChIdx; 3081 mChIdx = 0; 3082 // If there is no xml text declaration, the encoding is default. 3083 if (st == -1) { 3084 return enc; 3085 } 3086 mChIdx = 5; // the first white space after "<?xml" 3087 // Parse the xml text declaration 3088 for (st = 0; st >= 0;) { 3089 ch = getch(); 3090 switch (st) { 3091 case 0: // skip spaces after the xml declaration name 3092 if (chtyp(ch) != ' ') { 3093 bkch(); 3094 st = 1; 3095 } 3096 break; 3097 3098 case 1: // read xml declaration version 3099 case 2: // read xml declaration encoding or standalone 3100 case 3: // read xml declaration standalone 3101 switch (chtyp(ch)) { 3102 case 'a': 3103 case 'A': 3104 case '_': 3105 bkch(); 3106 str = name(false).toLowerCase(); 3107 if ("version".equals(str) == true) { 3108 if (st != 1) { 3109 panic(FAULT); 3110 } 3111 if ("1.0".equals(eqstr('=')) != true) { 3112 panic(FAULT); 3113 } 3114 mInp.xmlver = 0x0100; 3115 st = 2; 3116 } else if ("encoding".equals(str) == true) { 3117 if (st != 2) { 3118 panic(FAULT); 3119 } 3120 mInp.xmlenc = eqstr('=').toUpperCase(); 3121 enc = mInp.xmlenc; 3122 st = 3; 3123 } else if ("standalone".equals(str) == true) { 3124 if ((st == 1) || (mPh >= PH_DOC_START)) // [#4.3.1] 3125 { 3126 panic(FAULT); 3127 } 3128 str = eqstr('=').toLowerCase(); 3129 // Check the 'standalone' value and use it [#5.1] 3130 if (str.equals("yes") == true) { 3131 mIsSAlone = true; 3132 } else if (str.equals("no") == true) { 3133 mIsSAlone = false; 3134 } else { 3135 panic(FAULT); 3136 } 3137 mIsSAloneSet = true; 3138 st = 4; 3139 } else { 3140 panic(FAULT); 3141 } 3142 break; 3143 3144 case ' ': 3145 break; 3146 3147 case '?': 3148 if (st == 1) { 3149 panic(FAULT); 3150 } 3151 bkch(); 3152 st = 4; 3153 break; 3154 3155 default: 3156 panic(FAULT); 3157 } 3158 break; 3159 3160 case 4: // end of xml declaration 3161 switch (chtyp(ch)) { 3162 case '?': 3163 if (getch() != '>') { 3164 panic(FAULT); 3165 } 3166 if (mPh <= PH_DOC_START) { 3167 mPh = PH_MISC_DTD; // misc before DTD 3168 } 3169 st = -1; 3170 break; 3171 3172 case ' ': 3173 break; 3174 3175 default: 3176 panic(FAULT); 3177 } 3178 break; 3179 3180 default: 3181 panic(FAULT); 3182 } 3183 } 3184 return enc; 3185 } 3186 3187 /** 3188 * Sets up the document reader. 3189 * 3190 * @param name an encoding name. 3191 * @param is the document byte input stream. 3192 * @return a reader constructed from encoding name and input stream. 3193 * @exception UnsupportedEncodingException 3194 */ 3195 private Reader enc(String name, InputStream is) 3196 throws UnsupportedEncodingException { 3197 // DO NOT CLOSE current reader if any! 3198 if (name.equals("UTF-8")) { 3199 return new ReaderUTF8(is); 3200 } else if (name.equals("UTF-16LE")) { 3201 return new ReaderUTF16(is, 'l'); 3202 } else if (name.equals("UTF-16BE")) { 3203 return new ReaderUTF16(is, 'b'); 3204 } else { 3205 return new InputStreamReader(is, name); 3206 } 3207 } 3208 3209 /** 3210 * Sets up current input on the top of the input stack. 3211 * 3212 * @param inp A new input to set up. 3213 */ 3214 protected void push(Input inp) { 3215 mInp.chLen = mChLen; 3216 mInp.chIdx = mChIdx; 3217 inp.next = mInp; 3218 mInp = inp; 3219 mChars = inp.chars; 3220 mChLen = inp.chLen; 3221 mChIdx = inp.chIdx; 3222 } 3223 3224 /** 3225 * Restores previous input on the top of the input stack. 3226 */ 3227 protected void pop() { 3228 if (mInp.src != null) { 3229 try { 3230 mInp.src.close(); 3231 } catch (IOException ioe) { 3232 } 3233 mInp.src = null; 3234 } 3235 mInp = mInp.next; 3236 if (mInp != null) { 3237 mChars = mInp.chars; 3238 mChLen = mInp.chLen; 3239 mChIdx = mInp.chIdx; 3240 } else { 3241 mChars = null; 3242 mChLen = 0; 3243 mChIdx = 0; 3244 } 3245 } 3246 3247 /** 3248 * Maps a character to it's type. 3249 * 3250 * Possible character type values are:<br /> - ' ' for any kind of white 3251 * space character;<br /> - 'a' for any lower case alphabetical character 3252 * value;<br /> - 'A' for any upper case alphabetical character value;<br /> 3253 * - 'd' for any decimal digit character value;<br /> - 'z' for any 3254 * character less then ' ' except '\t', '\n', '\r';<br /> - 'X' for any not 3255 * ASCII character;<br /> - 'Z' for EOS character.<br /> An ASCII (7 bit) 3256 * character which does not fall in any category listed above is mapped to 3257 * it self. 3258 * 3259 * @param ch The character to map. 3260 * @return The type of character. 3261 */ 3262 protected char chtyp(char ch) { 3263 if (ch < 0x80) { 3264 return (char) asctyp[ch]; 3265 } 3266 return (ch != EOS) ? 'X' : 'Z'; 3267 } 3268 3269 /** 3270 * Retrives the next character in the document. 3271 * 3272 * @return The next character in the document. 3273 */ 3274 protected char getch() 3275 throws IOException { 3276 if (mChIdx >= mChLen) { 3277 if (mInp.src == null) { 3278 pop(); // remove internal entity 3279 return getch(); 3280 } 3281 // Read new portion of the document characters 3282 int Num = mInp.src.read(mChars, 0, mChars.length); 3283 if (Num < 0) { 3284 if (mInp != mDoc) { 3285 pop(); // restore the previous input 3286 return getch(); 3287 } else { 3288 mChars[0] = EOS; 3289 mChLen = 1; 3290 } 3291 } else { 3292 mChLen = Num; 3293 } 3294 mChIdx = 0; 3295 } 3296 return mChars[mChIdx++]; 3297 } 3298 3299 /** 3300 * Puts back the last read character. 3301 * 3302 * This method <strong>MUST NOT</strong> be called more then once after each 3303 * call of {@link #getch getch} method. 3304 */ 3305 protected void bkch() 3306 throws Exception { 3307 if (mChIdx <= 0) { 3308 panic(FAULT); 3309 } 3310 mChIdx--; 3311 } 3312 3313 /** 3314 * Sets the current character. 3315 * 3316 * @param ch The character to set. 3317 */ 3318 protected void setch(char ch) { 3319 mChars[mChIdx] = ch; 3320 } 3321 3322 /** 3323 * Finds a pair in the pair chain by a qualified name. 3324 * 3325 * @param chain The first element of the chain of pairs. 3326 * @param qname The qualified name. 3327 * @return A pair with the specified qualified name or null. 3328 */ 3329 protected Pair find(Pair chain, char[] qname) { 3330 for (Pair pair = chain; pair != null; pair = pair.next) { 3331 if (pair.eqname(qname) == true) { 3332 return pair; 3333 } 3334 } 3335 return null; 3336 } 3337 3338 /** 3339 * Provedes an instance of a pair. 3340 * 3341 * @param next The reference to a next pair. 3342 * @return An instance of a pair. 3343 */ 3344 protected Pair pair(Pair next) { 3345 Pair pair; 3346 3347 if (mDltd != null) { 3348 pair = mDltd; 3349 mDltd = pair.next; 3350 } else { 3351 pair = new Pair(); 3352 } 3353 pair.next = next; 3354 3355 return pair; 3356 } 3357 3358 /** 3359 * Deletes an instance of a pair. 3360 * 3361 * @param pair The pair to delete. 3362 * @return A reference to the next pair in a chain. 3363 */ 3364 protected Pair del(Pair pair) { 3365 Pair next = pair.next; 3366 3367 pair.name = null; 3368 pair.value = null; 3369 pair.chars = null; 3370 pair.list = null; 3371 pair.next = mDltd; 3372 mDltd = pair; 3373 3374 return next; 3375 } 3376 }