1 /* 2 * Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 package jdk.internal.util.xml.impl; 27 28 import java.io.IOException; 29 import java.io.InputStream; 30 import java.io.InputStreamReader; 31 import java.io.Reader; 32 import java.io.UnsupportedEncodingException; 33 import java.util.HashMap; 34 import java.util.Map; 35 import jdk.internal.org.xml.sax.InputSource; 36 import jdk.internal.org.xml.sax.SAXException; 37 38 /** 39 * XML non-validating parser engine. 40 */ 41 public abstract class Parser { 42 43 public final static String FAULT = ""; 44 protected final static int BUFFSIZE_READER = 512; 45 protected final static int BUFFSIZE_PARSER = 128; 46 /** 47 * The end of stream character. 48 */ 49 public final static char EOS = 0xffff; 50 private Pair mNoNS; // there is no namespace 51 private Pair mXml; // the xml namespace 52 private Map<String, Input> mEnt; // the entities look up table 53 private Map<String, Input> mPEnt; // the parmeter entities look up table 54 protected boolean mIsSAlone; // xml decl standalone flag 55 protected boolean mIsSAloneSet; // standalone is explicitely set 56 protected boolean mIsNSAware; // if true - namespace aware mode 57 protected int mPh; // current phase of document processing 58 protected final static int PH_BEFORE_DOC = -1; // before parsing 59 protected final static int PH_DOC_START = 0; // document start 60 protected final static int PH_MISC_DTD = 1; // misc before DTD 61 protected final static int PH_DTD = 2; // DTD 62 protected final static int PH_DTD_MISC = 3; // misc after DTD 63 protected final static int PH_DOCELM = 4; // document's element 64 protected final static int PH_DOCELM_MISC = 5; // misc after element 65 protected final static int PH_AFTER_DOC = 6; // after parsing 66 protected int mEvt; // current event type 67 protected final static int EV_NULL = 0; // unknown 68 protected final static int EV_ELM = 1; // empty element 69 protected final static int EV_ELMS = 2; // start element 70 protected final static int EV_ELME = 3; // end element 71 protected final static int EV_TEXT = 4; // textual content 72 protected final static int EV_WSPC = 5; // white space content 73 protected final static int EV_PI = 6; // processing instruction 74 protected final static int EV_CDAT = 7; // character data 75 protected final static int EV_COMM = 8; // comment 76 protected final static int EV_DTD = 9; // document type definition 77 protected final static int EV_ENT = 10; // skipped entity 78 private char mESt; // built-in entity recognizer state 79 // mESt values: 80 // 0x100 : the initial state 81 // > 0x100 : unrecognized name 82 // < 0x100 : replacement character 83 protected char[] mBuff; // parser buffer 84 protected int mBuffIdx; // index of the last char 85 protected Pair mPref; // stack of prefixes 86 protected Pair mElm; // stack of elements 87 // mAttL.chars - element qname 88 // mAttL.next - next element 89 // mAttL.list - list of attributes defined on this element 90 // mAttL.list.chars - attribute qname 91 // mAttL.list.id - a char representing attribute's type see below 92 // mAttL.list.next - next attribute defined on the element 93 // mAttL.list.list - devault value structure or null 94 // mAttL.list.list.chars - "name='value' " chars array for Input 95 // 96 // Attribute type character values: 97 // 'i' - "ID" 98 // 'r' - "IDREF" 99 // 'R' - "IDREFS" 100 // 'n' - "ENTITY" 101 // 'N' - "ENTITIES" 102 // 't' - "NMTOKEN" 103 // 'T' - "NMTOKENS" 104 // 'u' - enumeration type 105 // 'o' - "NOTATION" 106 // 'c' - "CDATA" 107 // see also: bkeyword() and atype() 108 // 109 protected Pair mAttL; // list of defined attrs by element name 110 protected Input mDoc; // document entity 111 protected Input mInp; // stack of entities 112 private char[] mChars; // reading buffer 113 private int mChLen; // current capacity 114 private int mChIdx; // index to the next char 115 protected Attrs mAttrs; // attributes of the curr. element 116 private String[] mItems; // attributes array of the curr. element 117 private char mAttrIdx; // attributes counter/index 118 private String mUnent; // unresolved entity name 119 private Pair mDltd; // deleted objects for reuse 120 /** 121 * Default prefixes 122 */ 123 private final static char NONS[]; 124 private final static char XML[]; 125 private final static char XMLNS[]; 126 127 static { 128 NONS = new char[1]; 129 NONS[0] = (char) 0; 130 131 XML = new char[4]; 132 XML[0] = (char) 4; 133 XML[1] = 'x'; 134 XML[2] = 'm'; 135 XML[3] = 'l'; 136 137 XMLNS = new char[6]; 138 XMLNS[0] = (char) 6; 139 XMLNS[1] = 'x'; 140 XMLNS[2] = 'm'; 141 XMLNS[3] = 'l'; 142 XMLNS[4] = 'n'; 143 XMLNS[5] = 's'; 144 } 145 /** 146 * ASCII character type array. 147 * 148 * This array maps an ASCII (7 bit) character to the character type.<br /> 149 * Possible character type values are:<br /> - ' ' for any kind of white 150 * space character;<br /> - 'a' for any lower case alphabetical character 151 * value;<br /> - 'A' for any upper case alphabetical character value;<br /> 152 * - 'd' for any decimal digit character value;<br /> - 'z' for any 153 * character less then ' ' except '\t', '\n', '\r';<br /> An ASCII (7 bit) 154 * character which does not fall in any category listed above is mapped to 155 * it self. 156 */ 157 private static final byte asctyp[]; 158 /** 159 * NMTOKEN character type array. 160 * 161 * This array maps an ASCII (7 bit) character to the character type.<br /> 162 * Possible character type values are:<br /> - 0 for underscore ('_') or any 163 * lower and upper case alphabetical character value;<br /> - 1 for colon 164 * (':') character;<br /> - 2 for dash ('-') and dot ('.') or any decimal 165 * digit character value;<br /> - 3 for any kind of white space character<br 166 * /> An ASCII (7 bit) character which does not fall in any category listed 167 * above is mapped to 0xff. 168 */ 169 private static final byte nmttyp[]; 170 171 /** 172 * Static constructor. 173 * 174 * Sets up the ASCII character type array which is used by 175 * {@link #asctyp asctyp} method and NMTOKEN character type array. 176 */ 177 static { 178 short i = 0; 179 180 asctyp = new byte[0x80]; 181 while (i < ' ') { 182 asctyp[i++] = (byte) 'z'; 183 } 184 asctyp['\t'] = (byte) ' '; 185 asctyp['\r'] = (byte) ' '; 186 asctyp['\n'] = (byte) ' '; 187 while (i < '0') { 188 asctyp[i] = (byte) i++; 189 } 190 while (i <= '9') { 191 asctyp[i++] = (byte) 'd'; 192 } 193 while (i < 'A') { 194 asctyp[i] = (byte) i++; 195 } 196 while (i <= 'Z') { 197 asctyp[i++] = (byte) 'A'; 198 } 199 while (i < 'a') { 200 asctyp[i] = (byte) i++; 201 } 202 while (i <= 'z') { 203 asctyp[i++] = (byte) 'a'; 204 } 205 while (i < 0x80) { 206 asctyp[i] = (byte) i++; 207 } 208 209 nmttyp = new byte[0x80]; 210 for (i = 0; i < '0'; i++) { 211 nmttyp[i] = (byte) 0xff; 212 } 213 while (i <= '9') { 214 nmttyp[i++] = (byte) 2; // digits 215 } 216 while (i < 'A') { 217 nmttyp[i++] = (byte) 0xff; 218 } 219 // skiped upper case alphabetical character are already 0 220 for (i = '['; i < 'a'; i++) { 221 nmttyp[i] = (byte) 0xff; 222 } 223 // skiped lower case alphabetical character are already 0 224 for (i = '{'; i < 0x80; i++) { 225 nmttyp[i] = (byte) 0xff; 226 } 227 nmttyp['_'] = 0; 228 nmttyp[':'] = 1; 229 nmttyp['.'] = 2; 230 nmttyp['-'] = 2; 231 nmttyp[' '] = 3; 232 nmttyp['\t'] = 3; 233 nmttyp['\r'] = 3; 234 nmttyp['\n'] = 3; 235 } 236 237 /** 238 * Constructor. 239 */ 240 protected Parser() { 241 mPh = PH_BEFORE_DOC; // before parsing 242 243 // Initialize the parser 244 mBuff = new char[BUFFSIZE_PARSER]; 245 mAttrs = new Attrs(); 246 247 // Default namespace 248 mPref = pair(mPref); 249 mPref.name = ""; 250 mPref.value = ""; 251 mPref.chars = NONS; 252 mNoNS = mPref; // no namespace 253 // XML namespace 254 mPref = pair(mPref); 255 mPref.name = "xml"; 256 mPref.value = "http://www.w3.org/XML/1998/namespace"; 257 mPref.chars = XML; 258 mXml = mPref; // XML namespace 259 } 260 261 /** 262 * Initializes parser's internals. Note, current input has to be set before 263 * this method is called. 264 */ 265 protected void init() { 266 mUnent = null; 267 mElm = null; 268 mPref = mXml; 269 mAttL = null; 270 mPEnt = new HashMap<String, Input>(); 271 mEnt = new HashMap<String, Input>(); 272 mDoc = mInp; // current input is document entity 273 mChars = mInp.chars; // use document entity buffer 274 mPh = PH_DOC_START; // the begining of the document 275 } 276 277 /** 278 * Cleans up parser internal resources. 279 */ 280 protected void cleanup() { 281 // Default attributes 282 while (mAttL != null) { 283 while (mAttL.list != null) { 284 if (mAttL.list.list != null) { 285 del(mAttL.list.list); 286 } 287 mAttL.list = del(mAttL.list); 288 } 289 mAttL = del(mAttL); 290 } 291 // Element stack 292 while (mElm != null) { 293 mElm = del(mElm); 294 } 295 // Namespace prefixes 296 while (mPref != mXml) { 297 mPref = del(mPref); 298 } 299 // Inputs 300 while (mInp != null) { 301 pop(); 302 } 303 // Document reader 304 if ((mDoc != null) && (mDoc.src != null)) { 305 try { 306 mDoc.src.close(); 307 } catch (IOException ioe) { 308 } 309 } 310 mPEnt = null; 311 mEnt = null; 312 mDoc = null; 313 mPh = PH_AFTER_DOC; // before documnet processing 314 } 315 316 /** 317 * Processes a portion of document. This method returns one of EV_* 318 * constants as an identifier of the portion of document have been read. 319 * 320 * @return Identifier of processed document portion. 321 * @exception Exception is parser specific exception form panic method. 322 * @exception IOException 323 */ 324 protected int step() 325 throws Exception { 326 mEvt = EV_NULL; 327 int st = 0; 328 while (mEvt == EV_NULL) { 329 char ch = (mChIdx < mChLen) ? mChars[mChIdx++] : getch(); 330 switch (st) { 331 case 0: // all sorts of markup (dispetcher) 332 if (ch != '<') { 333 bkch(); 334 mBuffIdx = -1; // clean parser buffer 335 st = 1; 336 break; 337 } 338 switch (getch()) { 339 case '/': // the end of the element content 340 mEvt = EV_ELME; 341 if (mElm == null) { 342 panic(FAULT); 343 } 344 // Check element's open/close tags balance 345 mBuffIdx = -1; // clean parser buffer 346 bname(mIsNSAware); 347 char[] chars = mElm.chars; 348 if (chars.length == (mBuffIdx + 1)) { 349 for (char i = 1; i <= mBuffIdx; i += 1) { 350 if (chars[i] != mBuff[i]) { 351 panic(FAULT); 352 } 353 } 354 } else { 355 panic(FAULT); 356 } 357 // Skip white spaces before '>' 358 if (wsskip() != '>') { 359 panic(FAULT); 360 } 361 getch(); // read '>' 362 break; 363 364 case '!': // a comment or a CDATA 365 ch = getch(); 366 bkch(); 367 switch (ch) { 368 case '-': // must be a comment 369 mEvt = EV_COMM; 370 comm(); 371 break; 372 373 case '[': // must be a CDATA section 374 mEvt = EV_CDAT; 375 cdat(); 376 break; 377 378 default: // must be 'DOCTYPE' 379 mEvt = EV_DTD; 380 dtd(); 381 break; 382 } 383 break; 384 385 case '?': // processing instruction 386 mEvt = EV_PI; 387 pi(); 388 break; 389 390 default: // must be the first char of an xml name 391 bkch(); 392 // Read an element name and put it on top of the 393 // element stack 394 mElm = pair(mElm); // add new element to the stack 395 mElm.chars = qname(mIsNSAware); 396 mElm.name = mElm.local(); 397 mElm.id = (mElm.next != null) ? mElm.next.id : 0; // flags 398 mElm.num = 0; // namespace counter 399 // Find the list of defined attributs of the current 400 // element 401 Pair elm = find(mAttL, mElm.chars); 402 mElm.list = (elm != null) ? elm.list : null; 403 // Read attributes till the end of the element tag 404 mAttrIdx = 0; 405 Pair att = pair(null); 406 att.num = 0; // clear attribute's flags 407 attr(att); // get all attributes inc. defaults 408 del(att); 409 mElm.value = (mIsNSAware) ? rslv(mElm.chars) : null; 410 // Skip white spaces before '>' 411 switch (wsskip()) { 412 case '>': 413 getch(); // read '>' 414 mEvt = EV_ELMS; 415 break; 416 417 case '/': 418 getch(); // read '/' 419 if (getch() != '>') // read '>' 420 { 421 panic(FAULT); 422 } 423 mEvt = EV_ELM; 424 break; 425 426 default: 427 panic(FAULT); 428 } 429 break; 430 } 431 break; 432 433 case 1: // read white space 434 switch (ch) { 435 case ' ': 436 case '\t': 437 case '\n': 438 bappend(ch); 439 break; 440 441 case '\r': // EOL processing [#2.11] 442 if (getch() != '\n') { 443 bkch(); 444 } 445 bappend('\n'); 446 break; 447 448 case '<': 449 mEvt = EV_WSPC; 450 bkch(); 451 bflash_ws(); 452 break; 453 454 default: 455 bkch(); 456 st = 2; 457 break; 458 } 459 break; 460 461 case 2: // read the text content of the element 462 switch (ch) { 463 case '&': 464 if (mUnent == null) { 465 // There was no unresolved entity on previous step. 466 if ((mUnent = ent('x')) != null) { 467 mEvt = EV_TEXT; 468 bkch(); // move back to ';' after entity name 469 setch('&'); // parser must be back on next step 470 bflash(); 471 } 472 } else { 473 // There was unresolved entity on previous step. 474 mEvt = EV_ENT; 475 skippedEnt(mUnent); 476 mUnent = null; 477 } 478 break; 479 480 case '<': 481 mEvt = EV_TEXT; 482 bkch(); 483 bflash(); 484 break; 485 486 case '\r': // EOL processing [#2.11] 487 if (getch() != '\n') { 488 bkch(); 489 } 490 bappend('\n'); 491 break; 492 493 case EOS: 494 panic(FAULT); 495 496 default: 497 bappend(ch); 498 break; 499 } 500 break; 501 502 default: 503 panic(FAULT); 504 } 505 } 506 507 return mEvt; 508 } 509 510 /** 511 * Parses the document type declaration. 512 * 513 * @exception Exception is parser specific exception form panic method. 514 * @exception IOException 515 */ 516 private void dtd() 517 throws Exception { 518 char ch; 519 String str = null; 520 String name = null; 521 Pair psid = null; 522 // read 'DOCTYPE' 523 if ("DOCTYPE".equals(name(false)) != true) { 524 panic(FAULT); 525 } 526 mPh = PH_DTD; // DTD 527 for (short st = 0; st >= 0;) { 528 ch = getch(); 529 switch (st) { 530 case 0: // read the document type name 531 if (chtyp(ch) != ' ') { 532 bkch(); 533 name = name(mIsNSAware); 534 wsskip(); 535 st = 1; // read 'PUPLIC' or 'SYSTEM' 536 } 537 break; 538 539 case 1: // read 'PUPLIC' or 'SYSTEM' 540 switch (chtyp(ch)) { 541 case 'A': 542 bkch(); 543 psid = pubsys(' '); 544 st = 2; // skip spaces before internal subset 545 docType(name, psid.name, psid.value); 546 break; 547 548 case '[': 549 bkch(); 550 st = 2; // skip spaces before internal subset 551 docType(name, null, null); 552 break; 553 554 case '>': 555 bkch(); 556 st = 3; // skip spaces after internal subset 557 docType(name, null, null); 558 break; 559 560 default: 561 panic(FAULT); 562 } 563 break; 564 565 case 2: // skip spaces before internal subset 566 switch (chtyp(ch)) { 567 case '[': 568 // Process internal subset 569 dtdsub(); 570 st = 3; // skip spaces after internal subset 571 break; 572 573 case '>': 574 // There is no internal subset 575 bkch(); 576 st = 3; // skip spaces after internal subset 577 break; 578 579 case ' ': 580 // skip white spaces 581 break; 582 583 default: 584 panic(FAULT); 585 } 586 break; 587 588 case 3: // skip spaces after internal subset 589 switch (chtyp(ch)) { 590 case '>': 591 if (psid != null) { 592 // Report the DTD external subset 593 InputSource is = resolveEnt(name, psid.name, psid.value); 594 if (is != null) { 595 if (mIsSAlone == false) { 596 // Set the end of DTD external subset char 597 bkch(); 598 setch(']'); 599 // Set the DTD external subset InputSource 600 push(new Input(BUFFSIZE_READER)); 601 setinp(is); 602 mInp.pubid = psid.name; 603 mInp.sysid = psid.value; 604 // Parse the DTD external subset 605 dtdsub(); 606 } else { 607 // Unresolved DTD external subset 608 skippedEnt("[dtd]"); 609 // Release reader and stream 610 if (is.getCharacterStream() != null) { 611 try { 612 is.getCharacterStream().close(); 613 } catch (IOException ioe) { 614 } 615 } 616 if (is.getByteStream() != null) { 617 try { 618 is.getByteStream().close(); 619 } catch (IOException ioe) { 620 } 621 } 622 } 623 } else { 624 // Unresolved DTD external subset 625 skippedEnt("[dtd]"); 626 } 627 del(psid); 628 } 629 st = -1; // end of DTD 630 break; 631 632 case ' ': 633 // skip white spaces 634 break; 635 636 default: 637 panic(FAULT); 638 } 639 break; 640 641 default: 642 panic(FAULT); 643 } 644 } 645 } 646 647 /** 648 * Parses the document type declaration subset. 649 * 650 * @exception Exception is parser specific exception form panic method. 651 * @exception IOException 652 */ 653 private void dtdsub() 654 throws Exception { 655 char ch; 656 for (short st = 0; st >= 0;) { 657 ch = getch(); 658 switch (st) { 659 case 0: // skip white spaces before a declaration 660 switch (chtyp(ch)) { 661 case '<': 662 ch = getch(); 663 switch (ch) { 664 case '?': 665 pi(); 666 break; 667 668 case '!': 669 ch = getch(); 670 bkch(); 671 if (ch == '-') { 672 comm(); 673 break; 674 } 675 // A markup or an entity declaration 676 bntok(); 677 switch (bkeyword()) { 678 case 'n': 679 dtdent(); 680 break; 681 682 case 'a': 683 dtdattl(); // parse attributes declaration 684 break; 685 686 case 'e': 687 dtdelm(); // parse element declaration 688 break; 689 690 case 'o': 691 dtdnot(); // parse notation declaration 692 break; 693 694 default: 695 panic(FAULT); // unsupported markup declaration 696 break; 697 } 698 st = 1; // read the end of declaration 699 break; 700 701 default: 702 panic(FAULT); 703 break; 704 } 705 break; 706 707 case '%': 708 // A parameter entity reference 709 pent(' '); 710 break; 711 712 case ']': 713 // End of DTD subset 714 st = -1; 715 break; 716 717 case ' ': 718 // Skip white spaces 719 break; 720 721 case 'Z': 722 // End of stream 723 if (getch() != ']') { 724 panic(FAULT); 725 } 726 st = -1; 727 break; 728 729 default: 730 panic(FAULT); 731 } 732 break; 733 734 case 1: // read the end of declaration 735 switch (ch) { 736 case '>': // there is no notation 737 st = 0; // skip white spaces before a declaration 738 break; 739 740 case ' ': 741 case '\n': 742 case '\r': 743 case '\t': 744 // Skip white spaces 745 break; 746 747 default: 748 panic(FAULT); 749 break; 750 } 751 break; 752 753 default: 754 panic(FAULT); 755 } 756 } 757 } 758 759 /** 760 * Parses an entity declaration. This method fills the general ( 761 * <code>mEnt</code>) and parameter 762 * ( 763 * <code>mPEnt</code>) entity look up table. 764 * 765 * @exception Exception is parser specific exception form panic method. 766 * @exception IOException 767 */ 768 private void dtdent() 769 throws Exception { 770 String str = null; 771 char[] val = null; 772 Input inp = null; 773 Pair ids = null; 774 char ch; 775 for (short st = 0; st >= 0;) { 776 ch = getch(); 777 switch (st) { 778 case 0: // skip white spaces before entity name 779 switch (chtyp(ch)) { 780 case ' ': 781 // Skip white spaces 782 break; 783 784 case '%': 785 // Parameter entity or parameter entity declaration. 786 ch = getch(); 787 bkch(); 788 if (chtyp(ch) == ' ') { 789 // Parameter entity declaration. 790 wsskip(); 791 str = name(false); 792 switch (chtyp(wsskip())) { 793 case 'A': 794 // Read the external identifier 795 ids = pubsys(' '); 796 if (wsskip() == '>') { 797 // External parsed entity 798 if (mPEnt.containsKey(str) == false) { // [#4.2] 799 inp = new Input(); 800 inp.pubid = ids.name; 801 inp.sysid = ids.value; 802 mPEnt.put(str, inp); 803 } 804 } else { 805 panic(FAULT); 806 } 807 del(ids); 808 st = -1; // the end of declaration 809 break; 810 811 case '\"': 812 case '\'': 813 // Read the parameter entity value 814 bqstr('d'); 815 // Create the parameter entity value 816 val = new char[mBuffIdx + 1]; 817 System.arraycopy(mBuff, 1, val, 1, val.length - 1); 818 // Add surrounding spaces [#4.4.8] 819 val[0] = ' '; 820 // Add the entity to the entity look up table 821 if (mPEnt.containsKey(str) == false) { // [#4.2] 822 inp = new Input(val); 823 inp.pubid = mInp.pubid; 824 inp.sysid = mInp.sysid; 825 inp.xmlenc = mInp.xmlenc; 826 inp.xmlver = mInp.xmlver; 827 mPEnt.put(str, inp); 828 } 829 st = -1; // the end of declaration 830 break; 831 832 default: 833 panic(FAULT); 834 break; 835 } 836 } else { 837 // Parameter entity reference. 838 pent(' '); 839 } 840 break; 841 842 default: 843 bkch(); 844 str = name(false); 845 st = 1; // read entity declaration value 846 break; 847 } 848 break; 849 850 case 1: // read entity declaration value 851 switch (chtyp(ch)) { 852 case '\"': // internal entity 853 case '\'': 854 bkch(); 855 bqstr('d'); // read a string into the buffer 856 if (mEnt.get(str) == null) { 857 // Create general entity value 858 val = new char[mBuffIdx]; 859 System.arraycopy(mBuff, 1, val, 0, val.length); 860 // Add the entity to the entity look up table 861 if (mEnt.containsKey(str) == false) { // [#4.2] 862 inp = new Input(val); 863 inp.pubid = mInp.pubid; 864 inp.sysid = mInp.sysid; 865 inp.xmlenc = mInp.xmlenc; 866 inp.xmlver = mInp.xmlver; 867 mEnt.put(str, inp); 868 } 869 } 870 st = -1; // the end of declaration 871 break; 872 873 case 'A': // external entity 874 bkch(); 875 ids = pubsys(' '); 876 switch (wsskip()) { 877 case '>': // external parsed entity 878 if (mEnt.containsKey(str) == false) { // [#4.2] 879 inp = new Input(); 880 inp.pubid = ids.name; 881 inp.sysid = ids.value; 882 mEnt.put(str, inp); 883 } 884 break; 885 886 case 'N': // external general unparsed entity 887 if ("NDATA".equals(name(false)) == true) { 888 wsskip(); 889 unparsedEntDecl(str, ids.name, ids.value, name(false)); 890 break; 891 } 892 default: 893 panic(FAULT); 894 break; 895 } 896 del(ids); 897 st = -1; // the end of declaration 898 break; 899 900 case ' ': 901 // Skip white spaces 902 break; 903 904 default: 905 panic(FAULT); 906 break; 907 } 908 break; 909 910 default: 911 panic(FAULT); 912 } 913 } 914 } 915 916 /** 917 * Parses an element declaration. 918 * 919 * This method parses the declaration up to the closing angle bracket. 920 * 921 * @exception Exception is parser specific exception form panic method. 922 * @exception IOException 923 */ 924 private void dtdelm() 925 throws Exception { 926 // This is stub implementation which skips an element 927 // declaration. 928 wsskip(); 929 name(mIsNSAware); 930 931 char ch; 932 while (true) { 933 ch = getch(); 934 switch (ch) { 935 case '>': 936 bkch(); 937 return; 938 939 case EOS: 940 panic(FAULT); 941 942 default: 943 break; 944 } 945 } 946 } 947 948 /** 949 * Parses an attribute list declaration. 950 * 951 * This method parses the declaration up to the closing angle bracket. 952 * 953 * @exception Exception is parser specific exception form panic method. 954 * @exception IOException 955 */ 956 private void dtdattl() 957 throws Exception { 958 char elmqn[] = null; 959 Pair elm = null; 960 char ch; 961 for (short st = 0; st >= 0;) { 962 ch = getch(); 963 switch (st) { 964 case 0: // read the element name 965 switch (chtyp(ch)) { 966 case 'a': 967 case 'A': 968 case '_': 969 case 'X': 970 case ':': 971 bkch(); 972 // Get the element from the list or add a new one. 973 elmqn = qname(mIsNSAware); 974 elm = find(mAttL, elmqn); 975 if (elm == null) { 976 elm = pair(mAttL); 977 elm.chars = elmqn; 978 mAttL = elm; 979 } 980 st = 1; // read an attribute declaration 981 break; 982 983 case ' ': 984 break; 985 986 case '%': 987 pent(' '); 988 break; 989 990 default: 991 panic(FAULT); 992 break; 993 } 994 break; 995 996 case 1: // read an attribute declaration 997 switch (chtyp(ch)) { 998 case 'a': 999 case 'A': 1000 case '_': 1001 case 'X': 1002 case ':': 1003 bkch(); 1004 dtdatt(elm); 1005 if (wsskip() == '>') { 1006 return; 1007 } 1008 break; 1009 1010 case ' ': 1011 break; 1012 1013 case '%': 1014 pent(' '); 1015 break; 1016 1017 default: 1018 panic(FAULT); 1019 break; 1020 } 1021 break; 1022 1023 default: 1024 panic(FAULT); 1025 break; 1026 } 1027 } 1028 } 1029 1030 /** 1031 * Parses an attribute declaration. 1032 * 1033 * The attribute uses the following fields of Pair object: chars - characters 1034 * of qualified name id - the type identifier of the attribute list - a pair 1035 * which holds the default value (chars field) 1036 * 1037 * @param elm An object which represents all defined attributes on an 1038 * element. 1039 * @exception Exception is parser specific exception form panic method. 1040 * @exception IOException 1041 */ 1042 private void dtdatt(Pair elm) 1043 throws Exception { 1044 char attqn[] = null; 1045 Pair att = null; 1046 char ch; 1047 for (short st = 0; st >= 0;) { 1048 ch = getch(); 1049 switch (st) { 1050 case 0: // the attribute name 1051 switch (chtyp(ch)) { 1052 case 'a': 1053 case 'A': 1054 case '_': 1055 case 'X': 1056 case ':': 1057 bkch(); 1058 // Get the attribut from the list or add a new one. 1059 attqn = qname(mIsNSAware); 1060 att = find(elm.list, attqn); 1061 if (att == null) { 1062 // New attribute declaration 1063 att = pair(elm.list); 1064 att.chars = attqn; 1065 elm.list = att; 1066 } else { 1067 // Do not override the attribute declaration [#3.3] 1068 att = pair(null); 1069 att.chars = attqn; 1070 att.id = 'c'; 1071 } 1072 wsskip(); 1073 st = 1; 1074 break; 1075 1076 case '%': 1077 pent(' '); 1078 break; 1079 1080 case ' ': 1081 break; 1082 1083 default: 1084 panic(FAULT); 1085 break; 1086 } 1087 break; 1088 1089 case 1: // the attribute type 1090 switch (chtyp(ch)) { 1091 case '(': 1092 att.id = 'u'; // enumeration type 1093 st = 2; // read the first element of the list 1094 break; 1095 1096 case '%': 1097 pent(' '); 1098 break; 1099 1100 case ' ': 1101 break; 1102 1103 default: 1104 bkch(); 1105 bntok(); // read type id 1106 att.id = bkeyword(); 1107 switch (att.id) { 1108 case 'o': // NOTATION 1109 if (wsskip() != '(') { 1110 panic(FAULT); 1111 } 1112 ch = getch(); 1113 st = 2; // read the first element of the list 1114 break; 1115 1116 case 'i': // ID 1117 case 'r': // IDREF 1118 case 'R': // IDREFS 1119 case 'n': // ENTITY 1120 case 'N': // ENTITIES 1121 case 't': // NMTOKEN 1122 case 'T': // NMTOKENS 1123 case 'c': // CDATA 1124 wsskip(); 1125 st = 4; // read default declaration 1126 break; 1127 1128 default: 1129 panic(FAULT); 1130 break; 1131 } 1132 break; 1133 } 1134 break; 1135 1136 case 2: // read the first element of the list 1137 switch (chtyp(ch)) { 1138 case 'a': 1139 case 'A': 1140 case 'd': 1141 case '.': 1142 case ':': 1143 case '-': 1144 case '_': 1145 case 'X': 1146 bkch(); 1147 switch (att.id) { 1148 case 'u': // enumeration type 1149 bntok(); 1150 break; 1151 1152 case 'o': // NOTATION 1153 mBuffIdx = -1; 1154 bname(false); 1155 break; 1156 1157 default: 1158 panic(FAULT); 1159 break; 1160 } 1161 wsskip(); 1162 st = 3; // read next element of the list 1163 break; 1164 1165 case '%': 1166 pent(' '); 1167 break; 1168 1169 case ' ': 1170 break; 1171 1172 default: 1173 panic(FAULT); 1174 break; 1175 } 1176 break; 1177 1178 case 3: // read next element of the list 1179 switch (ch) { 1180 case ')': 1181 wsskip(); 1182 st = 4; // read default declaration 1183 break; 1184 1185 case '|': 1186 wsskip(); 1187 switch (att.id) { 1188 case 'u': // enumeration type 1189 bntok(); 1190 break; 1191 1192 case 'o': // NOTATION 1193 mBuffIdx = -1; 1194 bname(false); 1195 break; 1196 1197 default: 1198 panic(FAULT); 1199 break; 1200 } 1201 wsskip(); 1202 break; 1203 1204 case '%': 1205 pent(' '); 1206 break; 1207 1208 default: 1209 panic(FAULT); 1210 break; 1211 } 1212 break; 1213 1214 case 4: // read default declaration 1215 switch (ch) { 1216 case '#': 1217 bntok(); 1218 switch (bkeyword()) { 1219 case 'F': // FIXED 1220 switch (wsskip()) { 1221 case '\"': 1222 case '\'': 1223 st = 5; // read the default value 1224 break; 1225 1226 case EOS: 1227 panic(FAULT); 1228 1229 default: 1230 st = -1; 1231 break; 1232 } 1233 break; 1234 1235 case 'Q': // REQUIRED 1236 case 'I': // IMPLIED 1237 st = -1; 1238 break; 1239 1240 default: 1241 panic(FAULT); 1242 break; 1243 } 1244 break; 1245 1246 case '\"': 1247 case '\'': 1248 bkch(); 1249 st = 5; // read the default value 1250 break; 1251 1252 case ' ': 1253 case '\n': 1254 case '\r': 1255 case '\t': 1256 break; 1257 1258 case '%': 1259 pent(' '); 1260 break; 1261 1262 default: 1263 bkch(); 1264 st = -1; 1265 break; 1266 } 1267 break; 1268 1269 case 5: // read the default value 1270 switch (ch) { 1271 case '\"': 1272 case '\'': 1273 bkch(); 1274 bqstr('d'); // the value in the mBuff now 1275 att.list = pair(null); 1276 // Create a string like "attqname='value' " 1277 att.list.chars = new char[att.chars.length + mBuffIdx + 3]; 1278 System.arraycopy( 1279 att.chars, 1, att.list.chars, 0, att.chars.length - 1); 1280 att.list.chars[att.chars.length - 1] = '='; 1281 att.list.chars[att.chars.length] = ch; 1282 System.arraycopy( 1283 mBuff, 1, att.list.chars, att.chars.length + 1, mBuffIdx); 1284 att.list.chars[att.chars.length + mBuffIdx + 1] = ch; 1285 att.list.chars[att.chars.length + mBuffIdx + 2] = ' '; 1286 st = -1; 1287 break; 1288 1289 default: 1290 panic(FAULT); 1291 break; 1292 } 1293 break; 1294 1295 default: 1296 panic(FAULT); 1297 break; 1298 } 1299 } 1300 } 1301 1302 /** 1303 * Parses a notation declaration. 1304 * 1305 * This method parses the declaration up to the closing angle bracket. 1306 * 1307 * @exception Exception is parser specific exception form panic method. 1308 * @exception IOException 1309 */ 1310 private void dtdnot() 1311 throws Exception { 1312 wsskip(); 1313 String name = name(false); 1314 wsskip(); 1315 Pair ids = pubsys('N'); 1316 notDecl(name, ids.name, ids.value); 1317 del(ids); 1318 } 1319 1320 /** 1321 * Parses an attribute. 1322 * 1323 * This recursive method is responsible for prefix addition 1324 * ( 1325 * <code>mPref</code>) on the way down. The element's start tag end triggers 1326 * the return process. The method then on it's way back resolves prefixes 1327 * and accumulates attributes. 1328 * 1329 * <p><code>att.num</code> carries attribute flags where: 0x1 - attribute is 1330 * declared in DTD (attribute decalration had been read); 0x2 - attribute's 1331 * default value is used.</p> 1332 * 1333 * @param att An object which reprecents current attribute. 1334 * @exception Exception is parser specific exception form panic method. 1335 * @exception IOException 1336 */ 1337 private void attr(Pair att) 1338 throws Exception { 1339 switch (wsskip()) { 1340 case '/': 1341 case '>': 1342 if ((att.num & 0x2) == 0) { // all attributes have been read 1343 att.num |= 0x2; // set default attribute flag 1344 Input inp = mInp; 1345 // Go through all attributes defined on current element. 1346 for (Pair def = mElm.list; def != null; def = def.next) { 1347 if (def.list == null) // no default value 1348 { 1349 continue; 1350 } 1351 // Go through all attributes defined on current 1352 // element and add defaults. 1353 Pair act = find(att.next, def.chars); 1354 if (act == null) { 1355 push(new Input(def.list.chars)); 1356 } 1357 } 1358 if (mInp != inp) { // defaults have been added 1359 attr(att); 1360 return; 1361 } 1362 } 1363 // Ensure the attribute string array capacity 1364 mAttrs.setLength(mAttrIdx); 1365 mItems = mAttrs.mItems; 1366 return; 1367 1368 case EOS: 1369 panic(FAULT); 1370 1371 default: 1372 // Read the attribute name and value 1373 att.chars = qname(mIsNSAware); 1374 att.name = att.local(); 1375 String type = atype(att); // sets attribute's type on att.id 1376 wsskip(); 1377 if (getch() != '=') { 1378 panic(FAULT); 1379 } 1380 bqstr((char) att.id); // read the value with normalization. 1381 String val = new String(mBuff, 1, mBuffIdx); 1382 Pair next = pair(att); 1383 next.num = (att.num & ~0x1); // inherit attribute flags 1384 // Put a namespace declaration on top of the prefix stack 1385 if ((mIsNSAware == false) || (isdecl(att, val) == false)) { 1386 // An ordinary attribute 1387 mAttrIdx++; 1388 attr(next); // recursive call to parse the next attribute 1389 mAttrIdx--; 1390 // Add the attribute to the attributes string array 1391 char idx = (char) (mAttrIdx << 3); 1392 mItems[idx + 1] = att.qname(); // attr qname 1393 mItems[idx + 2] = (mIsNSAware) ? att.name : ""; // attr local name 1394 mItems[idx + 3] = val; // attr value 1395 mItems[idx + 4] = type; // attr type 1396 switch (att.num & 0x3) { 1397 case 0x0: 1398 mItems[idx + 5] = null; 1399 break; 1400 1401 case 0x1: // declared attribute 1402 mItems[idx + 5] = "d"; 1403 break; 1404 1405 default: // 0x2, 0x3 - default attribute always declared 1406 mItems[idx + 5] = "D"; 1407 break; 1408 } 1409 // Resolve the prefix if any and report the attribute 1410 // NOTE: The attribute does not accept the default namespace. 1411 mItems[idx + 0] = (att.chars[0] != 0) ? rslv(att.chars) : ""; 1412 } else { 1413 // A namespace declaration. mPref.name contains prefix and 1414 // mPref.value contains namespace URI set by isdecl method. 1415 // Report a start of the new mapping 1416 newPrefix(); 1417 // Recursive call to parse the next attribute 1418 attr(next); 1419 // NOTE: The namespace declaration is not reported. 1420 } 1421 del(next); 1422 break; 1423 } 1424 } 1425 1426 /** 1427 * Retrieves attribute type. 1428 * 1429 * This method sets the type of normalization in the attribute 1430 * <code>id</code> field and returns the name of attribute type. 1431 * 1432 * @param att An object which represents current attribute. 1433 * @return The name of the attribute type. 1434 * @exception Exception is parser specific exception form panic method. 1435 */ 1436 private String atype(Pair att) 1437 throws Exception { 1438 Pair attr; 1439 1440 // CDATA-type normalization by default [#3.3.3] 1441 att.id = 'c'; 1442 if (mElm.list == null || (attr = find(mElm.list, att.chars)) == null) { 1443 return "CDATA"; 1444 } 1445 1446 att.num |= 0x1; // attribute is declared 1447 1448 // Non-CDATA normalization except when the attribute type is CDATA. 1449 att.id = 'i'; 1450 switch (attr.id) { 1451 case 'i': 1452 return "ID"; 1453 1454 case 'r': 1455 return "IDREF"; 1456 1457 case 'R': 1458 return "IDREFS"; 1459 1460 case 'n': 1461 return "ENTITY"; 1462 1463 case 'N': 1464 return "ENTITIES"; 1465 1466 case 't': 1467 return "NMTOKEN"; 1468 1469 case 'T': 1470 return "NMTOKENS"; 1471 1472 case 'u': 1473 return "NMTOKEN"; 1474 1475 case 'o': 1476 return "NOTATION"; 1477 1478 case 'c': 1479 att.id = 'c'; 1480 return "CDATA"; 1481 1482 default: 1483 panic(FAULT); 1484 } 1485 return null; 1486 } 1487 1488 /** 1489 * Parses a comment. 1490 * 1491 * The '<!' part is read in dispatcher so the method starts 1492 * with first '-' after '<!'. 1493 * 1494 * @exception Exception is parser specific exception form panic method. 1495 */ 1496 private void comm() 1497 throws Exception { 1498 if (mPh == PH_DOC_START) { 1499 mPh = PH_MISC_DTD; // misc before DTD 1500 } // '<!' has been already read by dispetcher. 1501 char ch; 1502 mBuffIdx = -1; 1503 for (short st = 0; st >= 0;) { 1504 ch = (mChIdx < mChLen) ? mChars[mChIdx++] : getch(); 1505 if (ch == EOS) { 1506 panic(FAULT); 1507 } 1508 switch (st) { 1509 case 0: // first '-' of the comment open 1510 if (ch == '-') { 1511 st = 1; 1512 } else { 1513 panic(FAULT); 1514 } 1515 break; 1516 1517 case 1: // secind '-' of the comment open 1518 if (ch == '-') { 1519 st = 2; 1520 } else { 1521 panic(FAULT); 1522 } 1523 break; 1524 1525 case 2: // skip the comment body 1526 switch (ch) { 1527 case '-': 1528 st = 3; 1529 break; 1530 1531 default: 1532 bappend(ch); 1533 break; 1534 } 1535 break; 1536 1537 case 3: // second '-' of the comment close 1538 switch (ch) { 1539 case '-': 1540 st = 4; 1541 break; 1542 1543 default: 1544 bappend('-'); 1545 bappend(ch); 1546 st = 2; 1547 break; 1548 } 1549 break; 1550 1551 case 4: // '>' of the comment close 1552 if (ch == '>') { 1553 comm(mBuff, mBuffIdx + 1); 1554 st = -1; 1555 break; 1556 } 1557 // else - panic [#2.5 compatibility note] 1558 1559 default: 1560 panic(FAULT); 1561 } 1562 } 1563 } 1564 1565 /** 1566 * Parses a processing instruction. 1567 * 1568 * The '<?' is read in dispatcher so the method starts with 1569 * first character of PI target name after '<?'. 1570 * 1571 * @exception Exception is parser specific exception form panic method. 1572 * @exception IOException 1573 */ 1574 private void pi() 1575 throws Exception { 1576 // '<?' has been already read by dispetcher. 1577 char ch; 1578 String str = null; 1579 mBuffIdx = -1; 1580 for (short st = 0; st >= 0;) { 1581 ch = getch(); 1582 if (ch == EOS) { 1583 panic(FAULT); 1584 } 1585 switch (st) { 1586 case 0: // read the PI target name 1587 switch (chtyp(ch)) { 1588 case 'a': 1589 case 'A': 1590 case '_': 1591 case ':': 1592 case 'X': 1593 bkch(); 1594 str = name(false); 1595 // PI target name may not be empty string [#2.6] 1596 // PI target name 'XML' is reserved [#2.6] 1597 if ((str.length() == 0) 1598 || (mXml.name.equals(str.toLowerCase()) == true)) { 1599 panic(FAULT); 1600 } 1601 // This is processing instruction 1602 if (mPh == PH_DOC_START) // the begining of the document 1603 { 1604 mPh = PH_MISC_DTD; // misc before DTD 1605 } 1606 wsskip(); // skip spaces after the PI target name 1607 st = 1; // accumulate the PI body 1608 mBuffIdx = -1; 1609 break; 1610 1611 default: 1612 panic(FAULT); 1613 } 1614 break; 1615 1616 case 1: // accumulate the PI body 1617 switch (ch) { 1618 case '?': 1619 st = 2; // end of the PI body 1620 break; 1621 1622 default: 1623 bappend(ch); 1624 break; 1625 } 1626 break; 1627 1628 case 2: // end of the PI body 1629 switch (ch) { 1630 case '>': 1631 // PI has been read. 1632 pi(str, new String(mBuff, 0, mBuffIdx + 1)); 1633 st = -1; 1634 break; 1635 1636 case '?': 1637 bappend('?'); 1638 break; 1639 1640 default: 1641 bappend('?'); 1642 bappend(ch); 1643 st = 1; // accumulate the PI body 1644 break; 1645 } 1646 break; 1647 1648 default: 1649 panic(FAULT); 1650 } 1651 } 1652 } 1653 1654 /** 1655 * Parses a character data. 1656 * 1657 * The '<!' part is read in dispatcher so the method starts 1658 * with first '[' after '<!'. 1659 * 1660 * @exception Exception is parser specific exception form panic method. 1661 * @exception IOException 1662 */ 1663 private void cdat() 1664 throws Exception { 1665 // '<!' has been already read by dispetcher. 1666 char ch; 1667 mBuffIdx = -1; 1668 for (short st = 0; st >= 0;) { 1669 ch = getch(); 1670 switch (st) { 1671 case 0: // the first '[' of the CDATA open 1672 if (ch == '[') { 1673 st = 1; 1674 } else { 1675 panic(FAULT); 1676 } 1677 break; 1678 1679 case 1: // read "CDATA" 1680 if (chtyp(ch) == 'A') { 1681 bappend(ch); 1682 } else { 1683 if ("CDATA".equals( 1684 new String(mBuff, 0, mBuffIdx + 1)) != true) { 1685 panic(FAULT); 1686 } 1687 bkch(); 1688 st = 2; 1689 } 1690 break; 1691 1692 case 2: // the second '[' of the CDATA open 1693 if (ch != '[') { 1694 panic(FAULT); 1695 } 1696 mBuffIdx = -1; 1697 st = 3; 1698 break; 1699 1700 case 3: // read data before the first ']' 1701 if (ch != ']') { 1702 bappend(ch); 1703 } else { 1704 st = 4; 1705 } 1706 break; 1707 1708 case 4: // read the second ']' or continue to read the data 1709 if (ch != ']') { 1710 bappend(']'); 1711 bappend(ch); 1712 st = 3; 1713 } else { 1714 st = 5; 1715 } 1716 break; 1717 1718 case 5: // read '>' or continue to read the data 1719 switch (ch) { 1720 case ']': 1721 bappend(']'); 1722 break; 1723 1724 case '>': 1725 bflash(); 1726 st = -1; 1727 break; 1728 1729 default: 1730 bappend(']'); 1731 bappend(']'); 1732 bappend(ch); 1733 st = 3; 1734 break; 1735 } 1736 break; 1737 1738 default: 1739 panic(FAULT); 1740 } 1741 } 1742 } 1743 1744 /** 1745 * Reads a xml name. 1746 * 1747 * The xml name must conform "Namespaces in XML" specification. Therefore 1748 * the ':' character is not allowed in the name. This method should be used 1749 * for PI and entity names which may not have a namespace according to the 1750 * specification mentioned above. 1751 * 1752 * @param ns The true value turns namespace conformance on. 1753 * @return The name has been read. 1754 * @exception Exception When incorrect character appear in the name. 1755 * @exception IOException 1756 */ 1757 protected String name(boolean ns) 1758 throws Exception { 1759 mBuffIdx = -1; 1760 bname(ns); 1761 return new String(mBuff, 1, mBuffIdx); 1762 } 1763 1764 /** 1765 * Reads a qualified xml name. 1766 * 1767 * The characters of a qualified name is an array of characters. The first 1768 * (chars[0]) character is the index of the colon character which separates 1769 * the prefix from the local name. If the index is zero, the name does not 1770 * contain separator or the parser works in the namespace unaware mode. The 1771 * length of qualified name is the length of the array minus one. 1772 * 1773 * @param ns The true value turns namespace conformance on. 1774 * @return The characters of a qualified name. 1775 * @exception Exception When incorrect character appear in the name. 1776 * @exception IOException 1777 */ 1778 protected char[] qname(boolean ns) 1779 throws Exception { 1780 mBuffIdx = -1; 1781 bname(ns); 1782 char chars[] = new char[mBuffIdx + 1]; 1783 System.arraycopy(mBuff, 0, chars, 0, mBuffIdx + 1); 1784 return chars; 1785 } 1786 1787 /** 1788 * Reads the public or/and system identifiers. 1789 * 1790 * @param inp The input object. 1791 * @exception Exception is parser specific exception form panic method. 1792 * @exception IOException 1793 */ 1794 private void pubsys(Input inp) 1795 throws Exception { 1796 Pair pair = pubsys(' '); 1797 inp.pubid = pair.name; 1798 inp.sysid = pair.value; 1799 del(pair); 1800 } 1801 1802 /** 1803 * Reads the public or/and system identifiers. 1804 * 1805 * @param flag The 'N' allows public id be without system id. 1806 * @return The public or/and system identifiers pair. 1807 * @exception Exception is parser specific exception form panic method. 1808 * @exception IOException 1809 */ 1810 private Pair pubsys(char flag) 1811 throws Exception { 1812 Pair ids = pair(null); 1813 String str = name(false); 1814 if ("PUBLIC".equals(str) == true) { 1815 bqstr('i'); // non-CDATA normalization [#4.2.2] 1816 ids.name = new String(mBuff, 1, mBuffIdx); 1817 switch (wsskip()) { 1818 case '\"': 1819 case '\'': 1820 bqstr(' '); 1821 ids.value = new String(mBuff, 1, mBuffIdx); 1822 break; 1823 1824 case EOS: 1825 panic(FAULT); 1826 1827 default: 1828 if (flag != 'N') // [#4.7] 1829 { 1830 panic(FAULT); 1831 } 1832 ids.value = null; 1833 break; 1834 } 1835 return ids; 1836 } else if ("SYSTEM".equals(str) == true) { 1837 ids.name = null; 1838 bqstr(' '); 1839 ids.value = new String(mBuff, 1, mBuffIdx); 1840 return ids; 1841 } 1842 panic(FAULT); 1843 return null; 1844 } 1845 1846 /** 1847 * Reads an attribute value. 1848 * 1849 * The grammar which this method can read is:<br /> 1850 * <code>eqstr := S "=" qstr</code><br /> 1851 * <code>qstr := S ("'" string "'") | 1852 * ('"' string '"')</code><br /> This method resolves entities 1853 * inside a string unless the parser parses DTD. 1854 * 1855 * @param flag The '=' character forces the method to accept the '=' 1856 * character before quoted string and read the following string as not an 1857 * attribute ('-'), 'c' - CDATA, 'i' - non CDATA, ' ' - no normalization; 1858 * '-' - not an attribute value; 'd' - in DTD context. 1859 * @return The content of the quoted strign as a string. 1860 * @exception Exception is parser specific exception form panic method. 1861 * @exception IOException 1862 */ 1863 protected String eqstr(char flag) 1864 throws Exception { 1865 if (flag == '=') { 1866 wsskip(); 1867 if (getch() != '=') { 1868 panic(FAULT); 1869 } 1870 } 1871 bqstr((flag == '=') ? '-' : flag); 1872 return new String(mBuff, 1, mBuffIdx); 1873 } 1874 1875 /** 1876 * Resoves an entity. 1877 * 1878 * This method resolves built-in and character entity references. It is also 1879 * reports external entities to the application. 1880 * 1881 * @param flag The 'x' character forces the method to report a skipped 1882 * entity; 'i' character - indicates non-CDATA normalization. 1883 * @return Name of unresolved entity or <code>null</code> if entity had been 1884 * resolved successfully. 1885 * @exception Exception is parser specific exception form panic method. 1886 * @exception IOException 1887 */ 1888 private String ent(char flag) 1889 throws Exception { 1890 char ch; 1891 int idx = mBuffIdx + 1; 1892 Input inp = null; 1893 String str = null; 1894 mESt = 0x100; // reset the built-in entity recognizer 1895 bappend('&'); 1896 for (short st = 0; st >= 0;) { 1897 ch = (mChIdx < mChLen) ? mChars[mChIdx++] : getch(); 1898 switch (st) { 1899 case 0: // the first character of the entity name 1900 case 1: // read built-in entity name 1901 switch (chtyp(ch)) { 1902 case 'd': 1903 case '.': 1904 case '-': 1905 if (st != 1) { 1906 panic(FAULT); 1907 } 1908 case 'a': 1909 case 'A': 1910 case '_': 1911 case 'X': 1912 bappend(ch); 1913 eappend(ch); 1914 st = 1; 1915 break; 1916 1917 case ':': 1918 if (mIsNSAware != false) { 1919 panic(FAULT); 1920 } 1921 bappend(ch); 1922 eappend(ch); 1923 st = 1; 1924 break; 1925 1926 case ';': 1927 if (mESt < 0x100) { 1928 // The entity is a built-in entity 1929 mBuffIdx = idx - 1; 1930 bappend(mESt); 1931 st = -1; 1932 break; 1933 } else if (mPh == PH_DTD) { 1934 // In DTD entity declaration has to resolve character 1935 // entities and include "as is" others. [#4.4.7] 1936 bappend(';'); 1937 st = -1; 1938 break; 1939 } 1940 // Convert an entity name to a string 1941 str = new String(mBuff, idx + 1, mBuffIdx - idx); 1942 inp = (Input) mEnt.get(str); 1943 // Restore the buffer offset 1944 mBuffIdx = idx - 1; 1945 if (inp != null) { 1946 if (inp.chars == null) { 1947 // External entity 1948 InputSource is = resolveEnt(str, inp.pubid, inp.sysid); 1949 if (is != null) { 1950 push(new Input(BUFFSIZE_READER)); 1951 setinp(is); 1952 mInp.pubid = inp.pubid; 1953 mInp.sysid = inp.sysid; 1954 str = null; // the entity is resolved 1955 } else { 1956 // Unresolved external entity 1957 if (flag != 'x') { 1958 panic(FAULT); // unknown entity within marckup 1959 } // str is name of unresolved entity 1960 } 1961 } else { 1962 // Internal entity 1963 push(inp); 1964 str = null; // the entity is resolved 1965 } 1966 } else { 1967 // Unknown or general unparsed entity 1968 if (flag != 'x') { 1969 panic(FAULT); // unknown entity within marckup 1970 } // str is name of unresolved entity 1971 } 1972 st = -1; 1973 break; 1974 1975 case '#': 1976 if (st != 0) { 1977 panic(FAULT); 1978 } 1979 st = 2; 1980 break; 1981 1982 default: 1983 panic(FAULT); 1984 } 1985 break; 1986 1987 case 2: // read character entity 1988 switch (chtyp(ch)) { 1989 case 'd': 1990 bappend(ch); 1991 break; 1992 1993 case ';': 1994 // Convert the character entity to a character 1995 try { 1996 int i = Integer.parseInt( 1997 new String(mBuff, idx + 1, mBuffIdx - idx), 10); 1998 if (i >= 0xffff) { 1999 panic(FAULT); 2000 } 2001 ch = (char) i; 2002 } catch (NumberFormatException nfe) { 2003 panic(FAULT); 2004 } 2005 // Restore the buffer offset 2006 mBuffIdx = idx - 1; 2007 if (ch == ' ' || mInp.next != null) { 2008 bappend(ch, flag); 2009 } else { 2010 bappend(ch); 2011 } 2012 st = -1; 2013 break; 2014 2015 case 'a': 2016 // If the entity buffer is empty and ch == 'x' 2017 if ((mBuffIdx == idx) && (ch == 'x')) { 2018 st = 3; 2019 break; 2020 } 2021 default: 2022 panic(FAULT); 2023 } 2024 break; 2025 2026 case 3: // read hex character entity 2027 switch (chtyp(ch)) { 2028 case 'A': 2029 case 'a': 2030 case 'd': 2031 bappend(ch); 2032 break; 2033 2034 case ';': 2035 // Convert the character entity to a character 2036 try { 2037 int i = Integer.parseInt( 2038 new String(mBuff, idx + 1, mBuffIdx - idx), 16); 2039 if (i >= 0xffff) { 2040 panic(FAULT); 2041 } 2042 ch = (char) i; 2043 } catch (NumberFormatException nfe) { 2044 panic(FAULT); 2045 } 2046 // Restore the buffer offset 2047 mBuffIdx = idx - 1; 2048 if (ch == ' ' || mInp.next != null) { 2049 bappend(ch, flag); 2050 } else { 2051 bappend(ch); 2052 } 2053 st = -1; 2054 break; 2055 2056 default: 2057 panic(FAULT); 2058 } 2059 break; 2060 2061 default: 2062 panic(FAULT); 2063 } 2064 } 2065 2066 return str; 2067 } 2068 2069 /** 2070 * Resoves a parameter entity. 2071 * 2072 * This method resolves a parameter entity references. It is also reports 2073 * external entities to the application. 2074 * 2075 * @param flag The '-' instruct the method to do not set up surrounding 2076 * spaces [#4.4.8]. 2077 * @exception Exception is parser specific exception form panic method. 2078 * @exception IOException 2079 */ 2080 private void pent(char flag) 2081 throws Exception { 2082 char ch; 2083 int idx = mBuffIdx + 1; 2084 Input inp = null; 2085 String str = null; 2086 bappend('%'); 2087 if (mPh != PH_DTD) // the DTD internal subset 2088 { 2089 return; // Not Recognized [#4.4.1] 2090 } // Read entity name 2091 bname(false); 2092 str = new String(mBuff, idx + 2, mBuffIdx - idx - 1); 2093 if (getch() != ';') { 2094 panic(FAULT); 2095 } 2096 inp = (Input) mPEnt.get(str); 2097 // Restore the buffer offset 2098 mBuffIdx = idx - 1; 2099 if (inp != null) { 2100 if (inp.chars == null) { 2101 // External parameter entity 2102 InputSource is = resolveEnt(str, inp.pubid, inp.sysid); 2103 if (is != null) { 2104 if (flag != '-') { 2105 bappend(' '); // tail space 2106 } 2107 push(new Input(BUFFSIZE_READER)); 2108 // BUG: there is no leading space! [#4.4.8] 2109 setinp(is); 2110 mInp.pubid = inp.pubid; 2111 mInp.sysid = inp.sysid; 2112 } else { 2113 // Unresolved external parameter entity 2114 skippedEnt("%" + str); 2115 } 2116 } else { 2117 // Internal parameter entity 2118 if (flag == '-') { 2119 // No surrounding spaces 2120 inp.chIdx = 1; 2121 } else { 2122 // Insert surrounding spaces 2123 bappend(' '); // tail space 2124 inp.chIdx = 0; 2125 } 2126 push(inp); 2127 } 2128 } else { 2129 // Unknown parameter entity 2130 skippedEnt("%" + str); 2131 } 2132 } 2133 2134 /** 2135 * Recognizes and handles a namespace declaration. 2136 * 2137 * This method identifies a type of namespace declaration if any and puts 2138 * new mapping on top of prefix stack. 2139 * 2140 * @param name The attribute qualified name (<code>name.value</code> is a 2141 * <code>String</code> object which represents the attribute prefix). 2142 * @param value The attribute value. 2143 * @return <code>true</code> if a namespace declaration is recognized. 2144 */ 2145 private boolean isdecl(Pair name, String value) { 2146 if (name.chars[0] == 0) { 2147 if ("xmlns".equals(name.name) == true) { 2148 // New default namespace declaration 2149 mPref = pair(mPref); 2150 mPref.list = mElm; // prefix owner element 2151 mPref.value = value; 2152 mPref.name = ""; 2153 mPref.chars = NONS; 2154 mElm.num++; // namespace counter 2155 return true; 2156 } 2157 } else { 2158 if (name.eqpref(XMLNS) == true) { 2159 // New prefix declaration 2160 int len = name.name.length(); 2161 mPref = pair(mPref); 2162 mPref.list = mElm; // prefix owner element 2163 mPref.value = value; 2164 mPref.name = name.name; 2165 mPref.chars = new char[len + 1]; 2166 mPref.chars[0] = (char) (len + 1); 2167 name.name.getChars(0, len, mPref.chars, 1); 2168 mElm.num++; // namespace counter 2169 return true; 2170 } 2171 } 2172 return false; 2173 } 2174 2175 /** 2176 * Resolves a prefix. 2177 * 2178 * @return The namespace assigned to the prefix. 2179 * @exception Exception When mapping for specified prefix is not found. 2180 */ 2181 private String rslv(char[] qname) 2182 throws Exception { 2183 for (Pair pref = mPref; pref != null; pref = pref.next) { 2184 if (pref.eqpref(qname) == true) { 2185 return pref.value; 2186 } 2187 } 2188 if (qname[0] == 1) { // QNames like ':local' 2189 for (Pair pref = mPref; pref != null; pref = pref.next) { 2190 if (pref.chars[0] == 0) { 2191 return pref.value; 2192 } 2193 } 2194 } 2195 panic(FAULT); 2196 return null; 2197 } 2198 2199 /** 2200 * Skips xml white space characters. 2201 * 2202 * This method skips white space characters (' ', '\t', '\n', '\r') and 2203 * looks ahead not white space character. 2204 * 2205 * @return The first not white space look ahead character. 2206 * @exception IOException 2207 */ 2208 protected char wsskip() 2209 throws IOException { 2210 char ch; 2211 while (true) { 2212 // Read next character 2213 ch = (mChIdx < mChLen) ? mChars[mChIdx++] : getch(); 2214 if (ch < 0x80) { 2215 if (nmttyp[ch] != 3) // [ \t\n\r] 2216 { 2217 break; 2218 } 2219 } else { 2220 break; 2221 } 2222 } 2223 mChIdx--; // bkch(); 2224 return ch; 2225 } 2226 2227 /** 2228 * Reports document type. 2229 * 2230 * @param name The name of the entity. 2231 * @param pubid The public identifier of the entity or <code>null</code>. 2232 * @param sysid The system identifier of the entity or <code>null</code>. 2233 */ 2234 protected abstract void docType(String name, String pubid, String sysid) 2235 throws SAXException; 2236 2237 /** 2238 * Reports a comment. 2239 * 2240 * @param text The comment text starting from first charcater. 2241 * @param length The number of characters in comment. 2242 */ 2243 protected abstract void comm(char[] text, int length); 2244 2245 /** 2246 * Reports a processing instruction. 2247 * 2248 * @param target The processing instruction target name. 2249 * @param body The processing instruction body text. 2250 */ 2251 protected abstract void pi(String target, String body) 2252 throws Exception; 2253 2254 /** 2255 * Reports new namespace prefix. The Namespace prefix ( 2256 * <code>mPref.name</code>) being declared and the Namespace URI ( 2257 * <code>mPref.value</code>) the prefix is mapped to. An empty string is 2258 * used for the default element namespace, which has no prefix. 2259 */ 2260 protected abstract void newPrefix() 2261 throws Exception; 2262 2263 /** 2264 * Reports skipped entity name. 2265 * 2266 * @param name The entity name. 2267 */ 2268 protected abstract void skippedEnt(String name) 2269 throws Exception; 2270 2271 /** 2272 * Returns an 2273 * <code>InputSource</code> for specified entity or 2274 * <code>null</code>. 2275 * 2276 * @param name The name of the entity. 2277 * @param pubid The public identifier of the entity. 2278 * @param sysid The system identifier of the entity. 2279 */ 2280 protected abstract InputSource resolveEnt( 2281 String name, String pubid, String sysid) 2282 throws Exception; 2283 2284 /** 2285 * Reports notation declaration. 2286 * 2287 * @param name The notation's name. 2288 * @param pubid The notation's public identifier, or null if none was given. 2289 * @param sysid The notation's system identifier, or null if none was given. 2290 */ 2291 protected abstract void notDecl(String name, String pubid, String sysid) 2292 throws Exception; 2293 2294 /** 2295 * Reports unparsed entity name. 2296 * 2297 * @param name The unparsed entity's name. 2298 * @param pubid The entity's public identifier, or null if none was given. 2299 * @param sysid The entity's system identifier. 2300 * @param notation The name of the associated notation. 2301 */ 2302 protected abstract void unparsedEntDecl( 2303 String name, String pubid, String sysid, String notation) 2304 throws Exception; 2305 2306 /** 2307 * Notifies the handler about fatal parsing error. 2308 * 2309 * @param msg The problem description message. 2310 */ 2311 protected abstract void panic(String msg) 2312 throws Exception; 2313 2314 /** 2315 * Reads a qualified xml name. 2316 * 2317 * This is low level routine which leaves a qName in the buffer. The 2318 * characters of a qualified name is an array of characters. The first 2319 * (chars[0]) character is the index of the colon character which separates 2320 * the prefix from the local name. If the index is zero, the name does not 2321 * contain separator or the parser works in the namespace unaware mode. The 2322 * length of qualified name is the length of the array minus one. 2323 * 2324 * @param ns The true value turns namespace conformance on. 2325 * @exception Exception is parser specific exception form panic method. 2326 * @exception IOException 2327 */ 2328 private void bname(boolean ns) 2329 throws Exception { 2330 char ch; 2331 char type; 2332 mBuffIdx++; // allocate a char for colon offset 2333 int bqname = mBuffIdx; 2334 int bcolon = bqname; 2335 int bchidx = bqname + 1; 2336 int bstart = bchidx; 2337 int cstart = mChIdx; 2338 short st = (short) ((ns == true) ? 0 : 2); 2339 while (true) { 2340 // Read next character 2341 if (mChIdx >= mChLen) { 2342 bcopy(cstart, bstart); 2343 getch(); 2344 mChIdx--; // bkch(); 2345 cstart = mChIdx; 2346 bstart = bchidx; 2347 } 2348 ch = mChars[mChIdx++]; 2349 type = (char) 0; // [X] 2350 if (ch < 0x80) { 2351 type = (char) nmttyp[ch]; 2352 } else if (ch == EOS) { 2353 panic(FAULT); 2354 } 2355 // Parse QName 2356 switch (st) { 2357 case 0: // read the first char of the prefix 2358 case 2: // read the first char of the suffix 2359 switch (type) { 2360 case 0: // [aA_X] 2361 bchidx++; // append char to the buffer 2362 st++; // (st == 0)? 1: 3; 2363 break; 2364 2365 case 1: // [:] 2366 mChIdx--; // bkch(); 2367 st++; // (st == 0)? 1: 3; 2368 break; 2369 2370 default: 2371 panic(FAULT); 2372 } 2373 break; 2374 2375 case 1: // read the prefix 2376 case 3: // read the suffix 2377 switch (type) { 2378 case 0: // [aA_X] 2379 case 2: // [.-d] 2380 bchidx++; // append char to the buffer 2381 break; 2382 2383 case 1: // [:] 2384 bchidx++; // append char to the buffer 2385 if (ns == true) { 2386 if (bcolon != bqname) { 2387 panic(FAULT); // it must be only one colon 2388 } 2389 bcolon = bchidx - 1; 2390 if (st == 1) { 2391 st = 2; 2392 } 2393 } 2394 break; 2395 2396 default: 2397 mChIdx--; // bkch(); 2398 bcopy(cstart, bstart); 2399 mBuff[bqname] = (char) (bcolon - bqname); 2400 return; 2401 } 2402 break; 2403 2404 default: 2405 panic(FAULT); 2406 } 2407 } 2408 } 2409 2410 /** 2411 * Reads a nmtoken. 2412 * 2413 * This is low level routine which leaves a nmtoken in the buffer. 2414 * 2415 * @exception Exception is parser specific exception form panic method. 2416 * @exception IOException 2417 */ 2418 private void bntok() 2419 throws Exception { 2420 char ch; 2421 mBuffIdx = -1; 2422 bappend((char) 0); // default offset to the colon char 2423 while (true) { 2424 ch = getch(); 2425 switch (chtyp(ch)) { 2426 case 'a': 2427 case 'A': 2428 case 'd': 2429 case '.': 2430 case ':': 2431 case '-': 2432 case '_': 2433 case 'X': 2434 bappend(ch); 2435 break; 2436 2437 case 'Z': 2438 panic(FAULT); 2439 2440 default: 2441 bkch(); 2442 return; 2443 } 2444 } 2445 } 2446 2447 /** 2448 * Recognizes a keyword. 2449 * 2450 * This is low level routine which recognizes one of keywords in the buffer. 2451 * Keyword Id ID - i IDREF - r IDREFS - R ENTITY - n ENTITIES - N NMTOKEN - 2452 * t NMTOKENS - T ELEMENT - e ATTLIST - a NOTATION - o CDATA - c REQUIRED - 2453 * Q IMPLIED - I FIXED - F 2454 * 2455 * @return an id of a keyword or '?'. 2456 * @exception Exception is parser specific exception form panic method. 2457 * @exception IOException 2458 */ 2459 private char bkeyword() 2460 throws Exception { 2461 String str = new String(mBuff, 1, mBuffIdx); 2462 switch (str.length()) { 2463 case 2: // ID 2464 return ("ID".equals(str) == true) ? 'i' : '?'; 2465 2466 case 5: // IDREF, CDATA, FIXED 2467 switch (mBuff[1]) { 2468 case 'I': 2469 return ("IDREF".equals(str) == true) ? 'r' : '?'; 2470 case 'C': 2471 return ("CDATA".equals(str) == true) ? 'c' : '?'; 2472 case 'F': 2473 return ("FIXED".equals(str) == true) ? 'F' : '?'; 2474 default: 2475 break; 2476 } 2477 break; 2478 2479 case 6: // IDREFS, ENTITY 2480 switch (mBuff[1]) { 2481 case 'I': 2482 return ("IDREFS".equals(str) == true) ? 'R' : '?'; 2483 case 'E': 2484 return ("ENTITY".equals(str) == true) ? 'n' : '?'; 2485 default: 2486 break; 2487 } 2488 break; 2489 2490 case 7: // NMTOKEN, IMPLIED, ATTLIST, ELEMENT 2491 switch (mBuff[1]) { 2492 case 'I': 2493 return ("IMPLIED".equals(str) == true) ? 'I' : '?'; 2494 case 'N': 2495 return ("NMTOKEN".equals(str) == true) ? 't' : '?'; 2496 case 'A': 2497 return ("ATTLIST".equals(str) == true) ? 'a' : '?'; 2498 case 'E': 2499 return ("ELEMENT".equals(str) == true) ? 'e' : '?'; 2500 default: 2501 break; 2502 } 2503 break; 2504 2505 case 8: // ENTITIES, NMTOKENS, NOTATION, REQUIRED 2506 switch (mBuff[2]) { 2507 case 'N': 2508 return ("ENTITIES".equals(str) == true) ? 'N' : '?'; 2509 case 'M': 2510 return ("NMTOKENS".equals(str) == true) ? 'T' : '?'; 2511 case 'O': 2512 return ("NOTATION".equals(str) == true) ? 'o' : '?'; 2513 case 'E': 2514 return ("REQUIRED".equals(str) == true) ? 'Q' : '?'; 2515 default: 2516 break; 2517 } 2518 break; 2519 2520 default: 2521 break; 2522 } 2523 return '?'; 2524 } 2525 2526 /** 2527 * Reads a single or double quotted string in to the buffer. 2528 * 2529 * This method resolves entities inside a string unless the parser parses 2530 * DTD. 2531 * 2532 * @param flag 'c' - CDATA, 'i' - non CDATA, ' ' - no normalization; '-' - 2533 * not an attribute value; 'd' - in DTD context. 2534 * @exception Exception is parser specific exception form panic method. 2535 * @exception IOException 2536 */ 2537 private void bqstr(char flag) 2538 throws Exception { 2539 Input inp = mInp; // remember the original input 2540 mBuffIdx = -1; 2541 bappend((char) 0); // default offset to the colon char 2542 char ch; 2543 for (short st = 0; st >= 0;) { 2544 ch = (mChIdx < mChLen) ? mChars[mChIdx++] : getch(); 2545 switch (st) { 2546 case 0: // read a single or double quote 2547 switch (ch) { 2548 case ' ': 2549 case '\n': 2550 case '\r': 2551 case '\t': 2552 break; 2553 2554 case '\'': 2555 st = 2; // read a single quoted string 2556 break; 2557 2558 case '\"': 2559 st = 3; // read a double quoted string 2560 break; 2561 2562 default: 2563 panic(FAULT); 2564 break; 2565 } 2566 break; 2567 2568 case 2: // read a single quoted string 2569 case 3: // read a double quoted string 2570 switch (ch) { 2571 case '\'': 2572 if ((st == 2) && (mInp == inp)) { 2573 st = -1; 2574 } else { 2575 bappend(ch); 2576 } 2577 break; 2578 2579 case '\"': 2580 if ((st == 3) && (mInp == inp)) { 2581 st = -1; 2582 } else { 2583 bappend(ch); 2584 } 2585 break; 2586 2587 case '&': 2588 if (flag != 'd') { 2589 ent(flag); 2590 } else { 2591 bappend(ch); 2592 } 2593 break; 2594 2595 case '%': 2596 if (flag == 'd') { 2597 pent('-'); 2598 } else { 2599 bappend(ch); 2600 } 2601 break; 2602 2603 case '<': 2604 if ((flag == '-') || (flag == 'd')) { 2605 bappend(ch); 2606 } else { 2607 panic(FAULT); 2608 } 2609 break; 2610 2611 case EOS: // EOS before single/double quote 2612 panic(FAULT); 2613 2614 case '\r': // EOL processing [#2.11 & #3.3.3] 2615 if (flag != ' ' && mInp.next == null) { 2616 if (getch() != '\n') { 2617 bkch(); 2618 } 2619 ch = '\n'; 2620 } 2621 default: 2622 bappend(ch, flag); 2623 break; 2624 } 2625 break; 2626 2627 default: 2628 panic(FAULT); 2629 } 2630 } 2631 // There is maximum one space at the end of the string in 2632 // i-mode (non CDATA normalization) and it has to be removed. 2633 if ((flag == 'i') && (mBuff[mBuffIdx] == ' ')) { 2634 mBuffIdx -= 1; 2635 } 2636 } 2637 2638 /** 2639 * Reports characters and empties the parser's buffer. This method is called 2640 * only if parser is going to return control to the main loop. This means 2641 * that this method may use parser buffer to report white space without 2642 * copeing characters to temporary buffer. 2643 */ 2644 protected abstract void bflash() 2645 throws Exception; 2646 2647 /** 2648 * Reports white space characters and empties the parser's buffer. This 2649 * method is called only if parser is going to return control to the main 2650 * loop. This means that this method may use parser buffer to report white 2651 * space without copeing characters to temporary buffer. 2652 */ 2653 protected abstract void bflash_ws() 2654 throws Exception; 2655 2656 /** 2657 * Appends a character to parser's buffer with normalization. 2658 * 2659 * @param ch The character to append to the buffer. 2660 * @param mode The normalization mode. 2661 */ 2662 private void bappend(char ch, char mode) { 2663 // This implements attribute value normalization as 2664 // described in the XML specification [#3.3.3]. 2665 switch (mode) { 2666 case 'i': // non CDATA normalization 2667 switch (ch) { 2668 case ' ': 2669 case '\n': 2670 case '\r': 2671 case '\t': 2672 if ((mBuffIdx > 0) && (mBuff[mBuffIdx] != ' ')) { 2673 bappend(' '); 2674 } 2675 return; 2676 2677 default: 2678 break; 2679 } 2680 break; 2681 2682 case 'c': // CDATA normalization 2683 switch (ch) { 2684 case '\n': 2685 case '\r': 2686 case '\t': 2687 ch = ' '; 2688 break; 2689 2690 default: 2691 break; 2692 } 2693 break; 2694 2695 default: // no normalization 2696 break; 2697 } 2698 mBuffIdx++; 2699 if (mBuffIdx < mBuff.length) { 2700 mBuff[mBuffIdx] = ch; 2701 } else { 2702 mBuffIdx--; 2703 bappend(ch); 2704 } 2705 } 2706 2707 /** 2708 * Appends a character to parser's buffer. 2709 * 2710 * @param ch The character to append to the buffer. 2711 */ 2712 private void bappend(char ch) { 2713 try { 2714 mBuff[++mBuffIdx] = ch; 2715 } catch (Exception exp) { 2716 // Double the buffer size 2717 char buff[] = new char[mBuff.length << 1]; 2718 System.arraycopy(mBuff, 0, buff, 0, mBuff.length); 2719 mBuff = buff; 2720 mBuff[mBuffIdx] = ch; 2721 } 2722 } 2723 2724 /** 2725 * Appends (mChIdx - cidx) characters from character buffer (mChars) to 2726 * parser's buffer (mBuff). 2727 * 2728 * @param cidx The character buffer (mChars) start index. 2729 * @param bidx The parser buffer (mBuff) start index. 2730 */ 2731 private void bcopy(int cidx, int bidx) { 2732 int length = mChIdx - cidx; 2733 if ((bidx + length + 1) >= mBuff.length) { 2734 // Expand the buffer 2735 char buff[] = new char[mBuff.length + length]; 2736 System.arraycopy(mBuff, 0, buff, 0, mBuff.length); 2737 mBuff = buff; 2738 } 2739 System.arraycopy(mChars, cidx, mBuff, bidx, length); 2740 mBuffIdx += length; 2741 } 2742 2743 /** 2744 * Recognizes the built-in entities <i>lt</i>, <i>gt</i>, <i>amp</i>, 2745 * <i>apos</i>, <i>quot</i>. The initial state is 0x100. Any state belowe 2746 * 0x100 is a built-in entity replacement character. 2747 * 2748 * @param ch the next character of an entity name. 2749 */ 2750 private void eappend(char ch) { 2751 switch (mESt) { 2752 case 0x100: // "l" or "g" or "a" or "q" 2753 switch (ch) { 2754 case 'l': 2755 mESt = 0x101; 2756 break; 2757 case 'g': 2758 mESt = 0x102; 2759 break; 2760 case 'a': 2761 mESt = 0x103; 2762 break; 2763 case 'q': 2764 mESt = 0x107; 2765 break; 2766 default: 2767 mESt = 0x200; 2768 break; 2769 } 2770 break; 2771 2772 case 0x101: // "lt" 2773 mESt = (ch == 't') ? '<' : (char) 0x200; 2774 break; 2775 2776 case 0x102: // "gt" 2777 mESt = (ch == 't') ? '>' : (char) 0x200; 2778 break; 2779 2780 case 0x103: // "am" or "ap" 2781 switch (ch) { 2782 case 'm': 2783 mESt = 0x104; 2784 break; 2785 case 'p': 2786 mESt = 0x105; 2787 break; 2788 default: 2789 mESt = 0x200; 2790 break; 2791 } 2792 break; 2793 2794 case 0x104: // "amp" 2795 mESt = (ch == 'p') ? '&' : (char) 0x200; 2796 break; 2797 2798 case 0x105: // "apo" 2799 mESt = (ch == 'o') ? (char) 0x106 : (char) 0x200; 2800 break; 2801 2802 case 0x106: // "apos" 2803 mESt = (ch == 's') ? '\'' : (char) 0x200; 2804 break; 2805 2806 case 0x107: // "qu" 2807 mESt = (ch == 'u') ? (char) 0x108 : (char) 0x200; 2808 break; 2809 2810 case 0x108: // "quo" 2811 mESt = (ch == 'o') ? (char) 0x109 : (char) 0x200; 2812 break; 2813 2814 case 0x109: // "quot" 2815 mESt = (ch == 't') ? '\"' : (char) 0x200; 2816 break; 2817 2818 case '<': // "lt" 2819 case '>': // "gt" 2820 case '&': // "amp" 2821 case '\'': // "apos" 2822 case '\"': // "quot" 2823 mESt = 0x200; 2824 default: 2825 break; 2826 } 2827 } 2828 2829 /** 2830 * Sets up a new input source on the top of the input stack. Note, the first 2831 * byte returned by the entity's byte stream has to be the first byte in the 2832 * entity. However, the parser does not expect the byte order mask in both 2833 * cases when encoding is provided by the input source. 2834 * 2835 * @param is A new input source to set up. 2836 * @exception IOException If any IO errors occur. 2837 * @exception Exception is parser specific exception form panic method. 2838 */ 2839 protected void setinp(InputSource is) 2840 throws Exception { 2841 Reader reader = null; 2842 mChIdx = 0; 2843 mChLen = 0; 2844 mChars = mInp.chars; 2845 mInp.src = null; 2846 if (mPh < PH_DOC_START) { 2847 mIsSAlone = false; // default [#2.9] 2848 } 2849 mIsSAloneSet = false; 2850 if (is.getCharacterStream() != null) { 2851 // Ignore encoding in the xml text decl. 2852 reader = is.getCharacterStream(); 2853 xml(reader); 2854 } else if (is.getByteStream() != null) { 2855 String expenc; 2856 if (is.getEncoding() != null) { 2857 // Ignore encoding in the xml text decl. 2858 expenc = is.getEncoding().toUpperCase(); 2859 if (expenc.equals("UTF-16")) { 2860 reader = bom(is.getByteStream(), 'U'); // UTF-16 [#4.3.3] 2861 } else { 2862 reader = enc(expenc, is.getByteStream()); 2863 } 2864 xml(reader); 2865 } else { 2866 // Get encoding from BOM or the xml text decl. 2867 reader = bom(is.getByteStream(), ' '); 2868 if (reader == null) { 2869 // Encoding is defined by the xml text decl. 2870 reader = enc("UTF-8", is.getByteStream()); 2871 expenc = xml(reader); 2872 if (expenc.startsWith("UTF-16")) { 2873 panic(FAULT); // UTF-16 must have BOM [#4.3.3] 2874 } 2875 reader = enc(expenc, is.getByteStream()); 2876 } else { 2877 // Encoding is defined by the BOM. 2878 xml(reader); 2879 } 2880 } 2881 } else { 2882 // There is no support for public/system identifiers. 2883 panic(FAULT); 2884 } 2885 mInp.src = reader; 2886 mInp.pubid = is.getPublicId(); 2887 mInp.sysid = is.getSystemId(); 2888 } 2889 2890 /** 2891 * Determines the entity encoding. 2892 * 2893 * This method gets encoding from Byte Order Mask [#4.3.3] if any. Note, the 2894 * first byte returned by the entity's byte stream has to be the first byte 2895 * in the entity. Also, there is no support for UCS-4. 2896 * 2897 * @param is A byte stream of the entity. 2898 * @param hint An encoding hint, character U means UTF-16. 2899 * @return a reader constructed from the BOM or UTF-8 by default. 2900 * @exception Exception is parser specific exception form panic method. 2901 * @exception IOException 2902 */ 2903 private Reader bom(InputStream is, char hint) 2904 throws Exception { 2905 int val = is.read(); 2906 switch (val) { 2907 case 0xef: // UTF-8 2908 if (hint == 'U') // must be UTF-16 2909 { 2910 panic(FAULT); 2911 } 2912 if (is.read() != 0xbb) { 2913 panic(FAULT); 2914 } 2915 if (is.read() != 0xbf) { 2916 panic(FAULT); 2917 } 2918 return new ReaderUTF8(is); 2919 2920 case 0xfe: // UTF-16, big-endian 2921 if (is.read() != 0xff) { 2922 panic(FAULT); 2923 } 2924 return new ReaderUTF16(is, 'b'); 2925 2926 case 0xff: // UTF-16, little-endian 2927 if (is.read() != 0xfe) { 2928 panic(FAULT); 2929 } 2930 return new ReaderUTF16(is, 'l'); 2931 2932 case -1: 2933 mChars[mChIdx++] = EOS; 2934 return new ReaderUTF8(is); 2935 2936 default: 2937 if (hint == 'U') // must be UTF-16 2938 { 2939 panic(FAULT); 2940 } 2941 // Read the rest of UTF-8 character 2942 switch (val & 0xf0) { 2943 case 0xc0: 2944 case 0xd0: 2945 mChars[mChIdx++] = (char) (((val & 0x1f) << 6) | (is.read() & 0x3f)); 2946 break; 2947 2948 case 0xe0: 2949 mChars[mChIdx++] = (char) (((val & 0x0f) << 12) 2950 | ((is.read() & 0x3f) << 6) | (is.read() & 0x3f)); 2951 break; 2952 2953 case 0xf0: // UCS-4 character 2954 throw new UnsupportedEncodingException(); 2955 2956 default: 2957 mChars[mChIdx++] = (char) val; 2958 break; 2959 } 2960 return null; 2961 } 2962 } 2963 2964 /** 2965 * Parses the xml text declaration. 2966 * 2967 * This method gets encoding from the xml text declaration [#4.3.1] if any. 2968 * The method assumes the buffer (mChars) is big enough to accomodate whole 2969 * xml text declaration. 2970 * 2971 * @param reader is entity reader. 2972 * @return The xml text declaration encoding or default UTF-8 encoding. 2973 * @exception Exception is parser specific exception form panic method. 2974 * @exception IOException 2975 */ 2976 private String xml(Reader reader) 2977 throws Exception { 2978 String str = null; 2979 String enc = "UTF-8"; 2980 char ch; 2981 int val; 2982 short st; 2983 // Read the xml text declaration into the buffer 2984 if (mChIdx != 0) { 2985 // The bom method have read ONE char into the buffer. 2986 st = (short) ((mChars[0] == '<') ? 1 : -1); 2987 } else { 2988 st = 0; 2989 } 2990 while (st >= 0 && mChIdx < mChars.length) { 2991 ch = ((val = reader.read()) >= 0) ? (char) val : EOS; 2992 mChars[mChIdx++] = ch; 2993 switch (st) { 2994 case 0: // read '<' of xml declaration 2995 switch (ch) { 2996 case '<': 2997 st = 1; 2998 break; 2999 3000 case 0xfeff: // the byte order mask 3001 ch = ((val = reader.read()) >= 0) ? (char) val : EOS; 3002 mChars[mChIdx - 1] = ch; 3003 st = (short) ((ch == '<') ? 1 : -1); 3004 break; 3005 3006 default: 3007 st = -1; 3008 break; 3009 } 3010 break; 3011 3012 case 1: // read '?' of xml declaration [#4.3.1] 3013 st = (short) ((ch == '?') ? 2 : -1); 3014 break; 3015 3016 case 2: // read 'x' of xml declaration [#4.3.1] 3017 st = (short) ((ch == 'x') ? 3 : -1); 3018 break; 3019 3020 case 3: // read 'm' of xml declaration [#4.3.1] 3021 st = (short) ((ch == 'm') ? 4 : -1); 3022 break; 3023 3024 case 4: // read 'l' of xml declaration [#4.3.1] 3025 st = (short) ((ch == 'l') ? 5 : -1); 3026 break; 3027 3028 case 5: // read white space after 'xml' 3029 switch (ch) { 3030 case ' ': 3031 case '\t': 3032 case '\r': 3033 case '\n': 3034 st = 6; 3035 break; 3036 3037 default: 3038 st = -1; 3039 break; 3040 } 3041 break; 3042 3043 case 6: // read content of xml declaration 3044 switch (ch) { 3045 case '?': 3046 st = 7; 3047 break; 3048 3049 case EOS: 3050 st = -2; 3051 break; 3052 3053 default: 3054 break; 3055 } 3056 break; 3057 3058 case 7: // read '>' after '?' of xml declaration 3059 switch (ch) { 3060 case '>': 3061 case EOS: 3062 st = -2; 3063 break; 3064 3065 default: 3066 st = 6; 3067 break; 3068 } 3069 break; 3070 3071 default: 3072 panic(FAULT); 3073 break; 3074 } 3075 } 3076 mChLen = mChIdx; 3077 mChIdx = 0; 3078 // If there is no xml text declaration, the encoding is default. 3079 if (st == -1) { 3080 return enc; 3081 } 3082 mChIdx = 5; // the first white space after "<?xml" 3083 // Parse the xml text declaration 3084 for (st = 0; st >= 0;) { 3085 ch = getch(); 3086 switch (st) { 3087 case 0: // skip spaces after the xml declaration name 3088 if (chtyp(ch) != ' ') { 3089 bkch(); 3090 st = 1; 3091 } 3092 break; 3093 3094 case 1: // read xml declaration version 3095 case 2: // read xml declaration encoding or standalone 3096 case 3: // read xml declaration standalone 3097 switch (chtyp(ch)) { 3098 case 'a': 3099 case 'A': 3100 case '_': 3101 bkch(); 3102 str = name(false).toLowerCase(); 3103 if ("version".equals(str) == true) { 3104 if (st != 1) { 3105 panic(FAULT); 3106 } 3107 if ("1.0".equals(eqstr('=')) != true) { 3108 panic(FAULT); 3109 } 3110 mInp.xmlver = 0x0100; 3111 st = 2; 3112 } else if ("encoding".equals(str) == true) { 3113 if (st != 2) { 3114 panic(FAULT); 3115 } 3116 mInp.xmlenc = eqstr('=').toUpperCase(); 3117 enc = mInp.xmlenc; 3118 st = 3; 3119 } else if ("standalone".equals(str) == true) { 3120 if ((st == 1) || (mPh >= PH_DOC_START)) // [#4.3.1] 3121 { 3122 panic(FAULT); 3123 } 3124 str = eqstr('=').toLowerCase(); 3125 // Check the 'standalone' value and use it [#5.1] 3126 if (str.equals("yes") == true) { 3127 mIsSAlone = true; 3128 } else if (str.equals("no") == true) { 3129 mIsSAlone = false; 3130 } else { 3131 panic(FAULT); 3132 } 3133 mIsSAloneSet = true; 3134 st = 4; 3135 } else { 3136 panic(FAULT); 3137 } 3138 break; 3139 3140 case ' ': 3141 break; 3142 3143 case '?': 3144 if (st == 1) { 3145 panic(FAULT); 3146 } 3147 bkch(); 3148 st = 4; 3149 break; 3150 3151 default: 3152 panic(FAULT); 3153 } 3154 break; 3155 3156 case 4: // end of xml declaration 3157 switch (chtyp(ch)) { 3158 case '?': 3159 if (getch() != '>') { 3160 panic(FAULT); 3161 } 3162 if (mPh <= PH_DOC_START) { 3163 mPh = PH_MISC_DTD; // misc before DTD 3164 } 3165 st = -1; 3166 break; 3167 3168 case ' ': 3169 break; 3170 3171 default: 3172 panic(FAULT); 3173 } 3174 break; 3175 3176 default: 3177 panic(FAULT); 3178 } 3179 } 3180 return enc; 3181 } 3182 3183 /** 3184 * Sets up the document reader. 3185 * 3186 * @param name an encoding name. 3187 * @param is the document byte input stream. 3188 * @return a reader constructed from encoding name and input stream. 3189 * @exception UnsupportedEncodingException 3190 */ 3191 private Reader enc(String name, InputStream is) 3192 throws UnsupportedEncodingException { 3193 // DO NOT CLOSE current reader if any! 3194 if (name.equals("UTF-8")) { 3195 return new ReaderUTF8(is); 3196 } else if (name.equals("UTF-16LE")) { 3197 return new ReaderUTF16(is, 'l'); 3198 } else if (name.equals("UTF-16BE")) { 3199 return new ReaderUTF16(is, 'b'); 3200 } else { 3201 return new InputStreamReader(is, name); 3202 } 3203 } 3204 3205 /** 3206 * Sets up current input on the top of the input stack. 3207 * 3208 * @param inp A new input to set up. 3209 */ 3210 protected void push(Input inp) { 3211 mInp.chLen = mChLen; 3212 mInp.chIdx = mChIdx; 3213 inp.next = mInp; 3214 mInp = inp; 3215 mChars = inp.chars; 3216 mChLen = inp.chLen; 3217 mChIdx = inp.chIdx; 3218 } 3219 3220 /** 3221 * Restores previous input on the top of the input stack. 3222 */ 3223 protected void pop() { 3224 if (mInp.src != null) { 3225 try { 3226 mInp.src.close(); 3227 } catch (IOException ioe) { 3228 } 3229 mInp.src = null; 3230 } 3231 mInp = mInp.next; 3232 if (mInp != null) { 3233 mChars = mInp.chars; 3234 mChLen = mInp.chLen; 3235 mChIdx = mInp.chIdx; 3236 } else { 3237 mChars = null; 3238 mChLen = 0; 3239 mChIdx = 0; 3240 } 3241 } 3242 3243 /** 3244 * Maps a character to it's type. 3245 * 3246 * Possible character type values are:<br /> - ' ' for any kind of white 3247 * space character;<br /> - 'a' for any lower case alphabetical character 3248 * value;<br /> - 'A' for any upper case alphabetical character value;<br /> 3249 * - 'd' for any decimal digit character value;<br /> - 'z' for any 3250 * character less then ' ' except '\t', '\n', '\r';<br /> - 'X' for any not 3251 * ASCII character;<br /> - 'Z' for EOS character.<br /> An ASCII (7 bit) 3252 * character which does not fall in any category listed above is mapped to 3253 * it self. 3254 * 3255 * @param ch The character to map. 3256 * @return The type of character. 3257 */ 3258 protected char chtyp(char ch) { 3259 if (ch < 0x80) { 3260 return (char) asctyp[ch]; 3261 } 3262 return (ch != EOS) ? 'X' : 'Z'; 3263 } 3264 3265 /** 3266 * Retrives the next character in the document. 3267 * 3268 * @return The next character in the document. 3269 */ 3270 protected char getch() 3271 throws IOException { 3272 if (mChIdx >= mChLen) { 3273 if (mInp.src == null) { 3274 pop(); // remove internal entity 3275 return getch(); 3276 } 3277 // Read new portion of the document characters 3278 int Num = mInp.src.read(mChars, 0, mChars.length); 3279 if (Num < 0) { 3280 if (mInp != mDoc) { 3281 pop(); // restore the previous input 3282 return getch(); 3283 } else { 3284 mChars[0] = EOS; 3285 mChLen = 1; 3286 } 3287 } else { 3288 mChLen = Num; 3289 } 3290 mChIdx = 0; 3291 } 3292 return mChars[mChIdx++]; 3293 } 3294 3295 /** 3296 * Puts back the last read character. 3297 * 3298 * This method <strong>MUST NOT</strong> be called more then once after each 3299 * call of {@link #getch getch} method. 3300 */ 3301 protected void bkch() 3302 throws Exception { 3303 if (mChIdx <= 0) { 3304 panic(FAULT); 3305 } 3306 mChIdx--; 3307 } 3308 3309 /** 3310 * Sets the current character. 3311 * 3312 * @param ch The character to set. 3313 */ 3314 protected void setch(char ch) { 3315 mChars[mChIdx] = ch; 3316 } 3317 3318 /** 3319 * Finds a pair in the pair chain by a qualified name. 3320 * 3321 * @param chain The first element of the chain of pairs. 3322 * @param qname The qualified name. 3323 * @return A pair with the specified qualified name or null. 3324 */ 3325 protected Pair find(Pair chain, char[] qname) { 3326 for (Pair pair = chain; pair != null; pair = pair.next) { 3327 if (pair.eqname(qname) == true) { 3328 return pair; 3329 } 3330 } 3331 return null; 3332 } 3333 3334 /** 3335 * Provedes an instance of a pair. 3336 * 3337 * @param next The reference to a next pair. 3338 * @return An instance of a pair. 3339 */ 3340 protected Pair pair(Pair next) { 3341 Pair pair; 3342 3343 if (mDltd != null) { 3344 pair = mDltd; 3345 mDltd = pair.next; 3346 } else { 3347 pair = new Pair(); 3348 } 3349 pair.next = next; 3350 3351 return pair; 3352 } 3353 3354 /** 3355 * Deletes an instance of a pair. 3356 * 3357 * @param pair The pair to delete. 3358 * @return A reference to the next pair in a chain. 3359 */ 3360 protected Pair del(Pair pair) { 3361 Pair next = pair.next; 3362 3363 pair.name = null; 3364 pair.value = null; 3365 pair.chars = null; 3366 pair.list = null; 3367 pair.next = mDltd; 3368 mDltd = pair; 3369 3370 return next; 3371 } 3372 }