1 /* 2 * Copyright (c) 1998, 2014, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 package javax.swing.text.html.parser; 27 28 import javax.swing.text.SimpleAttributeSet; 29 import javax.swing.text.html.HTML; 30 import javax.swing.text.ChangedCharSetException; 31 import java.io.*; 32 import java.util.Hashtable; 33 import java.util.Properties; 34 import java.util.Vector; 35 import java.util.Enumeration; 36 import java.net.URL; 37 38 import sun.misc.MessageUtils; 39 40 /** 41 * A simple DTD-driven HTML parser. The parser reads an 42 * HTML file from an InputStream and calls various methods 43 * (which should be overridden in a subclass) when tags and 44 * data are encountered. 45 * <p> 46 * Unfortunately there are many badly implemented HTML parsers 47 * out there, and as a result there are many badly formatted 48 * HTML files. This parser attempts to parse most HTML files. 49 * This means that the implementation sometimes deviates from 50 * the SGML specification in favor of HTML. 51 * <p> 52 * The parser treats \r and \r\n as \n. Newlines after starttags 53 * and before end tags are ignored just as specified in the SGML/HTML 54 * specification. 55 * <p> 56 * The html spec does not specify how spaces are to be coalesced very well. 57 * Specifically, the following scenarios are not discussed (note that a 58 * space should be used here, but I am using &nbsp to force the space to 59 * be displayed): 60 * <p> 61 * '<b>blah <i> <strike> foo' which can be treated as: 62 * '<b>blah <i><strike>foo' 63 * <p>as well as: 64 * '<p><a href="xx"> <em>Using</em></a></p>' 65 * which appears to be treated as: 66 * '<p><a href="xx"><em>Using</em></a></p>' 67 * <p> 68 * If <code>strict</code> is false, when a tag that breaks flow, 69 * (<code>TagElement.breaksFlows</code>) or trailing whitespace is 70 * encountered, all whitespace will be ignored until a non whitespace 71 * character is encountered. This appears to give behavior closer to 72 * the popular browsers. 73 * 74 * @see DTD 75 * @see TagElement 76 * @see SimpleAttributeSet 77 * @author Arthur van Hoff 78 * @author Sunita Mani 79 */ 80 public 81 class Parser implements DTDConstants { 82 83 private char text[] = new char[1024]; 84 private int textpos = 0; 85 private TagElement last; 86 private boolean space; 87 88 private char str[] = new char[128]; 89 private int strpos = 0; 90 91 /** 92 * The dtd. 93 */ 94 protected DTD dtd = null; 95 96 private int ch; 97 private int ln; 98 private Reader in; 99 100 private Element recent; 101 private TagStack stack; 102 private boolean skipTag = false; 103 private TagElement lastFormSent = null; 104 private SimpleAttributeSet attributes = new SimpleAttributeSet(); 105 106 // State for <html>, <head> and <body>. Since people like to slap 107 // together HTML documents without thinking, occasionally they 108 // have multiple instances of these tags. These booleans track 109 // the first sightings of these tags so they can be safely ignored 110 // by the parser if repeated. 111 private boolean seenHtml = false; 112 private boolean seenHead = false; 113 private boolean seenBody = false; 114 115 /** 116 * The html spec does not specify how spaces are coalesced very well. 117 * If strict == false, ignoreSpace is used to try and mimic the behavior 118 * of the popular browsers. 119 * <p> 120 * The problematic scenarios are: 121 * '<b>blah <i> <strike> foo' which can be treated as: 122 * '<b>blah <i><strike>foo' 123 * as well as: 124 * '<p><a href="xx"> <em>Using</em></a></p>' 125 * which appears to be treated as: 126 * '<p><a href="xx"><em>Using</em></a></p>' 127 * <p> 128 * When a tag that breaks flow, or trailing whitespace is encountered 129 * ignoreSpace is set to true. From then on, all whitespace will be 130 * ignored. 131 * ignoreSpace will be set back to false the first time a 132 * non whitespace character is encountered. This appears to give 133 * behavior closer to the popular browsers. 134 */ 135 private boolean ignoreSpace; 136 137 /** 138 * This flag determines whether or not the Parser will be strict 139 * in enforcing SGML compatibility. If false, it will be lenient 140 * with certain common classes of erroneous HTML constructs. 141 * Strict or not, in either case an error will be recorded. 142 * 143 */ 144 protected boolean strict = false; 145 146 147 /** Number of \r\n's encountered. */ 148 private int crlfCount; 149 /** Number of \r's encountered. A \r\n will not increment this. */ 150 private int crCount; 151 /** Number of \n's encountered. A \r\n will not increment this. */ 152 private int lfCount; 153 154 // 155 // To correctly identify the start of a tag/comment/text we need two 156 // ivars. Two are needed as handleText isn't invoked until the tag 157 // after the text has been parsed, that is the parser parses the text, 158 // then a tag, then invokes handleText followed by handleStart. 159 // 160 /** The start position of the current block. Block is overloaded here, 161 * it really means the current start position for the current comment, 162 * tag, text. Use getBlockStartPosition to access this. */ 163 private int currentBlockStartPos; 164 /** Start position of the last block. */ 165 private int lastBlockStartPos; 166 167 /** 168 * array for mapping numeric references in range 169 * 130-159 to displayable Unicode characters. 170 */ 171 private static final char[] cp1252Map = { 172 8218, // 173 402, // 174 8222, // 175 8230, // 176 8224, // 177 8225, // 178 710, // 179 8240, // 180 352, // 181 8249, // 182 338, // 183 141, // 184 142, // 185 143, // 186 144, // 187 8216, // 188 8217, // 189 8220, // 190 8221, // 191 8226, // 192 8211, // 193 8212, // 194 732, // 195 8482, // 196 353, // 197 8250, // 198 339, // 199 157, // 200 158, // 201 376 // 202 }; 203 204 /** 205 * Creates parser with the specified {@code dtd}. 206 * 207 * @param dtd the dtd. 208 */ 209 public Parser(DTD dtd) { 210 this.dtd = dtd; 211 } 212 213 214 /** 215 * @return the line number of the line currently being parsed 216 */ 217 protected int getCurrentLine() { 218 return ln; 219 } 220 221 /** 222 * Returns the start position of the current block. Block is 223 * overloaded here, it really means the current start position for 224 * the current comment tag, text, block.... This is provided for 225 * subclassers that wish to know the start of the current block when 226 * called with one of the handleXXX methods. 227 * 228 * @return the start position of the current block 229 */ 230 int getBlockStartPosition() { 231 return Math.max(0, lastBlockStartPos - 1); 232 } 233 234 /** 235 * Makes a TagElement. 236 * 237 * @param elem the element storing the tag definition 238 * @param fictional the value of the flag "{@code fictional}" to be set for the tag 239 * 240 * @return the created {@code TagElement} 241 */ 242 protected TagElement makeTag(Element elem, boolean fictional) { 243 return new TagElement(elem, fictional); 244 } 245 246 /** 247 * Makes a TagElement. 248 * 249 * @param elem the element storing the tag definition 250 * 251 * @return the created {@code TagElement} 252 */ 253 protected TagElement makeTag(Element elem) { 254 return makeTag(elem, false); 255 } 256 257 /** 258 * Returns attributes for the current tag. 259 * 260 * @return {@code SimpleAttributeSet} containing the attributes 261 */ 262 protected SimpleAttributeSet getAttributes() { 263 return attributes; 264 } 265 266 /** 267 * Removes the current attributes. 268 */ 269 protected void flushAttributes() { 270 attributes.removeAttributes(attributes); 271 } 272 273 /** 274 * Called when PCDATA is encountered. 275 * 276 * @param text the section text 277 */ 278 protected void handleText(char text[]) { 279 } 280 281 /** 282 * Called when an HTML title tag is encountered. 283 * 284 * @param text the title text 285 */ 286 protected void handleTitle(char text[]) { 287 // default behavior is to call handleText. Subclasses 288 // can override if necessary. 289 handleText(text); 290 } 291 292 /** 293 * Called when an HTML comment is encountered. 294 * 295 * @param text the comment being handled 296 */ 297 protected void handleComment(char text[]) { 298 } 299 300 /** 301 * Called when the content terminates without closing the HTML comment. 302 */ 303 protected void handleEOFInComment() { 304 // We've reached EOF. Our recovery strategy is to 305 // see if we have more than one line in the comment; 306 // if so, we pretend that the comment was an unterminated 307 // single line comment, and reparse the lines after the 308 // first line as normal HTML content. 309 310 int commentEndPos = strIndexOf('\n'); 311 if (commentEndPos >= 0) { 312 handleComment(getChars(0, commentEndPos)); 313 try { 314 in.close(); 315 in = new CharArrayReader(getChars(commentEndPos + 1)); 316 ch = '>'; 317 } catch (IOException e) { 318 error("ioexception"); 319 } 320 321 resetStrBuffer(); 322 } else { 323 // no newline, so signal an error 324 error("eof.comment"); 325 } 326 } 327 328 /** 329 * Called when an empty tag is encountered. 330 * 331 * @param tag the tag being handled 332 * @throws ChangedCharSetException if the document charset was changed 333 */ 334 protected void handleEmptyTag(TagElement tag) throws ChangedCharSetException { 335 } 336 337 /** 338 * Called when a start tag is encountered. 339 * 340 * @param tag the tag being handled 341 */ 342 protected void handleStartTag(TagElement tag) { 343 } 344 345 /** 346 * Called when an end tag is encountered. 347 * 348 * @param tag the tag being handled 349 */ 350 protected void handleEndTag(TagElement tag) { 351 } 352 353 /** 354 * An error has occurred. 355 * 356 * @param ln the number of line containing the error 357 * @param msg the error message 358 */ 359 protected void handleError(int ln, String msg) { 360 /* 361 Thread.dumpStack(); 362 System.out.println("**** " + stack); 363 System.out.println("line " + ln + ": error: " + msg); 364 System.out.println(); 365 */ 366 } 367 368 /** 369 * Output text. 370 */ 371 void handleText(TagElement tag) { 372 if (tag.breaksFlow()) { 373 space = false; 374 if (!strict) { 375 ignoreSpace = true; 376 } 377 } 378 if (textpos == 0) { 379 if ((!space) || (stack == null) || last.breaksFlow() || 380 !stack.advance(dtd.pcdata)) { 381 last = tag; 382 space = false; 383 lastBlockStartPos = currentBlockStartPos; 384 return; 385 } 386 } 387 if (space) { 388 if (!ignoreSpace) { 389 // enlarge buffer if needed 390 if (textpos + 1 > text.length) { 391 char newtext[] = new char[text.length + 200]; 392 System.arraycopy(text, 0, newtext, 0, text.length); 393 text = newtext; 394 } 395 396 // output pending space 397 text[textpos++] = ' '; 398 if (!strict && !tag.getElement().isEmpty()) { 399 ignoreSpace = true; 400 } 401 } 402 space = false; 403 } 404 char newtext[] = new char[textpos]; 405 System.arraycopy(text, 0, newtext, 0, textpos); 406 // Handles cases of bad html where the title tag 407 // was getting lost when we did error recovery. 408 if (tag.getElement().getName().equals("title")) { 409 handleTitle(newtext); 410 } else { 411 handleText(newtext); 412 } 413 lastBlockStartPos = currentBlockStartPos; 414 textpos = 0; 415 last = tag; 416 space = false; 417 } 418 419 /** 420 * Invokes the error handler. 421 * 422 * @param err the error type 423 * @param arg1 the 1st error message argument 424 * @param arg2 the 2nd error message argument 425 * @param arg3 the 3rd error message argument 426 */ 427 protected void error(String err, String arg1, String arg2, 428 String arg3) { 429 handleError(ln, err + " " + arg1 + " " + arg2 + " " + arg3); 430 } 431 432 /** 433 * Invokes the error handler with the 3rd error message argument "?". 434 * 435 * @param err the error type 436 * @param arg1 the 1st error message argument 437 * @param arg2 the 2nd error message argument 438 */ 439 protected void error(String err, String arg1, String arg2) { 440 error(err, arg1, arg2, "?"); 441 } 442 443 /** 444 * Invokes the error handler with the 2nd and 3rd error message argument "?". 445 * 446 * @param err the error type 447 * @param arg1 the 1st error message argument 448 */ 449 protected void error(String err, String arg1) { 450 error(err, arg1, "?", "?"); 451 } 452 453 /** 454 * Invokes the error handler with the 1st, 2nd and 3rd error message argument "?". 455 * 456 * @param err the error type 457 */ 458 protected void error(String err) { 459 error(err, "?", "?", "?"); 460 } 461 462 463 /** 464 * Handle a start tag. The new tag is pushed 465 * onto the tag stack. The attribute list is 466 * checked for required attributes. 467 * 468 * @param tag the tag 469 * @throws ChangedCharSetException if the document charset was changed 470 */ 471 protected void startTag(TagElement tag) throws ChangedCharSetException { 472 Element elem = tag.getElement(); 473 474 // If the tag is an empty tag and texpos != 0 475 // this implies that there is text before the 476 // start tag that needs to be processed before 477 // handling the tag. 478 // 479 if (!elem.isEmpty() || 480 ((last != null) && !last.breaksFlow()) || 481 (textpos != 0)) { 482 handleText(tag); 483 } else { 484 // this variable gets updated in handleText(). 485 // Since in this case we do not call handleText() 486 // we need to update it here. 487 // 488 last = tag; 489 // Note that we should really check last.breakFlows before 490 // assuming this should be false. 491 space = false; 492 } 493 lastBlockStartPos = currentBlockStartPos; 494 495 // check required attributes 496 for (AttributeList a = elem.atts ; a != null ; a = a.next) { 497 if ((a.modifier == REQUIRED) && 498 ((attributes.isEmpty()) || 499 ((!attributes.isDefined(a.name)) && 500 (!attributes.isDefined(HTML.getAttributeKey(a.name)))))) { 501 error("req.att ", a.getName(), elem.getName()); 502 } 503 } 504 505 if (elem.isEmpty()) { 506 handleEmptyTag(tag); 507 /* 508 } else if (elem.getName().equals("form")) { 509 handleStartTag(tag); 510 */ 511 } else { 512 recent = elem; 513 stack = new TagStack(tag, stack); 514 handleStartTag(tag); 515 } 516 } 517 518 /** 519 * Handle an end tag. The end tag is popped 520 * from the tag stack. 521 * 522 * @param omitted {@code true} if the tag is no actually present in the 523 * document, but is supposed by the parser 524 */ 525 protected void endTag(boolean omitted) { 526 handleText(stack.tag); 527 528 if (omitted && !stack.elem.omitEnd()) { 529 error("end.missing", stack.elem.getName()); 530 } else if (!stack.terminate()) { 531 error("end.unexpected", stack.elem.getName()); 532 } 533 534 // handle the tag 535 handleEndTag(stack.tag); 536 stack = stack.next; 537 recent = (stack != null) ? stack.elem : null; 538 } 539 540 541 boolean ignoreElement(Element elem) { 542 543 String stackElement = stack.elem.getName(); 544 String elemName = elem.getName(); 545 /* We ignore all elements that are not valid in the context of 546 a table except <td>, <th> (these we handle in 547 legalElementContext()) and #pcdata. We also ignore the 548 <font> tag in the context of <ul> and <ol> We additonally 549 ignore the <meta> and the <style> tag if the body tag has 550 been seen. **/ 551 if ((elemName.equals("html") && seenHtml) || 552 (elemName.equals("head") && seenHead) || 553 (elemName.equals("body") && seenBody)) { 554 return true; 555 } 556 if (elemName.equals("dt") || elemName.equals("dd")) { 557 TagStack s = stack; 558 while (s != null && !s.elem.getName().equals("dl")) { 559 s = s.next; 560 } 561 if (s == null) { 562 return true; 563 } 564 } 565 566 if (((stackElement.equals("table")) && 567 (!elemName.equals("#pcdata")) && (!elemName.equals("input"))) || 568 ((elemName.equals("font")) && 569 (stackElement.equals("ul") || stackElement.equals("ol"))) || 570 (elemName.equals("meta") && stack != null) || 571 (elemName.equals("style") && seenBody) || 572 (stackElement.equals("table") && elemName.equals("a"))) { 573 return true; 574 } 575 return false; 576 } 577 578 579 /** 580 * Marks the first time a tag has been seen in a document 581 * 582 * @param elem the element represented by the tag 583 */ 584 585 protected void markFirstTime(Element elem) { 586 String elemName = elem.getName(); 587 if (elemName.equals("html")) { 588 seenHtml = true; 589 } else if (elemName.equals("head")) { 590 seenHead = true; 591 } else if (elemName.equals("body")) { 592 if (buf.length == 1) { 593 // Refer to note in definition of buf for details on this. 594 char[] newBuf = new char[256]; 595 596 newBuf[0] = buf[0]; 597 buf = newBuf; 598 } 599 seenBody = true; 600 } 601 } 602 603 /** 604 * Create a legal content for an element. 605 */ 606 boolean legalElementContext(Element elem) throws ChangedCharSetException { 607 608 // System.out.println("-- legalContext -- " + elem); 609 610 // Deal with the empty stack 611 if (stack == null) { 612 // System.out.println("-- stack is empty"); 613 if (elem != dtd.html) { 614 // System.out.println("-- pushing html"); 615 startTag(makeTag(dtd.html, true)); 616 return legalElementContext(elem); 617 } 618 return true; 619 } 620 621 // Is it allowed in the current context 622 if (stack.advance(elem)) { 623 // System.out.println("-- legal context"); 624 markFirstTime(elem); 625 return true; 626 } 627 boolean insertTag = false; 628 629 // The use of all error recovery strategies are contingent 630 // on the value of the strict property. 631 // 632 // These are commonly occurring errors. if insertTag is true, 633 // then we want to adopt an error recovery strategy that 634 // involves attempting to insert an additional tag to 635 // legalize the context. The two errors addressed here 636 // are: 637 // 1) when a <td> or <th> is seen soon after a <table> tag. 638 // In this case we insert a <tr>. 639 // 2) when any other tag apart from a <tr> is seen 640 // in the context of a <tr>. In this case we would 641 // like to add a <td>. If a <tr> is seen within a 642 // <tr> context, then we will close out the current 643 // <tr>. 644 // 645 // This insertion strategy is handled later in the method. 646 // The reason for checking this now, is that in other cases 647 // we would like to apply other error recovery strategies for example 648 // ignoring tags. 649 // 650 // In certain cases it is better to ignore a tag than try to 651 // fix the situation. So the first test is to see if this 652 // is what we need to do. 653 // 654 String stackElemName = stack.elem.getName(); 655 String elemName = elem.getName(); 656 657 658 if (!strict && 659 ((stackElemName.equals("table") && elemName.equals("td")) || 660 (stackElemName.equals("table") && elemName.equals("th")) || 661 (stackElemName.equals("tr") && !elemName.equals("tr")))){ 662 insertTag = true; 663 } 664 665 666 if (!strict && !insertTag && (stack.elem.getName() != elem.getName() || 667 elem.getName().equals("body"))) { 668 if (skipTag = ignoreElement(elem)) { 669 error("tag.ignore", elem.getName()); 670 return skipTag; 671 } 672 } 673 674 // Check for anything after the start of the table besides tr, td, th 675 // or caption, and if those aren't there, insert the <tr> and call 676 // legalElementContext again. 677 if (!strict && stackElemName.equals("table") && 678 !elemName.equals("tr") && !elemName.equals("td") && 679 !elemName.equals("th") && !elemName.equals("caption")) { 680 Element e = dtd.getElement("tr"); 681 TagElement t = makeTag(e, true); 682 legalTagContext(t); 683 startTag(t); 684 error("start.missing", elem.getName()); 685 return legalElementContext(elem); 686 } 687 688 // They try to find a legal context by checking if the current 689 // tag is valid in an enclosing context. If so 690 // close out the tags by outputing end tags and then 691 // insert the current tag. If the tags that are 692 // being closed out do not have an optional end tag 693 // specification in the DTD then an html error is 694 // reported. 695 // 696 if (!insertTag && stack.terminate() && (!strict || stack.elem.omitEnd())) { 697 for (TagStack s = stack.next ; s != null ; s = s.next) { 698 if (s.advance(elem)) { 699 while (stack != s) { 700 endTag(true); 701 } 702 return true; 703 } 704 if (!s.terminate() || (strict && !s.elem.omitEnd())) { 705 break; 706 } 707 } 708 } 709 710 // Check if we know what tag is expected next. 711 // If so insert the tag. Report an error if the 712 // tag does not have its start tag spec in the DTD as optional. 713 // 714 Element next = stack.first(); 715 if (next != null && (!strict || next.omitStart()) && 716 !(next==dtd.head && elem==dtd.pcdata) ) { 717 // System.out.println("-- omitting start tag: " + next); 718 TagElement t = makeTag(next, true); 719 legalTagContext(t); 720 startTag(t); 721 if (!next.omitStart()) { 722 error("start.missing", elem.getName()); 723 } 724 return legalElementContext(elem); 725 } 726 727 728 // Traverse the list of expected elements and determine if adding 729 // any of these elements would make for a legal context. 730 // 731 732 if (!strict) { 733 ContentModel content = stack.contentModel(); 734 Vector<Element> elemVec = new Vector<Element>(); 735 if (content != null) { 736 content.getElements(elemVec); 737 for (Element e : elemVec) { 738 // Ensure that this element has not been included as 739 // part of the exclusions in the DTD. 740 // 741 if (stack.excluded(e.getIndex())) { 742 continue; 743 } 744 745 boolean reqAtts = false; 746 747 for (AttributeList a = e.getAttributes(); a != null ; a = a.next) { 748 if (a.modifier == REQUIRED) { 749 reqAtts = true; 750 break; 751 } 752 } 753 // Ensure that no tag that has required attributes 754 // gets inserted. 755 // 756 if (reqAtts) { 757 continue; 758 } 759 760 ContentModel m = e.getContent(); 761 if (m != null && m.first(elem)) { 762 // System.out.println("-- adding a legal tag: " + e); 763 TagElement t = makeTag(e, true); 764 legalTagContext(t); 765 startTag(t); 766 error("start.missing", e.getName()); 767 return legalElementContext(elem); 768 } 769 } 770 } 771 } 772 773 // Check if the stack can be terminated. If so add the appropriate 774 // end tag. Report an error if the tag being ended does not have its 775 // end tag spec in the DTD as optional. 776 // 777 if (stack.terminate() && (stack.elem != dtd.body) && (!strict || stack.elem.omitEnd())) { 778 // System.out.println("-- omitting end tag: " + stack.elem); 779 if (!stack.elem.omitEnd()) { 780 error("end.missing", elem.getName()); 781 } 782 783 endTag(true); 784 return legalElementContext(elem); 785 } 786 787 // At this point we know that something is screwed up. 788 return false; 789 } 790 791 /** 792 * Create a legal context for a tag. 793 */ 794 void legalTagContext(TagElement tag) throws ChangedCharSetException { 795 if (legalElementContext(tag.getElement())) { 796 markFirstTime(tag.getElement()); 797 return; 798 } 799 800 // Avoid putting a block tag in a flow tag. 801 if (tag.breaksFlow() && (stack != null) && !stack.tag.breaksFlow()) { 802 endTag(true); 803 legalTagContext(tag); 804 return; 805 } 806 807 // Avoid putting something wierd in the head of the document. 808 for (TagStack s = stack ; s != null ; s = s.next) { 809 if (s.tag.getElement() == dtd.head) { 810 while (stack != s) { 811 endTag(true); 812 } 813 endTag(true); 814 legalTagContext(tag); 815 return; 816 } 817 } 818 819 // Everything failed 820 error("tag.unexpected", tag.getElement().getName()); 821 } 822 823 /** 824 * Error context. Something went wrong, make sure we are in 825 * the document's body context 826 */ 827 void errorContext() throws ChangedCharSetException { 828 for (; (stack != null) && (stack.tag.getElement() != dtd.body) ; stack = stack.next) { 829 handleEndTag(stack.tag); 830 } 831 if (stack == null) { 832 legalElementContext(dtd.body); 833 startTag(makeTag(dtd.body, true)); 834 } 835 } 836 837 /** 838 * Add a char to the string buffer. 839 */ 840 void addString(int c) { 841 if (strpos == str.length) { 842 char newstr[] = new char[str.length + 128]; 843 System.arraycopy(str, 0, newstr, 0, str.length); 844 str = newstr; 845 } 846 str[strpos++] = (char)c; 847 } 848 849 /** 850 * Get the string that's been accumulated. 851 */ 852 String getString(int pos) { 853 char newStr[] = new char[strpos - pos]; 854 System.arraycopy(str, pos, newStr, 0, strpos - pos); 855 strpos = pos; 856 return new String(newStr); 857 } 858 859 char[] getChars(int pos) { 860 char newStr[] = new char[strpos - pos]; 861 System.arraycopy(str, pos, newStr, 0, strpos - pos); 862 strpos = pos; 863 return newStr; 864 } 865 866 char[] getChars(int pos, int endPos) { 867 char newStr[] = new char[endPos - pos]; 868 System.arraycopy(str, pos, newStr, 0, endPos - pos); 869 // REMIND: it's not clear whether this version should set strpos or not 870 // strpos = pos; 871 return newStr; 872 } 873 874 void resetStrBuffer() { 875 strpos = 0; 876 } 877 878 int strIndexOf(char target) { 879 for (int i = 0; i < strpos; i++) { 880 if (str[i] == target) { 881 return i; 882 } 883 } 884 885 return -1; 886 } 887 888 /** 889 * Skip space. 890 * [5] 297:5 891 */ 892 void skipSpace() throws IOException { 893 while (true) { 894 switch (ch) { 895 case '\n': 896 ln++; 897 ch = readCh(); 898 lfCount++; 899 break; 900 901 case '\r': 902 ln++; 903 if ((ch = readCh()) == '\n') { 904 ch = readCh(); 905 crlfCount++; 906 } 907 else { 908 crCount++; 909 } 910 break; 911 case ' ': 912 case '\t': 913 ch = readCh(); 914 break; 915 916 default: 917 return; 918 } 919 } 920 } 921 922 /** 923 * Parse identifier. Uppercase characters are folded 924 * to lowercase when lower is true. Returns falsed if 925 * no identifier is found. [55] 346:17 926 */ 927 boolean parseIdentifier(boolean lower) throws IOException { 928 switch (ch) { 929 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': 930 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L': 931 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': 932 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X': 933 case 'Y': case 'Z': 934 if (lower) { 935 ch = 'a' + (ch - 'A'); 936 } 937 break; 938 939 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': 940 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': 941 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': 942 case 's': case 't': case 'u': case 'v': case 'w': case 'x': 943 case 'y': case 'z': 944 break; 945 946 default: 947 return false; 948 } 949 950 while (true) { 951 addString(ch); 952 953 switch (ch = readCh()) { 954 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': 955 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L': 956 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': 957 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X': 958 case 'Y': case 'Z': 959 if (lower) { 960 ch = 'a' + (ch - 'A'); 961 } 962 break; 963 964 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': 965 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': 966 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': 967 case 's': case 't': case 'u': case 'v': case 'w': case 'x': 968 case 'y': case 'z': 969 970 case '0': case '1': case '2': case '3': case '4': 971 case '5': case '6': case '7': case '8': case '9': 972 973 case '.': case '-': 974 975 case '_': // not officially allowed 976 break; 977 978 default: 979 return true; 980 } 981 } 982 } 983 984 /** 985 * Parse an entity reference. [59] 350:17 986 */ 987 private char[] parseEntityReference() throws IOException { 988 int pos = strpos; 989 990 if ((ch = readCh()) == '#') { 991 int n = 0; 992 ch = readCh(); 993 if ((ch >= '0') && (ch <= '9') || 994 ch == 'x' || ch == 'X') { 995 996 if ((ch >= '0') && (ch <= '9')) { 997 // parse decimal reference 998 while ((ch >= '0') && (ch <= '9')) { 999 n = (n * 10) + ch - '0'; 1000 ch = readCh(); 1001 } 1002 } else { 1003 // parse hexadecimal reference 1004 ch = readCh(); 1005 char lch = (char) Character.toLowerCase(ch); 1006 while ((lch >= '0') && (lch <= '9') || 1007 (lch >= 'a') && (lch <= 'f')) { 1008 if (lch >= '0' && lch <= '9') { 1009 n = (n * 16) + lch - '0'; 1010 } else { 1011 n = (n * 16) + lch - 'a' + 10; 1012 } 1013 ch = readCh(); 1014 lch = (char) Character.toLowerCase(ch); 1015 } 1016 } 1017 switch (ch) { 1018 case '\n': 1019 ln++; 1020 ch = readCh(); 1021 lfCount++; 1022 break; 1023 1024 case '\r': 1025 ln++; 1026 if ((ch = readCh()) == '\n') { 1027 ch = readCh(); 1028 crlfCount++; 1029 } 1030 else { 1031 crCount++; 1032 } 1033 break; 1034 1035 case ';': 1036 ch = readCh(); 1037 break; 1038 } 1039 char data[] = mapNumericReference(n); 1040 return data; 1041 } 1042 addString('#'); 1043 if (!parseIdentifier(false)) { 1044 error("ident.expected"); 1045 strpos = pos; 1046 char data[] = {'&', '#'}; 1047 return data; 1048 } 1049 } else if (!parseIdentifier(false)) { 1050 char data[] = {'&'}; 1051 return data; 1052 } 1053 1054 boolean semicolon = false; 1055 1056 switch (ch) { 1057 case '\n': 1058 ln++; 1059 ch = readCh(); 1060 lfCount++; 1061 break; 1062 1063 case '\r': 1064 ln++; 1065 if ((ch = readCh()) == '\n') { 1066 ch = readCh(); 1067 crlfCount++; 1068 } 1069 else { 1070 crCount++; 1071 } 1072 break; 1073 1074 case ';': 1075 semicolon = true; 1076 1077 ch = readCh(); 1078 break; 1079 } 1080 1081 String nm = getString(pos); 1082 Entity ent = dtd.getEntity(nm); 1083 1084 // entities are case sensitive - however if strict 1085 // is false then we will try to make a match by 1086 // converting the string to all lowercase. 1087 // 1088 if (!strict && (ent == null)) { 1089 ent = dtd.getEntity(nm.toLowerCase()); 1090 } 1091 if ((ent == null) || !ent.isGeneral()) { 1092 1093 if (nm.length() == 0) { 1094 error("invalid.entref", nm); 1095 return new char[0]; 1096 } 1097 /* given that there is not a match restore the entity reference */ 1098 String str = "&" + nm + (semicolon ? ";" : ""); 1099 1100 char b[] = new char[str.length()]; 1101 str.getChars(0, b.length, b, 0); 1102 return b; 1103 } 1104 return ent.getData(); 1105 } 1106 1107 /** 1108 * Converts numeric character reference to char array. 1109 * 1110 * Normally the code in a reference should be always converted 1111 * to the Unicode character with the same code, but due to 1112 * wide usage of Cp1252 charset most browsers map numeric references 1113 * in the range 130-159 (which are control chars in Unicode set) 1114 * to displayable characters with other codes. 1115 * 1116 * @param c the code of numeric character reference. 1117 * @return a char array corresponding to the reference code. 1118 */ 1119 private char[] mapNumericReference(int c) { 1120 char[] data; 1121 if (c >= 0xffff) { // outside unicode BMP. 1122 try { 1123 data = Character.toChars(c); 1124 } catch (IllegalArgumentException e) { 1125 data = new char[0]; 1126 } 1127 } else { 1128 data = new char[1]; 1129 data[0] = (c < 130 || c > 159) ? (char) c : cp1252Map[c - 130]; 1130 } 1131 return data; 1132 } 1133 1134 /** 1135 * Parse a comment. [92] 391:7 1136 */ 1137 void parseComment() throws IOException { 1138 1139 while (true) { 1140 int c = ch; 1141 switch (c) { 1142 case '-': 1143 /** Presuming that the start string of a comment "<!--" has 1144 already been parsed, the '-' character is valid only as 1145 part of a comment termination and further more it must 1146 be present in even numbers. Hence if strict is true, we 1147 presume the comment has been terminated and return. 1148 However if strict is false, then there is no even number 1149 requirement and this character can appear anywhere in the 1150 comment. The parser reads on until it sees the following 1151 pattern: "-->" or "--!>". 1152 **/ 1153 if (!strict && (strpos != 0) && (str[strpos - 1] == '-')) { 1154 if ((ch = readCh()) == '>') { 1155 return; 1156 } 1157 if (ch == '!') { 1158 if ((ch = readCh()) == '>') { 1159 return; 1160 } else { 1161 /* to account for extra read()'s that happened */ 1162 addString('-'); 1163 addString('!'); 1164 continue; 1165 } 1166 } 1167 break; 1168 } 1169 1170 if ((ch = readCh()) == '-') { 1171 ch = readCh(); 1172 if (strict || ch == '>') { 1173 return; 1174 } 1175 if (ch == '!') { 1176 if ((ch = readCh()) == '>') { 1177 return; 1178 } else { 1179 /* to account for extra read()'s that happened */ 1180 addString('-'); 1181 addString('!'); 1182 continue; 1183 } 1184 } 1185 /* to account for the extra read() */ 1186 addString('-'); 1187 } 1188 break; 1189 1190 case -1: 1191 handleEOFInComment(); 1192 return; 1193 1194 case '\n': 1195 ln++; 1196 ch = readCh(); 1197 lfCount++; 1198 break; 1199 1200 case '>': 1201 ch = readCh(); 1202 break; 1203 1204 case '\r': 1205 ln++; 1206 if ((ch = readCh()) == '\n') { 1207 ch = readCh(); 1208 crlfCount++; 1209 } 1210 else { 1211 crCount++; 1212 } 1213 c = '\n'; 1214 break; 1215 default: 1216 ch = readCh(); 1217 break; 1218 } 1219 1220 addString(c); 1221 } 1222 } 1223 1224 /** 1225 * Parse literal content. [46] 343:1 and [47] 344:1 1226 */ 1227 void parseLiteral(boolean replace) throws IOException { 1228 while (true) { 1229 int c = ch; 1230 switch (c) { 1231 case -1: 1232 error("eof.literal", stack.elem.getName()); 1233 endTag(true); 1234 return; 1235 1236 case '>': 1237 ch = readCh(); 1238 int i = textpos - (stack.elem.name.length() + 2), j = 0; 1239 1240 // match end tag 1241 if ((i >= 0) && (text[i++] == '<') && (text[i] == '/')) { 1242 while ((++i < textpos) && 1243 (Character.toLowerCase(text[i]) == stack.elem.name.charAt(j++))); 1244 if (i == textpos) { 1245 textpos -= (stack.elem.name.length() + 2); 1246 if ((textpos > 0) && (text[textpos-1] == '\n')) { 1247 textpos--; 1248 } 1249 endTag(false); 1250 return; 1251 } 1252 } 1253 break; 1254 1255 case '&': 1256 char data[] = parseEntityReference(); 1257 if (textpos + data.length > text.length) { 1258 char newtext[] = new char[Math.max(textpos + data.length + 128, text.length * 2)]; 1259 System.arraycopy(text, 0, newtext, 0, text.length); 1260 text = newtext; 1261 } 1262 System.arraycopy(data, 0, text, textpos, data.length); 1263 textpos += data.length; 1264 continue; 1265 1266 case '\n': 1267 ln++; 1268 ch = readCh(); 1269 lfCount++; 1270 break; 1271 1272 case '\r': 1273 ln++; 1274 if ((ch = readCh()) == '\n') { 1275 ch = readCh(); 1276 crlfCount++; 1277 } 1278 else { 1279 crCount++; 1280 } 1281 c = '\n'; 1282 break; 1283 default: 1284 ch = readCh(); 1285 break; 1286 } 1287 1288 // output character 1289 if (textpos == text.length) { 1290 char newtext[] = new char[text.length + 128]; 1291 System.arraycopy(text, 0, newtext, 0, text.length); 1292 text = newtext; 1293 } 1294 text[textpos++] = (char)c; 1295 } 1296 } 1297 1298 /** 1299 * Parse attribute value. [33] 331:1 1300 */ 1301 @SuppressWarnings("fallthrough") 1302 String parseAttributeValue(boolean lower) throws IOException { 1303 int delim = -1; 1304 1305 // Check for a delimiter 1306 switch(ch) { 1307 case '\'': 1308 case '"': 1309 delim = ch; 1310 ch = readCh(); 1311 break; 1312 } 1313 1314 // Parse the rest of the value 1315 while (true) { 1316 int c = ch; 1317 1318 switch (c) { 1319 case '\n': 1320 ln++; 1321 ch = readCh(); 1322 lfCount++; 1323 if (delim < 0) { 1324 return getString(0); 1325 } 1326 break; 1327 1328 case '\r': 1329 ln++; 1330 1331 if ((ch = readCh()) == '\n') { 1332 ch = readCh(); 1333 crlfCount++; 1334 } 1335 else { 1336 crCount++; 1337 } 1338 if (delim < 0) { 1339 return getString(0); 1340 } 1341 break; 1342 1343 case '\t': 1344 if (delim < 0) 1345 c = ' '; 1346 // Fall through 1347 case ' ': 1348 ch = readCh(); 1349 if (delim < 0) { 1350 return getString(0); 1351 } 1352 break; 1353 1354 case '>': 1355 case '<': 1356 if (delim < 0) { 1357 return getString(0); 1358 } 1359 ch = readCh(); 1360 break; 1361 1362 case '\'': 1363 case '"': 1364 ch = readCh(); 1365 if (c == delim) { 1366 return getString(0); 1367 } else if (delim == -1) { 1368 error("attvalerr"); 1369 if (strict || ch == ' ') { 1370 return getString(0); 1371 } else { 1372 continue; 1373 } 1374 } 1375 break; 1376 1377 case '=': 1378 if (delim < 0) { 1379 /* In SGML a construct like <img src=/cgi-bin/foo?x=1> 1380 is considered invalid since an = sign can only be contained 1381 in an attributes value if the string is quoted. 1382 */ 1383 error("attvalerr"); 1384 /* If strict is true then we return with the string we have thus far. 1385 Otherwise we accept the = sign as part of the attribute's value and 1386 process the rest of the img tag. */ 1387 if (strict) { 1388 return getString(0); 1389 } 1390 } 1391 ch = readCh(); 1392 break; 1393 1394 case '&': 1395 if (strict && delim < 0) { 1396 ch = readCh(); 1397 break; 1398 } 1399 1400 char data[] = parseEntityReference(); 1401 for (int i = 0 ; i < data.length ; i++) { 1402 c = data[i]; 1403 addString((lower && (c >= 'A') && (c <= 'Z')) ? 'a' + c - 'A' : c); 1404 } 1405 continue; 1406 1407 case -1: 1408 return getString(0); 1409 1410 default: 1411 if (lower && (c >= 'A') && (c <= 'Z')) { 1412 c = 'a' + c - 'A'; 1413 } 1414 ch = readCh(); 1415 break; 1416 } 1417 addString(c); 1418 } 1419 } 1420 1421 1422 /** 1423 * Parse attribute specification List. [31] 327:17 1424 */ 1425 void parseAttributeSpecificationList(Element elem) throws IOException { 1426 1427 while (true) { 1428 skipSpace(); 1429 1430 switch (ch) { 1431 case '/': 1432 case '>': 1433 case '<': 1434 case -1: 1435 return; 1436 1437 case '-': 1438 if ((ch = readCh()) == '-') { 1439 ch = readCh(); 1440 parseComment(); 1441 strpos = 0; 1442 } else { 1443 error("invalid.tagchar", "-", elem.getName()); 1444 ch = readCh(); 1445 } 1446 continue; 1447 } 1448 1449 AttributeList att; 1450 String attname; 1451 String attvalue; 1452 1453 if (parseIdentifier(true)) { 1454 attname = getString(0); 1455 skipSpace(); 1456 if (ch == '=') { 1457 ch = readCh(); 1458 skipSpace(); 1459 att = elem.getAttribute(attname); 1460 // Bug ID 4102750 1461 // Load the NAME of an Attribute Case Sensitive 1462 // The case of the NAME must be intact 1463 // MG 021898 1464 attvalue = parseAttributeValue((att != null) && (att.type != CDATA) && (att.type != NOTATION) && (att.type != NAME)); 1465 // attvalue = parseAttributeValue((att != null) && (att.type != CDATA) && (att.type != NOTATION)); 1466 } else { 1467 attvalue = attname; 1468 att = elem.getAttributeByValue(attvalue); 1469 if (att == null) { 1470 att = elem.getAttribute(attname); 1471 if (att != null) { 1472 attvalue = att.getValue(); 1473 } 1474 else { 1475 // Make it null so that NULL_ATTRIBUTE_VALUE is 1476 // used 1477 attvalue = null; 1478 } 1479 } 1480 } 1481 } else if (!strict && ch == ',') { // allows for comma separated attribute-value pairs 1482 ch = readCh(); 1483 continue; 1484 } else if (!strict && ch == '"') { // allows for quoted attributes 1485 ch = readCh(); 1486 skipSpace(); 1487 if (parseIdentifier(true)) { 1488 attname = getString(0); 1489 if (ch == '"') { 1490 ch = readCh(); 1491 } 1492 skipSpace(); 1493 if (ch == '=') { 1494 ch = readCh(); 1495 skipSpace(); 1496 att = elem.getAttribute(attname); 1497 attvalue = parseAttributeValue((att != null) && 1498 (att.type != CDATA) && 1499 (att.type != NOTATION)); 1500 } else { 1501 attvalue = attname; 1502 att = elem.getAttributeByValue(attvalue); 1503 if (att == null) { 1504 att = elem.getAttribute(attname); 1505 if (att != null) { 1506 attvalue = att.getValue(); 1507 } 1508 } 1509 } 1510 } else { 1511 char str[] = {(char)ch}; 1512 error("invalid.tagchar", new String(str), elem.getName()); 1513 ch = readCh(); 1514 continue; 1515 } 1516 } else if (!strict && (attributes.isEmpty()) && (ch == '=')) { 1517 ch = readCh(); 1518 skipSpace(); 1519 attname = elem.getName(); 1520 att = elem.getAttribute(attname); 1521 attvalue = parseAttributeValue((att != null) && 1522 (att.type != CDATA) && 1523 (att.type != NOTATION)); 1524 } else if (!strict && (ch == '=')) { 1525 ch = readCh(); 1526 skipSpace(); 1527 attvalue = parseAttributeValue(true); 1528 error("attvalerr"); 1529 return; 1530 } else { 1531 char str[] = {(char)ch}; 1532 error("invalid.tagchar", new String(str), elem.getName()); 1533 if (!strict) { 1534 ch = readCh(); 1535 continue; 1536 } else { 1537 return; 1538 } 1539 } 1540 1541 if (att != null) { 1542 attname = att.getName(); 1543 } else { 1544 error("invalid.tagatt", attname, elem.getName()); 1545 } 1546 1547 // Check out the value 1548 if (attributes.isDefined(attname)) { 1549 error("multi.tagatt", attname, elem.getName()); 1550 } 1551 if (attvalue == null) { 1552 attvalue = ((att != null) && (att.value != null)) ? att.value : 1553 HTML.NULL_ATTRIBUTE_VALUE; 1554 } else if ((att != null) && (att.values != null) && !att.values.contains(attvalue)) { 1555 error("invalid.tagattval", attname, elem.getName()); 1556 } 1557 HTML.Attribute attkey = HTML.getAttributeKey(attname); 1558 if (attkey == null) { 1559 attributes.addAttribute(attname, attvalue); 1560 } else { 1561 attributes.addAttribute(attkey, attvalue); 1562 } 1563 } 1564 } 1565 1566 /** 1567 * Parses the Document Type Declaration markup declaration. 1568 * Currently ignores it. 1569 * 1570 * @return the string representation of the markup declaration 1571 * @throws IOException if an I/O error occurs 1572 */ 1573 public String parseDTDMarkup() throws IOException { 1574 1575 StringBuilder strBuff = new StringBuilder(); 1576 ch = readCh(); 1577 while(true) { 1578 switch (ch) { 1579 case '>': 1580 ch = readCh(); 1581 return strBuff.toString(); 1582 case -1: 1583 error("invalid.markup"); 1584 return strBuff.toString(); 1585 case '\n': 1586 ln++; 1587 ch = readCh(); 1588 lfCount++; 1589 break; 1590 case '"': 1591 ch = readCh(); 1592 break; 1593 case '\r': 1594 ln++; 1595 if ((ch = readCh()) == '\n') { 1596 ch = readCh(); 1597 crlfCount++; 1598 } 1599 else { 1600 crCount++; 1601 } 1602 break; 1603 default: 1604 strBuff.append((char)(ch & 0xFF)); 1605 ch = readCh(); 1606 break; 1607 } 1608 } 1609 } 1610 1611 /** 1612 * Parse markup declarations. 1613 * Currently only handles the Document Type Declaration markup. 1614 * Returns true if it is a markup declaration false otherwise. 1615 * 1616 * @param strBuff the markup declaration 1617 * @return {@code true} if this is a valid markup declaration; 1618 * otherwise {@code false} 1619 * @throws IOException if an I/O error occurs 1620 */ 1621 protected boolean parseMarkupDeclarations(StringBuffer strBuff) throws IOException { 1622 1623 /* Currently handles only the DOCTYPE */ 1624 if ((strBuff.length() == "DOCTYPE".length()) && 1625 (strBuff.toString().toUpperCase().equals("DOCTYPE"))) { 1626 parseDTDMarkup(); 1627 return true; 1628 } 1629 return false; 1630 } 1631 1632 /** 1633 * Parse an invalid tag. 1634 */ 1635 void parseInvalidTag() throws IOException { 1636 // ignore all data upto the close bracket '>' 1637 while (true) { 1638 skipSpace(); 1639 switch (ch) { 1640 case '>': 1641 case -1: 1642 ch = readCh(); 1643 return; 1644 case '<': 1645 return; 1646 default: 1647 ch = readCh(); 1648 1649 } 1650 } 1651 } 1652 1653 /** 1654 * Parse a start or end tag. 1655 */ 1656 @SuppressWarnings("fallthrough") 1657 void parseTag() throws IOException { 1658 Element elem; 1659 boolean net = false; 1660 boolean warned = false; 1661 boolean unknown = false; 1662 1663 switch (ch = readCh()) { 1664 case '!': 1665 switch (ch = readCh()) { 1666 case '-': 1667 // Parse comment. [92] 391:7 1668 while (true) { 1669 if (ch == '-') { 1670 if (!strict || ((ch = readCh()) == '-')) { 1671 ch = readCh(); 1672 if (!strict && ch == '-') { 1673 ch = readCh(); 1674 } 1675 // send over any text you might see 1676 // before parsing and sending the 1677 // comment 1678 if (textpos != 0) { 1679 char newtext[] = new char[textpos]; 1680 System.arraycopy(text, 0, newtext, 0, textpos); 1681 handleText(newtext); 1682 lastBlockStartPos = currentBlockStartPos; 1683 textpos = 0; 1684 } 1685 parseComment(); 1686 last = makeTag(dtd.getElement("comment"), true); 1687 handleComment(getChars(0)); 1688 continue; 1689 } else if (!warned) { 1690 warned = true; 1691 error("invalid.commentchar", "-"); 1692 } 1693 } 1694 skipSpace(); 1695 switch (ch) { 1696 case '-': 1697 continue; 1698 case '>': 1699 ch = readCh(); 1700 return; 1701 case -1: 1702 return; 1703 default: 1704 ch = readCh(); 1705 if (!warned) { 1706 warned = true; 1707 error("invalid.commentchar", 1708 String.valueOf((char)ch)); 1709 } 1710 break; 1711 } 1712 } 1713 1714 default: 1715 // deal with marked sections 1716 StringBuffer strBuff = new StringBuffer(); 1717 while (true) { 1718 strBuff.append((char)ch); 1719 if (parseMarkupDeclarations(strBuff)) { 1720 return; 1721 } 1722 switch(ch) { 1723 case '>': 1724 ch = readCh(); 1725 // Fall through 1726 case -1: 1727 error("invalid.markup"); 1728 return; 1729 case '\n': 1730 ln++; 1731 ch = readCh(); 1732 lfCount++; 1733 break; 1734 case '\r': 1735 ln++; 1736 if ((ch = readCh()) == '\n') { 1737 ch = readCh(); 1738 crlfCount++; 1739 } 1740 else { 1741 crCount++; 1742 } 1743 break; 1744 1745 default: 1746 ch = readCh(); 1747 break; 1748 } 1749 } 1750 } 1751 1752 case '/': 1753 // parse end tag [19] 317:4 1754 switch (ch = readCh()) { 1755 case '>': 1756 ch = readCh(); 1757 // Fall through 1758 case '<': 1759 // empty end tag. either </> or </< 1760 if (recent == null) { 1761 error("invalid.shortend"); 1762 return; 1763 } 1764 elem = recent; 1765 break; 1766 1767 default: 1768 if (!parseIdentifier(true)) { 1769 error("expected.endtagname"); 1770 return; 1771 } 1772 skipSpace(); 1773 switch (ch) { 1774 case '>': 1775 ch = readCh(); 1776 break; 1777 case '<': 1778 break; 1779 1780 default: 1781 error("expected", "'>'"); 1782 while ((ch != -1) && (ch != '\n') && (ch != '>')) { 1783 ch = readCh(); 1784 } 1785 if (ch == '>') { 1786 ch = readCh(); 1787 } 1788 break; 1789 } 1790 String elemStr = getString(0); 1791 if (!dtd.elementExists(elemStr)) { 1792 error("end.unrecognized", elemStr); 1793 // Ignore RE before end tag 1794 if ((textpos > 0) && (text[textpos-1] == '\n')) { 1795 textpos--; 1796 } 1797 elem = dtd.getElement("unknown"); 1798 elem.name = elemStr; 1799 unknown = true; 1800 } else { 1801 elem = dtd.getElement(elemStr); 1802 } 1803 break; 1804 } 1805 1806 1807 // If the stack is null, we're seeing end tags without any begin 1808 // tags. Ignore them. 1809 1810 if (stack == null) { 1811 error("end.extra.tag", elem.getName()); 1812 return; 1813 } 1814 1815 // Ignore RE before end tag 1816 if ((textpos > 0) && (text[textpos-1] == '\n')) { 1817 // In a pre tag, if there are blank lines 1818 // we do not want to remove the newline 1819 // before the end tag. Hence this code. 1820 // 1821 if (stack.pre) { 1822 if ((textpos > 1) && (text[textpos-2] != '\n')) { 1823 textpos--; 1824 } 1825 } else { 1826 textpos--; 1827 } 1828 } 1829 1830 // If the end tag is a form, since we did not put it 1831 // on the tag stack, there is no corresponding start 1832 // start tag to find. Hence do not touch the tag stack. 1833 // 1834 1835 /* 1836 if (!strict && elem.getName().equals("form")) { 1837 if (lastFormSent != null) { 1838 handleEndTag(lastFormSent); 1839 return; 1840 } else { 1841 // do nothing. 1842 return; 1843 } 1844 } 1845 */ 1846 1847 if (unknown) { 1848 // we will not see a corresponding start tag 1849 // on the stack. If we are seeing an 1850 // end tag, lets send this on as an empty 1851 // tag with the end tag attribute set to 1852 // true. 1853 TagElement t = makeTag(elem); 1854 handleText(t); 1855 attributes.addAttribute(HTML.Attribute.ENDTAG, "true"); 1856 handleEmptyTag(makeTag(elem)); 1857 unknown = false; 1858 return; 1859 } 1860 1861 // find the corresponding start tag 1862 1863 // A commonly occurring error appears to be the insertion 1864 // of extra end tags in a table. The intent here is ignore 1865 // such extra end tags. 1866 // 1867 if (!strict) { 1868 String stackElem = stack.elem.getName(); 1869 1870 if (stackElem.equals("table")) { 1871 // If it is not a valid end tag ignore it and return 1872 // 1873 if (!elem.getName().equals(stackElem)) { 1874 error("tag.ignore", elem.getName()); 1875 return; 1876 } 1877 } 1878 1879 1880 1881 if (stackElem.equals("tr") || 1882 stackElem.equals("td")) { 1883 if ((!elem.getName().equals("table")) && 1884 (!elem.getName().equals(stackElem))) { 1885 error("tag.ignore", elem.getName()); 1886 return; 1887 } 1888 } 1889 } 1890 TagStack sp = stack; 1891 1892 while ((sp != null) && (elem != sp.elem)) { 1893 sp = sp.next; 1894 } 1895 if (sp == null) { 1896 error("unmatched.endtag", elem.getName()); 1897 return; 1898 } 1899 1900 // People put font ending tags in the darndest places. 1901 // Don't close other contexts based on them being between 1902 // a font tag and the corresponding end tag. Instead, 1903 // ignore the end tag like it doesn't exist and allow the end 1904 // of the document to close us out. 1905 String elemName = elem.getName(); 1906 if (stack != sp && 1907 (elemName.equals("font") || 1908 elemName.equals("center"))) { 1909 1910 // Since closing out a center tag can have real wierd 1911 // effects on the formatting, make sure that tags 1912 // for which omitting an end tag is legimitate 1913 // get closed out. 1914 // 1915 if (elemName.equals("center")) { 1916 while(stack.elem.omitEnd() && stack != sp) { 1917 endTag(true); 1918 } 1919 if (stack.elem == elem) { 1920 endTag(false); 1921 } 1922 } 1923 return; 1924 } 1925 // People do the same thing with center tags. In this 1926 // case we would like to close off the center tag but 1927 // not necessarily all enclosing tags. 1928 1929 1930 1931 // end tags 1932 while (stack != sp) { 1933 endTag(true); 1934 } 1935 1936 endTag(false); 1937 return; 1938 1939 case -1: 1940 error("eof"); 1941 return; 1942 } 1943 1944 // start tag [14] 314:1 1945 if (!parseIdentifier(true)) { 1946 elem = recent; 1947 if ((ch != '>') || (elem == null)) { 1948 error("expected.tagname"); 1949 return; 1950 } 1951 } else { 1952 String elemStr = getString(0); 1953 1954 if (elemStr.equals("image")) { 1955 elemStr = "img"; 1956 } 1957 1958 /* determine if this element is part of the dtd. */ 1959 1960 if (!dtd.elementExists(elemStr)) { 1961 // parseInvalidTag(); 1962 error("tag.unrecognized ", elemStr); 1963 elem = dtd.getElement("unknown"); 1964 elem.name = elemStr; 1965 unknown = true; 1966 } else { 1967 elem = dtd.getElement(elemStr); 1968 } 1969 } 1970 1971 // Parse attributes 1972 parseAttributeSpecificationList(elem); 1973 1974 switch (ch) { 1975 case '/': 1976 net = true; 1977 // Fall through 1978 case '>': 1979 ch = readCh(); 1980 if (ch == '>' && net) { 1981 ch = readCh(); 1982 } 1983 case '<': 1984 break; 1985 1986 default: 1987 error("expected", "'>'"); 1988 break; 1989 } 1990 1991 if (!strict) { 1992 if (elem.getName().equals("script")) { 1993 error("javascript.unsupported"); 1994 } 1995 } 1996 1997 // ignore RE after start tag 1998 // 1999 if (!elem.isEmpty()) { 2000 if (ch == '\n') { 2001 ln++; 2002 lfCount++; 2003 ch = readCh(); 2004 } else if (ch == '\r') { 2005 ln++; 2006 if ((ch = readCh()) == '\n') { 2007 ch = readCh(); 2008 crlfCount++; 2009 } 2010 else { 2011 crCount++; 2012 } 2013 } 2014 } 2015 2016 // ensure a legal context for the tag 2017 TagElement tag = makeTag(elem, false); 2018 2019 2020 /** In dealing with forms, we have decided to treat 2021 them as legal in any context. Also, even though 2022 they do have a start and an end tag, we will 2023 not put this tag on the stack. This is to deal 2024 several pages in the web oasis that choose to 2025 start and end forms in any possible location. **/ 2026 2027 /* 2028 if (!strict && elem.getName().equals("form")) { 2029 if (lastFormSent == null) { 2030 lastFormSent = tag; 2031 } else { 2032 handleEndTag(lastFormSent); 2033 lastFormSent = tag; 2034 } 2035 } else { 2036 */ 2037 // Smlly, if a tag is unknown, we will apply 2038 // no legalTagContext logic to it. 2039 // 2040 if (!unknown) { 2041 legalTagContext(tag); 2042 2043 // If skip tag is true, this implies that 2044 // the tag was illegal and that the error 2045 // recovery strategy adopted is to ignore 2046 // the tag. 2047 if (!strict && skipTag) { 2048 skipTag = false; 2049 return; 2050 } 2051 } 2052 /* 2053 } 2054 */ 2055 2056 startTag(tag); 2057 2058 if (!elem.isEmpty()) { 2059 switch (elem.getType()) { 2060 case CDATA: 2061 parseLiteral(false); 2062 break; 2063 case RCDATA: 2064 parseLiteral(true); 2065 break; 2066 default: 2067 if (stack != null) { 2068 stack.net = net; 2069 } 2070 break; 2071 } 2072 } 2073 } 2074 2075 private static final String START_COMMENT = "<!--"; 2076 private static final String END_COMMENT = "-->"; 2077 private static final char[] SCRIPT_END_TAG = "</script>".toCharArray(); 2078 private static final char[] SCRIPT_END_TAG_UPPER_CASE = 2079 "</SCRIPT>".toCharArray(); 2080 2081 void parseScript() throws IOException { 2082 char[] charsToAdd = new char[SCRIPT_END_TAG.length]; 2083 boolean insideComment = false; 2084 2085 /* Here, ch should be the first character after <script> */ 2086 while (true) { 2087 int i = 0; 2088 while (!insideComment && i < SCRIPT_END_TAG.length 2089 && (SCRIPT_END_TAG[i] == ch 2090 || SCRIPT_END_TAG_UPPER_CASE[i] == ch)) { 2091 charsToAdd[i] = (char) ch; 2092 ch = readCh(); 2093 i++; 2094 } 2095 if (i == SCRIPT_END_TAG.length) { 2096 2097 /* '</script>' tag detected */ 2098 /* Here, ch == the first character after </script> */ 2099 return; 2100 } else { 2101 2102 /* To account for extra read()'s that happened */ 2103 for (int j = 0; j < i; j++) { 2104 addString(charsToAdd[j]); 2105 } 2106 2107 switch (ch) { 2108 case -1: 2109 error("eof.script"); 2110 return; 2111 case '\n': 2112 ln++; 2113 ch = readCh(); 2114 lfCount++; 2115 addString('\n'); 2116 break; 2117 case '\r': 2118 ln++; 2119 if ((ch = readCh()) == '\n') { 2120 ch = readCh(); 2121 crlfCount++; 2122 } else { 2123 crCount++; 2124 } 2125 addString('\n'); 2126 break; 2127 default: 2128 addString(ch); 2129 String str = new String(getChars(0, strpos)); 2130 if (!insideComment && str.endsWith(START_COMMENT)) { 2131 insideComment = true; 2132 } 2133 if (insideComment && str.endsWith(END_COMMENT)) { 2134 insideComment = false; 2135 } 2136 ch = readCh(); 2137 break; 2138 } // switch 2139 } 2140 } // while 2141 } 2142 2143 /** 2144 * Parse Content. [24] 320:1 2145 */ 2146 void parseContent() throws IOException { 2147 Thread curThread = Thread.currentThread(); 2148 2149 for (;;) { 2150 if (curThread.isInterrupted()) { 2151 curThread.interrupt(); // resignal the interrupt 2152 break; 2153 } 2154 2155 int c = ch; 2156 currentBlockStartPos = currentPosition; 2157 2158 if (recent == dtd.script) { // means: if after starting <script> tag 2159 2160 /* Here, ch has to be the first character after <script> */ 2161 parseScript(); 2162 last = makeTag(dtd.getElement("comment"), true); 2163 2164 /* Remove leading and trailing HTML comment declarations */ 2165 String str = new String(getChars(0)).trim(); 2166 int minLength = START_COMMENT.length() + END_COMMENT.length(); 2167 if (str.startsWith(START_COMMENT) && str.endsWith(END_COMMENT) 2168 && str.length() >= (minLength)) { 2169 str = str.substring(START_COMMENT.length(), 2170 str.length() - END_COMMENT.length()); 2171 } 2172 2173 /* Handle resulting chars as comment */ 2174 handleComment(str.toCharArray()); 2175 endTag(false); 2176 lastBlockStartPos = currentPosition; 2177 2178 continue; 2179 } else { 2180 switch (c) { 2181 case '<': 2182 parseTag(); 2183 lastBlockStartPos = currentPosition; 2184 continue; 2185 2186 case '/': 2187 ch = readCh(); 2188 if ((stack != null) && stack.net) { 2189 // null end tag. 2190 endTag(false); 2191 continue; 2192 } else if (textpos == 0) { 2193 if (!legalElementContext(dtd.pcdata)) { 2194 error("unexpected.pcdata"); 2195 } 2196 if (last.breaksFlow()) { 2197 space = false; 2198 } 2199 } 2200 break; 2201 2202 case -1: 2203 return; 2204 2205 case '&': 2206 if (textpos == 0) { 2207 if (!legalElementContext(dtd.pcdata)) { 2208 error("unexpected.pcdata"); 2209 } 2210 if (last.breaksFlow()) { 2211 space = false; 2212 } 2213 } 2214 char data[] = parseEntityReference(); 2215 if (textpos + data.length + 1 > text.length) { 2216 char newtext[] = new char[Math.max(textpos + data.length + 128, text.length * 2)]; 2217 System.arraycopy(text, 0, newtext, 0, text.length); 2218 text = newtext; 2219 } 2220 if (space) { 2221 space = false; 2222 text[textpos++] = ' '; 2223 } 2224 System.arraycopy(data, 0, text, textpos, data.length); 2225 textpos += data.length; 2226 ignoreSpace = false; 2227 continue; 2228 2229 case '\n': 2230 ln++; 2231 lfCount++; 2232 ch = readCh(); 2233 if ((stack != null) && stack.pre) { 2234 break; 2235 } 2236 if (textpos == 0) { 2237 lastBlockStartPos = currentPosition; 2238 } 2239 if (!ignoreSpace) { 2240 space = true; 2241 } 2242 continue; 2243 2244 case '\r': 2245 ln++; 2246 c = '\n'; 2247 if ((ch = readCh()) == '\n') { 2248 ch = readCh(); 2249 crlfCount++; 2250 } 2251 else { 2252 crCount++; 2253 } 2254 if ((stack != null) && stack.pre) { 2255 break; 2256 } 2257 if (textpos == 0) { 2258 lastBlockStartPos = currentPosition; 2259 } 2260 if (!ignoreSpace) { 2261 space = true; 2262 } 2263 continue; 2264 2265 2266 case '\t': 2267 case ' ': 2268 ch = readCh(); 2269 if ((stack != null) && stack.pre) { 2270 break; 2271 } 2272 if (textpos == 0) { 2273 lastBlockStartPos = currentPosition; 2274 } 2275 if (!ignoreSpace) { 2276 space = true; 2277 } 2278 continue; 2279 2280 default: 2281 if (textpos == 0) { 2282 if (!legalElementContext(dtd.pcdata)) { 2283 error("unexpected.pcdata"); 2284 } 2285 if (last.breaksFlow()) { 2286 space = false; 2287 } 2288 } 2289 ch = readCh(); 2290 break; 2291 } 2292 } 2293 2294 // enlarge buffer if needed 2295 if (textpos + 2 > text.length) { 2296 char newtext[] = new char[text.length + 128]; 2297 System.arraycopy(text, 0, newtext, 0, text.length); 2298 text = newtext; 2299 } 2300 2301 // output pending space 2302 if (space) { 2303 if (textpos == 0) { 2304 lastBlockStartPos--; 2305 } 2306 text[textpos++] = ' '; 2307 space = false; 2308 } 2309 text[textpos++] = (char)c; 2310 ignoreSpace = false; 2311 } 2312 } 2313 2314 /** 2315 * Returns the end of line string. This will return the end of line 2316 * string that has been encountered the most, one of \r, \n or \r\n. 2317 */ 2318 String getEndOfLineString() { 2319 if (crlfCount >= crCount) { 2320 if (lfCount >= crlfCount) { 2321 return "\n"; 2322 } 2323 else { 2324 return "\r\n"; 2325 } 2326 } 2327 else { 2328 if (crCount > lfCount) { 2329 return "\r"; 2330 } 2331 else { 2332 return "\n"; 2333 } 2334 } 2335 } 2336 2337 /** 2338 * Parse an HTML stream, given a DTD. 2339 * 2340 * @param in the reader to read the source from 2341 * @throws IOException if an I/O error occurs 2342 */ 2343 public synchronized void parse(Reader in) throws IOException { 2344 this.in = in; 2345 2346 this.ln = 1; 2347 2348 seenHtml = false; 2349 seenHead = false; 2350 seenBody = false; 2351 2352 crCount = lfCount = crlfCount = 0; 2353 2354 try { 2355 ch = readCh(); 2356 text = new char[1024]; 2357 str = new char[128]; 2358 2359 parseContent(); 2360 // NOTE: interruption may have occurred. Control flows out 2361 // of here normally. 2362 while (stack != null) { 2363 endTag(true); 2364 } 2365 in.close(); 2366 } catch (IOException e) { 2367 errorContext(); 2368 error("ioexception"); 2369 throw e; 2370 } catch (Exception e) { 2371 errorContext(); 2372 error("exception", e.getClass().getName(), e.getMessage()); 2373 e.printStackTrace(); 2374 } catch (ThreadDeath e) { 2375 errorContext(); 2376 error("terminated"); 2377 e.printStackTrace(); 2378 throw e; 2379 } finally { 2380 for (; stack != null ; stack = stack.next) { 2381 handleEndTag(stack.tag); 2382 } 2383 2384 text = null; 2385 str = null; 2386 } 2387 2388 } 2389 2390 2391 /* 2392 * Input cache. This is much faster than calling down to a synchronized 2393 * method of BufferedReader for each byte. Measurements done 5/30/97 2394 * show that there's no point in having a bigger buffer: Increasing 2395 * the buffer to 8192 had no measurable impact for a program discarding 2396 * one character at a time (reading from an http URL to a local machine). 2397 * NOTE: If the current encoding is bogus, and we read too much 2398 * (past the content-type) we may suffer a MalformedInputException. For 2399 * this reason the initial size is 1 and when the body is encountered the 2400 * size is adjusted to 256. 2401 */ 2402 private char buf[] = new char[1]; 2403 private int pos; 2404 private int len; 2405 /* 2406 tracks position relative to the beginning of the 2407 document. 2408 */ 2409 private int currentPosition; 2410 2411 2412 private final int readCh() throws IOException { 2413 2414 if (pos >= len) { 2415 2416 // This loop allows us to ignore interrupts if the flag 2417 // says so 2418 for (;;) { 2419 try { 2420 len = in.read(buf); 2421 break; 2422 } catch (InterruptedIOException ex) { 2423 throw ex; 2424 } 2425 } 2426 2427 if (len <= 0) { 2428 return -1; // eof 2429 } 2430 pos = 0; 2431 } 2432 ++currentPosition; 2433 2434 return buf[pos++]; 2435 } 2436 2437 2438 /** 2439 * Returns the current position. 2440 * 2441 * @return the current position 2442 */ 2443 protected int getCurrentPos() { 2444 return currentPosition; 2445 } 2446 }