1 /* 2 * Copyright (c) 1998, 2014, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 package javax.swing.text.html.parser; 27 28 import javax.swing.text.SimpleAttributeSet; 29 import javax.swing.text.html.HTML; 30 import javax.swing.text.ChangedCharSetException; 31 import java.io.*; 32 import java.util.Hashtable; 33 import java.util.Properties; 34 import java.util.Vector; 35 import java.util.Enumeration; 36 import java.net.URL; 37 38 import sun.misc.MessageUtils; 39 40 /** 41 * A simple DTD-driven HTML parser. The parser reads an 42 * HTML file from an InputStream and calls various methods 43 * (which should be overridden in a subclass) when tags and 44 * data are encountered. 45 * <p> 46 * Unfortunately there are many badly implemented HTML parsers 47 * out there, and as a result there are many badly formatted 48 * HTML files. This parser attempts to parse most HTML files. 49 * This means that the implementation sometimes deviates from 50 * the SGML specification in favor of HTML. 51 * <p> 52 * The parser treats \r and \r\n as \n. Newlines after starttags 53 * and before end tags are ignored just as specified in the SGML/HTML 54 * specification. 55 * <p> 56 * The html spec does not specify how spaces are to be coalesced very well. 57 * Specifically, the following scenarios are not discussed (note that a 58 * space should be used here, but I am using &nbsp to force the space to 59 * be displayed): 60 * <p> 61 * '<b>blah <i> <strike> foo' which can be treated as: 62 * '<b>blah <i><strike>foo' 63 * <p>as well as: 64 * '<p><a href="xx"> <em>Using</em></a></p>' 65 * which appears to be treated as: 66 * '<p><a href="xx"><em>Using</em></a></p>' 67 * <p> 68 * If <code>strict</code> is false, when a tag that breaks flow, 69 * (<code>TagElement.breaksFlows</code>) or trailing whitespace is 70 * encountered, all whitespace will be ignored until a non whitespace 71 * character is encountered. This appears to give behavior closer to 72 * the popular browsers. 73 * 74 * @see DTD 75 * @see TagElement 76 * @see SimpleAttributeSet 77 * @author Arthur van Hoff 78 * @author Sunita Mani 79 */ 80 public 81 class Parser implements DTDConstants { 82 83 private char text[] = new char[1024]; 84 private int textpos = 0; 85 private TagElement last; 86 private boolean space; 87 88 private char str[] = new char[128]; 89 private int strpos = 0; 90 91 protected DTD dtd = null; 92 93 private int ch; 94 private int ln; 95 private Reader in; 96 97 private Element recent; 98 private TagStack stack; 99 private boolean skipTag = false; 100 private TagElement lastFormSent = null; 101 private SimpleAttributeSet attributes = new SimpleAttributeSet(); 102 103 // State for <html>, <head> and <body>. Since people like to slap 104 // together HTML documents without thinking, occasionally they 105 // have multiple instances of these tags. These booleans track 106 // the first sightings of these tags so they can be safely ignored 107 // by the parser if repeated. 108 private boolean seenHtml = false; 109 private boolean seenHead = false; 110 private boolean seenBody = false; 111 112 /** 113 * The html spec does not specify how spaces are coalesced very well. 114 * If strict == false, ignoreSpace is used to try and mimic the behavior 115 * of the popular browsers. 116 * <p> 117 * The problematic scenarios are: 118 * '<b>blah <i> <strike> foo' which can be treated as: 119 * '<b>blah <i><strike>foo' 120 * as well as: 121 * '<p><a href="xx"> <em>Using</em></a></p>' 122 * which appears to be treated as: 123 * '<p><a href="xx"><em>Using</em></a></p>' 124 * <p> 125 * When a tag that breaks flow, or trailing whitespace is encountered 126 * ignoreSpace is set to true. From then on, all whitespace will be 127 * ignored. 128 * ignoreSpace will be set back to false the first time a 129 * non whitespace character is encountered. This appears to give 130 * behavior closer to the popular browsers. 131 */ 132 private boolean ignoreSpace; 133 134 /** 135 * This flag determines whether or not the Parser will be strict 136 * in enforcing SGML compatibility. If false, it will be lenient 137 * with certain common classes of erroneous HTML constructs. 138 * Strict or not, in either case an error will be recorded. 139 * 140 */ 141 protected boolean strict = false; 142 143 144 /** Number of \r\n's encountered. */ 145 private int crlfCount; 146 /** Number of \r's encountered. A \r\n will not increment this. */ 147 private int crCount; 148 /** Number of \n's encountered. A \r\n will not increment this. */ 149 private int lfCount; 150 151 // 152 // To correctly identify the start of a tag/comment/text we need two 153 // ivars. Two are needed as handleText isn't invoked until the tag 154 // after the text has been parsed, that is the parser parses the text, 155 // then a tag, then invokes handleText followed by handleStart. 156 // 157 /** The start position of the current block. Block is overloaded here, 158 * it really means the current start position for the current comment, 159 * tag, text. Use getBlockStartPosition to access this. */ 160 private int currentBlockStartPos; 161 /** Start position of the last block. */ 162 private int lastBlockStartPos; 163 164 /** 165 * array for mapping numeric references in range 166 * 130-159 to displayable Unicode characters. 167 */ 168 private static final char[] cp1252Map = { 169 8218, // 170 402, // 171 8222, // 172 8230, // 173 8224, // 174 8225, // 175 710, // 176 8240, // 177 352, // 178 8249, // 179 338, // 180 141, // 181 142, // 182 143, // 183 144, // 184 8216, // 185 8217, // 186 8220, // 187 8221, // 188 8226, // 189 8211, // 190 8212, // 191 732, // 192 8482, // 193 353, // 194 8250, // 195 339, // 196 157, // 197 158, // 198 376 // 199 }; 200 201 public Parser(DTD dtd) { 202 this.dtd = dtd; 203 } 204 205 206 /** 207 * @return the line number of the line currently being parsed 208 */ 209 protected int getCurrentLine() { 210 return ln; 211 } 212 213 /** 214 * Returns the start position of the current block. Block is 215 * overloaded here, it really means the current start position for 216 * the current comment tag, text, block.... This is provided for 217 * subclassers that wish to know the start of the current block when 218 * called with one of the handleXXX methods. 219 */ 220 int getBlockStartPosition() { 221 return Math.max(0, lastBlockStartPos - 1); 222 } 223 224 /** 225 * Makes a TagElement. 226 */ 227 protected TagElement makeTag(Element elem, boolean fictional) { 228 return new TagElement(elem, fictional); 229 } 230 231 protected TagElement makeTag(Element elem) { 232 return makeTag(elem, false); 233 } 234 235 protected SimpleAttributeSet getAttributes() { 236 return attributes; 237 } 238 239 protected void flushAttributes() { 240 attributes.removeAttributes(attributes); 241 } 242 243 /** 244 * Called when PCDATA is encountered. 245 */ 246 protected void handleText(char text[]) { 247 } 248 249 /** 250 * Called when an HTML title tag is encountered. 251 */ 252 protected void handleTitle(char text[]) { 253 // default behavior is to call handleText. Subclasses 254 // can override if necessary. 255 handleText(text); 256 } 257 258 /** 259 * Called when an HTML comment is encountered. 260 */ 261 protected void handleComment(char text[]) { 262 } 263 264 protected void handleEOFInComment() { 265 // We've reached EOF. Our recovery strategy is to 266 // see if we have more than one line in the comment; 267 // if so, we pretend that the comment was an unterminated 268 // single line comment, and reparse the lines after the 269 // first line as normal HTML content. 270 271 int commentEndPos = strIndexOf('\n'); 272 if (commentEndPos >= 0) { 273 handleComment(getChars(0, commentEndPos)); 274 try { 275 in.close(); 276 in = new CharArrayReader(getChars(commentEndPos + 1)); 277 ch = '>'; 278 } catch (IOException e) { 279 error("ioexception"); 280 } 281 282 resetStrBuffer(); 283 } else { 284 // no newline, so signal an error 285 error("eof.comment"); 286 } 287 } 288 289 /** 290 * Called when an empty tag is encountered. 291 */ 292 protected void handleEmptyTag(TagElement tag) throws ChangedCharSetException { 293 } 294 295 /** 296 * Called when a start tag is encountered. 297 */ 298 protected void handleStartTag(TagElement tag) { 299 } 300 301 /** 302 * Called when an end tag is encountered. 303 */ 304 protected void handleEndTag(TagElement tag) { 305 } 306 307 /** 308 * An error has occurred. 309 */ 310 protected void handleError(int ln, String msg) { 311 /* 312 Thread.dumpStack(); 313 System.out.println("**** " + stack); 314 System.out.println("line " + ln + ": error: " + msg); 315 System.out.println(); 316 */ 317 } 318 319 /** 320 * Output text. 321 */ 322 void handleText(TagElement tag) { 323 if (tag.breaksFlow()) { 324 space = false; 325 if (!strict) { 326 ignoreSpace = true; 327 } 328 } 329 if (textpos == 0) { 330 if ((!space) || (stack == null) || last.breaksFlow() || 331 !stack.advance(dtd.pcdata)) { 332 last = tag; 333 space = false; 334 lastBlockStartPos = currentBlockStartPos; 335 return; 336 } 337 } 338 if (space) { 339 if (!ignoreSpace) { 340 // enlarge buffer if needed 341 if (textpos + 1 > text.length) { 342 char newtext[] = new char[text.length + 200]; 343 System.arraycopy(text, 0, newtext, 0, text.length); 344 text = newtext; 345 } 346 347 // output pending space 348 text[textpos++] = ' '; 349 if (!strict && !tag.getElement().isEmpty()) { 350 ignoreSpace = true; 351 } 352 } 353 space = false; 354 } 355 char newtext[] = new char[textpos]; 356 System.arraycopy(text, 0, newtext, 0, textpos); 357 // Handles cases of bad html where the title tag 358 // was getting lost when we did error recovery. 359 if (tag.getElement().getName().equals("title")) { 360 handleTitle(newtext); 361 } else { 362 handleText(newtext); 363 } 364 lastBlockStartPos = currentBlockStartPos; 365 textpos = 0; 366 last = tag; 367 space = false; 368 } 369 370 /** 371 * Invoke the error handler. 372 */ 373 protected void error(String err, String arg1, String arg2, 374 String arg3) { 375 handleError(ln, err + " " + arg1 + " " + arg2 + " " + arg3); 376 } 377 378 protected void error(String err, String arg1, String arg2) { 379 error(err, arg1, arg2, "?"); 380 } 381 protected void error(String err, String arg1) { 382 error(err, arg1, "?", "?"); 383 } 384 protected void error(String err) { 385 error(err, "?", "?", "?"); 386 } 387 388 389 /** 390 * Handle a start tag. The new tag is pushed 391 * onto the tag stack. The attribute list is 392 * checked for required attributes. 393 */ 394 protected void startTag(TagElement tag) throws ChangedCharSetException { 395 Element elem = tag.getElement(); 396 397 // If the tag is an empty tag and texpos != 0 398 // this implies that there is text before the 399 // start tag that needs to be processed before 400 // handling the tag. 401 // 402 if (!elem.isEmpty() || 403 ((last != null) && !last.breaksFlow()) || 404 (textpos != 0)) { 405 handleText(tag); 406 } else { 407 // this variable gets updated in handleText(). 408 // Since in this case we do not call handleText() 409 // we need to update it here. 410 // 411 last = tag; 412 // Note that we should really check last.breakFlows before 413 // assuming this should be false. 414 space = false; 415 } 416 lastBlockStartPos = currentBlockStartPos; 417 418 // check required attributes 419 for (AttributeList a = elem.atts ; a != null ; a = a.next) { 420 if ((a.modifier == REQUIRED) && 421 ((attributes.isEmpty()) || 422 ((!attributes.isDefined(a.name)) && 423 (!attributes.isDefined(HTML.getAttributeKey(a.name)))))) { 424 error("req.att ", a.getName(), elem.getName()); 425 } 426 } 427 428 if (elem.isEmpty()) { 429 handleEmptyTag(tag); 430 /* 431 } else if (elem.getName().equals("form")) { 432 handleStartTag(tag); 433 */ 434 } else { 435 recent = elem; 436 stack = new TagStack(tag, stack); 437 handleStartTag(tag); 438 } 439 } 440 441 /** 442 * Handle an end tag. The end tag is popped 443 * from the tag stack. 444 */ 445 protected void endTag(boolean omitted) { 446 handleText(stack.tag); 447 448 if (omitted && !stack.elem.omitEnd()) { 449 error("end.missing", stack.elem.getName()); 450 } else if (!stack.terminate()) { 451 error("end.unexpected", stack.elem.getName()); 452 } 453 454 // handle the tag 455 handleEndTag(stack.tag); 456 stack = stack.next; 457 recent = (stack != null) ? stack.elem : null; 458 } 459 460 461 boolean ignoreElement(Element elem) { 462 463 String stackElement = stack.elem.getName(); 464 String elemName = elem.getName(); 465 /* We ignore all elements that are not valid in the context of 466 a table except <td>, <th> (these we handle in 467 legalElementContext()) and #pcdata. We also ignore the 468 <font> tag in the context of <ul> and <ol> We additonally 469 ignore the <meta> and the <style> tag if the body tag has 470 been seen. **/ 471 if ((elemName.equals("html") && seenHtml) || 472 (elemName.equals("head") && seenHead) || 473 (elemName.equals("body") && seenBody)) { 474 return true; 475 } 476 if (elemName.equals("dt") || elemName.equals("dd")) { 477 TagStack s = stack; 478 while (s != null && !s.elem.getName().equals("dl")) { 479 s = s.next; 480 } 481 if (s == null) { 482 return true; 483 } 484 } 485 486 if (((stackElement.equals("table")) && 487 (!elemName.equals("#pcdata")) && (!elemName.equals("input"))) || 488 ((elemName.equals("font")) && 489 (stackElement.equals("ul") || stackElement.equals("ol"))) || 490 (elemName.equals("meta") && stack != null) || 491 (elemName.equals("style") && seenBody) || 492 (stackElement.equals("table") && elemName.equals("a"))) { 493 return true; 494 } 495 return false; 496 } 497 498 499 /** 500 * Marks the first time a tag has been seen in a document 501 */ 502 503 protected void markFirstTime(Element elem) { 504 String elemName = elem.getName(); 505 if (elemName.equals("html")) { 506 seenHtml = true; 507 } else if (elemName.equals("head")) { 508 seenHead = true; 509 } else if (elemName.equals("body")) { 510 if (buf.length == 1) { 511 // Refer to note in definition of buf for details on this. 512 char[] newBuf = new char[256]; 513 514 newBuf[0] = buf[0]; 515 buf = newBuf; 516 } 517 seenBody = true; 518 } 519 } 520 521 /** 522 * Create a legal content for an element. 523 */ 524 boolean legalElementContext(Element elem) throws ChangedCharSetException { 525 526 // System.out.println("-- legalContext -- " + elem); 527 528 // Deal with the empty stack 529 if (stack == null) { 530 // System.out.println("-- stack is empty"); 531 if (elem != dtd.html) { 532 // System.out.println("-- pushing html"); 533 startTag(makeTag(dtd.html, true)); 534 return legalElementContext(elem); 535 } 536 return true; 537 } 538 539 // Is it allowed in the current context 540 if (stack.advance(elem)) { 541 // System.out.println("-- legal context"); 542 markFirstTime(elem); 543 return true; 544 } 545 boolean insertTag = false; 546 547 // The use of all error recovery strategies are contingent 548 // on the value of the strict property. 549 // 550 // These are commonly occurring errors. if insertTag is true, 551 // then we want to adopt an error recovery strategy that 552 // involves attempting to insert an additional tag to 553 // legalize the context. The two errors addressed here 554 // are: 555 // 1) when a <td> or <th> is seen soon after a <table> tag. 556 // In this case we insert a <tr>. 557 // 2) when any other tag apart from a <tr> is seen 558 // in the context of a <tr>. In this case we would 559 // like to add a <td>. If a <tr> is seen within a 560 // <tr> context, then we will close out the current 561 // <tr>. 562 // 563 // This insertion strategy is handled later in the method. 564 // The reason for checking this now, is that in other cases 565 // we would like to apply other error recovery strategies for example 566 // ignoring tags. 567 // 568 // In certain cases it is better to ignore a tag than try to 569 // fix the situation. So the first test is to see if this 570 // is what we need to do. 571 // 572 String stackElemName = stack.elem.getName(); 573 String elemName = elem.getName(); 574 575 576 if (!strict && 577 ((stackElemName.equals("table") && elemName.equals("td")) || 578 (stackElemName.equals("table") && elemName.equals("th")) || 579 (stackElemName.equals("tr") && !elemName.equals("tr")))){ 580 insertTag = true; 581 } 582 583 584 if (!strict && !insertTag && (stack.elem.getName() != elem.getName() || 585 elem.getName().equals("body"))) { 586 if (skipTag = ignoreElement(elem)) { 587 error("tag.ignore", elem.getName()); 588 return skipTag; 589 } 590 } 591 592 // Check for anything after the start of the table besides tr, td, th 593 // or caption, and if those aren't there, insert the <tr> and call 594 // legalElementContext again. 595 if (!strict && stackElemName.equals("table") && 596 !elemName.equals("tr") && !elemName.equals("td") && 597 !elemName.equals("th") && !elemName.equals("caption")) { 598 Element e = dtd.getElement("tr"); 599 TagElement t = makeTag(e, true); 600 legalTagContext(t); 601 startTag(t); 602 error("start.missing", elem.getName()); 603 return legalElementContext(elem); 604 } 605 606 // They try to find a legal context by checking if the current 607 // tag is valid in an enclosing context. If so 608 // close out the tags by outputing end tags and then 609 // insert the current tag. If the tags that are 610 // being closed out do not have an optional end tag 611 // specification in the DTD then an html error is 612 // reported. 613 // 614 if (!insertTag && stack.terminate() && (!strict || stack.elem.omitEnd())) { 615 for (TagStack s = stack.next ; s != null ; s = s.next) { 616 if (s.advance(elem)) { 617 while (stack != s) { 618 endTag(true); 619 } 620 return true; 621 } 622 if (!s.terminate() || (strict && !s.elem.omitEnd())) { 623 break; 624 } 625 } 626 } 627 628 // Check if we know what tag is expected next. 629 // If so insert the tag. Report an error if the 630 // tag does not have its start tag spec in the DTD as optional. 631 // 632 Element next = stack.first(); 633 if (next != null && (!strict || next.omitStart()) && 634 !(next==dtd.head && elem==dtd.pcdata) ) { 635 // System.out.println("-- omitting start tag: " + next); 636 TagElement t = makeTag(next, true); 637 legalTagContext(t); 638 startTag(t); 639 if (!next.omitStart()) { 640 error("start.missing", elem.getName()); 641 } 642 return legalElementContext(elem); 643 } 644 645 646 // Traverse the list of expected elements and determine if adding 647 // any of these elements would make for a legal context. 648 // 649 650 if (!strict) { 651 ContentModel content = stack.contentModel(); 652 Vector<Element> elemVec = new Vector<Element>(); 653 if (content != null) { 654 content.getElements(elemVec); 655 for (Element e : elemVec) { 656 // Ensure that this element has not been included as 657 // part of the exclusions in the DTD. 658 // 659 if (stack.excluded(e.getIndex())) { 660 continue; 661 } 662 663 boolean reqAtts = false; 664 665 for (AttributeList a = e.getAttributes(); a != null ; a = a.next) { 666 if (a.modifier == REQUIRED) { 667 reqAtts = true; 668 break; 669 } 670 } 671 // Ensure that no tag that has required attributes 672 // gets inserted. 673 // 674 if (reqAtts) { 675 continue; 676 } 677 678 ContentModel m = e.getContent(); 679 if (m != null && m.first(elem)) { 680 // System.out.println("-- adding a legal tag: " + e); 681 TagElement t = makeTag(e, true); 682 legalTagContext(t); 683 startTag(t); 684 error("start.missing", e.getName()); 685 return legalElementContext(elem); 686 } 687 } 688 } 689 } 690 691 // Check if the stack can be terminated. If so add the appropriate 692 // end tag. Report an error if the tag being ended does not have its 693 // end tag spec in the DTD as optional. 694 // 695 if (stack.terminate() && (stack.elem != dtd.body) && (!strict || stack.elem.omitEnd())) { 696 // System.out.println("-- omitting end tag: " + stack.elem); 697 if (!stack.elem.omitEnd()) { 698 error("end.missing", elem.getName()); 699 } 700 701 endTag(true); 702 return legalElementContext(elem); 703 } 704 705 // At this point we know that something is screwed up. 706 return false; 707 } 708 709 /** 710 * Create a legal context for a tag. 711 */ 712 void legalTagContext(TagElement tag) throws ChangedCharSetException { 713 if (legalElementContext(tag.getElement())) { 714 markFirstTime(tag.getElement()); 715 return; 716 } 717 718 // Avoid putting a block tag in a flow tag. 719 if (tag.breaksFlow() && (stack != null) && !stack.tag.breaksFlow()) { 720 endTag(true); 721 legalTagContext(tag); 722 return; 723 } 724 725 // Avoid putting something wierd in the head of the document. 726 for (TagStack s = stack ; s != null ; s = s.next) { 727 if (s.tag.getElement() == dtd.head) { 728 while (stack != s) { 729 endTag(true); 730 } 731 endTag(true); 732 legalTagContext(tag); 733 return; 734 } 735 } 736 737 // Everything failed 738 error("tag.unexpected", tag.getElement().getName()); 739 } 740 741 /** 742 * Error context. Something went wrong, make sure we are in 743 * the document's body context 744 */ 745 void errorContext() throws ChangedCharSetException { 746 for (; (stack != null) && (stack.tag.getElement() != dtd.body) ; stack = stack.next) { 747 handleEndTag(stack.tag); 748 } 749 if (stack == null) { 750 legalElementContext(dtd.body); 751 startTag(makeTag(dtd.body, true)); 752 } 753 } 754 755 /** 756 * Add a char to the string buffer. 757 */ 758 void addString(int c) { 759 if (strpos == str.length) { 760 char newstr[] = new char[str.length + 128]; 761 System.arraycopy(str, 0, newstr, 0, str.length); 762 str = newstr; 763 } 764 str[strpos++] = (char)c; 765 } 766 767 /** 768 * Get the string that's been accumulated. 769 */ 770 String getString(int pos) { 771 char newStr[] = new char[strpos - pos]; 772 System.arraycopy(str, pos, newStr, 0, strpos - pos); 773 strpos = pos; 774 return new String(newStr); 775 } 776 777 char[] getChars(int pos) { 778 char newStr[] = new char[strpos - pos]; 779 System.arraycopy(str, pos, newStr, 0, strpos - pos); 780 strpos = pos; 781 return newStr; 782 } 783 784 char[] getChars(int pos, int endPos) { 785 char newStr[] = new char[endPos - pos]; 786 System.arraycopy(str, pos, newStr, 0, endPos - pos); 787 // REMIND: it's not clear whether this version should set strpos or not 788 // strpos = pos; 789 return newStr; 790 } 791 792 void resetStrBuffer() { 793 strpos = 0; 794 } 795 796 int strIndexOf(char target) { 797 for (int i = 0; i < strpos; i++) { 798 if (str[i] == target) { 799 return i; 800 } 801 } 802 803 return -1; 804 } 805 806 /** 807 * Skip space. 808 * [5] 297:5 809 */ 810 void skipSpace() throws IOException { 811 while (true) { 812 switch (ch) { 813 case '\n': 814 ln++; 815 ch = readCh(); 816 lfCount++; 817 break; 818 819 case '\r': 820 ln++; 821 if ((ch = readCh()) == '\n') { 822 ch = readCh(); 823 crlfCount++; 824 } 825 else { 826 crCount++; 827 } 828 break; 829 case ' ': 830 case '\t': 831 ch = readCh(); 832 break; 833 834 default: 835 return; 836 } 837 } 838 } 839 840 /** 841 * Parse identifier. Uppercase characters are folded 842 * to lowercase when lower is true. Returns falsed if 843 * no identifier is found. [55] 346:17 844 */ 845 boolean parseIdentifier(boolean lower) throws IOException { 846 switch (ch) { 847 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': 848 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L': 849 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': 850 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X': 851 case 'Y': case 'Z': 852 if (lower) { 853 ch = 'a' + (ch - 'A'); 854 } 855 break; 856 857 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': 858 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': 859 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': 860 case 's': case 't': case 'u': case 'v': case 'w': case 'x': 861 case 'y': case 'z': 862 break; 863 864 default: 865 return false; 866 } 867 868 while (true) { 869 addString(ch); 870 871 switch (ch = readCh()) { 872 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': 873 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L': 874 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': 875 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X': 876 case 'Y': case 'Z': 877 if (lower) { 878 ch = 'a' + (ch - 'A'); 879 } 880 break; 881 882 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': 883 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': 884 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': 885 case 's': case 't': case 'u': case 'v': case 'w': case 'x': 886 case 'y': case 'z': 887 888 case '0': case '1': case '2': case '3': case '4': 889 case '5': case '6': case '7': case '8': case '9': 890 891 case '.': case '-': 892 893 case '_': // not officially allowed 894 break; 895 896 default: 897 return true; 898 } 899 } 900 } 901 902 /** 903 * Parse an entity reference. [59] 350:17 904 */ 905 private char[] parseEntityReference() throws IOException { 906 int pos = strpos; 907 908 if ((ch = readCh()) == '#') { 909 int n = 0; 910 ch = readCh(); 911 if ((ch >= '0') && (ch <= '9') || 912 ch == 'x' || ch == 'X') { 913 914 if ((ch >= '0') && (ch <= '9')) { 915 // parse decimal reference 916 while ((ch >= '0') && (ch <= '9')) { 917 n = (n * 10) + ch - '0'; 918 ch = readCh(); 919 } 920 } else { 921 // parse hexadecimal reference 922 ch = readCh(); 923 char lch = (char) Character.toLowerCase(ch); 924 while ((lch >= '0') && (lch <= '9') || 925 (lch >= 'a') && (lch <= 'f')) { 926 if (lch >= '0' && lch <= '9') { 927 n = (n * 16) + lch - '0'; 928 } else { 929 n = (n * 16) + lch - 'a' + 10; 930 } 931 ch = readCh(); 932 lch = (char) Character.toLowerCase(ch); 933 } 934 } 935 switch (ch) { 936 case '\n': 937 ln++; 938 ch = readCh(); 939 lfCount++; 940 break; 941 942 case '\r': 943 ln++; 944 if ((ch = readCh()) == '\n') { 945 ch = readCh(); 946 crlfCount++; 947 } 948 else { 949 crCount++; 950 } 951 break; 952 953 case ';': 954 ch = readCh(); 955 break; 956 } 957 char data[] = mapNumericReference(n); 958 return data; 959 } 960 addString('#'); 961 if (!parseIdentifier(false)) { 962 error("ident.expected"); 963 strpos = pos; 964 char data[] = {'&', '#'}; 965 return data; 966 } 967 } else if (!parseIdentifier(false)) { 968 char data[] = {'&'}; 969 return data; 970 } 971 972 boolean semicolon = false; 973 974 switch (ch) { 975 case '\n': 976 ln++; 977 ch = readCh(); 978 lfCount++; 979 break; 980 981 case '\r': 982 ln++; 983 if ((ch = readCh()) == '\n') { 984 ch = readCh(); 985 crlfCount++; 986 } 987 else { 988 crCount++; 989 } 990 break; 991 992 case ';': 993 semicolon = true; 994 995 ch = readCh(); 996 break; 997 } 998 999 String nm = getString(pos); 1000 Entity ent = dtd.getEntity(nm); 1001 1002 // entities are case sensitive - however if strict 1003 // is false then we will try to make a match by 1004 // converting the string to all lowercase. 1005 // 1006 if (!strict && (ent == null)) { 1007 ent = dtd.getEntity(nm.toLowerCase()); 1008 } 1009 if ((ent == null) || !ent.isGeneral()) { 1010 1011 if (nm.length() == 0) { 1012 error("invalid.entref", nm); 1013 return new char[0]; 1014 } 1015 /* given that there is not a match restore the entity reference */ 1016 String str = "&" + nm + (semicolon ? ";" : ""); 1017 1018 char b[] = new char[str.length()]; 1019 str.getChars(0, b.length, b, 0); 1020 return b; 1021 } 1022 return ent.getData(); 1023 } 1024 1025 /** 1026 * Converts numeric character reference to char array. 1027 * 1028 * Normally the code in a reference should be always converted 1029 * to the Unicode character with the same code, but due to 1030 * wide usage of Cp1252 charset most browsers map numeric references 1031 * in the range 130-159 (which are control chars in Unicode set) 1032 * to displayable characters with other codes. 1033 * 1034 * @param c the code of numeric character reference. 1035 * @return a char array corresponding to the reference code. 1036 */ 1037 private char[] mapNumericReference(int c) { 1038 char[] data; 1039 if (c >= 0xffff) { // outside unicode BMP. 1040 try { 1041 data = Character.toChars(c); 1042 } catch (IllegalArgumentException e) { 1043 data = new char[0]; 1044 } 1045 } else { 1046 data = new char[1]; 1047 data[0] = (c < 130 || c > 159) ? (char) c : cp1252Map[c - 130]; 1048 } 1049 return data; 1050 } 1051 1052 /** 1053 * Parse a comment. [92] 391:7 1054 */ 1055 void parseComment() throws IOException { 1056 1057 while (true) { 1058 int c = ch; 1059 switch (c) { 1060 case '-': 1061 /** Presuming that the start string of a comment "<!--" has 1062 already been parsed, the '-' character is valid only as 1063 part of a comment termination and further more it must 1064 be present in even numbers. Hence if strict is true, we 1065 presume the comment has been terminated and return. 1066 However if strict is false, then there is no even number 1067 requirement and this character can appear anywhere in the 1068 comment. The parser reads on until it sees the following 1069 pattern: "-->" or "--!>". 1070 **/ 1071 if (!strict && (strpos != 0) && (str[strpos - 1] == '-')) { 1072 if ((ch = readCh()) == '>') { 1073 return; 1074 } 1075 if (ch == '!') { 1076 if ((ch = readCh()) == '>') { 1077 return; 1078 } else { 1079 /* to account for extra read()'s that happened */ 1080 addString('-'); 1081 addString('!'); 1082 continue; 1083 } 1084 } 1085 break; 1086 } 1087 1088 if ((ch = readCh()) == '-') { 1089 ch = readCh(); 1090 if (strict || ch == '>') { 1091 return; 1092 } 1093 if (ch == '!') { 1094 if ((ch = readCh()) == '>') { 1095 return; 1096 } else { 1097 /* to account for extra read()'s that happened */ 1098 addString('-'); 1099 addString('!'); 1100 continue; 1101 } 1102 } 1103 /* to account for the extra read() */ 1104 addString('-'); 1105 } 1106 break; 1107 1108 case -1: 1109 handleEOFInComment(); 1110 return; 1111 1112 case '\n': 1113 ln++; 1114 ch = readCh(); 1115 lfCount++; 1116 break; 1117 1118 case '>': 1119 ch = readCh(); 1120 break; 1121 1122 case '\r': 1123 ln++; 1124 if ((ch = readCh()) == '\n') { 1125 ch = readCh(); 1126 crlfCount++; 1127 } 1128 else { 1129 crCount++; 1130 } 1131 c = '\n'; 1132 break; 1133 default: 1134 ch = readCh(); 1135 break; 1136 } 1137 1138 addString(c); 1139 } 1140 } 1141 1142 /** 1143 * Parse literal content. [46] 343:1 and [47] 344:1 1144 */ 1145 void parseLiteral(boolean replace) throws IOException { 1146 while (true) { 1147 int c = ch; 1148 switch (c) { 1149 case -1: 1150 error("eof.literal", stack.elem.getName()); 1151 endTag(true); 1152 return; 1153 1154 case '>': 1155 ch = readCh(); 1156 int i = textpos - (stack.elem.name.length() + 2), j = 0; 1157 1158 // match end tag 1159 if ((i >= 0) && (text[i++] == '<') && (text[i] == '/')) { 1160 while ((++i < textpos) && 1161 (Character.toLowerCase(text[i]) == stack.elem.name.charAt(j++))); 1162 if (i == textpos) { 1163 textpos -= (stack.elem.name.length() + 2); 1164 if ((textpos > 0) && (text[textpos-1] == '\n')) { 1165 textpos--; 1166 } 1167 endTag(false); 1168 return; 1169 } 1170 } 1171 break; 1172 1173 case '&': 1174 char data[] = parseEntityReference(); 1175 if (textpos + data.length > text.length) { 1176 char newtext[] = new char[Math.max(textpos + data.length + 128, text.length * 2)]; 1177 System.arraycopy(text, 0, newtext, 0, text.length); 1178 text = newtext; 1179 } 1180 System.arraycopy(data, 0, text, textpos, data.length); 1181 textpos += data.length; 1182 continue; 1183 1184 case '\n': 1185 ln++; 1186 ch = readCh(); 1187 lfCount++; 1188 break; 1189 1190 case '\r': 1191 ln++; 1192 if ((ch = readCh()) == '\n') { 1193 ch = readCh(); 1194 crlfCount++; 1195 } 1196 else { 1197 crCount++; 1198 } 1199 c = '\n'; 1200 break; 1201 default: 1202 ch = readCh(); 1203 break; 1204 } 1205 1206 // output character 1207 if (textpos == text.length) { 1208 char newtext[] = new char[text.length + 128]; 1209 System.arraycopy(text, 0, newtext, 0, text.length); 1210 text = newtext; 1211 } 1212 text[textpos++] = (char)c; 1213 } 1214 } 1215 1216 /** 1217 * Parse attribute value. [33] 331:1 1218 */ 1219 @SuppressWarnings("fallthrough") 1220 String parseAttributeValue(boolean lower) throws IOException { 1221 int delim = -1; 1222 1223 // Check for a delimiter 1224 switch(ch) { 1225 case '\'': 1226 case '"': 1227 delim = ch; 1228 ch = readCh(); 1229 break; 1230 } 1231 1232 // Parse the rest of the value 1233 while (true) { 1234 int c = ch; 1235 1236 switch (c) { 1237 case '\n': 1238 ln++; 1239 ch = readCh(); 1240 lfCount++; 1241 if (delim < 0) { 1242 return getString(0); 1243 } 1244 break; 1245 1246 case '\r': 1247 ln++; 1248 1249 if ((ch = readCh()) == '\n') { 1250 ch = readCh(); 1251 crlfCount++; 1252 } 1253 else { 1254 crCount++; 1255 } 1256 if (delim < 0) { 1257 return getString(0); 1258 } 1259 break; 1260 1261 case '\t': 1262 if (delim < 0) 1263 c = ' '; 1264 // Fall through 1265 case ' ': 1266 ch = readCh(); 1267 if (delim < 0) { 1268 return getString(0); 1269 } 1270 break; 1271 1272 case '>': 1273 case '<': 1274 if (delim < 0) { 1275 return getString(0); 1276 } 1277 ch = readCh(); 1278 break; 1279 1280 case '\'': 1281 case '"': 1282 ch = readCh(); 1283 if (c == delim) { 1284 return getString(0); 1285 } else if (delim == -1) { 1286 error("attvalerr"); 1287 if (strict || ch == ' ') { 1288 return getString(0); 1289 } else { 1290 continue; 1291 } 1292 } 1293 break; 1294 1295 case '=': 1296 if (delim < 0) { 1297 /* In SGML a construct like <img src=/cgi-bin/foo?x=1> 1298 is considered invalid since an = sign can only be contained 1299 in an attributes value if the string is quoted. 1300 */ 1301 error("attvalerr"); 1302 /* If strict is true then we return with the string we have thus far. 1303 Otherwise we accept the = sign as part of the attribute's value and 1304 process the rest of the img tag. */ 1305 if (strict) { 1306 return getString(0); 1307 } 1308 } 1309 ch = readCh(); 1310 break; 1311 1312 case '&': 1313 if (strict && delim < 0) { 1314 ch = readCh(); 1315 break; 1316 } 1317 1318 char data[] = parseEntityReference(); 1319 for (int i = 0 ; i < data.length ; i++) { 1320 c = data[i]; 1321 addString((lower && (c >= 'A') && (c <= 'Z')) ? 'a' + c - 'A' : c); 1322 } 1323 continue; 1324 1325 case -1: 1326 return getString(0); 1327 1328 default: 1329 if (lower && (c >= 'A') && (c <= 'Z')) { 1330 c = 'a' + c - 'A'; 1331 } 1332 ch = readCh(); 1333 break; 1334 } 1335 addString(c); 1336 } 1337 } 1338 1339 1340 /** 1341 * Parse attribute specification List. [31] 327:17 1342 */ 1343 void parseAttributeSpecificationList(Element elem) throws IOException { 1344 1345 while (true) { 1346 skipSpace(); 1347 1348 switch (ch) { 1349 case '/': 1350 case '>': 1351 case '<': 1352 case -1: 1353 return; 1354 1355 case '-': 1356 if ((ch = readCh()) == '-') { 1357 ch = readCh(); 1358 parseComment(); 1359 strpos = 0; 1360 } else { 1361 error("invalid.tagchar", "-", elem.getName()); 1362 ch = readCh(); 1363 } 1364 continue; 1365 } 1366 1367 AttributeList att; 1368 String attname; 1369 String attvalue; 1370 1371 if (parseIdentifier(true)) { 1372 attname = getString(0); 1373 skipSpace(); 1374 if (ch == '=') { 1375 ch = readCh(); 1376 skipSpace(); 1377 att = elem.getAttribute(attname); 1378 // Bug ID 4102750 1379 // Load the NAME of an Attribute Case Sensitive 1380 // The case of the NAME must be intact 1381 // MG 021898 1382 attvalue = parseAttributeValue((att != null) && (att.type != CDATA) && (att.type != NOTATION) && (att.type != NAME)); 1383 // attvalue = parseAttributeValue((att != null) && (att.type != CDATA) && (att.type != NOTATION)); 1384 } else { 1385 attvalue = attname; 1386 att = elem.getAttributeByValue(attvalue); 1387 if (att == null) { 1388 att = elem.getAttribute(attname); 1389 if (att != null) { 1390 attvalue = att.getValue(); 1391 } 1392 else { 1393 // Make it null so that NULL_ATTRIBUTE_VALUE is 1394 // used 1395 attvalue = null; 1396 } 1397 } 1398 } 1399 } else if (!strict && ch == ',') { // allows for comma separated attribute-value pairs 1400 ch = readCh(); 1401 continue; 1402 } else if (!strict && ch == '"') { // allows for quoted attributes 1403 ch = readCh(); 1404 skipSpace(); 1405 if (parseIdentifier(true)) { 1406 attname = getString(0); 1407 if (ch == '"') { 1408 ch = readCh(); 1409 } 1410 skipSpace(); 1411 if (ch == '=') { 1412 ch = readCh(); 1413 skipSpace(); 1414 att = elem.getAttribute(attname); 1415 attvalue = parseAttributeValue((att != null) && 1416 (att.type != CDATA) && 1417 (att.type != NOTATION)); 1418 } else { 1419 attvalue = attname; 1420 att = elem.getAttributeByValue(attvalue); 1421 if (att == null) { 1422 att = elem.getAttribute(attname); 1423 if (att != null) { 1424 attvalue = att.getValue(); 1425 } 1426 } 1427 } 1428 } else { 1429 char str[] = {(char)ch}; 1430 error("invalid.tagchar", new String(str), elem.getName()); 1431 ch = readCh(); 1432 continue; 1433 } 1434 } else if (!strict && (attributes.isEmpty()) && (ch == '=')) { 1435 ch = readCh(); 1436 skipSpace(); 1437 attname = elem.getName(); 1438 att = elem.getAttribute(attname); 1439 attvalue = parseAttributeValue((att != null) && 1440 (att.type != CDATA) && 1441 (att.type != NOTATION)); 1442 } else if (!strict && (ch == '=')) { 1443 ch = readCh(); 1444 skipSpace(); 1445 attvalue = parseAttributeValue(true); 1446 error("attvalerr"); 1447 return; 1448 } else { 1449 char str[] = {(char)ch}; 1450 error("invalid.tagchar", new String(str), elem.getName()); 1451 if (!strict) { 1452 ch = readCh(); 1453 continue; 1454 } else { 1455 return; 1456 } 1457 } 1458 1459 if (att != null) { 1460 attname = att.getName(); 1461 } else { 1462 error("invalid.tagatt", attname, elem.getName()); 1463 } 1464 1465 // Check out the value 1466 if (attributes.isDefined(attname)) { 1467 error("multi.tagatt", attname, elem.getName()); 1468 } 1469 if (attvalue == null) { 1470 attvalue = ((att != null) && (att.value != null)) ? att.value : 1471 HTML.NULL_ATTRIBUTE_VALUE; 1472 } else if ((att != null) && (att.values != null) && !att.values.contains(attvalue)) { 1473 error("invalid.tagattval", attname, elem.getName()); 1474 } 1475 HTML.Attribute attkey = HTML.getAttributeKey(attname); 1476 if (attkey == null) { 1477 attributes.addAttribute(attname, attvalue); 1478 } else { 1479 attributes.addAttribute(attkey, attvalue); 1480 } 1481 } 1482 } 1483 1484 /** 1485 * Parses th Document Declaration Type markup declaration. 1486 * Currently ignores it. 1487 */ 1488 public String parseDTDMarkup() throws IOException { 1489 1490 StringBuilder strBuff = new StringBuilder(); 1491 ch = readCh(); 1492 while(true) { 1493 switch (ch) { 1494 case '>': 1495 ch = readCh(); 1496 return strBuff.toString(); 1497 case -1: 1498 error("invalid.markup"); 1499 return strBuff.toString(); 1500 case '\n': 1501 ln++; 1502 ch = readCh(); 1503 lfCount++; 1504 break; 1505 case '"': 1506 ch = readCh(); 1507 break; 1508 case '\r': 1509 ln++; 1510 if ((ch = readCh()) == '\n') { 1511 ch = readCh(); 1512 crlfCount++; 1513 } 1514 else { 1515 crCount++; 1516 } 1517 break; 1518 default: 1519 strBuff.append((char)(ch & 0xFF)); 1520 ch = readCh(); 1521 break; 1522 } 1523 } 1524 } 1525 1526 /** 1527 * Parse markup declarations. 1528 * Currently only handles the Document Type Declaration markup. 1529 * Returns true if it is a markup declaration false otherwise. 1530 */ 1531 protected boolean parseMarkupDeclarations(StringBuffer strBuff) throws IOException { 1532 1533 /* Currently handles only the DOCTYPE */ 1534 if ((strBuff.length() == "DOCTYPE".length()) && 1535 (strBuff.toString().toUpperCase().equals("DOCTYPE"))) { 1536 parseDTDMarkup(); 1537 return true; 1538 } 1539 return false; 1540 } 1541 1542 /** 1543 * Parse an invalid tag. 1544 */ 1545 void parseInvalidTag() throws IOException { 1546 // ignore all data upto the close bracket '>' 1547 while (true) { 1548 skipSpace(); 1549 switch (ch) { 1550 case '>': 1551 case -1: 1552 ch = readCh(); 1553 return; 1554 case '<': 1555 return; 1556 default: 1557 ch = readCh(); 1558 1559 } 1560 } 1561 } 1562 1563 /** 1564 * Parse a start or end tag. 1565 */ 1566 @SuppressWarnings("fallthrough") 1567 void parseTag() throws IOException { 1568 Element elem; 1569 boolean net = false; 1570 boolean warned = false; 1571 boolean unknown = false; 1572 1573 switch (ch = readCh()) { 1574 case '!': 1575 switch (ch = readCh()) { 1576 case '-': 1577 // Parse comment. [92] 391:7 1578 while (true) { 1579 if (ch == '-') { 1580 if (!strict || ((ch = readCh()) == '-')) { 1581 ch = readCh(); 1582 if (!strict && ch == '-') { 1583 ch = readCh(); 1584 } 1585 // send over any text you might see 1586 // before parsing and sending the 1587 // comment 1588 if (textpos != 0) { 1589 char newtext[] = new char[textpos]; 1590 System.arraycopy(text, 0, newtext, 0, textpos); 1591 handleText(newtext); 1592 lastBlockStartPos = currentBlockStartPos; 1593 textpos = 0; 1594 } 1595 parseComment(); 1596 last = makeTag(dtd.getElement("comment"), true); 1597 handleComment(getChars(0)); 1598 continue; 1599 } else if (!warned) { 1600 warned = true; 1601 error("invalid.commentchar", "-"); 1602 } 1603 } 1604 skipSpace(); 1605 switch (ch) { 1606 case '-': 1607 continue; 1608 case '>': 1609 ch = readCh(); 1610 return; 1611 case -1: 1612 return; 1613 default: 1614 ch = readCh(); 1615 if (!warned) { 1616 warned = true; 1617 error("invalid.commentchar", 1618 String.valueOf((char)ch)); 1619 } 1620 break; 1621 } 1622 } 1623 1624 default: 1625 // deal with marked sections 1626 StringBuffer strBuff = new StringBuffer(); 1627 while (true) { 1628 strBuff.append((char)ch); 1629 if (parseMarkupDeclarations(strBuff)) { 1630 return; 1631 } 1632 switch(ch) { 1633 case '>': 1634 ch = readCh(); 1635 // Fall through 1636 case -1: 1637 error("invalid.markup"); 1638 return; 1639 case '\n': 1640 ln++; 1641 ch = readCh(); 1642 lfCount++; 1643 break; 1644 case '\r': 1645 ln++; 1646 if ((ch = readCh()) == '\n') { 1647 ch = readCh(); 1648 crlfCount++; 1649 } 1650 else { 1651 crCount++; 1652 } 1653 break; 1654 1655 default: 1656 ch = readCh(); 1657 break; 1658 } 1659 } 1660 } 1661 1662 case '/': 1663 // parse end tag [19] 317:4 1664 switch (ch = readCh()) { 1665 case '>': 1666 ch = readCh(); 1667 // Fall through 1668 case '<': 1669 // empty end tag. either </> or </< 1670 if (recent == null) { 1671 error("invalid.shortend"); 1672 return; 1673 } 1674 elem = recent; 1675 break; 1676 1677 default: 1678 if (!parseIdentifier(true)) { 1679 error("expected.endtagname"); 1680 return; 1681 } 1682 skipSpace(); 1683 switch (ch) { 1684 case '>': 1685 ch = readCh(); 1686 break; 1687 case '<': 1688 break; 1689 1690 default: 1691 error("expected", "'>'"); 1692 while ((ch != -1) && (ch != '\n') && (ch != '>')) { 1693 ch = readCh(); 1694 } 1695 if (ch == '>') { 1696 ch = readCh(); 1697 } 1698 break; 1699 } 1700 String elemStr = getString(0); 1701 if (!dtd.elementExists(elemStr)) { 1702 error("end.unrecognized", elemStr); 1703 // Ignore RE before end tag 1704 if ((textpos > 0) && (text[textpos-1] == '\n')) { 1705 textpos--; 1706 } 1707 elem = dtd.getElement("unknown"); 1708 elem.name = elemStr; 1709 unknown = true; 1710 } else { 1711 elem = dtd.getElement(elemStr); 1712 } 1713 break; 1714 } 1715 1716 1717 // If the stack is null, we're seeing end tags without any begin 1718 // tags. Ignore them. 1719 1720 if (stack == null) { 1721 error("end.extra.tag", elem.getName()); 1722 return; 1723 } 1724 1725 // Ignore RE before end tag 1726 if ((textpos > 0) && (text[textpos-1] == '\n')) { 1727 // In a pre tag, if there are blank lines 1728 // we do not want to remove the newline 1729 // before the end tag. Hence this code. 1730 // 1731 if (stack.pre) { 1732 if ((textpos > 1) && (text[textpos-2] != '\n')) { 1733 textpos--; 1734 } 1735 } else { 1736 textpos--; 1737 } 1738 } 1739 1740 // If the end tag is a form, since we did not put it 1741 // on the tag stack, there is no corresponding start 1742 // start tag to find. Hence do not touch the tag stack. 1743 // 1744 1745 /* 1746 if (!strict && elem.getName().equals("form")) { 1747 if (lastFormSent != null) { 1748 handleEndTag(lastFormSent); 1749 return; 1750 } else { 1751 // do nothing. 1752 return; 1753 } 1754 } 1755 */ 1756 1757 if (unknown) { 1758 // we will not see a corresponding start tag 1759 // on the the stack. If we are seeing an 1760 // end tag, lets send this on as an empty 1761 // tag with the end tag attribute set to 1762 // true. 1763 TagElement t = makeTag(elem); 1764 handleText(t); 1765 attributes.addAttribute(HTML.Attribute.ENDTAG, "true"); 1766 handleEmptyTag(makeTag(elem)); 1767 unknown = false; 1768 return; 1769 } 1770 1771 // find the corresponding start tag 1772 1773 // A commonly occurring error appears to be the insertion 1774 // of extra end tags in a table. The intent here is ignore 1775 // such extra end tags. 1776 // 1777 if (!strict) { 1778 String stackElem = stack.elem.getName(); 1779 1780 if (stackElem.equals("table")) { 1781 // If it is not a valid end tag ignore it and return 1782 // 1783 if (!elem.getName().equals(stackElem)) { 1784 error("tag.ignore", elem.getName()); 1785 return; 1786 } 1787 } 1788 1789 1790 1791 if (stackElem.equals("tr") || 1792 stackElem.equals("td")) { 1793 if ((!elem.getName().equals("table")) && 1794 (!elem.getName().equals(stackElem))) { 1795 error("tag.ignore", elem.getName()); 1796 return; 1797 } 1798 } 1799 } 1800 TagStack sp = stack; 1801 1802 while ((sp != null) && (elem != sp.elem)) { 1803 sp = sp.next; 1804 } 1805 if (sp == null) { 1806 error("unmatched.endtag", elem.getName()); 1807 return; 1808 } 1809 1810 // People put font ending tags in the darndest places. 1811 // Don't close other contexts based on them being between 1812 // a font tag and the corresponding end tag. Instead, 1813 // ignore the end tag like it doesn't exist and allow the end 1814 // of the document to close us out. 1815 String elemName = elem.getName(); 1816 if (stack != sp && 1817 (elemName.equals("font") || 1818 elemName.equals("center"))) { 1819 1820 // Since closing out a center tag can have real wierd 1821 // effects on the formatting, make sure that tags 1822 // for which omitting an end tag is legimitate 1823 // get closed out. 1824 // 1825 if (elemName.equals("center")) { 1826 while(stack.elem.omitEnd() && stack != sp) { 1827 endTag(true); 1828 } 1829 if (stack.elem == elem) { 1830 endTag(false); 1831 } 1832 } 1833 return; 1834 } 1835 // People do the same thing with center tags. In this 1836 // case we would like to close off the center tag but 1837 // not necessarily all enclosing tags. 1838 1839 1840 1841 // end tags 1842 while (stack != sp) { 1843 endTag(true); 1844 } 1845 1846 endTag(false); 1847 return; 1848 1849 case -1: 1850 error("eof"); 1851 return; 1852 } 1853 1854 // start tag [14] 314:1 1855 if (!parseIdentifier(true)) { 1856 elem = recent; 1857 if ((ch != '>') || (elem == null)) { 1858 error("expected.tagname"); 1859 return; 1860 } 1861 } else { 1862 String elemStr = getString(0); 1863 1864 if (elemStr.equals("image")) { 1865 elemStr = "img"; 1866 } 1867 1868 /* determine if this element is part of the dtd. */ 1869 1870 if (!dtd.elementExists(elemStr)) { 1871 // parseInvalidTag(); 1872 error("tag.unrecognized ", elemStr); 1873 elem = dtd.getElement("unknown"); 1874 elem.name = elemStr; 1875 unknown = true; 1876 } else { 1877 elem = dtd.getElement(elemStr); 1878 } 1879 } 1880 1881 // Parse attributes 1882 parseAttributeSpecificationList(elem); 1883 1884 switch (ch) { 1885 case '/': 1886 net = true; 1887 // Fall through 1888 case '>': 1889 ch = readCh(); 1890 if (ch == '>' && net) { 1891 ch = readCh(); 1892 } 1893 case '<': 1894 break; 1895 1896 default: 1897 error("expected", "'>'"); 1898 break; 1899 } 1900 1901 if (!strict) { 1902 if (elem.getName().equals("script")) { 1903 error("javascript.unsupported"); 1904 } 1905 } 1906 1907 // ignore RE after start tag 1908 // 1909 if (!elem.isEmpty()) { 1910 if (ch == '\n') { 1911 ln++; 1912 lfCount++; 1913 ch = readCh(); 1914 } else if (ch == '\r') { 1915 ln++; 1916 if ((ch = readCh()) == '\n') { 1917 ch = readCh(); 1918 crlfCount++; 1919 } 1920 else { 1921 crCount++; 1922 } 1923 } 1924 } 1925 1926 // ensure a legal context for the tag 1927 TagElement tag = makeTag(elem, false); 1928 1929 1930 /** In dealing with forms, we have decided to treat 1931 them as legal in any context. Also, even though 1932 they do have a start and an end tag, we will 1933 not put this tag on the stack. This is to deal 1934 several pages in the web oasis that choose to 1935 start and end forms in any possible location. **/ 1936 1937 /* 1938 if (!strict && elem.getName().equals("form")) { 1939 if (lastFormSent == null) { 1940 lastFormSent = tag; 1941 } else { 1942 handleEndTag(lastFormSent); 1943 lastFormSent = tag; 1944 } 1945 } else { 1946 */ 1947 // Smlly, if a tag is unknown, we will apply 1948 // no legalTagContext logic to it. 1949 // 1950 if (!unknown) { 1951 legalTagContext(tag); 1952 1953 // If skip tag is true, this implies that 1954 // the tag was illegal and that the error 1955 // recovery strategy adopted is to ignore 1956 // the tag. 1957 if (!strict && skipTag) { 1958 skipTag = false; 1959 return; 1960 } 1961 } 1962 /* 1963 } 1964 */ 1965 1966 startTag(tag); 1967 1968 if (!elem.isEmpty()) { 1969 switch (elem.getType()) { 1970 case CDATA: 1971 parseLiteral(false); 1972 break; 1973 case RCDATA: 1974 parseLiteral(true); 1975 break; 1976 default: 1977 if (stack != null) { 1978 stack.net = net; 1979 } 1980 break; 1981 } 1982 } 1983 } 1984 1985 private static final String START_COMMENT = "<!--"; 1986 private static final String END_COMMENT = "-->"; 1987 private static final char[] SCRIPT_END_TAG = "</script>".toCharArray(); 1988 private static final char[] SCRIPT_END_TAG_UPPER_CASE = 1989 "</SCRIPT>".toCharArray(); 1990 1991 void parseScript() throws IOException { 1992 char[] charsToAdd = new char[SCRIPT_END_TAG.length]; 1993 boolean insideComment = false; 1994 1995 /* Here, ch should be the first character after <script> */ 1996 while (true) { 1997 int i = 0; 1998 while (!insideComment && i < SCRIPT_END_TAG.length 1999 && (SCRIPT_END_TAG[i] == ch 2000 || SCRIPT_END_TAG_UPPER_CASE[i] == ch)) { 2001 charsToAdd[i] = (char) ch; 2002 ch = readCh(); 2003 i++; 2004 } 2005 if (i == SCRIPT_END_TAG.length) { 2006 2007 /* '</script>' tag detected */ 2008 /* Here, ch == the first character after </script> */ 2009 return; 2010 } else { 2011 2012 /* To account for extra read()'s that happened */ 2013 for (int j = 0; j < i; j++) { 2014 addString(charsToAdd[j]); 2015 } 2016 2017 switch (ch) { 2018 case -1: 2019 error("eof.script"); 2020 return; 2021 case '\n': 2022 ln++; 2023 ch = readCh(); 2024 lfCount++; 2025 addString('\n'); 2026 break; 2027 case '\r': 2028 ln++; 2029 if ((ch = readCh()) == '\n') { 2030 ch = readCh(); 2031 crlfCount++; 2032 } else { 2033 crCount++; 2034 } 2035 addString('\n'); 2036 break; 2037 default: 2038 addString(ch); 2039 String str = new String(getChars(0, strpos)); 2040 if (!insideComment && str.endsWith(START_COMMENT)) { 2041 insideComment = true; 2042 } 2043 if (insideComment && str.endsWith(END_COMMENT)) { 2044 insideComment = false; 2045 } 2046 ch = readCh(); 2047 break; 2048 } // switch 2049 } 2050 } // while 2051 } 2052 2053 /** 2054 * Parse Content. [24] 320:1 2055 */ 2056 void parseContent() throws IOException { 2057 Thread curThread = Thread.currentThread(); 2058 2059 for (;;) { 2060 if (curThread.isInterrupted()) { 2061 curThread.interrupt(); // resignal the interrupt 2062 break; 2063 } 2064 2065 int c = ch; 2066 currentBlockStartPos = currentPosition; 2067 2068 if (recent == dtd.script) { // means: if after starting <script> tag 2069 2070 /* Here, ch has to be the first character after <script> */ 2071 parseScript(); 2072 last = makeTag(dtd.getElement("comment"), true); 2073 2074 /* Remove leading and trailing HTML comment declarations */ 2075 String str = new String(getChars(0)).trim(); 2076 int minLength = START_COMMENT.length() + END_COMMENT.length(); 2077 if (str.startsWith(START_COMMENT) && str.endsWith(END_COMMENT) 2078 && str.length() >= (minLength)) { 2079 str = str.substring(START_COMMENT.length(), 2080 str.length() - END_COMMENT.length()); 2081 } 2082 2083 /* Handle resulting chars as comment */ 2084 handleComment(str.toCharArray()); 2085 endTag(false); 2086 lastBlockStartPos = currentPosition; 2087 2088 continue; 2089 } else { 2090 switch (c) { 2091 case '<': 2092 parseTag(); 2093 lastBlockStartPos = currentPosition; 2094 continue; 2095 2096 case '/': 2097 ch = readCh(); 2098 if ((stack != null) && stack.net) { 2099 // null end tag. 2100 endTag(false); 2101 continue; 2102 } else if (textpos == 0) { 2103 if (!legalElementContext(dtd.pcdata)) { 2104 error("unexpected.pcdata"); 2105 } 2106 if (last.breaksFlow()) { 2107 space = false; 2108 } 2109 } 2110 break; 2111 2112 case -1: 2113 return; 2114 2115 case '&': 2116 if (textpos == 0) { 2117 if (!legalElementContext(dtd.pcdata)) { 2118 error("unexpected.pcdata"); 2119 } 2120 if (last.breaksFlow()) { 2121 space = false; 2122 } 2123 } 2124 char data[] = parseEntityReference(); 2125 if (textpos + data.length + 1 > text.length) { 2126 char newtext[] = new char[Math.max(textpos + data.length + 128, text.length * 2)]; 2127 System.arraycopy(text, 0, newtext, 0, text.length); 2128 text = newtext; 2129 } 2130 if (space) { 2131 space = false; 2132 text[textpos++] = ' '; 2133 } 2134 System.arraycopy(data, 0, text, textpos, data.length); 2135 textpos += data.length; 2136 ignoreSpace = false; 2137 continue; 2138 2139 case '\n': 2140 ln++; 2141 lfCount++; 2142 ch = readCh(); 2143 if ((stack != null) && stack.pre) { 2144 break; 2145 } 2146 if (textpos == 0) { 2147 lastBlockStartPos = currentPosition; 2148 } 2149 if (!ignoreSpace) { 2150 space = true; 2151 } 2152 continue; 2153 2154 case '\r': 2155 ln++; 2156 c = '\n'; 2157 if ((ch = readCh()) == '\n') { 2158 ch = readCh(); 2159 crlfCount++; 2160 } 2161 else { 2162 crCount++; 2163 } 2164 if ((stack != null) && stack.pre) { 2165 break; 2166 } 2167 if (textpos == 0) { 2168 lastBlockStartPos = currentPosition; 2169 } 2170 if (!ignoreSpace) { 2171 space = true; 2172 } 2173 continue; 2174 2175 2176 case '\t': 2177 case ' ': 2178 ch = readCh(); 2179 if ((stack != null) && stack.pre) { 2180 break; 2181 } 2182 if (textpos == 0) { 2183 lastBlockStartPos = currentPosition; 2184 } 2185 if (!ignoreSpace) { 2186 space = true; 2187 } 2188 continue; 2189 2190 default: 2191 if (textpos == 0) { 2192 if (!legalElementContext(dtd.pcdata)) { 2193 error("unexpected.pcdata"); 2194 } 2195 if (last.breaksFlow()) { 2196 space = false; 2197 } 2198 } 2199 ch = readCh(); 2200 break; 2201 } 2202 } 2203 2204 // enlarge buffer if needed 2205 if (textpos + 2 > text.length) { 2206 char newtext[] = new char[text.length + 128]; 2207 System.arraycopy(text, 0, newtext, 0, text.length); 2208 text = newtext; 2209 } 2210 2211 // output pending space 2212 if (space) { 2213 if (textpos == 0) { 2214 lastBlockStartPos--; 2215 } 2216 text[textpos++] = ' '; 2217 space = false; 2218 } 2219 text[textpos++] = (char)c; 2220 ignoreSpace = false; 2221 } 2222 } 2223 2224 /** 2225 * Returns the end of line string. This will return the end of line 2226 * string that has been encountered the most, one of \r, \n or \r\n. 2227 */ 2228 String getEndOfLineString() { 2229 if (crlfCount >= crCount) { 2230 if (lfCount >= crlfCount) { 2231 return "\n"; 2232 } 2233 else { 2234 return "\r\n"; 2235 } 2236 } 2237 else { 2238 if (crCount > lfCount) { 2239 return "\r"; 2240 } 2241 else { 2242 return "\n"; 2243 } 2244 } 2245 } 2246 2247 /** 2248 * Parse an HTML stream, given a DTD. 2249 */ 2250 public synchronized void parse(Reader in) throws IOException { 2251 this.in = in; 2252 2253 this.ln = 1; 2254 2255 seenHtml = false; 2256 seenHead = false; 2257 seenBody = false; 2258 2259 crCount = lfCount = crlfCount = 0; 2260 2261 try { 2262 ch = readCh(); 2263 text = new char[1024]; 2264 str = new char[128]; 2265 2266 parseContent(); 2267 // NOTE: interruption may have occurred. Control flows out 2268 // of here normally. 2269 while (stack != null) { 2270 endTag(true); 2271 } 2272 in.close(); 2273 } catch (IOException e) { 2274 errorContext(); 2275 error("ioexception"); 2276 throw e; 2277 } catch (Exception e) { 2278 errorContext(); 2279 error("exception", e.getClass().getName(), e.getMessage()); 2280 e.printStackTrace(); 2281 } catch (ThreadDeath e) { 2282 errorContext(); 2283 error("terminated"); 2284 e.printStackTrace(); 2285 throw e; 2286 } finally { 2287 for (; stack != null ; stack = stack.next) { 2288 handleEndTag(stack.tag); 2289 } 2290 2291 text = null; 2292 str = null; 2293 } 2294 2295 } 2296 2297 2298 /* 2299 * Input cache. This is much faster than calling down to a synchronized 2300 * method of BufferedReader for each byte. Measurements done 5/30/97 2301 * show that there's no point in having a bigger buffer: Increasing 2302 * the buffer to 8192 had no measurable impact for a program discarding 2303 * one character at a time (reading from an http URL to a local machine). 2304 * NOTE: If the current encoding is bogus, and we read too much 2305 * (past the content-type) we may suffer a MalformedInputException. For 2306 * this reason the initial size is 1 and when the body is encountered the 2307 * size is adjusted to 256. 2308 */ 2309 private char buf[] = new char[1]; 2310 private int pos; 2311 private int len; 2312 /* 2313 tracks position relative to the beginning of the 2314 document. 2315 */ 2316 private int currentPosition; 2317 2318 2319 private final int readCh() throws IOException { 2320 2321 if (pos >= len) { 2322 2323 // This loop allows us to ignore interrupts if the flag 2324 // says so 2325 for (;;) { 2326 try { 2327 len = in.read(buf); 2328 break; 2329 } catch (InterruptedIOException ex) { 2330 throw ex; 2331 } 2332 } 2333 2334 if (len <= 0) { 2335 return -1; // eof 2336 } 2337 pos = 0; 2338 } 2339 ++currentPosition; 2340 2341 return buf[pos++]; 2342 } 2343 2344 2345 protected int getCurrentPos() { 2346 return currentPosition; 2347 } 2348 }