1 /* 2 * Copyright (c) 1998, 2014, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 package javax.swing.text.html.parser; 27 28 import javax.swing.text.SimpleAttributeSet; 29 import javax.swing.text.html.HTML; 30 import javax.swing.text.ChangedCharSetException; 31 import java.io.*; 32 import java.util.Hashtable; 33 import java.util.Properties; 34 import java.util.Vector; 35 import java.util.Enumeration; 36 import java.net.URL; 37 38 import sun.misc.MessageUtils; 39 40 /** 41 * A simple DTD-driven HTML parser. The parser reads an 42 * HTML file from an InputStream and calls various methods 43 * (which should be overridden in a subclass) when tags and 44 * data are encountered. 45 * <p> 46 * Unfortunately there are many badly implemented HTML parsers 47 * out there, and as a result there are many badly formatted 48 * HTML files. This parser attempts to parse most HTML files. 49 * This means that the implementation sometimes deviates from 50 * the SGML specification in favor of HTML. 51 * <p> 52 * The parser treats \r and \r\n as \n. Newlines after starttags 53 * and before end tags are ignored just as specified in the SGML/HTML 54 * specification. 55 * <p> 56 * The html spec does not specify how spaces are to be coalesced very well. 57 * Specifically, the following scenarios are not discussed (note that a 58 * space should be used here, but I am using &nbsp to force the space to 59 * be displayed): 60 * <p> 61 * '<b>blah <i> <strike> foo' which can be treated as: 62 * '<b>blah <i><strike>foo' 63 * <p>as well as: 64 * '<p><a href="xx"> <em>Using</em></a></p>' 65 * which appears to be treated as: 66 * '<p><a href="xx"><em>Using</em></a></p>' 67 * <p> 68 * If <code>strict</code> is false, when a tag that breaks flow, 69 * (<code>TagElement.breaksFlows</code>) or trailing whitespace is 70 * encountered, all whitespace will be ignored until a non whitespace 71 * character is encountered. This appears to give behavior closer to 72 * the popular browsers. 73 * 74 * @see DTD 75 * @see TagElement 76 * @see SimpleAttributeSet 77 * @author Arthur van Hoff 78 * @author Sunita Mani 79 */ 80 public 81 class Parser implements DTDConstants { 82 83 private char text[] = new char[1024]; 84 private int textpos = 0; 85 private TagElement last; 86 private boolean space; 87 88 private char str[] = new char[128]; 89 private int strpos = 0; 90 91 protected DTD dtd = null; 92 93 private int ch; 94 private int ln; 95 private Reader in; 96 97 private Element recent; 98 private TagStack stack; 99 private boolean skipTag = false; 100 private TagElement lastFormSent = null; 101 private SimpleAttributeSet attributes = new SimpleAttributeSet(); 102 103 // State for <html>, <head> and <body>. Since people like to slap 104 // together HTML documents without thinking, occasionally they 105 // have multiple instances of these tags. These booleans track 106 // the first sightings of these tags so they can be safely ignored 107 // by the parser if repeated. 108 private boolean seenHtml = false; 109 private boolean seenHead = false; 110 private boolean seenBody = false; 111 112 /** 113 * The html spec does not specify how spaces are coalesced very well. 114 * If strict == false, ignoreSpace is used to try and mimic the behavior 115 * of the popular browsers. 116 * <p> 117 * The problematic scenarios are: 118 * '<b>blah <i> <strike> foo' which can be treated as: 119 * '<b>blah <i><strike>foo' 120 * as well as: 121 * '<p><a href="xx"> <em>Using</em></a></p>' 122 * which appears to be treated as: 123 * '<p><a href="xx"><em>Using</em></a></p>' 124 * <p> 125 * When a tag that breaks flow, or trailing whitespace is encountered 126 * ignoreSpace is set to true. From then on, all whitespace will be 127 * ignored. 128 * ignoreSpace will be set back to false the first time a 129 * non whitespace character is encountered. This appears to give 130 * behavior closer to the popular browsers. 131 */ 132 private boolean ignoreSpace; 133 134 /** 135 * This flag determines whether or not the Parser will be strict 136 * in enforcing SGML compatibility. If false, it will be lenient 137 * with certain common classes of erroneous HTML constructs. 138 * Strict or not, in either case an error will be recorded. 139 * 140 */ 141 protected boolean strict = false; 142 143 144 /** Number of \r\n's encountered. */ 145 private int crlfCount; 146 /** Number of \r's encountered. A \r\n will not increment this. */ 147 private int crCount; 148 /** Number of \n's encountered. A \r\n will not increment this. */ 149 private int lfCount; 150 151 // 152 // To correctly identify the start of a tag/comment/text we need two 153 // ivars. Two are needed as handleText isn't invoked until the tag 154 // after the text has been parsed, that is the parser parses the text, 155 // then a tag, then invokes handleText followed by handleStart. 156 // 157 /** The start position of the current block. Block is overloaded here, 158 * it really means the current start position for the current comment, 159 * tag, text. Use getBlockStartPosition to access this. */ 160 private int currentBlockStartPos; 161 /** Start position of the last block. */ 162 private int lastBlockStartPos; 163 164 /** 165 * array for mapping numeric references in range 166 * 130-159 to displayable Unicode characters. 167 */ 168 private static final char[] cp1252Map = { 169 8218, // 170 402, // 171 8222, // 172 8230, // 173 8224, // 174 8225, // 175 710, // 176 8240, // 177 352, // 178 8249, // 179 338, // 180 141, // 181 142, // 182 143, // 183 144, // 184 8216, // 185 8217, // 186 8220, // 187 8221, // 188 8226, // 189 8211, // 190 8212, // 191 732, // 192 8482, // 193 353, // 194 8250, // 195 339, // 196 157, // 197 158, // 198 376 // 199 }; 200 201 public Parser(DTD dtd) { 202 this.dtd = dtd; 203 } 204 205 206 /** 207 * @return the line number of the line currently being parsed 208 */ 209 protected int getCurrentLine() { 210 return ln; 211 } 212 213 /** 214 * Returns the start position of the current block. Block is 215 * overloaded here, it really means the current start position for 216 * the current comment tag, text, block.... This is provided for 217 * subclassers that wish to know the start of the current block when 218 * called with one of the handleXXX methods. 219 */ 220 int getBlockStartPosition() { 221 return Math.max(0, lastBlockStartPos - 1); 222 } 223 224 /** 225 * Makes a TagElement. 226 */ 227 protected TagElement makeTag(Element elem, boolean fictional) { 228 return new TagElement(elem, fictional); 229 } 230 231 protected TagElement makeTag(Element elem) { 232 return makeTag(elem, false); 233 } 234 235 protected SimpleAttributeSet getAttributes() { 236 return attributes; 237 } 238 239 protected void flushAttributes() { 240 attributes.removeAttributes(attributes); 241 } 242 243 /** 244 * Called when PCDATA is encountered. 245 */ 246 protected void handleText(char text[]) { 247 } 248 249 /** 250 * Called when an HTML title tag is encountered. 251 */ 252 protected void handleTitle(char text[]) { 253 // default behavior is to call handleText. Subclasses 254 // can override if necessary. 255 handleText(text); 256 } 257 258 /** 259 * Called when an HTML comment is encountered. 260 */ 261 protected void handleComment(char text[]) { 262 } 263 264 protected void handleEOFInComment() { 265 // We've reached EOF. Our recovery strategy is to 266 // see if we have more than one line in the comment; 267 // if so, we pretend that the comment was an unterminated 268 // single line comment, and reparse the lines after the 269 // first line as normal HTML content. 270 271 int commentEndPos = strIndexOf('\n'); 272 if (commentEndPos >= 0) { 273 handleComment(getChars(0, commentEndPos)); 274 try { 275 in.close(); 276 in = new CharArrayReader(getChars(commentEndPos + 1)); 277 ch = '>'; 278 } catch (IOException e) { 279 error("ioexception"); 280 } 281 282 resetStrBuffer(); 283 } else { 284 // no newline, so signal an error 285 error("eof.comment"); 286 } 287 } 288 289 /** 290 * Called when an empty tag is encountered. 291 */ 292 protected void handleEmptyTag(TagElement tag) throws ChangedCharSetException { 293 } 294 295 /** 296 * Called when a start tag is encountered. 297 */ 298 protected void handleStartTag(TagElement tag) { 299 } 300 301 /** 302 * Called when an end tag is encountered. 303 */ 304 protected void handleEndTag(TagElement tag) { 305 } 306 307 /** 308 * An error has occurred. 309 */ 310 protected void handleError(int ln, String msg) { 311 /* 312 Thread.dumpStack(); 313 System.out.println("**** " + stack); 314 System.out.println("line " + ln + ": error: " + msg); 315 System.out.println(); 316 */ 317 } 318 319 /** 320 * Output text. 321 */ 322 void handleText(TagElement tag) { 323 if (tag.breaksFlow()) { 324 space = false; 325 if (!strict) { 326 ignoreSpace = true; 327 } 328 } 329 if (textpos == 0) { 330 if ((!space) || (stack == null) || last.breaksFlow() || 331 !stack.advance(dtd.pcdata)) { 332 last = tag; 333 space = false; 334 lastBlockStartPos = currentBlockStartPos; 335 return; 336 } 337 } 338 if (space) { 339 if (!ignoreSpace) { 340 // enlarge buffer if needed 341 if (textpos + 1 > text.length) { 342 char newtext[] = new char[text.length + 200]; 343 System.arraycopy(text, 0, newtext, 0, text.length); 344 text = newtext; 345 } 346 347 // output pending space 348 text[textpos++] = ' '; 349 if (!strict && !tag.getElement().isEmpty()) { 350 ignoreSpace = true; 351 } 352 } 353 space = false; 354 } 355 char newtext[] = new char[textpos]; 356 System.arraycopy(text, 0, newtext, 0, textpos); 357 // Handles cases of bad html where the title tag 358 // was getting lost when we did error recovery. 359 if (tag.getElement().getName().equals("title")) { 360 handleTitle(newtext); 361 } else { 362 handleText(newtext); 363 } 364 lastBlockStartPos = currentBlockStartPos; 365 textpos = 0; 366 last = tag; 367 space = false; 368 } 369 370 /** 371 * Invoke the error handler. 372 */ 373 protected void error(String err, String arg1, String arg2, 374 String arg3) { 375 handleError(ln, err + " " + arg1 + " " + arg2 + " " + arg3); 376 } 377 378 protected void error(String err, String arg1, String arg2) { 379 error(err, arg1, arg2, "?"); 380 } 381 protected void error(String err, String arg1) { 382 error(err, arg1, "?", "?"); 383 } 384 protected void error(String err) { 385 error(err, "?", "?", "?"); 386 } 387 388 389 /** 390 * Handle a start tag. The new tag is pushed 391 * onto the tag stack. The attribute list is 392 * checked for required attributes. 393 */ 394 protected void startTag(TagElement tag) throws ChangedCharSetException { 395 Element elem = tag.getElement(); 396 397 // If the tag is an empty tag and texpos != 0 398 // this implies that there is text before the 399 // start tag that needs to be processed before 400 // handling the tag. 401 // 402 if (!elem.isEmpty() || 403 ((last != null) && !last.breaksFlow()) || 404 (textpos != 0)) { 405 handleText(tag); 406 } else { 407 // this variable gets updated in handleText(). 408 // Since in this case we do not call handleText() 409 // we need to update it here. 410 // 411 last = tag; 412 // Note that we should really check last.breakFlows before 413 // assuming this should be false. 414 space = false; 415 } 416 lastBlockStartPos = currentBlockStartPos; 417 418 // check required attributes 419 for (AttributeList a = elem.atts ; a != null ; a = a.next) { 420 if ((a.modifier == REQUIRED) && 421 ((attributes.isEmpty()) || 422 ((!attributes.isDefined(a.name)) && 423 (!attributes.isDefined(HTML.getAttributeKey(a.name)))))) { 424 error("req.att ", a.getName(), elem.getName()); 425 } 426 } 427 428 if (elem.isEmpty()) { 429 handleEmptyTag(tag); 430 /* 431 } else if (elem.getName().equals("form")) { 432 handleStartTag(tag); 433 */ 434 } else { 435 recent = elem; 436 stack = new TagStack(tag, stack); 437 handleStartTag(tag); 438 } 439 } 440 441 /** 442 * Handle an end tag. The end tag is popped 443 * from the tag stack. 444 */ 445 protected void endTag(boolean omitted) { 446 handleText(stack.tag); 447 448 if (omitted && !stack.elem.omitEnd()) { 449 error("end.missing", stack.elem.getName()); 450 } else if (!stack.terminate()) { 451 error("end.unexpected", stack.elem.getName()); 452 } 453 454 // handle the tag 455 handleEndTag(stack.tag); 456 stack = stack.next; 457 recent = (stack != null) ? stack.elem : null; 458 } 459 460 461 boolean ignoreElement(Element elem) { 462 463 String stackElement = stack.elem.getName(); 464 String elemName = elem.getName(); 465 /* We ignore all elements that are not valid in the context of 466 a table except <td>, <th> (these we handle in 467 legalElementContext()) and #pcdata. We also ignore the 468 <font> tag in the context of <ul> and <ol> We additonally 469 ignore the <meta> and the <style> tag if the body tag has 470 been seen. **/ 471 if ((elemName.equals("html") && seenHtml) || 472 (elemName.equals("head") && seenHead) || 473 (elemName.equals("body") && seenBody)) { 474 return true; 475 } 476 if (elemName.equals("dt") || elemName.equals("dd")) { 477 TagStack s = stack; 478 while (s != null && !s.elem.getName().equals("dl")) { 479 s = s.next; 480 } 481 if (s == null) { 482 return true; 483 } 484 } 485 486 if (((stackElement.equals("table")) && 487 (!elemName.equals("#pcdata")) && (!elemName.equals("input"))) || 488 ((elemName.equals("font")) && 489 (stackElement.equals("ul") || stackElement.equals("ol"))) || 490 (elemName.equals("meta") && stack != null) || 491 (elemName.equals("style") && seenBody) || 492 (stackElement.equals("table") && elemName.equals("a"))) { 493 return true; 494 } 495 return false; 496 } 497 498 499 /** 500 * Marks the first time a tag has been seen in a document 501 */ 502 503 protected void markFirstTime(Element elem) { 504 String elemName = elem.getName(); 505 if (elemName.equals("html")) { 506 seenHtml = true; 507 } else if (elemName.equals("head")) { 508 seenHead = true; 509 } else if (elemName.equals("body")) { 510 if (buf.length == 1) { 511 // Refer to note in definition of buf for details on this. 512 char[] newBuf = new char[256]; 513 514 newBuf[0] = buf[0]; 515 buf = newBuf; 516 } 517 seenBody = true; 518 } 519 } 520 521 /** 522 * Create a legal content for an element. 523 */ 524 boolean legalElementContext(Element elem) throws ChangedCharSetException { 525 526 // System.out.println("-- legalContext -- " + elem); 527 528 // Deal with the empty stack 529 if (stack == null) { 530 // System.out.println("-- stack is empty"); 531 if (elem != dtd.html) { 532 // System.out.println("-- pushing html"); 533 startTag(makeTag(dtd.html, true)); 534 return legalElementContext(elem); 535 } 536 return true; 537 } 538 539 // Is it allowed in the current context 540 if (stack.advance(elem)) { 541 // System.out.println("-- legal context"); 542 markFirstTime(elem); 543 return true; 544 } 545 boolean insertTag = false; 546 547 // The use of all error recovery strategies are contingent 548 // on the value of the strict property. 549 // 550 // These are commonly occurring errors. if insertTag is true, 551 // then we want to adopt an error recovery strategy that 552 // involves attempting to insert an additional tag to 553 // legalize the context. The two errors addressed here 554 // are: 555 // 1) when a <td> or <th> is seen soon after a <table> tag. 556 // In this case we insert a <tr>. 557 // 2) when any other tag apart from a <tr> is seen 558 // in the context of a <tr>. In this case we would 559 // like to add a <td>. If a <tr> is seen within a 560 // <tr> context, then we will close out the current 561 // <tr>. 562 // 563 // This insertion strategy is handled later in the method. 564 // The reason for checking this now, is that in other cases 565 // we would like to apply other error recovery strategies for example 566 // ignoring tags. 567 // 568 // In certain cases it is better to ignore a tag than try to 569 // fix the situation. So the first test is to see if this 570 // is what we need to do. 571 // 572 String stackElemName = stack.elem.getName(); 573 String elemName = elem.getName(); 574 575 576 if (!strict && 577 ((stackElemName.equals("table") && elemName.equals("td")) || 578 (stackElemName.equals("table") && elemName.equals("th")) || 579 (stackElemName.equals("tr") && !elemName.equals("tr")))){ 580 insertTag = true; 581 } 582 583 584 if (!strict && !insertTag && (stack.elem.getName() != elem.getName() || 585 elem.getName().equals("body"))) { 586 if (skipTag = ignoreElement(elem)) { 587 error("tag.ignore", elem.getName()); 588 return skipTag; 589 } 590 } 591 592 // Check for anything after the start of the table besides tr, td, th 593 // or caption, and if those aren't there, insert the <tr> and call 594 // legalElementContext again. 595 if (!strict && stackElemName.equals("table") && 596 !elemName.equals("tr") && !elemName.equals("td") && 597 !elemName.equals("th") && !elemName.equals("caption")) { 598 Element e = dtd.getElement("tr"); 599 TagElement t = makeTag(e, true); 600 legalTagContext(t); 601 startTag(t); 602 error("start.missing", elem.getName()); 603 return legalElementContext(elem); 604 } 605 606 // They try to find a legal context by checking if the current 607 // tag is valid in an enclosing context. If so 608 // close out the tags by outputing end tags and then 609 // insert the current tag. If the tags that are 610 // being closed out do not have an optional end tag 611 // specification in the DTD then an html error is 612 // reported. 613 // 614 if (!insertTag && stack.terminate() && (!strict || stack.elem.omitEnd())) { 615 for (TagStack s = stack.next ; s != null ; s = s.next) { 616 if (s.advance(elem)) { 617 while (stack != s) { 618 endTag(true); 619 } 620 return true; 621 } 622 if (!s.terminate() || (strict && !s.elem.omitEnd())) { 623 break; 624 } 625 } 626 } 627 628 // Check if we know what tag is expected next. 629 // If so insert the tag. Report an error if the 630 // tag does not have its start tag spec in the DTD as optional. 631 // 632 Element next = stack.first(); 633 if (next != null && (!strict || next.omitStart()) && 634 !(next==dtd.head && elem==dtd.pcdata) ) { 635 // System.out.println("-- omitting start tag: " + next); 636 TagElement t = makeTag(next, true); 637 legalTagContext(t); 638 startTag(t); 639 if (!next.omitStart()) { 640 error("start.missing", elem.getName()); 641 } 642 return legalElementContext(elem); 643 } 644 645 646 // Traverse the list of expected elements and determine if adding 647 // any of these elements would make for a legal context. 648 // 649 650 if (!strict) { 651 ContentModel content = stack.contentModel(); 652 Vector<Element> elemVec = new Vector<Element>(); 653 if (content != null) { 654 content.getElements(elemVec); 655 for (Element e : elemVec) { 656 // Ensure that this element has not been included as 657 // part of the exclusions in the DTD. 658 // 659 if (stack.excluded(e.getIndex())) { 660 continue; 661 } 662 663 boolean reqAtts = false; 664 665 for (AttributeList a = e.getAttributes(); a != null ; a = a.next) { 666 if (a.modifier == REQUIRED) { 667 reqAtts = true; 668 break; 669 } 670 } 671 // Ensure that no tag that has required attributes 672 // gets inserted. 673 // 674 if (reqAtts) { 675 continue; 676 } 677 678 ContentModel m = e.getContent(); 679 if (m != null && m.first(elem)) { 680 // System.out.println("-- adding a legal tag: " + e); 681 TagElement t = makeTag(e, true); 682 legalTagContext(t); 683 startTag(t); 684 error("start.missing", e.getName()); 685 return legalElementContext(elem); 686 } 687 } 688 } 689 } 690 691 // Check if the stack can be terminated. If so add the appropriate 692 // end tag. Report an error if the tag being ended does not have its 693 // end tag spec in the DTD as optional. 694 // 695 if (stack.terminate() && (stack.elem != dtd.body) && (!strict || stack.elem.omitEnd())) { 696 // System.out.println("-- omitting end tag: " + stack.elem); 697 if (!stack.elem.omitEnd()) { 698 error("end.missing", elem.getName()); 699 } 700 701 endTag(true); 702 return legalElementContext(elem); 703 } 704 705 // At this point we know that something is screwed up. 706 return false; 707 } 708 709 /** 710 * Create a legal context for a tag. 711 */ 712 void legalTagContext(TagElement tag) throws ChangedCharSetException { 713 if (legalElementContext(tag.getElement())) { 714 markFirstTime(tag.getElement()); 715 return; 716 } 717 718 // Avoid putting a block tag in a flow tag. 719 if (tag.breaksFlow() && (stack != null) && !stack.tag.breaksFlow()) { 720 endTag(true); 721 legalTagContext(tag); 722 return; 723 } 724 725 // Avoid putting something wierd in the head of the document. 726 for (TagStack s = stack ; s != null ; s = s.next) { 727 if (s.tag.getElement() == dtd.head) { 728 while (stack != s) { 729 endTag(true); 730 } 731 endTag(true); 732 legalTagContext(tag); 733 return; 734 } 735 } 736 737 // Everything failed 738 error("tag.unexpected", tag.getElement().getName()); 739 } 740 741 /** 742 * Error context. Something went wrong, make sure we are in 743 * the document's body context 744 */ 745 void errorContext() throws ChangedCharSetException { 746 for (; (stack != null) && (stack.tag.getElement() != dtd.body) ; stack = stack.next) { 747 handleEndTag(stack.tag); 748 } 749 if (stack == null) { 750 legalElementContext(dtd.body); 751 startTag(makeTag(dtd.body, true)); 752 } 753 } 754 755 /** 756 * Add a char to the string buffer. 757 */ 758 void addString(int c) { 759 if (strpos == str.length) { 760 char newstr[] = new char[str.length + 128]; 761 System.arraycopy(str, 0, newstr, 0, str.length); 762 str = newstr; 763 } 764 str[strpos++] = (char)c; 765 } 766 767 /** 768 * Get the string that's been accumulated. 769 */ 770 String getString(int pos) { 771 char newStr[] = new char[strpos - pos]; 772 System.arraycopy(str, pos, newStr, 0, strpos - pos); 773 strpos = pos; 774 return new String(newStr); 775 } 776 777 char[] getChars(int pos) { 778 char newStr[] = new char[strpos - pos]; 779 System.arraycopy(str, pos, newStr, 0, strpos - pos); 780 strpos = pos; 781 return newStr; 782 } 783 784 char[] getChars(int pos, int endPos) { 785 char newStr[] = new char[endPos - pos]; 786 System.arraycopy(str, pos, newStr, 0, endPos - pos); 787 // REMIND: it's not clear whether this version should set strpos or not 788 // strpos = pos; 789 return newStr; 790 } 791 792 void resetStrBuffer() { 793 strpos = 0; 794 } 795 796 int strIndexOf(char target) { 797 for (int i = 0; i < strpos; i++) { 798 if (str[i] == target) { 799 return i; 800 } 801 } 802 803 return -1; 804 } 805 806 /** 807 * Skip space. 808 * [5] 297:5 809 */ 810 void skipSpace() throws IOException { 811 while (true) { 812 switch (ch) { 813 case '\n': 814 ln++; 815 ch = readCh(); 816 lfCount++; 817 break; 818 819 case '\r': 820 ln++; 821 if ((ch = readCh()) == '\n') { 822 ch = readCh(); 823 crlfCount++; 824 } 825 else { 826 crCount++; 827 } 828 break; 829 case ' ': 830 case '\t': 831 ch = readCh(); 832 break; 833 834 default: 835 return; 836 } 837 } 838 } 839 840 /** 841 * Parse identifier. Uppercase characters are folded 842 * to lowercase when lower is true. Returns falsed if 843 * no identifier is found. [55] 346:17 844 */ 845 boolean parseIdentifier(boolean lower) throws IOException { 846 switch (ch) { 847 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': 848 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L': 849 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': 850 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X': 851 case 'Y': case 'Z': 852 if (lower) { 853 ch = 'a' + (ch - 'A'); 854 } 855 856 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': 857 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': 858 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': 859 case 's': case 't': case 'u': case 'v': case 'w': case 'x': 860 case 'y': case 'z': 861 break; 862 863 default: 864 return false; 865 } 866 867 while (true) { 868 addString(ch); 869 870 switch (ch = readCh()) { 871 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': 872 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L': 873 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': 874 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X': 875 case 'Y': case 'Z': 876 if (lower) { 877 ch = 'a' + (ch - 'A'); 878 } 879 880 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': 881 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': 882 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': 883 case 's': case 't': case 'u': case 'v': case 'w': case 'x': 884 case 'y': case 'z': 885 886 case '0': case '1': case '2': case '3': case '4': 887 case '5': case '6': case '7': case '8': case '9': 888 889 case '.': case '-': 890 891 case '_': // not officially allowed 892 break; 893 894 default: 895 return true; 896 } 897 } 898 } 899 900 /** 901 * Parse an entity reference. [59] 350:17 902 */ 903 private char[] parseEntityReference() throws IOException { 904 int pos = strpos; 905 906 if ((ch = readCh()) == '#') { 907 int n = 0; 908 ch = readCh(); 909 if ((ch >= '0') && (ch <= '9') || 910 ch == 'x' || ch == 'X') { 911 912 if ((ch >= '0') && (ch <= '9')) { 913 // parse decimal reference 914 while ((ch >= '0') && (ch <= '9')) { 915 n = (n * 10) + ch - '0'; 916 ch = readCh(); 917 } 918 } else { 919 // parse hexadecimal reference 920 ch = readCh(); 921 char lch = (char) Character.toLowerCase(ch); 922 while ((lch >= '0') && (lch <= '9') || 923 (lch >= 'a') && (lch <= 'f')) { 924 if (lch >= '0' && lch <= '9') { 925 n = (n * 16) + lch - '0'; 926 } else { 927 n = (n * 16) + lch - 'a' + 10; 928 } 929 ch = readCh(); 930 lch = (char) Character.toLowerCase(ch); 931 } 932 } 933 switch (ch) { 934 case '\n': 935 ln++; 936 ch = readCh(); 937 lfCount++; 938 break; 939 940 case '\r': 941 ln++; 942 if ((ch = readCh()) == '\n') { 943 ch = readCh(); 944 crlfCount++; 945 } 946 else { 947 crCount++; 948 } 949 break; 950 951 case ';': 952 ch = readCh(); 953 break; 954 } 955 char data[] = mapNumericReference(n); 956 return data; 957 } 958 addString('#'); 959 if (!parseIdentifier(false)) { 960 error("ident.expected"); 961 strpos = pos; 962 char data[] = {'&', '#'}; 963 return data; 964 } 965 } else if (!parseIdentifier(false)) { 966 char data[] = {'&'}; 967 return data; 968 } 969 970 boolean semicolon = false; 971 972 switch (ch) { 973 case '\n': 974 ln++; 975 ch = readCh(); 976 lfCount++; 977 break; 978 979 case '\r': 980 ln++; 981 if ((ch = readCh()) == '\n') { 982 ch = readCh(); 983 crlfCount++; 984 } 985 else { 986 crCount++; 987 } 988 break; 989 990 case ';': 991 semicolon = true; 992 993 ch = readCh(); 994 break; 995 } 996 997 String nm = getString(pos); 998 Entity ent = dtd.getEntity(nm); 999 1000 // entities are case sensitive - however if strict 1001 // is false then we will try to make a match by 1002 // converting the string to all lowercase. 1003 // 1004 if (!strict && (ent == null)) { 1005 ent = dtd.getEntity(nm.toLowerCase()); 1006 } 1007 if ((ent == null) || !ent.isGeneral()) { 1008 1009 if (nm.length() == 0) { 1010 error("invalid.entref", nm); 1011 return new char[0]; 1012 } 1013 /* given that there is not a match restore the entity reference */ 1014 String str = "&" + nm + (semicolon ? ";" : ""); 1015 1016 char b[] = new char[str.length()]; 1017 str.getChars(0, b.length, b, 0); 1018 return b; 1019 } 1020 return ent.getData(); 1021 } 1022 1023 /** 1024 * Converts numeric character reference to char array. 1025 * 1026 * Normally the code in a reference should be always converted 1027 * to the Unicode character with the same code, but due to 1028 * wide usage of Cp1252 charset most browsers map numeric references 1029 * in the range 130-159 (which are control chars in Unicode set) 1030 * to displayable characters with other codes. 1031 * 1032 * @param c the code of numeric character reference. 1033 * @return a char array corresponding to the reference code. 1034 */ 1035 private char[] mapNumericReference(int c) { 1036 char[] data; 1037 if (c >= 0xffff) { // outside unicode BMP. 1038 try { 1039 data = Character.toChars(c); 1040 } catch (IllegalArgumentException e) { 1041 data = new char[0]; 1042 } 1043 } else { 1044 data = new char[1]; 1045 data[0] = (c < 130 || c > 159) ? (char) c : cp1252Map[c - 130]; 1046 } 1047 return data; 1048 } 1049 1050 /** 1051 * Parse a comment. [92] 391:7 1052 */ 1053 void parseComment() throws IOException { 1054 1055 while (true) { 1056 int c = ch; 1057 switch (c) { 1058 case '-': 1059 /** Presuming that the start string of a comment "<!--" has 1060 already been parsed, the '-' character is valid only as 1061 part of a comment termination and further more it must 1062 be present in even numbers. Hence if strict is true, we 1063 presume the comment has been terminated and return. 1064 However if strict is false, then there is no even number 1065 requirement and this character can appear anywhere in the 1066 comment. The parser reads on until it sees the following 1067 pattern: "-->" or "--!>". 1068 **/ 1069 if (!strict && (strpos != 0) && (str[strpos - 1] == '-')) { 1070 if ((ch = readCh()) == '>') { 1071 return; 1072 } 1073 if (ch == '!') { 1074 if ((ch = readCh()) == '>') { 1075 return; 1076 } else { 1077 /* to account for extra read()'s that happened */ 1078 addString('-'); 1079 addString('!'); 1080 continue; 1081 } 1082 } 1083 break; 1084 } 1085 1086 if ((ch = readCh()) == '-') { 1087 ch = readCh(); 1088 if (strict || ch == '>') { 1089 return; 1090 } 1091 if (ch == '!') { 1092 if ((ch = readCh()) == '>') { 1093 return; 1094 } else { 1095 /* to account for extra read()'s that happened */ 1096 addString('-'); 1097 addString('!'); 1098 continue; 1099 } 1100 } 1101 /* to account for the extra read() */ 1102 addString('-'); 1103 } 1104 break; 1105 1106 case -1: 1107 handleEOFInComment(); 1108 return; 1109 1110 case '\n': 1111 ln++; 1112 ch = readCh(); 1113 lfCount++; 1114 break; 1115 1116 case '>': 1117 ch = readCh(); 1118 break; 1119 1120 case '\r': 1121 ln++; 1122 if ((ch = readCh()) == '\n') { 1123 ch = readCh(); 1124 crlfCount++; 1125 } 1126 else { 1127 crCount++; 1128 } 1129 c = '\n'; 1130 break; 1131 default: 1132 ch = readCh(); 1133 break; 1134 } 1135 1136 addString(c); 1137 } 1138 } 1139 1140 /** 1141 * Parse literal content. [46] 343:1 and [47] 344:1 1142 */ 1143 void parseLiteral(boolean replace) throws IOException { 1144 while (true) { 1145 int c = ch; 1146 switch (c) { 1147 case -1: 1148 error("eof.literal", stack.elem.getName()); 1149 endTag(true); 1150 return; 1151 1152 case '>': 1153 ch = readCh(); 1154 int i = textpos - (stack.elem.name.length() + 2), j = 0; 1155 1156 // match end tag 1157 if ((i >= 0) && (text[i++] == '<') && (text[i] == '/')) { 1158 while ((++i < textpos) && 1159 (Character.toLowerCase(text[i]) == stack.elem.name.charAt(j++))); 1160 if (i == textpos) { 1161 textpos -= (stack.elem.name.length() + 2); 1162 if ((textpos > 0) && (text[textpos-1] == '\n')) { 1163 textpos--; 1164 } 1165 endTag(false); 1166 return; 1167 } 1168 } 1169 break; 1170 1171 case '&': 1172 char data[] = parseEntityReference(); 1173 if (textpos + data.length > text.length) { 1174 char newtext[] = new char[Math.max(textpos + data.length + 128, text.length * 2)]; 1175 System.arraycopy(text, 0, newtext, 0, text.length); 1176 text = newtext; 1177 } 1178 System.arraycopy(data, 0, text, textpos, data.length); 1179 textpos += data.length; 1180 continue; 1181 1182 case '\n': 1183 ln++; 1184 ch = readCh(); 1185 lfCount++; 1186 break; 1187 1188 case '\r': 1189 ln++; 1190 if ((ch = readCh()) == '\n') { 1191 ch = readCh(); 1192 crlfCount++; 1193 } 1194 else { 1195 crCount++; 1196 } 1197 c = '\n'; 1198 break; 1199 default: 1200 ch = readCh(); 1201 break; 1202 } 1203 1204 // output character 1205 if (textpos == text.length) { 1206 char newtext[] = new char[text.length + 128]; 1207 System.arraycopy(text, 0, newtext, 0, text.length); 1208 text = newtext; 1209 } 1210 text[textpos++] = (char)c; 1211 } 1212 } 1213 1214 /** 1215 * Parse attribute value. [33] 331:1 1216 */ 1217 String parseAttributeValue(boolean lower) throws IOException { 1218 int delim = -1; 1219 1220 // Check for a delimiter 1221 switch(ch) { 1222 case '\'': 1223 case '"': 1224 delim = ch; 1225 ch = readCh(); 1226 break; 1227 } 1228 1229 // Parse the rest of the value 1230 while (true) { 1231 int c = ch; 1232 1233 switch (c) { 1234 case '\n': 1235 ln++; 1236 ch = readCh(); 1237 lfCount++; 1238 if (delim < 0) { 1239 return getString(0); 1240 } 1241 break; 1242 1243 case '\r': 1244 ln++; 1245 1246 if ((ch = readCh()) == '\n') { 1247 ch = readCh(); 1248 crlfCount++; 1249 } 1250 else { 1251 crCount++; 1252 } 1253 if (delim < 0) { 1254 return getString(0); 1255 } 1256 break; 1257 1258 case '\t': 1259 if (delim < 0) 1260 c = ' '; 1261 case ' ': 1262 ch = readCh(); 1263 if (delim < 0) { 1264 return getString(0); 1265 } 1266 break; 1267 1268 case '>': 1269 case '<': 1270 if (delim < 0) { 1271 return getString(0); 1272 } 1273 ch = readCh(); 1274 break; 1275 1276 case '\'': 1277 case '"': 1278 ch = readCh(); 1279 if (c == delim) { 1280 return getString(0); 1281 } else if (delim == -1) { 1282 error("attvalerr"); 1283 if (strict || ch == ' ') { 1284 return getString(0); 1285 } else { 1286 continue; 1287 } 1288 } 1289 break; 1290 1291 case '=': 1292 if (delim < 0) { 1293 /* In SGML a construct like <img src=/cgi-bin/foo?x=1> 1294 is considered invalid since an = sign can only be contained 1295 in an attributes value if the string is quoted. 1296 */ 1297 error("attvalerr"); 1298 /* If strict is true then we return with the string we have thus far. 1299 Otherwise we accept the = sign as part of the attribute's value and 1300 process the rest of the img tag. */ 1301 if (strict) { 1302 return getString(0); 1303 } 1304 } 1305 ch = readCh(); 1306 break; 1307 1308 case '&': 1309 if (strict && delim < 0) { 1310 ch = readCh(); 1311 break; 1312 } 1313 1314 char data[] = parseEntityReference(); 1315 for (int i = 0 ; i < data.length ; i++) { 1316 c = data[i]; 1317 addString((lower && (c >= 'A') && (c <= 'Z')) ? 'a' + c - 'A' : c); 1318 } 1319 continue; 1320 1321 case -1: 1322 return getString(0); 1323 1324 default: 1325 if (lower && (c >= 'A') && (c <= 'Z')) { 1326 c = 'a' + c - 'A'; 1327 } 1328 ch = readCh(); 1329 break; 1330 } 1331 addString(c); 1332 } 1333 } 1334 1335 1336 /** 1337 * Parse attribute specification List. [31] 327:17 1338 */ 1339 void parseAttributeSpecificationList(Element elem) throws IOException { 1340 1341 while (true) { 1342 skipSpace(); 1343 1344 switch (ch) { 1345 case '/': 1346 case '>': 1347 case '<': 1348 case -1: 1349 return; 1350 1351 case '-': 1352 if ((ch = readCh()) == '-') { 1353 ch = readCh(); 1354 parseComment(); 1355 strpos = 0; 1356 } else { 1357 error("invalid.tagchar", "-", elem.getName()); 1358 ch = readCh(); 1359 } 1360 continue; 1361 } 1362 1363 AttributeList att; 1364 String attname; 1365 String attvalue; 1366 1367 if (parseIdentifier(true)) { 1368 attname = getString(0); 1369 skipSpace(); 1370 if (ch == '=') { 1371 ch = readCh(); 1372 skipSpace(); 1373 att = elem.getAttribute(attname); 1374 // Bug ID 4102750 1375 // Load the NAME of an Attribute Case Sensitive 1376 // The case of the NAME must be intact 1377 // MG 021898 1378 attvalue = parseAttributeValue((att != null) && (att.type != CDATA) && (att.type != NOTATION) && (att.type != NAME)); 1379 // attvalue = parseAttributeValue((att != null) && (att.type != CDATA) && (att.type != NOTATION)); 1380 } else { 1381 attvalue = attname; 1382 att = elem.getAttributeByValue(attvalue); 1383 if (att == null) { 1384 att = elem.getAttribute(attname); 1385 if (att != null) { 1386 attvalue = att.getValue(); 1387 } 1388 else { 1389 // Make it null so that NULL_ATTRIBUTE_VALUE is 1390 // used 1391 attvalue = null; 1392 } 1393 } 1394 } 1395 } else if (!strict && ch == ',') { // allows for comma separated attribute-value pairs 1396 ch = readCh(); 1397 continue; 1398 } else if (!strict && ch == '"') { // allows for quoted attributes 1399 ch = readCh(); 1400 skipSpace(); 1401 if (parseIdentifier(true)) { 1402 attname = getString(0); 1403 if (ch == '"') { 1404 ch = readCh(); 1405 } 1406 skipSpace(); 1407 if (ch == '=') { 1408 ch = readCh(); 1409 skipSpace(); 1410 att = elem.getAttribute(attname); 1411 attvalue = parseAttributeValue((att != null) && 1412 (att.type != CDATA) && 1413 (att.type != NOTATION)); 1414 } else { 1415 attvalue = attname; 1416 att = elem.getAttributeByValue(attvalue); 1417 if (att == null) { 1418 att = elem.getAttribute(attname); 1419 if (att != null) { 1420 attvalue = att.getValue(); 1421 } 1422 } 1423 } 1424 } else { 1425 char str[] = {(char)ch}; 1426 error("invalid.tagchar", new String(str), elem.getName()); 1427 ch = readCh(); 1428 continue; 1429 } 1430 } else if (!strict && (attributes.isEmpty()) && (ch == '=')) { 1431 ch = readCh(); 1432 skipSpace(); 1433 attname = elem.getName(); 1434 att = elem.getAttribute(attname); 1435 attvalue = parseAttributeValue((att != null) && 1436 (att.type != CDATA) && 1437 (att.type != NOTATION)); 1438 } else if (!strict && (ch == '=')) { 1439 ch = readCh(); 1440 skipSpace(); 1441 attvalue = parseAttributeValue(true); 1442 error("attvalerr"); 1443 return; 1444 } else { 1445 char str[] = {(char)ch}; 1446 error("invalid.tagchar", new String(str), elem.getName()); 1447 if (!strict) { 1448 ch = readCh(); 1449 continue; 1450 } else { 1451 return; 1452 } 1453 } 1454 1455 if (att != null) { 1456 attname = att.getName(); 1457 } else { 1458 error("invalid.tagatt", attname, elem.getName()); 1459 } 1460 1461 // Check out the value 1462 if (attributes.isDefined(attname)) { 1463 error("multi.tagatt", attname, elem.getName()); 1464 } 1465 if (attvalue == null) { 1466 attvalue = ((att != null) && (att.value != null)) ? att.value : 1467 HTML.NULL_ATTRIBUTE_VALUE; 1468 } else if ((att != null) && (att.values != null) && !att.values.contains(attvalue)) { 1469 error("invalid.tagattval", attname, elem.getName()); 1470 } 1471 HTML.Attribute attkey = HTML.getAttributeKey(attname); 1472 if (attkey == null) { 1473 attributes.addAttribute(attname, attvalue); 1474 } else { 1475 attributes.addAttribute(attkey, attvalue); 1476 } 1477 } 1478 } 1479 1480 /** 1481 * Parses th Document Declaration Type markup declaration. 1482 * Currently ignores it. 1483 */ 1484 public String parseDTDMarkup() throws IOException { 1485 1486 StringBuilder strBuff = new StringBuilder(); 1487 ch = readCh(); 1488 while(true) { 1489 switch (ch) { 1490 case '>': 1491 ch = readCh(); 1492 return strBuff.toString(); 1493 case -1: 1494 error("invalid.markup"); 1495 return strBuff.toString(); 1496 case '\n': 1497 ln++; 1498 ch = readCh(); 1499 lfCount++; 1500 break; 1501 case '"': 1502 ch = readCh(); 1503 break; 1504 case '\r': 1505 ln++; 1506 if ((ch = readCh()) == '\n') { 1507 ch = readCh(); 1508 crlfCount++; 1509 } 1510 else { 1511 crCount++; 1512 } 1513 break; 1514 default: 1515 strBuff.append((char)(ch & 0xFF)); 1516 ch = readCh(); 1517 break; 1518 } 1519 } 1520 } 1521 1522 /** 1523 * Parse markup declarations. 1524 * Currently only handles the Document Type Declaration markup. 1525 * Returns true if it is a markup declaration false otherwise. 1526 */ 1527 protected boolean parseMarkupDeclarations(StringBuffer strBuff) throws IOException { 1528 1529 /* Currently handles only the DOCTYPE */ 1530 if ((strBuff.length() == "DOCTYPE".length()) && 1531 (strBuff.toString().toUpperCase().equals("DOCTYPE"))) { 1532 parseDTDMarkup(); 1533 return true; 1534 } 1535 return false; 1536 } 1537 1538 /** 1539 * Parse an invalid tag. 1540 */ 1541 void parseInvalidTag() throws IOException { 1542 // ignore all data upto the close bracket '>' 1543 while (true) { 1544 skipSpace(); 1545 switch (ch) { 1546 case '>': 1547 case -1: 1548 ch = readCh(); 1549 return; 1550 case '<': 1551 return; 1552 default: 1553 ch = readCh(); 1554 1555 } 1556 } 1557 } 1558 1559 /** 1560 * Parse a start or end tag. 1561 */ 1562 void parseTag() throws IOException { 1563 Element elem; 1564 boolean net = false; 1565 boolean warned = false; 1566 boolean unknown = false; 1567 1568 switch (ch = readCh()) { 1569 case '!': 1570 switch (ch = readCh()) { 1571 case '-': 1572 // Parse comment. [92] 391:7 1573 while (true) { 1574 if (ch == '-') { 1575 if (!strict || ((ch = readCh()) == '-')) { 1576 ch = readCh(); 1577 if (!strict && ch == '-') { 1578 ch = readCh(); 1579 } 1580 // send over any text you might see 1581 // before parsing and sending the 1582 // comment 1583 if (textpos != 0) { 1584 char newtext[] = new char[textpos]; 1585 System.arraycopy(text, 0, newtext, 0, textpos); 1586 handleText(newtext); 1587 lastBlockStartPos = currentBlockStartPos; 1588 textpos = 0; 1589 } 1590 parseComment(); 1591 last = makeTag(dtd.getElement("comment"), true); 1592 handleComment(getChars(0)); 1593 continue; 1594 } else if (!warned) { 1595 warned = true; 1596 error("invalid.commentchar", "-"); 1597 } 1598 } 1599 skipSpace(); 1600 switch (ch) { 1601 case '-': 1602 continue; 1603 case '>': 1604 ch = readCh(); 1605 case -1: 1606 return; 1607 default: 1608 ch = readCh(); 1609 if (!warned) { 1610 warned = true; 1611 error("invalid.commentchar", 1612 String.valueOf((char)ch)); 1613 } 1614 break; 1615 } 1616 } 1617 1618 default: 1619 // deal with marked sections 1620 StringBuffer strBuff = new StringBuffer(); 1621 while (true) { 1622 strBuff.append((char)ch); 1623 if (parseMarkupDeclarations(strBuff)) { 1624 return; 1625 } 1626 switch(ch) { 1627 case '>': 1628 ch = readCh(); 1629 case -1: 1630 error("invalid.markup"); 1631 return; 1632 case '\n': 1633 ln++; 1634 ch = readCh(); 1635 lfCount++; 1636 break; 1637 case '\r': 1638 ln++; 1639 if ((ch = readCh()) == '\n') { 1640 ch = readCh(); 1641 crlfCount++; 1642 } 1643 else { 1644 crCount++; 1645 } 1646 break; 1647 1648 default: 1649 ch = readCh(); 1650 break; 1651 } 1652 } 1653 } 1654 1655 case '/': 1656 // parse end tag [19] 317:4 1657 switch (ch = readCh()) { 1658 case '>': 1659 ch = readCh(); 1660 case '<': 1661 // empty end tag. either </> or </< 1662 if (recent == null) { 1663 error("invalid.shortend"); 1664 return; 1665 } 1666 elem = recent; 1667 break; 1668 1669 default: 1670 if (!parseIdentifier(true)) { 1671 error("expected.endtagname"); 1672 return; 1673 } 1674 skipSpace(); 1675 switch (ch) { 1676 case '>': 1677 ch = readCh(); 1678 case '<': 1679 break; 1680 1681 default: 1682 error("expected", "'>'"); 1683 while ((ch != -1) && (ch != '\n') && (ch != '>')) { 1684 ch = readCh(); 1685 } 1686 if (ch == '>') { 1687 ch = readCh(); 1688 } 1689 break; 1690 } 1691 String elemStr = getString(0); 1692 if (!dtd.elementExists(elemStr)) { 1693 error("end.unrecognized", elemStr); 1694 // Ignore RE before end tag 1695 if ((textpos > 0) && (text[textpos-1] == '\n')) { 1696 textpos--; 1697 } 1698 elem = dtd.getElement("unknown"); 1699 elem.name = elemStr; 1700 unknown = true; 1701 } else { 1702 elem = dtd.getElement(elemStr); 1703 } 1704 break; 1705 } 1706 1707 1708 // If the stack is null, we're seeing end tags without any begin 1709 // tags. Ignore them. 1710 1711 if (stack == null) { 1712 error("end.extra.tag", elem.getName()); 1713 return; 1714 } 1715 1716 // Ignore RE before end tag 1717 if ((textpos > 0) && (text[textpos-1] == '\n')) { 1718 // In a pre tag, if there are blank lines 1719 // we do not want to remove the newline 1720 // before the end tag. Hence this code. 1721 // 1722 if (stack.pre) { 1723 if ((textpos > 1) && (text[textpos-2] != '\n')) { 1724 textpos--; 1725 } 1726 } else { 1727 textpos--; 1728 } 1729 } 1730 1731 // If the end tag is a form, since we did not put it 1732 // on the tag stack, there is no corresponding start 1733 // start tag to find. Hence do not touch the tag stack. 1734 // 1735 1736 /* 1737 if (!strict && elem.getName().equals("form")) { 1738 if (lastFormSent != null) { 1739 handleEndTag(lastFormSent); 1740 return; 1741 } else { 1742 // do nothing. 1743 return; 1744 } 1745 } 1746 */ 1747 1748 if (unknown) { 1749 // we will not see a corresponding start tag 1750 // on the the stack. If we are seeing an 1751 // end tag, lets send this on as an empty 1752 // tag with the end tag attribute set to 1753 // true. 1754 TagElement t = makeTag(elem); 1755 handleText(t); 1756 attributes.addAttribute(HTML.Attribute.ENDTAG, "true"); 1757 handleEmptyTag(makeTag(elem)); 1758 unknown = false; 1759 return; 1760 } 1761 1762 // find the corresponding start tag 1763 1764 // A commonly occurring error appears to be the insertion 1765 // of extra end tags in a table. The intent here is ignore 1766 // such extra end tags. 1767 // 1768 if (!strict) { 1769 String stackElem = stack.elem.getName(); 1770 1771 if (stackElem.equals("table")) { 1772 // If it is not a valid end tag ignore it and return 1773 // 1774 if (!elem.getName().equals(stackElem)) { 1775 error("tag.ignore", elem.getName()); 1776 return; 1777 } 1778 } 1779 1780 1781 1782 if (stackElem.equals("tr") || 1783 stackElem.equals("td")) { 1784 if ((!elem.getName().equals("table")) && 1785 (!elem.getName().equals(stackElem))) { 1786 error("tag.ignore", elem.getName()); 1787 return; 1788 } 1789 } 1790 } 1791 TagStack sp = stack; 1792 1793 while ((sp != null) && (elem != sp.elem)) { 1794 sp = sp.next; 1795 } 1796 if (sp == null) { 1797 error("unmatched.endtag", elem.getName()); 1798 return; 1799 } 1800 1801 // People put font ending tags in the darndest places. 1802 // Don't close other contexts based on them being between 1803 // a font tag and the corresponding end tag. Instead, 1804 // ignore the end tag like it doesn't exist and allow the end 1805 // of the document to close us out. 1806 String elemName = elem.getName(); 1807 if (stack != sp && 1808 (elemName.equals("font") || 1809 elemName.equals("center"))) { 1810 1811 // Since closing out a center tag can have real wierd 1812 // effects on the formatting, make sure that tags 1813 // for which omitting an end tag is legimitate 1814 // get closed out. 1815 // 1816 if (elemName.equals("center")) { 1817 while(stack.elem.omitEnd() && stack != sp) { 1818 endTag(true); 1819 } 1820 if (stack.elem == elem) { 1821 endTag(false); 1822 } 1823 } 1824 return; 1825 } 1826 // People do the same thing with center tags. In this 1827 // case we would like to close off the center tag but 1828 // not necessarily all enclosing tags. 1829 1830 1831 1832 // end tags 1833 while (stack != sp) { 1834 endTag(true); 1835 } 1836 1837 endTag(false); 1838 return; 1839 1840 case -1: 1841 error("eof"); 1842 return; 1843 } 1844 1845 // start tag [14] 314:1 1846 if (!parseIdentifier(true)) { 1847 elem = recent; 1848 if ((ch != '>') || (elem == null)) { 1849 error("expected.tagname"); 1850 return; 1851 } 1852 } else { 1853 String elemStr = getString(0); 1854 1855 if (elemStr.equals("image")) { 1856 elemStr = "img"; 1857 } 1858 1859 /* determine if this element is part of the dtd. */ 1860 1861 if (!dtd.elementExists(elemStr)) { 1862 // parseInvalidTag(); 1863 error("tag.unrecognized ", elemStr); 1864 elem = dtd.getElement("unknown"); 1865 elem.name = elemStr; 1866 unknown = true; 1867 } else { 1868 elem = dtd.getElement(elemStr); 1869 } 1870 } 1871 1872 // Parse attributes 1873 parseAttributeSpecificationList(elem); 1874 1875 switch (ch) { 1876 case '/': 1877 net = true; 1878 case '>': 1879 ch = readCh(); 1880 if (ch == '>' && net) { 1881 ch = readCh(); 1882 } 1883 case '<': 1884 break; 1885 1886 default: 1887 error("expected", "'>'"); 1888 break; 1889 } 1890 1891 if (!strict) { 1892 if (elem.getName().equals("script")) { 1893 error("javascript.unsupported"); 1894 } 1895 } 1896 1897 // ignore RE after start tag 1898 // 1899 if (!elem.isEmpty()) { 1900 if (ch == '\n') { 1901 ln++; 1902 lfCount++; 1903 ch = readCh(); 1904 } else if (ch == '\r') { 1905 ln++; 1906 if ((ch = readCh()) == '\n') { 1907 ch = readCh(); 1908 crlfCount++; 1909 } 1910 else { 1911 crCount++; 1912 } 1913 } 1914 } 1915 1916 // ensure a legal context for the tag 1917 TagElement tag = makeTag(elem, false); 1918 1919 1920 /** In dealing with forms, we have decided to treat 1921 them as legal in any context. Also, even though 1922 they do have a start and an end tag, we will 1923 not put this tag on the stack. This is to deal 1924 several pages in the web oasis that choose to 1925 start and end forms in any possible location. **/ 1926 1927 /* 1928 if (!strict && elem.getName().equals("form")) { 1929 if (lastFormSent == null) { 1930 lastFormSent = tag; 1931 } else { 1932 handleEndTag(lastFormSent); 1933 lastFormSent = tag; 1934 } 1935 } else { 1936 */ 1937 // Smlly, if a tag is unknown, we will apply 1938 // no legalTagContext logic to it. 1939 // 1940 if (!unknown) { 1941 legalTagContext(tag); 1942 1943 // If skip tag is true, this implies that 1944 // the tag was illegal and that the error 1945 // recovery strategy adopted is to ignore 1946 // the tag. 1947 if (!strict && skipTag) { 1948 skipTag = false; 1949 return; 1950 } 1951 } 1952 /* 1953 } 1954 */ 1955 1956 startTag(tag); 1957 1958 if (!elem.isEmpty()) { 1959 switch (elem.getType()) { 1960 case CDATA: 1961 parseLiteral(false); 1962 break; 1963 case RCDATA: 1964 parseLiteral(true); 1965 break; 1966 default: 1967 if (stack != null) { 1968 stack.net = net; 1969 } 1970 break; 1971 } 1972 } 1973 } 1974 1975 private static final String START_COMMENT = "<!--"; 1976 private static final String END_COMMENT = "-->"; 1977 private static final char[] SCRIPT_END_TAG = "</script>".toCharArray(); 1978 private static final char[] SCRIPT_END_TAG_UPPER_CASE = 1979 "</SCRIPT>".toCharArray(); 1980 1981 void parseScript() throws IOException { 1982 char[] charsToAdd = new char[SCRIPT_END_TAG.length]; 1983 boolean insideComment = false; 1984 1985 /* Here, ch should be the first character after <script> */ 1986 while (true) { 1987 int i = 0; 1988 while (!insideComment && i < SCRIPT_END_TAG.length 1989 && (SCRIPT_END_TAG[i] == ch 1990 || SCRIPT_END_TAG_UPPER_CASE[i] == ch)) { 1991 charsToAdd[i] = (char) ch; 1992 ch = readCh(); 1993 i++; 1994 } 1995 if (i == SCRIPT_END_TAG.length) { 1996 1997 /* '</script>' tag detected */ 1998 /* Here, ch == the first character after </script> */ 1999 return; 2000 } else { 2001 2002 /* To account for extra read()'s that happened */ 2003 for (int j = 0; j < i; j++) { 2004 addString(charsToAdd[j]); 2005 } 2006 2007 switch (ch) { 2008 case -1: 2009 error("eof.script"); 2010 return; 2011 case '\n': 2012 ln++; 2013 ch = readCh(); 2014 lfCount++; 2015 addString('\n'); 2016 break; 2017 case '\r': 2018 ln++; 2019 if ((ch = readCh()) == '\n') { 2020 ch = readCh(); 2021 crlfCount++; 2022 } else { 2023 crCount++; 2024 } 2025 addString('\n'); 2026 break; 2027 default: 2028 addString(ch); 2029 String str = new String(getChars(0, strpos)); 2030 if (!insideComment && str.endsWith(START_COMMENT)) { 2031 insideComment = true; 2032 } 2033 if (insideComment && str.endsWith(END_COMMENT)) { 2034 insideComment = false; 2035 } 2036 ch = readCh(); 2037 break; 2038 } // switch 2039 } 2040 } // while 2041 } 2042 2043 /** 2044 * Parse Content. [24] 320:1 2045 */ 2046 void parseContent() throws IOException { 2047 Thread curThread = Thread.currentThread(); 2048 2049 for (;;) { 2050 if (curThread.isInterrupted()) { 2051 curThread.interrupt(); // resignal the interrupt 2052 break; 2053 } 2054 2055 int c = ch; 2056 currentBlockStartPos = currentPosition; 2057 2058 if (recent == dtd.script) { // means: if after starting <script> tag 2059 2060 /* Here, ch has to be the first character after <script> */ 2061 parseScript(); 2062 last = makeTag(dtd.getElement("comment"), true); 2063 2064 /* Remove leading and trailing HTML comment declarations */ 2065 String str = new String(getChars(0)).trim(); 2066 int minLength = START_COMMENT.length() + END_COMMENT.length(); 2067 if (str.startsWith(START_COMMENT) && str.endsWith(END_COMMENT) 2068 && str.length() >= (minLength)) { 2069 str = str.substring(START_COMMENT.length(), 2070 str.length() - END_COMMENT.length()); 2071 } 2072 2073 /* Handle resulting chars as comment */ 2074 handleComment(str.toCharArray()); 2075 endTag(false); 2076 lastBlockStartPos = currentPosition; 2077 2078 continue; 2079 } else { 2080 switch (c) { 2081 case '<': 2082 parseTag(); 2083 lastBlockStartPos = currentPosition; 2084 continue; 2085 2086 case '/': 2087 ch = readCh(); 2088 if ((stack != null) && stack.net) { 2089 // null end tag. 2090 endTag(false); 2091 continue; 2092 } else if (textpos == 0) { 2093 if (!legalElementContext(dtd.pcdata)) { 2094 error("unexpected.pcdata"); 2095 } 2096 if (last.breaksFlow()) { 2097 space = false; 2098 } 2099 } 2100 break; 2101 2102 case -1: 2103 return; 2104 2105 case '&': 2106 if (textpos == 0) { 2107 if (!legalElementContext(dtd.pcdata)) { 2108 error("unexpected.pcdata"); 2109 } 2110 if (last.breaksFlow()) { 2111 space = false; 2112 } 2113 } 2114 char data[] = parseEntityReference(); 2115 if (textpos + data.length + 1 > text.length) { 2116 char newtext[] = new char[Math.max(textpos + data.length + 128, text.length * 2)]; 2117 System.arraycopy(text, 0, newtext, 0, text.length); 2118 text = newtext; 2119 } 2120 if (space) { 2121 space = false; 2122 text[textpos++] = ' '; 2123 } 2124 System.arraycopy(data, 0, text, textpos, data.length); 2125 textpos += data.length; 2126 ignoreSpace = false; 2127 continue; 2128 2129 case '\n': 2130 ln++; 2131 lfCount++; 2132 ch = readCh(); 2133 if ((stack != null) && stack.pre) { 2134 break; 2135 } 2136 if (textpos == 0) { 2137 lastBlockStartPos = currentPosition; 2138 } 2139 if (!ignoreSpace) { 2140 space = true; 2141 } 2142 continue; 2143 2144 case '\r': 2145 ln++; 2146 c = '\n'; 2147 if ((ch = readCh()) == '\n') { 2148 ch = readCh(); 2149 crlfCount++; 2150 } 2151 else { 2152 crCount++; 2153 } 2154 if ((stack != null) && stack.pre) { 2155 break; 2156 } 2157 if (textpos == 0) { 2158 lastBlockStartPos = currentPosition; 2159 } 2160 if (!ignoreSpace) { 2161 space = true; 2162 } 2163 continue; 2164 2165 2166 case '\t': 2167 case ' ': 2168 ch = readCh(); 2169 if ((stack != null) && stack.pre) { 2170 break; 2171 } 2172 if (textpos == 0) { 2173 lastBlockStartPos = currentPosition; 2174 } 2175 if (!ignoreSpace) { 2176 space = true; 2177 } 2178 continue; 2179 2180 default: 2181 if (textpos == 0) { 2182 if (!legalElementContext(dtd.pcdata)) { 2183 error("unexpected.pcdata"); 2184 } 2185 if (last.breaksFlow()) { 2186 space = false; 2187 } 2188 } 2189 ch = readCh(); 2190 break; 2191 } 2192 } 2193 2194 // enlarge buffer if needed 2195 if (textpos + 2 > text.length) { 2196 char newtext[] = new char[text.length + 128]; 2197 System.arraycopy(text, 0, newtext, 0, text.length); 2198 text = newtext; 2199 } 2200 2201 // output pending space 2202 if (space) { 2203 if (textpos == 0) { 2204 lastBlockStartPos--; 2205 } 2206 text[textpos++] = ' '; 2207 space = false; 2208 } 2209 text[textpos++] = (char)c; 2210 ignoreSpace = false; 2211 } 2212 } 2213 2214 /** 2215 * Returns the end of line string. This will return the end of line 2216 * string that has been encountered the most, one of \r, \n or \r\n. 2217 */ 2218 String getEndOfLineString() { 2219 if (crlfCount >= crCount) { 2220 if (lfCount >= crlfCount) { 2221 return "\n"; 2222 } 2223 else { 2224 return "\r\n"; 2225 } 2226 } 2227 else { 2228 if (crCount > lfCount) { 2229 return "\r"; 2230 } 2231 else { 2232 return "\n"; 2233 } 2234 } 2235 } 2236 2237 /** 2238 * Parse an HTML stream, given a DTD. 2239 */ 2240 public synchronized void parse(Reader in) throws IOException { 2241 this.in = in; 2242 2243 this.ln = 1; 2244 2245 seenHtml = false; 2246 seenHead = false; 2247 seenBody = false; 2248 2249 crCount = lfCount = crlfCount = 0; 2250 2251 try { 2252 ch = readCh(); 2253 text = new char[1024]; 2254 str = new char[128]; 2255 2256 parseContent(); 2257 // NOTE: interruption may have occurred. Control flows out 2258 // of here normally. 2259 while (stack != null) { 2260 endTag(true); 2261 } 2262 in.close(); 2263 } catch (IOException e) { 2264 errorContext(); 2265 error("ioexception"); 2266 throw e; 2267 } catch (Exception e) { 2268 errorContext(); 2269 error("exception", e.getClass().getName(), e.getMessage()); 2270 e.printStackTrace(); 2271 } catch (ThreadDeath e) { 2272 errorContext(); 2273 error("terminated"); 2274 e.printStackTrace(); 2275 throw e; 2276 } finally { 2277 for (; stack != null ; stack = stack.next) { 2278 handleEndTag(stack.tag); 2279 } 2280 2281 text = null; 2282 str = null; 2283 } 2284 2285 } 2286 2287 2288 /* 2289 * Input cache. This is much faster than calling down to a synchronized 2290 * method of BufferedReader for each byte. Measurements done 5/30/97 2291 * show that there's no point in having a bigger buffer: Increasing 2292 * the buffer to 8192 had no measurable impact for a program discarding 2293 * one character at a time (reading from an http URL to a local machine). 2294 * NOTE: If the current encoding is bogus, and we read too much 2295 * (past the content-type) we may suffer a MalformedInputException. For 2296 * this reason the initial size is 1 and when the body is encountered the 2297 * size is adjusted to 256. 2298 */ 2299 private char buf[] = new char[1]; 2300 private int pos; 2301 private int len; 2302 /* 2303 tracks position relative to the beginning of the 2304 document. 2305 */ 2306 private int currentPosition; 2307 2308 2309 private final int readCh() throws IOException { 2310 2311 if (pos >= len) { 2312 2313 // This loop allows us to ignore interrupts if the flag 2314 // says so 2315 for (;;) { 2316 try { 2317 len = in.read(buf); 2318 break; 2319 } catch (InterruptedIOException ex) { 2320 throw ex; 2321 } 2322 } 2323 2324 if (len <= 0) { 2325 return -1; // eof 2326 } 2327 pos = 0; 2328 } 2329 ++currentPosition; 2330 2331 return buf[pos++]; 2332 } 2333 2334 2335 protected int getCurrentPos() { 2336 return currentPosition; 2337 } 2338 }