1 /* 2 * Copyright (c) 1998, 2008, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 package javax.swing.text.html.parser; 27 28 import javax.swing.text.SimpleAttributeSet; 29 import javax.swing.text.html.HTML; 30 import javax.swing.text.ChangedCharSetException; 31 import java.io.*; 32 import java.util.Hashtable; 33 import java.util.Properties; 34 import java.util.Vector; 35 import java.util.Enumeration; 36 import java.net.URL; 37 38 import sun.misc.MessageUtils; 39 40 /** 41 * A simple DTD-driven HTML parser. The parser reads an 42 * HTML file from an InputStream and calls various methods 43 * (which should be overridden in a subclass) when tags and 44 * data are encountered. 45 * <p> 46 * Unfortunately there are many badly implemented HTML parsers 47 * out there, and as a result there are many badly formatted 48 * HTML files. This parser attempts to parse most HTML files. 49 * This means that the implementation sometimes deviates from 50 * the SGML specification in favor of HTML. 51 * <p> 52 * The parser treats \r and \r\n as \n. Newlines after starttags 53 * and before end tags are ignored just as specified in the SGML/HTML 54 * specification. 55 * <p> 56 * The html spec does not specify how spaces are to be coalesced very well. 57 * Specifically, the following scenarios are not discussed (note that a 58 * space should be used here, but I am using &nbsp to force the space to 59 * be displayed): 60 * <p> 61 * '<b>blah <i> <strike> foo' which can be treated as: 62 * '<b>blah <i><strike>foo' 63 * <p>as well as: 64 * '<p><a href="xx"> <em>Using</em></a></p>' 65 * which appears to be treated as: 66 * '<p><a href="xx"><em>Using</em></a></p>' 67 * <p> 68 * If <code>strict</code> is false, when a tag that breaks flow, 69 * (<code>TagElement.breaksFlows</code>) or trailing whitespace is 70 * encountered, all whitespace will be ignored until a non whitespace 71 * character is encountered. This appears to give behavior closer to 72 * the popular browsers. 73 * 74 * @see DTD 75 * @see TagElement 76 * @see SimpleAttributeSet 77 * @author Arthur van Hoff 78 * @author Sunita Mani 79 */ 80 public 81 class Parser implements DTDConstants { 82 83 private char text[] = new char[1024]; 84 private int textpos = 0; 85 private TagElement last; 86 private boolean space; 87 88 private char str[] = new char[128]; 89 private int strpos = 0; 90 91 protected DTD dtd = null; 92 93 private int ch; 94 private int ln; 95 private Reader in; 96 97 private Element recent; 98 private TagStack stack; 99 private boolean skipTag = false; 100 private TagElement lastFormSent = null; 101 private SimpleAttributeSet attributes = new SimpleAttributeSet(); 102 103 // State for <html>, <head> and <body>. Since people like to slap 104 // together HTML documents without thinking, occasionally they 105 // have multiple instances of these tags. These booleans track 106 // the first sightings of these tags so they can be safely ignored 107 // by the parser if repeated. 108 private boolean seenHtml = false; 109 private boolean seenHead = false; 110 private boolean seenBody = false; 111 112 /** 113 * The html spec does not specify how spaces are coalesced very well. 114 * If strict == false, ignoreSpace is used to try and mimic the behavior 115 * of the popular browsers. 116 * <p> 117 * The problematic scenarios are: 118 * '<b>blah <i> <strike> foo' which can be treated as: 119 * '<b>blah <i><strike>foo' 120 * as well as: 121 * '<p><a href="xx"> <em>Using</em></a></p>' 122 * which appears to be treated as: 123 * '<p><a href="xx"><em>Using</em></a></p>' 124 * <p> 125 * When a tag that breaks flow, or trailing whitespace is encountered 126 * ignoreSpace is set to true. From then on, all whitespace will be 127 * ignored. 128 * ignoreSpace will be set back to false the first time a 129 * non whitespace character is encountered. This appears to give 130 * behavior closer to the popular browsers. 131 */ 132 private boolean ignoreSpace; 133 134 /** 135 * This flag determines whether or not the Parser will be strict 136 * in enforcing SGML compatibility. If false, it will be lenient 137 * with certain common classes of erroneous HTML constructs. 138 * Strict or not, in either case an error will be recorded. 139 * 140 */ 141 protected boolean strict = false; 142 143 144 /** Number of \r\n's encountered. */ 145 private int crlfCount; 146 /** Number of \r's encountered. A \r\n will not increment this. */ 147 private int crCount; 148 /** Number of \n's encountered. A \r\n will not increment this. */ 149 private int lfCount; 150 151 // 152 // To correctly identify the start of a tag/comment/text we need two 153 // ivars. Two are needed as handleText isn't invoked until the tag 154 // after the text has been parsed, that is the parser parses the text, 155 // then a tag, then invokes handleText followed by handleStart. 156 // 157 /** The start position of the current block. Block is overloaded here, 158 * it really means the current start position for the current comment, 159 * tag, text. Use getBlockStartPosition to access this. */ 160 private int currentBlockStartPos; 161 /** Start position of the last block. */ 162 private int lastBlockStartPos; 163 164 /** 165 * array for mapping numeric references in range 166 * 130-159 to displayable Unicode characters. 167 */ 168 private static final char[] cp1252Map = { 169 8218, // ‚ 170 402, // ƒ 171 8222, // „ 172 8230, // … 173 8224, // † 174 8225, // ‡ 175 710, // ˆ 176 8240, // ‰ 177 352, // Š 178 8249, // ‹ 179 338, // Œ 180 141, //  181 142, // Ž 182 143, //  183 144, //  184 8216, // ‘ 185 8217, // ’ 186 8220, // “ 187 8221, // ” 188 8226, // • 189 8211, // – 190 8212, // — 191 732, // ˜ 192 8482, // ™ 193 353, // š 194 8250, // › 195 339, // œ 196 157, //  197 158, // ž 198 376 // Ÿ 199 }; 200 201 public Parser(DTD dtd) { 202 this.dtd = dtd; 203 } 204 205 206 /** 207 * @return the line number of the line currently being parsed 208 */ 209 protected int getCurrentLine() { 210 return ln; 211 } 212 213 /** 214 * Returns the start position of the current block. Block is 215 * overloaded here, it really means the current start position for 216 * the current comment tag, text, block.... This is provided for 217 * subclassers that wish to know the start of the current block when 218 * called with one of the handleXXX methods. 219 */ 220 int getBlockStartPosition() { 221 return Math.max(0, lastBlockStartPos - 1); 222 } 223 224 /** 225 * Makes a TagElement. 226 */ 227 protected TagElement makeTag(Element elem, boolean fictional) { 228 return new TagElement(elem, fictional); 229 } 230 231 protected TagElement makeTag(Element elem) { 232 return makeTag(elem, false); 233 } 234 235 protected SimpleAttributeSet getAttributes() { 236 return attributes; 237 } 238 239 protected void flushAttributes() { 240 attributes.removeAttributes(attributes); 241 } 242 243 /** 244 * Called when PCDATA is encountered. 245 */ 246 protected void handleText(char text[]) { 247 } 248 249 /** 250 * Called when an HTML title tag is encountered. 251 */ 252 protected void handleTitle(char text[]) { 253 // default behavior is to call handleText. Subclasses 254 // can override if necessary. 255 handleText(text); 256 } 257 258 /** 259 * Called when an HTML comment is encountered. 260 */ 261 protected void handleComment(char text[]) { 262 } 263 264 protected void handleEOFInComment() { 265 // We've reached EOF. Our recovery strategy is to 266 // see if we have more than one line in the comment; 267 // if so, we pretend that the comment was an unterminated 268 // single line comment, and reparse the lines after the 269 // first line as normal HTML content. 270 271 int commentEndPos = strIndexOf('\n'); 272 if (commentEndPos >= 0) { 273 handleComment(getChars(0, commentEndPos)); 274 try { 275 in.close(); 276 in = new CharArrayReader(getChars(commentEndPos + 1)); 277 ch = '>'; 278 } catch (IOException e) { 279 error("ioexception"); 280 } 281 282 resetStrBuffer(); 283 } else { 284 // no newline, so signal an error 285 error("eof.comment"); 286 } 287 } 288 289 /** 290 * Called when an empty tag is encountered. 291 */ 292 protected void handleEmptyTag(TagElement tag) throws ChangedCharSetException { 293 } 294 295 /** 296 * Called when a start tag is encountered. 297 */ 298 protected void handleStartTag(TagElement tag) { 299 } 300 301 /** 302 * Called when an end tag is encountered. 303 */ 304 protected void handleEndTag(TagElement tag) { 305 } 306 307 /** 308 * An error has occurred. 309 */ 310 protected void handleError(int ln, String msg) { 311 /* 312 Thread.dumpStack(); 313 System.out.println("**** " + stack); 314 System.out.println("line " + ln + ": error: " + msg); 315 System.out.println(); 316 */ 317 } 318 319 /** 320 * Output text. 321 */ 322 void handleText(TagElement tag) { 323 if (tag.breaksFlow()) { 324 space = false; 325 if (!strict) { 326 ignoreSpace = true; 327 } 328 } 329 if (textpos == 0) { 330 if ((!space) || (stack == null) || last.breaksFlow() || 331 !stack.advance(dtd.pcdata)) { 332 last = tag; 333 space = false; 334 lastBlockStartPos = currentBlockStartPos; 335 return; 336 } 337 } 338 if (space) { 339 if (!ignoreSpace) { 340 // enlarge buffer if needed 341 if (textpos + 1 > text.length) { 342 char newtext[] = new char[text.length + 200]; 343 System.arraycopy(text, 0, newtext, 0, text.length); 344 text = newtext; 345 } 346 347 // output pending space 348 text[textpos++] = ' '; 349 if (!strict && !tag.getElement().isEmpty()) { 350 ignoreSpace = true; 351 } 352 } 353 space = false; 354 } 355 char newtext[] = new char[textpos]; 356 System.arraycopy(text, 0, newtext, 0, textpos); 357 // Handles cases of bad html where the title tag 358 // was getting lost when we did error recovery. 359 if (tag.getElement().getName().equals("title")) { 360 handleTitle(newtext); 361 } else { 362 handleText(newtext); 363 } 364 lastBlockStartPos = currentBlockStartPos; 365 textpos = 0; 366 last = tag; 367 space = false; 368 } 369 370 /** 371 * Invoke the error handler. 372 */ 373 protected void error(String err, String arg1, String arg2, 374 String arg3) { 375 handleError(ln, err + " " + arg1 + " " + arg2 + " " + arg3); 376 } 377 378 protected void error(String err, String arg1, String arg2) { 379 error(err, arg1, arg2, "?"); 380 } 381 protected void error(String err, String arg1) { 382 error(err, arg1, "?", "?"); 383 } 384 protected void error(String err) { 385 error(err, "?", "?", "?"); 386 } 387 388 389 /** 390 * Handle a start tag. The new tag is pushed 391 * onto the tag stack. The attribute list is 392 * checked for required attributes. 393 */ 394 protected void startTag(TagElement tag) throws ChangedCharSetException { 395 Element elem = tag.getElement(); 396 397 // If the tag is an empty tag and texpos != 0 398 // this implies that there is text before the 399 // start tag that needs to be processed before 400 // handling the tag. 401 // 402 if (!elem.isEmpty() || 403 ((last != null) && !last.breaksFlow()) || 404 (textpos != 0)) { 405 handleText(tag); 406 } else { 407 // this variable gets updated in handleText(). 408 // Since in this case we do not call handleText() 409 // we need to update it here. 410 // 411 last = tag; 412 // Note that we should really check last.breakFlows before 413 // assuming this should be false. 414 space = false; 415 } 416 lastBlockStartPos = currentBlockStartPos; 417 418 // check required attributes 419 for (AttributeList a = elem.atts ; a != null ; a = a.next) { 420 if ((a.modifier == REQUIRED) && 421 ((attributes.isEmpty()) || 422 ((!attributes.isDefined(a.name)) && 423 (!attributes.isDefined(HTML.getAttributeKey(a.name)))))) { 424 error("req.att ", a.getName(), elem.getName()); 425 } 426 } 427 428 if (elem.isEmpty()) { 429 handleEmptyTag(tag); 430 /* 431 } else if (elem.getName().equals("form")) { 432 handleStartTag(tag); 433 */ 434 } else { 435 recent = elem; 436 stack = new TagStack(tag, stack); 437 handleStartTag(tag); 438 } 439 } 440 441 /** 442 * Handle an end tag. The end tag is popped 443 * from the tag stack. 444 */ 445 protected void endTag(boolean omitted) { 446 handleText(stack.tag); 447 448 if (omitted && !stack.elem.omitEnd()) { 449 error("end.missing", stack.elem.getName()); 450 } else if (!stack.terminate()) { 451 error("end.unexpected", stack.elem.getName()); 452 } 453 454 // handle the tag 455 handleEndTag(stack.tag); 456 stack = stack.next; 457 recent = (stack != null) ? stack.elem : null; 458 } 459 460 461 boolean ignoreElement(Element elem) { 462 463 String stackElement = stack.elem.getName(); 464 String elemName = elem.getName(); 465 /* We ignore all elements that are not valid in the context of 466 a table except <td>, <th> (these we handle in 467 legalElementContext()) and #pcdata. We also ignore the 468 <font> tag in the context of <ul> and <ol> We additonally 469 ignore the <meta> and the <style> tag if the body tag has 470 been seen. **/ 471 if ((elemName.equals("html") && seenHtml) || 472 (elemName.equals("head") && seenHead) || 473 (elemName.equals("body") && seenBody)) { 474 return true; 475 } 476 if (elemName.equals("dt") || elemName.equals("dd")) { 477 TagStack s = stack; 478 while (s != null && !s.elem.getName().equals("dl")) { 479 s = s.next; 480 } 481 if (s == null) { 482 return true; 483 } 484 } 485 486 if (((stackElement.equals("table")) && 487 (!elemName.equals("#pcdata")) && (!elemName.equals("input"))) || 488 ((elemName.equals("font")) && 489 (stackElement.equals("ul") || stackElement.equals("ol"))) || 490 (elemName.equals("meta") && stack != null) || 491 (elemName.equals("style") && seenBody) || 492 (stackElement.equals("table") && elemName.equals("a"))) { 493 return true; 494 } 495 return false; 496 } 497 498 499 /** 500 * Marks the first time a tag has been seen in a document 501 */ 502 503 protected void markFirstTime(Element elem) { 504 String elemName = elem.getName(); 505 if (elemName.equals("html")) { 506 seenHtml = true; 507 } else if (elemName.equals("head")) { 508 seenHead = true; 509 } else if (elemName.equals("body")) { 510 if (buf.length == 1) { 511 // Refer to note in definition of buf for details on this. 512 char[] newBuf = new char[256]; 513 514 newBuf[0] = buf[0]; 515 buf = newBuf; 516 } 517 seenBody = true; 518 } 519 } 520 521 /** 522 * Create a legal content for an element. 523 */ 524 boolean legalElementContext(Element elem) throws ChangedCharSetException { 525 526 // System.out.println("-- legalContext -- " + elem); 527 528 // Deal with the empty stack 529 if (stack == null) { 530 // System.out.println("-- stack is empty"); 531 if (elem != dtd.html) { 532 // System.out.println("-- pushing html"); 533 startTag(makeTag(dtd.html, true)); 534 return legalElementContext(elem); 535 } 536 return true; 537 } 538 539 // Is it allowed in the current context 540 if (stack.advance(elem)) { 541 // System.out.println("-- legal context"); 542 markFirstTime(elem); 543 return true; 544 } 545 boolean insertTag = false; 546 547 // The use of all error recovery strategies are contingent 548 // on the value of the strict property. 549 // 550 // These are commonly occuring errors. if insertTag is true, 551 // then we want to adopt an error recovery strategy that 552 // involves attempting to insert an additional tag to 553 // legalize the context. The two errors addressed here 554 // are: 555 // 1) when a <td> or <th> is seen soon after a <table> tag. 556 // In this case we insert a <tr>. 557 // 2) when any other tag apart from a <tr> is seen 558 // in the context of a <tr>. In this case we would 559 // like to add a <td>. If a <tr> is seen within a 560 // <tr> context, then we will close out the current 561 // <tr>. 562 // 563 // This insertion strategy is handled later in the method. 564 // The reason for checking this now, is that in other cases 565 // we would like to apply other error recovery strategies for example 566 // ignoring tags. 567 // 568 // In certain cases it is better to ignore a tag than try to 569 // fix the situation. So the first test is to see if this 570 // is what we need to do. 571 // 572 String stackElemName = stack.elem.getName(); 573 String elemName = elem.getName(); 574 575 576 if (!strict && 577 ((stackElemName.equals("table") && elemName.equals("td")) || 578 (stackElemName.equals("table") && elemName.equals("th")) || 579 (stackElemName.equals("tr") && !elemName.equals("tr")))){ 580 insertTag = true; 581 } 582 583 584 if (!strict && !insertTag && (stack.elem.getName() != elem.getName() || 585 elem.getName().equals("body"))) { 586 if (skipTag = ignoreElement(elem)) { 587 error("tag.ignore", elem.getName()); 588 return skipTag; 589 } 590 } 591 592 // Check for anything after the start of the table besides tr, td, th 593 // or caption, and if those aren't there, insert the <tr> and call 594 // legalElementContext again. 595 if (!strict && stackElemName.equals("table") && 596 !elemName.equals("tr") && !elemName.equals("td") && 597 !elemName.equals("th") && !elemName.equals("caption")) { 598 Element e = dtd.getElement("tr"); 599 TagElement t = makeTag(e, true); 600 legalTagContext(t); 601 startTag(t); 602 error("start.missing", elem.getName()); 603 return legalElementContext(elem); 604 } 605 606 // They try to find a legal context by checking if the current 607 // tag is valid in an enclosing context. If so 608 // close out the tags by outputing end tags and then 609 // insert the curent tag. If the tags that are 610 // being closed out do not have an optional end tag 611 // specification in the DTD then an html error is 612 // reported. 613 // 614 if (!insertTag && stack.terminate() && (!strict || stack.elem.omitEnd())) { 615 for (TagStack s = stack.next ; s != null ; s = s.next) { 616 if (s.advance(elem)) { 617 while (stack != s) { 618 endTag(true); 619 } 620 return true; 621 } 622 if (!s.terminate() || (strict && !s.elem.omitEnd())) { 623 break; 624 } 625 } 626 } 627 628 // Check if we know what tag is expected next. 629 // If so insert the tag. Report an error if the 630 // tag does not have its start tag spec in the DTD as optional. 631 // 632 Element next = stack.first(); 633 if (next != null && (!strict || next.omitStart()) && 634 !(next==dtd.head && elem==dtd.pcdata) ) { 635 // System.out.println("-- omitting start tag: " + next); 636 TagElement t = makeTag(next, true); 637 legalTagContext(t); 638 startTag(t); 639 if (!next.omitStart()) { 640 error("start.missing", elem.getName()); 641 } 642 return legalElementContext(elem); 643 } 644 645 646 // Traverse the list of expected elements and determine if adding 647 // any of these elements would make for a legal context. 648 // 649 650 if (!strict) { 651 ContentModel content = stack.contentModel(); 652 Vector<Element> elemVec = new Vector<Element>(); 653 if (content != null) { 654 content.getElements(elemVec); 655 for (Element e : elemVec) { 656 // Ensure that this element has not been included as 657 // part of the exclusions in the DTD. 658 // 659 if (stack.excluded(e.getIndex())) { 660 continue; 661 } 662 663 boolean reqAtts = false; 664 665 for (AttributeList a = e.getAttributes(); a != null ; a = a.next) { 666 if (a.modifier == REQUIRED) { 667 reqAtts = true; 668 break; 669 } 670 } 671 // Ensure that no tag that has required attributes 672 // gets inserted. 673 // 674 if (reqAtts) { 675 continue; 676 } 677 678 ContentModel m = e.getContent(); 679 if (m != null && m.first(elem)) { 680 // System.out.println("-- adding a legal tag: " + e); 681 TagElement t = makeTag(e, true); 682 legalTagContext(t); 683 startTag(t); 684 error("start.missing", e.getName()); 685 return legalElementContext(elem); 686 } 687 } 688 } 689 } 690 691 // Check if the stack can be terminated. If so add the appropriate 692 // end tag. Report an error if the tag being ended does not have its 693 // end tag spec in the DTD as optional. 694 // 695 if (stack.terminate() && (stack.elem != dtd.body) && (!strict || stack.elem.omitEnd())) { 696 // System.out.println("-- omitting end tag: " + stack.elem); 697 if (!stack.elem.omitEnd()) { 698 error("end.missing", elem.getName()); 699 } 700 701 endTag(true); 702 return legalElementContext(elem); 703 } 704 705 // At this point we know that something is screwed up. 706 return false; 707 } 708 709 /** 710 * Create a legal context for a tag. 711 */ 712 void legalTagContext(TagElement tag) throws ChangedCharSetException { 713 if (legalElementContext(tag.getElement())) { 714 markFirstTime(tag.getElement()); 715 return; 716 } 717 718 // Avoid putting a block tag in a flow tag. 719 if (tag.breaksFlow() && (stack != null) && !stack.tag.breaksFlow()) { 720 endTag(true); 721 legalTagContext(tag); 722 return; 723 } 724 725 // Avoid putting something wierd in the head of the document. 726 for (TagStack s = stack ; s != null ; s = s.next) { 727 if (s.tag.getElement() == dtd.head) { 728 while (stack != s) { 729 endTag(true); 730 } 731 endTag(true); 732 legalTagContext(tag); 733 return; 734 } 735 } 736 737 // Everything failed 738 error("tag.unexpected", tag.getElement().getName()); 739 } 740 741 /** 742 * Error context. Something went wrong, make sure we are in 743 * the document's body context 744 */ 745 void errorContext() throws ChangedCharSetException { 746 for (; (stack != null) && (stack.tag.getElement() != dtd.body) ; stack = stack.next) { 747 handleEndTag(stack.tag); 748 } 749 if (stack == null) { 750 legalElementContext(dtd.body); 751 startTag(makeTag(dtd.body, true)); 752 } 753 } 754 755 /** 756 * Add a char to the string buffer. 757 */ 758 void addString(int c) { 759 if (strpos == str.length) { 760 char newstr[] = new char[str.length + 128]; 761 System.arraycopy(str, 0, newstr, 0, str.length); 762 str = newstr; 763 } 764 str[strpos++] = (char)c; 765 } 766 767 /** 768 * Get the string that's been accumulated. 769 */ 770 String getString(int pos) { 771 char newStr[] = new char[strpos - pos]; 772 System.arraycopy(str, pos, newStr, 0, strpos - pos); 773 strpos = pos; 774 return new String(newStr); 775 } 776 777 char[] getChars(int pos) { 778 char newStr[] = new char[strpos - pos]; 779 System.arraycopy(str, pos, newStr, 0, strpos - pos); 780 strpos = pos; 781 return newStr; 782 } 783 784 char[] getChars(int pos, int endPos) { 785 char newStr[] = new char[endPos - pos]; 786 System.arraycopy(str, pos, newStr, 0, endPos - pos); 787 // REMIND: it's not clear whether this version should set strpos or not 788 // strpos = pos; 789 return newStr; 790 } 791 792 void resetStrBuffer() { 793 strpos = 0; 794 } 795 796 int strIndexOf(char target) { 797 for (int i = 0; i < strpos; i++) { 798 if (str[i] == target) { 799 return i; 800 } 801 } 802 803 return -1; 804 } 805 806 /** 807 * Skip space. 808 * [5] 297:5 809 */ 810 void skipSpace() throws IOException { 811 while (true) { 812 switch (ch) { 813 case '\n': 814 ln++; 815 ch = readCh(); 816 lfCount++; 817 break; 818 819 case '\r': 820 ln++; 821 if ((ch = readCh()) == '\n') { 822 ch = readCh(); 823 crlfCount++; 824 } 825 else { 826 crCount++; 827 } 828 break; 829 case ' ': 830 case '\t': 831 ch = readCh(); 832 break; 833 834 default: 835 return; 836 } 837 } 838 } 839 840 /** 841 * Parse identifier. Uppercase characters are folded 842 * to lowercase when lower is true. Returns falsed if 843 * no identifier is found. [55] 346:17 844 */ 845 boolean parseIdentifier(boolean lower) throws IOException { 846 switch (ch) { 847 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': 848 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L': 849 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': 850 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X': 851 case 'Y': case 'Z': 852 if (lower) { 853 ch = 'a' + (ch - 'A'); 854 } 855 856 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': 857 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': 858 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': 859 case 's': case 't': case 'u': case 'v': case 'w': case 'x': 860 case 'y': case 'z': 861 break; 862 863 default: 864 return false; 865 } 866 867 while (true) { 868 addString(ch); 869 870 switch (ch = readCh()) { 871 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': 872 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L': 873 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': 874 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X': 875 case 'Y': case 'Z': 876 if (lower) { 877 ch = 'a' + (ch - 'A'); 878 } 879 880 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': 881 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': 882 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': 883 case 's': case 't': case 'u': case 'v': case 'w': case 'x': 884 case 'y': case 'z': 885 886 case '0': case '1': case '2': case '3': case '4': 887 case '5': case '6': case '7': case '8': case '9': 888 889 case '.': case '-': 890 891 case '_': // not officially allowed 892 break; 893 894 default: 895 return true; 896 } 897 } 898 } 899 900 /** 901 * Parse an entity reference. [59] 350:17 902 */ 903 private char[] parseEntityReference() throws IOException { 904 int pos = strpos; 905 906 if ((ch = readCh()) == '#') { 907 int n = 0; 908 ch = readCh(); 909 if ((ch >= '0') && (ch <= '9') || 910 ch == 'x' || ch == 'X') { 911 912 if ((ch >= '0') && (ch <= '9')) { 913 // parse decimal reference 914 while ((ch >= '0') && (ch <= '9')) { 915 n = (n * 10) + ch - '0'; 916 ch = readCh(); 917 } 918 } else { 919 // parse hexadecimal reference 920 ch = readCh(); 921 char lch = (char) Character.toLowerCase(ch); 922 while ((lch >= '0') && (lch <= '9') || 923 (lch >= 'a') && (lch <= 'f')) { 924 if (lch >= '0' && lch <= '9') { 925 n = (n * 16) + lch - '0'; 926 } else { 927 n = (n * 16) + lch - 'a' + 10; 928 } 929 ch = readCh(); 930 lch = (char) Character.toLowerCase(ch); 931 } 932 } 933 switch (ch) { 934 case '\n': 935 ln++; 936 ch = readCh(); 937 lfCount++; 938 break; 939 940 case '\r': 941 ln++; 942 if ((ch = readCh()) == '\n') { 943 ch = readCh(); 944 crlfCount++; 945 } 946 else { 947 crCount++; 948 } 949 break; 950 951 case ';': 952 ch = readCh(); 953 break; 954 } 955 char data[] = {mapNumericReference((char) n)}; 956 return data; 957 } 958 addString('#'); 959 if (!parseIdentifier(false)) { 960 error("ident.expected"); 961 strpos = pos; 962 char data[] = {'&', '#'}; 963 return data; 964 } 965 } else if (!parseIdentifier(false)) { 966 char data[] = {'&'}; 967 return data; 968 } 969 switch (ch) { 970 case '\n': 971 ln++; 972 ch = readCh(); 973 lfCount++; 974 break; 975 976 case '\r': 977 ln++; 978 if ((ch = readCh()) == '\n') { 979 ch = readCh(); 980 crlfCount++; 981 } 982 else { 983 crCount++; 984 } 985 break; 986 987 case ';': 988 ch = readCh(); 989 break; 990 } 991 992 String nm = getString(pos); 993 Entity ent = dtd.getEntity(nm); 994 995 // entities are case sensitive - however if strict 996 // is false then we will try to make a match by 997 // converting the string to all lowercase. 998 // 999 if (!strict && (ent == null)) { 1000 ent = dtd.getEntity(nm.toLowerCase()); 1001 } 1002 if ((ent == null) || !ent.isGeneral()) { 1003 1004 if (nm.length() == 0) { 1005 error("invalid.entref", nm); 1006 return new char[0]; 1007 } 1008 /* given that there is not a match restore the entity reference */ 1009 String str = "&" + nm + ";"; 1010 1011 char b[] = new char[str.length()]; 1012 str.getChars(0, b.length, b, 0); 1013 return b; 1014 } 1015 return ent.getData(); 1016 } 1017 1018 /** 1019 * Converts numeric character reference to Unicode character. 1020 * 1021 * Normally the code in a reference should be always converted 1022 * to the Unicode character with the same code, but due to 1023 * wide usage of Cp1252 charset most browsers map numeric references 1024 * in the range 130-159 (which are control chars in Unicode set) 1025 * to displayable characters with other codes. 1026 * 1027 * @param c the code of numeric character reference. 1028 * @return the character corresponding to the reference code. 1029 */ 1030 private char mapNumericReference(char c) { 1031 if (c < 130 || c > 159) { 1032 return c; 1033 } 1034 return cp1252Map[c - 130]; 1035 } 1036 1037 /** 1038 * Parse a comment. [92] 391:7 1039 */ 1040 void parseComment() throws IOException { 1041 1042 while (true) { 1043 int c = ch; 1044 switch (c) { 1045 case '-': 1046 /** Presuming that the start string of a comment "<!--" has 1047 already been parsed, the '-' character is valid only as 1048 part of a comment termination and further more it must 1049 be present in even numbers. Hence if strict is true, we 1050 presume the comment has been terminated and return. 1051 However if strict is false, then there is no even number 1052 requirement and this character can appear anywhere in the 1053 comment. The parser reads on until it sees the following 1054 pattern: "-->" or "--!>". 1055 **/ 1056 if (!strict && (strpos != 0) && (str[strpos - 1] == '-')) { 1057 if ((ch = readCh()) == '>') { 1058 return; 1059 } 1060 if (ch == '!') { 1061 if ((ch = readCh()) == '>') { 1062 return; 1063 } else { 1064 /* to account for extra read()'s that happened */ 1065 addString('-'); 1066 addString('!'); 1067 continue; 1068 } 1069 } 1070 break; 1071 } 1072 1073 if ((ch = readCh()) == '-') { 1074 ch = readCh(); 1075 if (strict || ch == '>') { 1076 return; 1077 } 1078 if (ch == '!') { 1079 if ((ch = readCh()) == '>') { 1080 return; 1081 } else { 1082 /* to account for extra read()'s that happened */ 1083 addString('-'); 1084 addString('!'); 1085 continue; 1086 } 1087 } 1088 /* to account for the extra read() */ 1089 addString('-'); 1090 } 1091 break; 1092 1093 case -1: 1094 handleEOFInComment(); 1095 return; 1096 1097 case '\n': 1098 ln++; 1099 ch = readCh(); 1100 lfCount++; 1101 break; 1102 1103 case '>': 1104 ch = readCh(); 1105 break; 1106 1107 case '\r': 1108 ln++; 1109 if ((ch = readCh()) == '\n') { 1110 ch = readCh(); 1111 crlfCount++; 1112 } 1113 else { 1114 crCount++; 1115 } 1116 c = '\n'; 1117 break; 1118 default: 1119 ch = readCh(); 1120 break; 1121 } 1122 1123 addString(c); 1124 } 1125 } 1126 1127 /** 1128 * Parse literal content. [46] 343:1 and [47] 344:1 1129 */ 1130 void parseLiteral(boolean replace) throws IOException { 1131 while (true) { 1132 int c = ch; 1133 switch (c) { 1134 case -1: 1135 error("eof.literal", stack.elem.getName()); 1136 endTag(true); 1137 return; 1138 1139 case '>': 1140 ch = readCh(); 1141 int i = textpos - (stack.elem.name.length() + 2), j = 0; 1142 1143 // match end tag 1144 if ((i >= 0) && (text[i++] == '<') && (text[i] == '/')) { 1145 while ((++i < textpos) && 1146 (Character.toLowerCase(text[i]) == stack.elem.name.charAt(j++))); 1147 if (i == textpos) { 1148 textpos -= (stack.elem.name.length() + 2); 1149 if ((textpos > 0) && (text[textpos-1] == '\n')) { 1150 textpos--; 1151 } 1152 endTag(false); 1153 return; 1154 } 1155 } 1156 break; 1157 1158 case '&': 1159 char data[] = parseEntityReference(); 1160 if (textpos + data.length > text.length) { 1161 char newtext[] = new char[Math.max(textpos + data.length + 128, text.length * 2)]; 1162 System.arraycopy(text, 0, newtext, 0, text.length); 1163 text = newtext; 1164 } 1165 System.arraycopy(data, 0, text, textpos, data.length); 1166 textpos += data.length; 1167 continue; 1168 1169 case '\n': 1170 ln++; 1171 ch = readCh(); 1172 lfCount++; 1173 break; 1174 1175 case '\r': 1176 ln++; 1177 if ((ch = readCh()) == '\n') { 1178 ch = readCh(); 1179 crlfCount++; 1180 } 1181 else { 1182 crCount++; 1183 } 1184 c = '\n'; 1185 break; 1186 default: 1187 ch = readCh(); 1188 break; 1189 } 1190 1191 // output character 1192 if (textpos == text.length) { 1193 char newtext[] = new char[text.length + 128]; 1194 System.arraycopy(text, 0, newtext, 0, text.length); 1195 text = newtext; 1196 } 1197 text[textpos++] = (char)c; 1198 } 1199 } 1200 1201 /** 1202 * Parse attribute value. [33] 331:1 1203 */ 1204 String parseAttributeValue(boolean lower) throws IOException { 1205 int delim = -1; 1206 1207 // Check for a delimiter 1208 switch(ch) { 1209 case '\'': 1210 case '"': 1211 delim = ch; 1212 ch = readCh(); 1213 break; 1214 } 1215 1216 // Parse the rest of the value 1217 while (true) { 1218 int c = ch; 1219 1220 switch (c) { 1221 case '\n': 1222 ln++; 1223 ch = readCh(); 1224 lfCount++; 1225 if (delim < 0) { 1226 return getString(0); 1227 } 1228 break; 1229 1230 case '\r': 1231 ln++; 1232 1233 if ((ch = readCh()) == '\n') { 1234 ch = readCh(); 1235 crlfCount++; 1236 } 1237 else { 1238 crCount++; 1239 } 1240 if (delim < 0) { 1241 return getString(0); 1242 } 1243 break; 1244 1245 case '\t': 1246 if (delim < 0) 1247 c = ' '; 1248 case ' ': 1249 ch = readCh(); 1250 if (delim < 0) { 1251 return getString(0); 1252 } 1253 break; 1254 1255 case '>': 1256 case '<': 1257 if (delim < 0) { 1258 return getString(0); 1259 } 1260 ch = readCh(); 1261 break; 1262 1263 case '\'': 1264 case '"': 1265 ch = readCh(); 1266 if (c == delim) { 1267 return getString(0); 1268 } else if (delim == -1) { 1269 error("attvalerr"); 1270 if (strict || ch == ' ') { 1271 return getString(0); 1272 } else { 1273 continue; 1274 } 1275 } 1276 break; 1277 1278 case '=': 1279 if (delim < 0) { 1280 /* In SGML a construct like <img src=/cgi-bin/foo?x=1> 1281 is considered invalid since an = sign can only be contained 1282 in an attributes value if the string is quoted. 1283 */ 1284 error("attvalerr"); 1285 /* If strict is true then we return with the string we have thus far. 1286 Otherwise we accept the = sign as part of the attribute's value and 1287 process the rest of the img tag. */ 1288 if (strict) { 1289 return getString(0); 1290 } 1291 } 1292 ch = readCh(); 1293 break; 1294 1295 case '&': 1296 if (strict && delim < 0) { 1297 ch = readCh(); 1298 break; 1299 } 1300 1301 char data[] = parseEntityReference(); 1302 for (int i = 0 ; i < data.length ; i++) { 1303 c = data[i]; 1304 addString((lower && (c >= 'A') && (c <= 'Z')) ? 'a' + c - 'A' : c); 1305 } 1306 continue; 1307 1308 case -1: 1309 return getString(0); 1310 1311 default: 1312 if (lower && (c >= 'A') && (c <= 'Z')) { 1313 c = 'a' + c - 'A'; 1314 } 1315 ch = readCh(); 1316 break; 1317 } 1318 addString(c); 1319 } 1320 } 1321 1322 1323 /** 1324 * Parse attribute specification List. [31] 327:17 1325 */ 1326 void parseAttributeSpecificationList(Element elem) throws IOException { 1327 1328 while (true) { 1329 skipSpace(); 1330 1331 switch (ch) { 1332 case '/': 1333 case '>': 1334 case '<': 1335 case -1: 1336 return; 1337 1338 case '-': 1339 if ((ch = readCh()) == '-') { 1340 ch = readCh(); 1341 parseComment(); 1342 strpos = 0; 1343 } else { 1344 error("invalid.tagchar", "-", elem.getName()); 1345 ch = readCh(); 1346 } 1347 continue; 1348 } 1349 1350 AttributeList att; 1351 String attname; 1352 String attvalue; 1353 1354 if (parseIdentifier(true)) { 1355 attname = getString(0); 1356 skipSpace(); 1357 if (ch == '=') { 1358 ch = readCh(); 1359 skipSpace(); 1360 att = elem.getAttribute(attname); 1361 // Bug ID 4102750 1362 // Load the NAME of an Attribute Case Sensitive 1363 // The case of the NAME must be intact 1364 // MG 021898 1365 attvalue = parseAttributeValue((att != null) && (att.type != CDATA) && (att.type != NOTATION) && (att.type != NAME)); 1366 // attvalue = parseAttributeValue((att != null) && (att.type != CDATA) && (att.type != NOTATION)); 1367 } else { 1368 attvalue = attname; 1369 att = elem.getAttributeByValue(attvalue); 1370 if (att == null) { 1371 att = elem.getAttribute(attname); 1372 if (att != null) { 1373 attvalue = att.getValue(); 1374 } 1375 else { 1376 // Make it null so that NULL_ATTRIBUTE_VALUE is 1377 // used 1378 attvalue = null; 1379 } 1380 } 1381 } 1382 } else if (!strict && ch == ',') { // allows for comma separated attribute-value pairs 1383 ch = readCh(); 1384 continue; 1385 } else if (!strict && ch == '"') { // allows for quoted attributes 1386 ch = readCh(); 1387 skipSpace(); 1388 if (parseIdentifier(true)) { 1389 attname = getString(0); 1390 if (ch == '"') { 1391 ch = readCh(); 1392 } 1393 skipSpace(); 1394 if (ch == '=') { 1395 ch = readCh(); 1396 skipSpace(); 1397 att = elem.getAttribute(attname); 1398 attvalue = parseAttributeValue((att != null) && 1399 (att.type != CDATA) && 1400 (att.type != NOTATION)); 1401 } else { 1402 attvalue = attname; 1403 att = elem.getAttributeByValue(attvalue); 1404 if (att == null) { 1405 att = elem.getAttribute(attname); 1406 if (att != null) { 1407 attvalue = att.getValue(); 1408 } 1409 } 1410 } 1411 } else { 1412 char str[] = {(char)ch}; 1413 error("invalid.tagchar", new String(str), elem.getName()); 1414 ch = readCh(); 1415 continue; 1416 } 1417 } else if (!strict && (attributes.isEmpty()) && (ch == '=')) { 1418 ch = readCh(); 1419 skipSpace(); 1420 attname = elem.getName(); 1421 att = elem.getAttribute(attname); 1422 attvalue = parseAttributeValue((att != null) && 1423 (att.type != CDATA) && 1424 (att.type != NOTATION)); 1425 } else if (!strict && (ch == '=')) { 1426 ch = readCh(); 1427 skipSpace(); 1428 attvalue = parseAttributeValue(true); 1429 error("attvalerr"); 1430 return; 1431 } else { 1432 char str[] = {(char)ch}; 1433 error("invalid.tagchar", new String(str), elem.getName()); 1434 if (!strict) { 1435 ch = readCh(); 1436 continue; 1437 } else { 1438 return; 1439 } 1440 } 1441 1442 if (att != null) { 1443 attname = att.getName(); 1444 } else { 1445 error("invalid.tagatt", attname, elem.getName()); 1446 } 1447 1448 // Check out the value 1449 if (attributes.isDefined(attname)) { 1450 error("multi.tagatt", attname, elem.getName()); 1451 } 1452 if (attvalue == null) { 1453 attvalue = ((att != null) && (att.value != null)) ? att.value : 1454 HTML.NULL_ATTRIBUTE_VALUE; 1455 } else if ((att != null) && (att.values != null) && !att.values.contains(attvalue)) { 1456 error("invalid.tagattval", attname, elem.getName()); 1457 } 1458 HTML.Attribute attkey = HTML.getAttributeKey(attname); 1459 if (attkey == null) { 1460 attributes.addAttribute(attname, attvalue); 1461 } else { 1462 attributes.addAttribute(attkey, attvalue); 1463 } 1464 } 1465 } 1466 1467 /** 1468 * Parses th Document Declaration Type markup declaration. 1469 * Currently ignores it. 1470 */ 1471 public String parseDTDMarkup() throws IOException { 1472 1473 StringBuilder strBuff = new StringBuilder(); 1474 ch = readCh(); 1475 while(true) { 1476 switch (ch) { 1477 case '>': 1478 ch = readCh(); 1479 return strBuff.toString(); 1480 case -1: 1481 error("invalid.markup"); 1482 return strBuff.toString(); 1483 case '\n': 1484 ln++; 1485 ch = readCh(); 1486 lfCount++; 1487 break; 1488 case '"': 1489 ch = readCh(); 1490 break; 1491 case '\r': 1492 ln++; 1493 if ((ch = readCh()) == '\n') { 1494 ch = readCh(); 1495 crlfCount++; 1496 } 1497 else { 1498 crCount++; 1499 } 1500 break; 1501 default: 1502 strBuff.append((char)(ch & 0xFF)); 1503 ch = readCh(); 1504 break; 1505 } 1506 } 1507 } 1508 1509 /** 1510 * Parse markup declarations. 1511 * Currently only handles the Document Type Declaration markup. 1512 * Returns true if it is a markup declaration false otherwise. 1513 */ 1514 protected boolean parseMarkupDeclarations(StringBuffer strBuff) throws IOException { 1515 1516 /* Currently handles only the DOCTYPE */ 1517 if ((strBuff.length() == "DOCTYPE".length()) && 1518 (strBuff.toString().toUpperCase().equals("DOCTYPE"))) { 1519 parseDTDMarkup(); 1520 return true; 1521 } 1522 return false; 1523 } 1524 1525 /** 1526 * Parse an invalid tag. 1527 */ 1528 void parseInvalidTag() throws IOException { 1529 // ignore all data upto the close bracket '>' 1530 while (true) { 1531 skipSpace(); 1532 switch (ch) { 1533 case '>': 1534 case -1: 1535 ch = readCh(); 1536 return; 1537 case '<': 1538 return; 1539 default: 1540 ch = readCh(); 1541 1542 } 1543 } 1544 } 1545 1546 /** 1547 * Parse a start or end tag. 1548 */ 1549 void parseTag() throws IOException { 1550 Element elem; 1551 boolean net = false; 1552 boolean warned = false; 1553 boolean unknown = false; 1554 1555 switch (ch = readCh()) { 1556 case '!': 1557 switch (ch = readCh()) { 1558 case '-': 1559 // Parse comment. [92] 391:7 1560 while (true) { 1561 if (ch == '-') { 1562 if (!strict || ((ch = readCh()) == '-')) { 1563 ch = readCh(); 1564 if (!strict && ch == '-') { 1565 ch = readCh(); 1566 } 1567 // send over any text you might see 1568 // before parsing and sending the 1569 // comment 1570 if (textpos != 0) { 1571 char newtext[] = new char[textpos]; 1572 System.arraycopy(text, 0, newtext, 0, textpos); 1573 handleText(newtext); 1574 lastBlockStartPos = currentBlockStartPos; 1575 textpos = 0; 1576 } 1577 parseComment(); 1578 last = makeTag(dtd.getElement("comment"), true); 1579 handleComment(getChars(0)); 1580 continue; 1581 } else if (!warned) { 1582 warned = true; 1583 error("invalid.commentchar", "-"); 1584 } 1585 } 1586 skipSpace(); 1587 switch (ch) { 1588 case '-': 1589 continue; 1590 case '>': 1591 ch = readCh(); 1592 case -1: 1593 return; 1594 default: 1595 ch = readCh(); 1596 if (!warned) { 1597 warned = true; 1598 error("invalid.commentchar", 1599 String.valueOf((char)ch)); 1600 } 1601 break; 1602 } 1603 } 1604 1605 default: 1606 // deal with marked sections 1607 StringBuffer strBuff = new StringBuffer(); 1608 while (true) { 1609 strBuff.append((char)ch); 1610 if (parseMarkupDeclarations(strBuff)) { 1611 return; 1612 } 1613 switch(ch) { 1614 case '>': 1615 ch = readCh(); 1616 case -1: 1617 error("invalid.markup"); 1618 return; 1619 case '\n': 1620 ln++; 1621 ch = readCh(); 1622 lfCount++; 1623 break; 1624 case '\r': 1625 ln++; 1626 if ((ch = readCh()) == '\n') { 1627 ch = readCh(); 1628 crlfCount++; 1629 } 1630 else { 1631 crCount++; 1632 } 1633 break; 1634 1635 default: 1636 ch = readCh(); 1637 break; 1638 } 1639 } 1640 } 1641 1642 case '/': 1643 // parse end tag [19] 317:4 1644 switch (ch = readCh()) { 1645 case '>': 1646 ch = readCh(); 1647 case '<': 1648 // empty end tag. either </> or </< 1649 if (recent == null) { 1650 error("invalid.shortend"); 1651 return; 1652 } 1653 elem = recent; 1654 break; 1655 1656 default: 1657 if (!parseIdentifier(true)) { 1658 error("expected.endtagname"); 1659 return; 1660 } 1661 skipSpace(); 1662 switch (ch) { 1663 case '>': 1664 ch = readCh(); 1665 case '<': 1666 break; 1667 1668 default: 1669 error("expected", "'>'"); 1670 while ((ch != -1) && (ch != '\n') && (ch != '>')) { 1671 ch = readCh(); 1672 } 1673 if (ch == '>') { 1674 ch = readCh(); 1675 } 1676 break; 1677 } 1678 String elemStr = getString(0); 1679 if (!dtd.elementExists(elemStr)) { 1680 error("end.unrecognized", elemStr); 1681 // Ignore RE before end tag 1682 if ((textpos > 0) && (text[textpos-1] == '\n')) { 1683 textpos--; 1684 } 1685 elem = dtd.getElement("unknown"); 1686 elem.name = elemStr; 1687 unknown = true; 1688 } else { 1689 elem = dtd.getElement(elemStr); 1690 } 1691 break; 1692 } 1693 1694 1695 // If the stack is null, we're seeing end tags without any begin 1696 // tags. Ignore them. 1697 1698 if (stack == null) { 1699 error("end.extra.tag", elem.getName()); 1700 return; 1701 } 1702 1703 // Ignore RE before end tag 1704 if ((textpos > 0) && (text[textpos-1] == '\n')) { 1705 // In a pre tag, if there are blank lines 1706 // we do not want to remove the newline 1707 // before the end tag. Hence this code. 1708 // 1709 if (stack.pre) { 1710 if ((textpos > 1) && (text[textpos-2] != '\n')) { 1711 textpos--; 1712 } 1713 } else { 1714 textpos--; 1715 } 1716 } 1717 1718 // If the end tag is a form, since we did not put it 1719 // on the tag stack, there is no corresponding start 1720 // start tag to find. Hence do not touch the tag stack. 1721 // 1722 1723 /* 1724 if (!strict && elem.getName().equals("form")) { 1725 if (lastFormSent != null) { 1726 handleEndTag(lastFormSent); 1727 return; 1728 } else { 1729 // do nothing. 1730 return; 1731 } 1732 } 1733 */ 1734 1735 if (unknown) { 1736 // we will not see a corresponding start tag 1737 // on the the stack. If we are seeing an 1738 // end tag, lets send this on as an empty 1739 // tag with the end tag attribute set to 1740 // true. 1741 TagElement t = makeTag(elem); 1742 handleText(t); 1743 attributes.addAttribute(HTML.Attribute.ENDTAG, "true"); 1744 handleEmptyTag(makeTag(elem)); 1745 unknown = false; 1746 return; 1747 } 1748 1749 // find the corresponding start tag 1750 1751 // A commonly occuring error appears to be the insertion 1752 // of extra end tags in a table. The intent here is ignore 1753 // such extra end tags. 1754 // 1755 if (!strict) { 1756 String stackElem = stack.elem.getName(); 1757 1758 if (stackElem.equals("table")) { 1759 // If it isnt a valid end tag ignore it and return 1760 // 1761 if (!elem.getName().equals(stackElem)) { 1762 error("tag.ignore", elem.getName()); 1763 return; 1764 } 1765 } 1766 1767 1768 1769 if (stackElem.equals("tr") || 1770 stackElem.equals("td")) { 1771 if ((!elem.getName().equals("table")) && 1772 (!elem.getName().equals(stackElem))) { 1773 error("tag.ignore", elem.getName()); 1774 return; 1775 } 1776 } 1777 } 1778 TagStack sp = stack; 1779 1780 while ((sp != null) && (elem != sp.elem)) { 1781 sp = sp.next; 1782 } 1783 if (sp == null) { 1784 error("unmatched.endtag", elem.getName()); 1785 return; 1786 } 1787 1788 // People put font ending tags in the darndest places. 1789 // Don't close other contexts based on them being between 1790 // a font tag and the corresponding end tag. Instead, 1791 // ignore the end tag like it doesn't exist and allow the end 1792 // of the document to close us out. 1793 String elemName = elem.getName(); 1794 if (stack != sp && 1795 (elemName.equals("font") || 1796 elemName.equals("center"))) { 1797 1798 // Since closing out a center tag can have real wierd 1799 // effects on the formatting, make sure that tags 1800 // for which omitting an end tag is legimitate 1801 // get closed out. 1802 // 1803 if (elemName.equals("center")) { 1804 while(stack.elem.omitEnd() && stack != sp) { 1805 endTag(true); 1806 } 1807 if (stack.elem == elem) { 1808 endTag(false); 1809 } 1810 } 1811 return; 1812 } 1813 // People do the same thing with center tags. In this 1814 // case we would like to close off the center tag but 1815 // not necessarily all enclosing tags. 1816 1817 1818 1819 // end tags 1820 while (stack != sp) { 1821 endTag(true); 1822 } 1823 1824 endTag(false); 1825 return; 1826 1827 case -1: 1828 error("eof"); 1829 return; 1830 } 1831 1832 // start tag [14] 314:1 1833 if (!parseIdentifier(true)) { 1834 elem = recent; 1835 if ((ch != '>') || (elem == null)) { 1836 error("expected.tagname"); 1837 return; 1838 } 1839 } else { 1840 String elemStr = getString(0); 1841 1842 if (elemStr.equals("image")) { 1843 elemStr = "img"; 1844 } 1845 1846 /* determine if this element is part of the dtd. */ 1847 1848 if (!dtd.elementExists(elemStr)) { 1849 // parseInvalidTag(); 1850 error("tag.unrecognized ", elemStr); 1851 elem = dtd.getElement("unknown"); 1852 elem.name = elemStr; 1853 unknown = true; 1854 } else { 1855 elem = dtd.getElement(elemStr); 1856 } 1857 } 1858 1859 // Parse attributes 1860 parseAttributeSpecificationList(elem); 1861 1862 switch (ch) { 1863 case '/': 1864 net = true; 1865 case '>': 1866 ch = readCh(); 1867 if (ch == '>' && net) { 1868 ch = readCh(); 1869 } 1870 case '<': 1871 break; 1872 1873 default: 1874 error("expected", "'>'"); 1875 break; 1876 } 1877 1878 if (!strict) { 1879 if (elem.getName().equals("script")) { 1880 error("javascript.unsupported"); 1881 } 1882 } 1883 1884 // ignore RE after start tag 1885 // 1886 if (!elem.isEmpty()) { 1887 if (ch == '\n') { 1888 ln++; 1889 lfCount++; 1890 ch = readCh(); 1891 } else if (ch == '\r') { 1892 ln++; 1893 if ((ch = readCh()) == '\n') { 1894 ch = readCh(); 1895 crlfCount++; 1896 } 1897 else { 1898 crCount++; 1899 } 1900 } 1901 } 1902 1903 // ensure a legal context for the tag 1904 TagElement tag = makeTag(elem, false); 1905 1906 1907 /** In dealing with forms, we have decided to treat 1908 them as legal in any context. Also, even though 1909 they do have a start and an end tag, we will 1910 not put this tag on the stack. This is to deal 1911 several pages in the web oasis that choose to 1912 start and end forms in any possible location. **/ 1913 1914 /* 1915 if (!strict && elem.getName().equals("form")) { 1916 if (lastFormSent == null) { 1917 lastFormSent = tag; 1918 } else { 1919 handleEndTag(lastFormSent); 1920 lastFormSent = tag; 1921 } 1922 } else { 1923 */ 1924 // Smlly, if a tag is unknown, we will apply 1925 // no legalTagContext logic to it. 1926 // 1927 if (!unknown) { 1928 legalTagContext(tag); 1929 1930 // If skip tag is true, this implies that 1931 // the tag was illegal and that the error 1932 // recovery strategy adopted is to ignore 1933 // the tag. 1934 if (!strict && skipTag) { 1935 skipTag = false; 1936 return; 1937 } 1938 } 1939 /* 1940 } 1941 */ 1942 1943 startTag(tag); 1944 1945 if (!elem.isEmpty()) { 1946 switch (elem.getType()) { 1947 case CDATA: 1948 parseLiteral(false); 1949 break; 1950 case RCDATA: 1951 parseLiteral(true); 1952 break; 1953 default: 1954 if (stack != null) { 1955 stack.net = net; 1956 } 1957 break; 1958 } 1959 } 1960 } 1961 1962 private static final String START_COMMENT = "<!--"; 1963 private static final String END_COMMENT = "-->"; 1964 private static final char[] SCRIPT_END_TAG = "</script>".toCharArray(); 1965 private static final char[] SCRIPT_END_TAG_UPPER_CASE = 1966 "</SCRIPT>".toCharArray(); 1967 1968 void parseScript() throws IOException { 1969 char[] charsToAdd = new char[SCRIPT_END_TAG.length]; 1970 1971 /* Here, ch should be the first character after <script> */ 1972 while (true) { 1973 int i = 0; 1974 while (i < SCRIPT_END_TAG.length 1975 && (SCRIPT_END_TAG[i] == ch 1976 || SCRIPT_END_TAG_UPPER_CASE[i] == ch)) { 1977 charsToAdd[i] = (char) ch; 1978 ch = readCh(); 1979 i++; 1980 } 1981 if (i == SCRIPT_END_TAG.length) { 1982 1983 /* '</script>' tag detected */ 1984 /* Here, ch == '>' */ 1985 ch = readCh(); 1986 /* Here, ch == the first character after </script> */ 1987 return; 1988 } else { 1989 1990 /* To account for extra read()'s that happened */ 1991 for (int j = 0; j < i; j++) { 1992 addString(charsToAdd[j]); 1993 } 1994 1995 switch (ch) { 1996 case -1: 1997 error("eof.script"); 1998 return; 1999 case '\n': 2000 ln++; 2001 ch = readCh(); 2002 lfCount++; 2003 addString('\n'); 2004 break; 2005 case '\r': 2006 ln++; 2007 if ((ch = readCh()) == '\n') { 2008 ch = readCh(); 2009 crlfCount++; 2010 } else { 2011 crCount++; 2012 } 2013 addString('\n'); 2014 break; 2015 default: 2016 addString(ch); 2017 ch = readCh(); 2018 break; 2019 } // switch 2020 } 2021 } // while 2022 } 2023 2024 /** 2025 * Parse Content. [24] 320:1 2026 */ 2027 void parseContent() throws IOException { 2028 Thread curThread = Thread.currentThread(); 2029 2030 for (;;) { 2031 if (curThread.isInterrupted()) { 2032 curThread.interrupt(); // resignal the interrupt 2033 break; 2034 } 2035 2036 int c = ch; 2037 currentBlockStartPos = currentPosition; 2038 2039 if (recent == dtd.script) { // means: if after starting <script> tag 2040 2041 /* Here, ch has to be the first character after <script> */ 2042 parseScript(); 2043 last = makeTag(dtd.getElement("comment"), true); 2044 2045 /* Remove leading and trailing HTML comment declarations */ 2046 String str = new String(getChars(0)).trim(); 2047 int minLength = START_COMMENT.length() + END_COMMENT.length(); 2048 if (str.startsWith(START_COMMENT) && str.endsWith(END_COMMENT) 2049 && str.length() >= (minLength)) { 2050 str = str.substring(START_COMMENT.length(), 2051 str.length() - END_COMMENT.length()); 2052 } 2053 2054 /* Handle resulting chars as comment */ 2055 handleComment(str.toCharArray()); 2056 endTag(false); 2057 lastBlockStartPos = currentPosition; 2058 } else { 2059 switch (c) { 2060 case '<': 2061 parseTag(); 2062 lastBlockStartPos = currentPosition; 2063 continue; 2064 2065 case '/': 2066 ch = readCh(); 2067 if ((stack != null) && stack.net) { 2068 // null end tag. 2069 endTag(false); 2070 continue; 2071 } 2072 break; 2073 2074 case -1: 2075 return; 2076 2077 case '&': 2078 if (textpos == 0) { 2079 if (!legalElementContext(dtd.pcdata)) { 2080 error("unexpected.pcdata"); 2081 } 2082 if (last.breaksFlow()) { 2083 space = false; 2084 } 2085 } 2086 char data[] = parseEntityReference(); 2087 if (textpos + data.length + 1 > text.length) { 2088 char newtext[] = new char[Math.max(textpos + data.length + 128, text.length * 2)]; 2089 System.arraycopy(text, 0, newtext, 0, text.length); 2090 text = newtext; 2091 } 2092 if (space) { 2093 space = false; 2094 text[textpos++] = ' '; 2095 } 2096 System.arraycopy(data, 0, text, textpos, data.length); 2097 textpos += data.length; 2098 ignoreSpace = false; 2099 continue; 2100 2101 case '\n': 2102 ln++; 2103 lfCount++; 2104 ch = readCh(); 2105 if ((stack != null) && stack.pre) { 2106 break; 2107 } 2108 if (textpos == 0) { 2109 lastBlockStartPos = currentPosition; 2110 } 2111 if (!ignoreSpace) { 2112 space = true; 2113 } 2114 continue; 2115 2116 case '\r': 2117 ln++; 2118 c = '\n'; 2119 if ((ch = readCh()) == '\n') { 2120 ch = readCh(); 2121 crlfCount++; 2122 } 2123 else { 2124 crCount++; 2125 } 2126 if ((stack != null) && stack.pre) { 2127 break; 2128 } 2129 if (textpos == 0) { 2130 lastBlockStartPos = currentPosition; 2131 } 2132 if (!ignoreSpace) { 2133 space = true; 2134 } 2135 continue; 2136 2137 2138 case '\t': 2139 case ' ': 2140 ch = readCh(); 2141 if ((stack != null) && stack.pre) { 2142 break; 2143 } 2144 if (textpos == 0) { 2145 lastBlockStartPos = currentPosition; 2146 } 2147 if (!ignoreSpace) { 2148 space = true; 2149 } 2150 continue; 2151 2152 default: 2153 if (textpos == 0) { 2154 if (!legalElementContext(dtd.pcdata)) { 2155 error("unexpected.pcdata"); 2156 } 2157 if (last.breaksFlow()) { 2158 space = false; 2159 } 2160 } 2161 ch = readCh(); 2162 break; 2163 } 2164 } 2165 2166 // enlarge buffer if needed 2167 if (textpos + 2 > text.length) { 2168 char newtext[] = new char[text.length + 128]; 2169 System.arraycopy(text, 0, newtext, 0, text.length); 2170 text = newtext; 2171 } 2172 2173 // output pending space 2174 if (space) { 2175 if (textpos == 0) { 2176 lastBlockStartPos--; 2177 } 2178 text[textpos++] = ' '; 2179 space = false; 2180 } 2181 text[textpos++] = (char)c; 2182 ignoreSpace = false; 2183 } 2184 } 2185 2186 /** 2187 * Returns the end of line string. This will return the end of line 2188 * string that has been encountered the most, one of \r, \n or \r\n. 2189 */ 2190 String getEndOfLineString() { 2191 if (crlfCount >= crCount) { 2192 if (lfCount >= crlfCount) { 2193 return "\n"; 2194 } 2195 else { 2196 return "\r\n"; 2197 } 2198 } 2199 else { 2200 if (crCount > lfCount) { 2201 return "\r"; 2202 } 2203 else { 2204 return "\n"; 2205 } 2206 } 2207 } 2208 2209 /** 2210 * Parse an HTML stream, given a DTD. 2211 */ 2212 public synchronized void parse(Reader in) throws IOException { 2213 this.in = in; 2214 2215 this.ln = 1; 2216 2217 seenHtml = false; 2218 seenHead = false; 2219 seenBody = false; 2220 2221 crCount = lfCount = crlfCount = 0; 2222 2223 try { 2224 ch = readCh(); 2225 text = new char[1024]; 2226 str = new char[128]; 2227 2228 parseContent(); 2229 // NOTE: interruption may have occurred. Control flows out 2230 // of here normally. 2231 while (stack != null) { 2232 endTag(true); 2233 } 2234 in.close(); 2235 } catch (IOException e) { 2236 errorContext(); 2237 error("ioexception"); 2238 throw e; 2239 } catch (Exception e) { 2240 errorContext(); 2241 error("exception", e.getClass().getName(), e.getMessage()); 2242 e.printStackTrace(); 2243 } catch (ThreadDeath e) { 2244 errorContext(); 2245 error("terminated"); 2246 e.printStackTrace(); 2247 throw e; 2248 } finally { 2249 for (; stack != null ; stack = stack.next) { 2250 handleEndTag(stack.tag); 2251 } 2252 2253 text = null; 2254 str = null; 2255 } 2256 2257 } 2258 2259 2260 /* 2261 * Input cache. This is much faster than calling down to a synchronized 2262 * method of BufferedReader for each byte. Measurements done 5/30/97 2263 * show that there's no point in having a bigger buffer: Increasing 2264 * the buffer to 8192 had no measurable impact for a program discarding 2265 * one character at a time (reading from an http URL to a local machine). 2266 * NOTE: If the current encoding is bogus, and we read too much 2267 * (past the content-type) we may suffer a MalformedInputException. For 2268 * this reason the initial size is 1 and when the body is encountered the 2269 * size is adjusted to 256. 2270 */ 2271 private char buf[] = new char[1]; 2272 private int pos; 2273 private int len; 2274 /* 2275 tracks position relative to the beginning of the 2276 document. 2277 */ 2278 private int currentPosition; 2279 2280 2281 private final int readCh() throws IOException { 2282 2283 if (pos >= len) { 2284 2285 // This loop allows us to ignore interrupts if the flag 2286 // says so 2287 for (;;) { 2288 try { 2289 len = in.read(buf); 2290 break; 2291 } catch (InterruptedIOException ex) { 2292 throw ex; 2293 } 2294 } 2295 2296 if (len <= 0) { 2297 return -1; // eof 2298 } 2299 pos = 0; 2300 } 2301 ++currentPosition; 2302 2303 return buf[pos++]; 2304 } 2305 2306 2307 protected int getCurrentPos() { 2308 return currentPosition; 2309 } 2310 }