1 /* 2 * Copyright (c) 1998, 2012, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 package javax.swing.text.html.parser; 27 28 import javax.swing.text.SimpleAttributeSet; 29 import javax.swing.text.html.HTML; 30 import javax.swing.text.ChangedCharSetException; 31 import java.io.*; 32 import java.util.Hashtable; 33 import java.util.Properties; 34 import java.util.Vector; 35 import java.util.Enumeration; 36 import java.net.URL; 37 38 import sun.misc.MessageUtils; 39 40 /** 41 * A simple DTD-driven HTML parser. The parser reads an 42 * HTML file from an InputStream and calls various methods 43 * (which should be overridden in a subclass) when tags and 44 * data are encountered. 45 * <p> 46 * Unfortunately there are many badly implemented HTML parsers 47 * out there, and as a result there are many badly formatted 48 * HTML files. This parser attempts to parse most HTML files. 49 * This means that the implementation sometimes deviates from 50 * the SGML specification in favor of HTML. 51 * <p> 52 * The parser treats \r and \r\n as \n. Newlines after starttags 53 * and before end tags are ignored just as specified in the SGML/HTML 54 * specification. 55 * <p> 56 * The html spec does not specify how spaces are to be coalesced very well. 57 * Specifically, the following scenarios are not discussed (note that a 58 * space should be used here, but I am using &nbsp to force the space to 59 * be displayed): 60 * <p> 61 * '<b>blah <i> <strike> foo' which can be treated as: 62 * '<b>blah <i><strike>foo' 63 * <p>as well as: 64 * '<p><a href="xx"> <em>Using</em></a></p>' 65 * which appears to be treated as: 66 * '<p><a href="xx"><em>Using</em></a></p>' 67 * <p> 68 * If <code>strict</code> is false, when a tag that breaks flow, 69 * (<code>TagElement.breaksFlows</code>) or trailing whitespace is 70 * encountered, all whitespace will be ignored until a non whitespace 71 * character is encountered. This appears to give behavior closer to 72 * the popular browsers. 73 * 74 * @see DTD 75 * @see TagElement 76 * @see SimpleAttributeSet 77 * @author Arthur van Hoff 78 * @author Sunita Mani 79 */ 80 public 81 class Parser implements DTDConstants { 82 83 // Maximum codepoint value within BMP 84 private static final int MAX_BMP_BOUND = 65535; 85 86 private char text[] = new char[1024]; 87 private int textpos = 0; 88 private TagElement last; 89 private boolean space; 90 91 private char str[] = new char[128]; 92 private int strpos = 0; 93 94 protected DTD dtd = null; 95 96 private int ch; 97 private int ln; 98 private Reader in; 99 100 private Element recent; 101 private TagStack stack; 102 private boolean skipTag = false; 103 private TagElement lastFormSent = null; 104 private SimpleAttributeSet attributes = new SimpleAttributeSet(); 105 106 // State for <html>, <head> and <body>. Since people like to slap 107 // together HTML documents without thinking, occasionally they 108 // have multiple instances of these tags. These booleans track 109 // the first sightings of these tags so they can be safely ignored 110 // by the parser if repeated. 111 private boolean seenHtml = false; 112 private boolean seenHead = false; 113 private boolean seenBody = false; 114 115 /** 116 * The html spec does not specify how spaces are coalesced very well. 117 * If strict == false, ignoreSpace is used to try and mimic the behavior 118 * of the popular browsers. 119 * <p> 120 * The problematic scenarios are: 121 * '<b>blah <i> <strike> foo' which can be treated as: 122 * '<b>blah <i><strike>foo' 123 * as well as: 124 * '<p><a href="xx"> <em>Using</em></a></p>' 125 * which appears to be treated as: 126 * '<p><a href="xx"><em>Using</em></a></p>' 127 * <p> 128 * When a tag that breaks flow, or trailing whitespace is encountered 129 * ignoreSpace is set to true. From then on, all whitespace will be 130 * ignored. 131 * ignoreSpace will be set back to false the first time a 132 * non whitespace character is encountered. This appears to give 133 * behavior closer to the popular browsers. 134 */ 135 private boolean ignoreSpace; 136 137 /** 138 * This flag determines whether or not the Parser will be strict 139 * in enforcing SGML compatibility. If false, it will be lenient 140 * with certain common classes of erroneous HTML constructs. 141 * Strict or not, in either case an error will be recorded. 142 * 143 */ 144 protected boolean strict = false; 145 146 147 /** Number of \r\n's encountered. */ 148 private int crlfCount; 149 /** Number of \r's encountered. A \r\n will not increment this. */ 150 private int crCount; 151 /** Number of \n's encountered. A \r\n will not increment this. */ 152 private int lfCount; 153 154 // 155 // To correctly identify the start of a tag/comment/text we need two 156 // ivars. Two are needed as handleText isn't invoked until the tag 157 // after the text has been parsed, that is the parser parses the text, 158 // then a tag, then invokes handleText followed by handleStart. 159 // 160 /** The start position of the current block. Block is overloaded here, 161 * it really means the current start position for the current comment, 162 * tag, text. Use getBlockStartPosition to access this. */ 163 private int currentBlockStartPos; 164 /** Start position of the last block. */ 165 private int lastBlockStartPos; 166 167 /** 168 * array for mapping numeric references in range 169 * 130-159 to displayable Unicode characters. 170 */ 171 private static final char[] cp1252Map = { 172 8218, // ‚ 173 402, // ƒ 174 8222, // „ 175 8230, // … 176 8224, // † 177 8225, // ‡ 178 710, // ˆ 179 8240, // ‰ 180 352, // Š 181 8249, // ‹ 182 338, // Œ 183 141, //  184 142, // Ž 185 143, //  186 144, //  187 8216, // ‘ 188 8217, // ’ 189 8220, // “ 190 8221, // ” 191 8226, // • 192 8211, // – 193 8212, // — 194 732, // ˜ 195 8482, // ™ 196 353, // š 197 8250, // › 198 339, // œ 199 157, //  200 158, // ž 201 376 // Ÿ 202 }; 203 204 public Parser(DTD dtd) { 205 this.dtd = dtd; 206 } 207 208 209 /** 210 * @return the line number of the line currently being parsed 211 */ 212 protected int getCurrentLine() { 213 return ln; 214 } 215 216 /** 217 * Returns the start position of the current block. Block is 218 * overloaded here, it really means the current start position for 219 * the current comment tag, text, block.... This is provided for 220 * subclassers that wish to know the start of the current block when 221 * called with one of the handleXXX methods. 222 */ 223 int getBlockStartPosition() { 224 return Math.max(0, lastBlockStartPos - 1); 225 } 226 227 /** 228 * Makes a TagElement. 229 */ 230 protected TagElement makeTag(Element elem, boolean fictional) { 231 return new TagElement(elem, fictional); 232 } 233 234 protected TagElement makeTag(Element elem) { 235 return makeTag(elem, false); 236 } 237 238 protected SimpleAttributeSet getAttributes() { 239 return attributes; 240 } 241 242 protected void flushAttributes() { 243 attributes.removeAttributes(attributes); 244 } 245 246 /** 247 * Called when PCDATA is encountered. 248 */ 249 protected void handleText(char text[]) { 250 } 251 252 /** 253 * Called when an HTML title tag is encountered. 254 */ 255 protected void handleTitle(char text[]) { 256 // default behavior is to call handleText. Subclasses 257 // can override if necessary. 258 handleText(text); 259 } 260 261 /** 262 * Called when an HTML comment is encountered. 263 */ 264 protected void handleComment(char text[]) { 265 } 266 267 protected void handleEOFInComment() { 268 // We've reached EOF. Our recovery strategy is to 269 // see if we have more than one line in the comment; 270 // if so, we pretend that the comment was an unterminated 271 // single line comment, and reparse the lines after the 272 // first line as normal HTML content. 273 274 int commentEndPos = strIndexOf('\n'); 275 if (commentEndPos >= 0) { 276 handleComment(getChars(0, commentEndPos)); 277 try { 278 in.close(); 279 in = new CharArrayReader(getChars(commentEndPos + 1)); 280 ch = '>'; 281 } catch (IOException e) { 282 error("ioexception"); 283 } 284 285 resetStrBuffer(); 286 } else { 287 // no newline, so signal an error 288 error("eof.comment"); 289 } 290 } 291 292 /** 293 * Called when an empty tag is encountered. 294 */ 295 protected void handleEmptyTag(TagElement tag) throws ChangedCharSetException { 296 } 297 298 /** 299 * Called when a start tag is encountered. 300 */ 301 protected void handleStartTag(TagElement tag) { 302 } 303 304 /** 305 * Called when an end tag is encountered. 306 */ 307 protected void handleEndTag(TagElement tag) { 308 } 309 310 /** 311 * An error has occurred. 312 */ 313 protected void handleError(int ln, String msg) { 314 /* 315 Thread.dumpStack(); 316 System.out.println("**** " + stack); 317 System.out.println("line " + ln + ": error: " + msg); 318 System.out.println(); 319 */ 320 } 321 322 /** 323 * Output text. 324 */ 325 void handleText(TagElement tag) { 326 if (tag.breaksFlow()) { 327 space = false; 328 if (!strict) { 329 ignoreSpace = true; 330 } 331 } 332 if (textpos == 0) { 333 if ((!space) || (stack == null) || last.breaksFlow() || 334 !stack.advance(dtd.pcdata)) { 335 last = tag; 336 space = false; 337 lastBlockStartPos = currentBlockStartPos; 338 return; 339 } 340 } 341 if (space) { 342 if (!ignoreSpace) { 343 // enlarge buffer if needed 344 if (textpos + 1 > text.length) { 345 char newtext[] = new char[text.length + 200]; 346 System.arraycopy(text, 0, newtext, 0, text.length); 347 text = newtext; 348 } 349 350 // output pending space 351 text[textpos++] = ' '; 352 if (!strict && !tag.getElement().isEmpty()) { 353 ignoreSpace = true; 354 } 355 } 356 space = false; 357 } 358 char newtext[] = new char[textpos]; 359 System.arraycopy(text, 0, newtext, 0, textpos); 360 // Handles cases of bad html where the title tag 361 // was getting lost when we did error recovery. 362 if (tag.getElement().getName().equals("title")) { 363 handleTitle(newtext); 364 } else { 365 handleText(newtext); 366 } 367 lastBlockStartPos = currentBlockStartPos; 368 textpos = 0; 369 last = tag; 370 space = false; 371 } 372 373 /** 374 * Invoke the error handler. 375 */ 376 protected void error(String err, String arg1, String arg2, 377 String arg3) { 378 handleError(ln, err + " " + arg1 + " " + arg2 + " " + arg3); 379 } 380 381 protected void error(String err, String arg1, String arg2) { 382 error(err, arg1, arg2, "?"); 383 } 384 protected void error(String err, String arg1) { 385 error(err, arg1, "?", "?"); 386 } 387 protected void error(String err) { 388 error(err, "?", "?", "?"); 389 } 390 391 392 /** 393 * Handle a start tag. The new tag is pushed 394 * onto the tag stack. The attribute list is 395 * checked for required attributes. 396 */ 397 protected void startTag(TagElement tag) throws ChangedCharSetException { 398 Element elem = tag.getElement(); 399 400 // If the tag is an empty tag and texpos != 0 401 // this implies that there is text before the 402 // start tag that needs to be processed before 403 // handling the tag. 404 // 405 if (!elem.isEmpty() || 406 ((last != null) && !last.breaksFlow()) || 407 (textpos != 0)) { 408 handleText(tag); 409 } else { 410 // this variable gets updated in handleText(). 411 // Since in this case we do not call handleText() 412 // we need to update it here. 413 // 414 last = tag; 415 // Note that we should really check last.breakFlows before 416 // assuming this should be false. 417 space = false; 418 } 419 lastBlockStartPos = currentBlockStartPos; 420 421 // check required attributes 422 for (AttributeList a = elem.atts ; a != null ; a = a.next) { 423 if ((a.modifier == REQUIRED) && 424 ((attributes.isEmpty()) || 425 ((!attributes.isDefined(a.name)) && 426 (!attributes.isDefined(HTML.getAttributeKey(a.name)))))) { 427 error("req.att ", a.getName(), elem.getName()); 428 } 429 } 430 431 if (elem.isEmpty()) { 432 handleEmptyTag(tag); 433 /* 434 } else if (elem.getName().equals("form")) { 435 handleStartTag(tag); 436 */ 437 } else { 438 recent = elem; 439 stack = new TagStack(tag, stack); 440 handleStartTag(tag); 441 } 442 } 443 444 /** 445 * Handle an end tag. The end tag is popped 446 * from the tag stack. 447 */ 448 protected void endTag(boolean omitted) { 449 handleText(stack.tag); 450 451 if (omitted && !stack.elem.omitEnd()) { 452 error("end.missing", stack.elem.getName()); 453 } else if (!stack.terminate()) { 454 error("end.unexpected", stack.elem.getName()); 455 } 456 457 // handle the tag 458 handleEndTag(stack.tag); 459 stack = stack.next; 460 recent = (stack != null) ? stack.elem : null; 461 } 462 463 464 boolean ignoreElement(Element elem) { 465 466 String stackElement = stack.elem.getName(); 467 String elemName = elem.getName(); 468 /* We ignore all elements that are not valid in the context of 469 a table except <td>, <th> (these we handle in 470 legalElementContext()) and #pcdata. We also ignore the 471 <font> tag in the context of <ul> and <ol> We additonally 472 ignore the <meta> and the <style> tag if the body tag has 473 been seen. **/ 474 if ((elemName.equals("html") && seenHtml) || 475 (elemName.equals("head") && seenHead) || 476 (elemName.equals("body") && seenBody)) { 477 return true; 478 } 479 if (elemName.equals("dt") || elemName.equals("dd")) { 480 TagStack s = stack; 481 while (s != null && !s.elem.getName().equals("dl")) { 482 s = s.next; 483 } 484 if (s == null) { 485 return true; 486 } 487 } 488 489 if (((stackElement.equals("table")) && 490 (!elemName.equals("#pcdata")) && (!elemName.equals("input"))) || 491 ((elemName.equals("font")) && 492 (stackElement.equals("ul") || stackElement.equals("ol"))) || 493 (elemName.equals("meta") && stack != null) || 494 (elemName.equals("style") && seenBody) || 495 (stackElement.equals("table") && elemName.equals("a"))) { 496 return true; 497 } 498 return false; 499 } 500 501 502 /** 503 * Marks the first time a tag has been seen in a document 504 */ 505 506 protected void markFirstTime(Element elem) { 507 String elemName = elem.getName(); 508 if (elemName.equals("html")) { 509 seenHtml = true; 510 } else if (elemName.equals("head")) { 511 seenHead = true; 512 } else if (elemName.equals("body")) { 513 if (buf.length == 1) { 514 // Refer to note in definition of buf for details on this. 515 char[] newBuf = new char[256]; 516 517 newBuf[0] = buf[0]; 518 buf = newBuf; 519 } 520 seenBody = true; 521 } 522 } 523 524 /** 525 * Create a legal content for an element. 526 */ 527 boolean legalElementContext(Element elem) throws ChangedCharSetException { 528 529 // System.out.println("-- legalContext -- " + elem); 530 531 // Deal with the empty stack 532 if (stack == null) { 533 // System.out.println("-- stack is empty"); 534 if (elem != dtd.html) { 535 // System.out.println("-- pushing html"); 536 startTag(makeTag(dtd.html, true)); 537 return legalElementContext(elem); 538 } 539 return true; 540 } 541 542 // Is it allowed in the current context 543 if (stack.advance(elem)) { 544 // System.out.println("-- legal context"); 545 markFirstTime(elem); 546 return true; 547 } 548 boolean insertTag = false; 549 550 // The use of all error recovery strategies are contingent 551 // on the value of the strict property. 552 // 553 // These are commonly occuring errors. if insertTag is true, 554 // then we want to adopt an error recovery strategy that 555 // involves attempting to insert an additional tag to 556 // legalize the context. The two errors addressed here 557 // are: 558 // 1) when a <td> or <th> is seen soon after a <table> tag. 559 // In this case we insert a <tr>. 560 // 2) when any other tag apart from a <tr> is seen 561 // in the context of a <tr>. In this case we would 562 // like to add a <td>. If a <tr> is seen within a 563 // <tr> context, then we will close out the current 564 // <tr>. 565 // 566 // This insertion strategy is handled later in the method. 567 // The reason for checking this now, is that in other cases 568 // we would like to apply other error recovery strategies for example 569 // ignoring tags. 570 // 571 // In certain cases it is better to ignore a tag than try to 572 // fix the situation. So the first test is to see if this 573 // is what we need to do. 574 // 575 String stackElemName = stack.elem.getName(); 576 String elemName = elem.getName(); 577 578 579 if (!strict && 580 ((stackElemName.equals("table") && elemName.equals("td")) || 581 (stackElemName.equals("table") && elemName.equals("th")) || 582 (stackElemName.equals("tr") && !elemName.equals("tr")))){ 583 insertTag = true; 584 } 585 586 587 if (!strict && !insertTag && (stack.elem.getName() != elem.getName() || 588 elem.getName().equals("body"))) { 589 if (skipTag = ignoreElement(elem)) { 590 error("tag.ignore", elem.getName()); 591 return skipTag; 592 } 593 } 594 595 // Check for anything after the start of the table besides tr, td, th 596 // or caption, and if those aren't there, insert the <tr> and call 597 // legalElementContext again. 598 if (!strict && stackElemName.equals("table") && 599 !elemName.equals("tr") && !elemName.equals("td") && 600 !elemName.equals("th") && !elemName.equals("caption")) { 601 Element e = dtd.getElement("tr"); 602 TagElement t = makeTag(e, true); 603 legalTagContext(t); 604 startTag(t); 605 error("start.missing", elem.getName()); 606 return legalElementContext(elem); 607 } 608 609 // They try to find a legal context by checking if the current 610 // tag is valid in an enclosing context. If so 611 // close out the tags by outputing end tags and then 612 // insert the curent tag. If the tags that are 613 // being closed out do not have an optional end tag 614 // specification in the DTD then an html error is 615 // reported. 616 // 617 if (!insertTag && stack.terminate() && (!strict || stack.elem.omitEnd())) { 618 for (TagStack s = stack.next ; s != null ; s = s.next) { 619 if (s.advance(elem)) { 620 while (stack != s) { 621 endTag(true); 622 } 623 return true; 624 } 625 if (!s.terminate() || (strict && !s.elem.omitEnd())) { 626 break; 627 } 628 } 629 } 630 631 // Check if we know what tag is expected next. 632 // If so insert the tag. Report an error if the 633 // tag does not have its start tag spec in the DTD as optional. 634 // 635 Element next = stack.first(); 636 if (next != null && (!strict || next.omitStart()) && 637 !(next==dtd.head && elem==dtd.pcdata) ) { 638 // System.out.println("-- omitting start tag: " + next); 639 TagElement t = makeTag(next, true); 640 legalTagContext(t); 641 startTag(t); 642 if (!next.omitStart()) { 643 error("start.missing", elem.getName()); 644 } 645 return legalElementContext(elem); 646 } 647 648 649 // Traverse the list of expected elements and determine if adding 650 // any of these elements would make for a legal context. 651 // 652 653 if (!strict) { 654 ContentModel content = stack.contentModel(); 655 Vector<Element> elemVec = new Vector<Element>(); 656 if (content != null) { 657 content.getElements(elemVec); 658 for (Element e : elemVec) { 659 // Ensure that this element has not been included as 660 // part of the exclusions in the DTD. 661 // 662 if (stack.excluded(e.getIndex())) { 663 continue; 664 } 665 666 boolean reqAtts = false; 667 668 for (AttributeList a = e.getAttributes(); a != null ; a = a.next) { 669 if (a.modifier == REQUIRED) { 670 reqAtts = true; 671 break; 672 } 673 } 674 // Ensure that no tag that has required attributes 675 // gets inserted. 676 // 677 if (reqAtts) { 678 continue; 679 } 680 681 ContentModel m = e.getContent(); 682 if (m != null && m.first(elem)) { 683 // System.out.println("-- adding a legal tag: " + e); 684 TagElement t = makeTag(e, true); 685 legalTagContext(t); 686 startTag(t); 687 error("start.missing", e.getName()); 688 return legalElementContext(elem); 689 } 690 } 691 } 692 } 693 694 // Check if the stack can be terminated. If so add the appropriate 695 // end tag. Report an error if the tag being ended does not have its 696 // end tag spec in the DTD as optional. 697 // 698 if (stack.terminate() && (stack.elem != dtd.body) && (!strict || stack.elem.omitEnd())) { 699 // System.out.println("-- omitting end tag: " + stack.elem); 700 if (!stack.elem.omitEnd()) { 701 error("end.missing", elem.getName()); 702 } 703 704 endTag(true); 705 return legalElementContext(elem); 706 } 707 708 // At this point we know that something is screwed up. 709 return false; 710 } 711 712 /** 713 * Create a legal context for a tag. 714 */ 715 void legalTagContext(TagElement tag) throws ChangedCharSetException { 716 if (legalElementContext(tag.getElement())) { 717 markFirstTime(tag.getElement()); 718 return; 719 } 720 721 // Avoid putting a block tag in a flow tag. 722 if (tag.breaksFlow() && (stack != null) && !stack.tag.breaksFlow()) { 723 endTag(true); 724 legalTagContext(tag); 725 return; 726 } 727 728 // Avoid putting something wierd in the head of the document. 729 for (TagStack s = stack ; s != null ; s = s.next) { 730 if (s.tag.getElement() == dtd.head) { 731 while (stack != s) { 732 endTag(true); 733 } 734 endTag(true); 735 legalTagContext(tag); 736 return; 737 } 738 } 739 740 // Everything failed 741 error("tag.unexpected", tag.getElement().getName()); 742 } 743 744 /** 745 * Error context. Something went wrong, make sure we are in 746 * the document's body context 747 */ 748 void errorContext() throws ChangedCharSetException { 749 for (; (stack != null) && (stack.tag.getElement() != dtd.body) ; stack = stack.next) { 750 handleEndTag(stack.tag); 751 } 752 if (stack == null) { 753 legalElementContext(dtd.body); 754 startTag(makeTag(dtd.body, true)); 755 } 756 } 757 758 /** 759 * Add a char to the string buffer. 760 */ 761 void addString(int c) { 762 if (strpos == str.length) { 763 char newstr[] = new char[str.length + 128]; 764 System.arraycopy(str, 0, newstr, 0, str.length); 765 str = newstr; 766 } 767 str[strpos++] = (char)c; 768 } 769 770 /** 771 * Get the string that's been accumulated. 772 */ 773 String getString(int pos) { 774 char newStr[] = new char[strpos - pos]; 775 System.arraycopy(str, pos, newStr, 0, strpos - pos); 776 strpos = pos; 777 return new String(newStr); 778 } 779 780 char[] getChars(int pos) { 781 char newStr[] = new char[strpos - pos]; 782 System.arraycopy(str, pos, newStr, 0, strpos - pos); 783 strpos = pos; 784 return newStr; 785 } 786 787 char[] getChars(int pos, int endPos) { 788 char newStr[] = new char[endPos - pos]; 789 System.arraycopy(str, pos, newStr, 0, endPos - pos); 790 // REMIND: it's not clear whether this version should set strpos or not 791 // strpos = pos; 792 return newStr; 793 } 794 795 void resetStrBuffer() { 796 strpos = 0; 797 } 798 799 int strIndexOf(char target) { 800 for (int i = 0; i < strpos; i++) { 801 if (str[i] == target) { 802 return i; 803 } 804 } 805 806 return -1; 807 } 808 809 /** 810 * Skip space. 811 * [5] 297:5 812 */ 813 void skipSpace() throws IOException { 814 while (true) { 815 switch (ch) { 816 case '\n': 817 ln++; 818 ch = readCh(); 819 lfCount++; 820 break; 821 822 case '\r': 823 ln++; 824 if ((ch = readCh()) == '\n') { 825 ch = readCh(); 826 crlfCount++; 827 } 828 else { 829 crCount++; 830 } 831 break; 832 case ' ': 833 case '\t': 834 ch = readCh(); 835 break; 836 837 default: 838 return; 839 } 840 } 841 } 842 843 /** 844 * Parse identifier. Uppercase characters are folded 845 * to lowercase when lower is true. Returns falsed if 846 * no identifier is found. [55] 346:17 847 */ 848 boolean parseIdentifier(boolean lower) throws IOException { 849 switch (ch) { 850 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': 851 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L': 852 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': 853 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X': 854 case 'Y': case 'Z': 855 if (lower) { 856 ch = 'a' + (ch - 'A'); 857 } 858 859 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': 860 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': 861 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': 862 case 's': case 't': case 'u': case 'v': case 'w': case 'x': 863 case 'y': case 'z': 864 break; 865 866 default: 867 return false; 868 } 869 870 while (true) { 871 addString(ch); 872 873 switch (ch = readCh()) { 874 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': 875 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L': 876 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': 877 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X': 878 case 'Y': case 'Z': 879 if (lower) { 880 ch = 'a' + (ch - 'A'); 881 } 882 883 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': 884 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': 885 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': 886 case 's': case 't': case 'u': case 'v': case 'w': case 'x': 887 case 'y': case 'z': 888 889 case '0': case '1': case '2': case '3': case '4': 890 case '5': case '6': case '7': case '8': case '9': 891 892 case '.': case '-': 893 894 case '_': // not officially allowed 895 break; 896 897 default: 898 return true; 899 } 900 } 901 } 902 903 /** 904 * Parse an entity reference. [59] 350:17 905 */ 906 private char[] parseEntityReference() throws IOException { 907 int pos = strpos; 908 909 if ((ch = readCh()) == '#') { 910 int n = 0; 911 ch = readCh(); 912 if ((ch >= '0') && (ch <= '9') || 913 ch == 'x' || ch == 'X') { 914 915 if ((ch >= '0') && (ch <= '9')) { 916 // parse decimal reference 917 while ((ch >= '0') && (ch <= '9')) { 918 n = (n * 10) + ch - '0'; 919 ch = readCh(); 920 } 921 } else { 922 // parse hexadecimal reference 923 ch = readCh(); 924 char lch = (char) Character.toLowerCase(ch); 925 while ((lch >= '0') && (lch <= '9') || 926 (lch >= 'a') && (lch <= 'f')) { 927 if (lch >= '0' && lch <= '9') { 928 n = (n * 16) + lch - '0'; 929 } else { 930 n = (n * 16) + lch - 'a' + 10; 931 } 932 ch = readCh(); 933 lch = (char) Character.toLowerCase(ch); 934 } 935 } 936 switch (ch) { 937 case '\n': 938 ln++; 939 ch = readCh(); 940 lfCount++; 941 break; 942 943 case '\r': 944 ln++; 945 if ((ch = readCh()) == '\n') { 946 ch = readCh(); 947 crlfCount++; 948 } 949 else { 950 crCount++; 951 } 952 break; 953 954 case ';': 955 ch = readCh(); 956 break; 957 } 958 // Check if n codepoint is within BMP; convert into surrogate 959 // pair otherwise 960 try { 961 char data[]; 962 if (n <= MAX_BMP_BOUND) { 963 data = Character.toChars(mapNumericReference((char) n)); 964 } else { 965 data = Character.toChars(n); 966 } 967 968 return data; 969 } 970 catch(IllegalArgumentException ex) { 971 error(ex.toString()); 972 return new char[0]; 973 } 974 } 975 addString('#'); 976 if (!parseIdentifier(false)) { 977 error("ident.expected"); 978 strpos = pos; 979 char data[] = {'&', '#'}; 980 return data; 981 } 982 } else if (!parseIdentifier(false)) { 983 char data[] = {'&'}; 984 return data; 985 } 986 987 boolean semicolon = false; 988 989 switch (ch) { 990 case '\n': 991 ln++; 992 ch = readCh(); 993 lfCount++; 994 break; 995 996 case '\r': 997 ln++; 998 if ((ch = readCh()) == '\n') { 999 ch = readCh(); 1000 crlfCount++; 1001 } 1002 else { 1003 crCount++; 1004 } 1005 break; 1006 1007 case ';': 1008 semicolon = true; 1009 1010 ch = readCh(); 1011 break; 1012 } 1013 1014 String nm = getString(pos); 1015 Entity ent = dtd.getEntity(nm); 1016 1017 // entities are case sensitive - however if strict 1018 // is false then we will try to make a match by 1019 // converting the string to all lowercase. 1020 // 1021 if (!strict && (ent == null)) { 1022 ent = dtd.getEntity(nm.toLowerCase()); 1023 } 1024 if ((ent == null) || !ent.isGeneral()) { 1025 1026 if (nm.length() == 0) { 1027 error("invalid.entref", nm); 1028 return new char[0]; 1029 } 1030 /* given that there is not a match restore the entity reference */ 1031 String str = "&" + nm + (semicolon ? ";" : ""); 1032 1033 char b[] = new char[str.length()]; 1034 str.getChars(0, b.length, b, 0); 1035 return b; 1036 } 1037 return ent.getData(); 1038 } 1039 1040 /** 1041 * Converts numeric character reference to Unicode character. 1042 * 1043 * Normally the code in a reference should be always converted 1044 * to the Unicode character with the same code, but due to 1045 * wide usage of Cp1252 charset most browsers map numeric references 1046 * in the range 130-159 (which are control chars in Unicode set) 1047 * to displayable characters with other codes. 1048 * 1049 * @param c the code of numeric character reference. 1050 * @return the character corresponding to the reference code. 1051 */ 1052 private char mapNumericReference(char c) { 1053 if (c < 130 || c > 159) { 1054 return c; 1055 } 1056 return cp1252Map[c - 130]; 1057 } 1058 1059 /** 1060 * Parse a comment. [92] 391:7 1061 */ 1062 void parseComment() throws IOException { 1063 1064 while (true) { 1065 int c = ch; 1066 switch (c) { 1067 case '-': 1068 /** Presuming that the start string of a comment "<!--" has 1069 already been parsed, the '-' character is valid only as 1070 part of a comment termination and further more it must 1071 be present in even numbers. Hence if strict is true, we 1072 presume the comment has been terminated and return. 1073 However if strict is false, then there is no even number 1074 requirement and this character can appear anywhere in the 1075 comment. The parser reads on until it sees the following 1076 pattern: "-->" or "--!>". 1077 **/ 1078 if (!strict && (strpos != 0) && (str[strpos - 1] == '-')) { 1079 if ((ch = readCh()) == '>') { 1080 return; 1081 } 1082 if (ch == '!') { 1083 if ((ch = readCh()) == '>') { 1084 return; 1085 } else { 1086 /* to account for extra read()'s that happened */ 1087 addString('-'); 1088 addString('!'); 1089 continue; 1090 } 1091 } 1092 break; 1093 } 1094 1095 if ((ch = readCh()) == '-') { 1096 ch = readCh(); 1097 if (strict || ch == '>') { 1098 return; 1099 } 1100 if (ch == '!') { 1101 if ((ch = readCh()) == '>') { 1102 return; 1103 } else { 1104 /* to account for extra read()'s that happened */ 1105 addString('-'); 1106 addString('!'); 1107 continue; 1108 } 1109 } 1110 /* to account for the extra read() */ 1111 addString('-'); 1112 } 1113 break; 1114 1115 case -1: 1116 handleEOFInComment(); 1117 return; 1118 1119 case '\n': 1120 ln++; 1121 ch = readCh(); 1122 lfCount++; 1123 break; 1124 1125 case '>': 1126 ch = readCh(); 1127 break; 1128 1129 case '\r': 1130 ln++; 1131 if ((ch = readCh()) == '\n') { 1132 ch = readCh(); 1133 crlfCount++; 1134 } 1135 else { 1136 crCount++; 1137 } 1138 c = '\n'; 1139 break; 1140 default: 1141 ch = readCh(); 1142 break; 1143 } 1144 1145 addString(c); 1146 } 1147 } 1148 1149 /** 1150 * Parse literal content. [46] 343:1 and [47] 344:1 1151 */ 1152 void parseLiteral(boolean replace) throws IOException { 1153 while (true) { 1154 int c = ch; 1155 switch (c) { 1156 case -1: 1157 error("eof.literal", stack.elem.getName()); 1158 endTag(true); 1159 return; 1160 1161 case '>': 1162 ch = readCh(); 1163 int i = textpos - (stack.elem.name.length() + 2), j = 0; 1164 1165 // match end tag 1166 if ((i >= 0) && (text[i++] == '<') && (text[i] == '/')) { 1167 while ((++i < textpos) && 1168 (Character.toLowerCase(text[i]) == stack.elem.name.charAt(j++))); 1169 if (i == textpos) { 1170 textpos -= (stack.elem.name.length() + 2); 1171 if ((textpos > 0) && (text[textpos-1] == '\n')) { 1172 textpos--; 1173 } 1174 endTag(false); 1175 return; 1176 } 1177 } 1178 break; 1179 1180 case '&': 1181 char data[] = parseEntityReference(); 1182 if (textpos + data.length > text.length) { 1183 char newtext[] = new char[Math.max(textpos + data.length + 128, text.length * 2)]; 1184 System.arraycopy(text, 0, newtext, 0, text.length); 1185 text = newtext; 1186 } 1187 System.arraycopy(data, 0, text, textpos, data.length); 1188 textpos += data.length; 1189 continue; 1190 1191 case '\n': 1192 ln++; 1193 ch = readCh(); 1194 lfCount++; 1195 break; 1196 1197 case '\r': 1198 ln++; 1199 if ((ch = readCh()) == '\n') { 1200 ch = readCh(); 1201 crlfCount++; 1202 } 1203 else { 1204 crCount++; 1205 } 1206 c = '\n'; 1207 break; 1208 default: 1209 ch = readCh(); 1210 break; 1211 } 1212 1213 // output character 1214 if (textpos == text.length) { 1215 char newtext[] = new char[text.length + 128]; 1216 System.arraycopy(text, 0, newtext, 0, text.length); 1217 text = newtext; 1218 } 1219 text[textpos++] = (char)c; 1220 } 1221 } 1222 1223 /** 1224 * Parse attribute value. [33] 331:1 1225 */ 1226 String parseAttributeValue(boolean lower) throws IOException { 1227 int delim = -1; 1228 1229 // Check for a delimiter 1230 switch(ch) { 1231 case '\'': 1232 case '"': 1233 delim = ch; 1234 ch = readCh(); 1235 break; 1236 } 1237 1238 // Parse the rest of the value 1239 while (true) { 1240 int c = ch; 1241 1242 switch (c) { 1243 case '\n': 1244 ln++; 1245 ch = readCh(); 1246 lfCount++; 1247 if (delim < 0) { 1248 return getString(0); 1249 } 1250 break; 1251 1252 case '\r': 1253 ln++; 1254 1255 if ((ch = readCh()) == '\n') { 1256 ch = readCh(); 1257 crlfCount++; 1258 } 1259 else { 1260 crCount++; 1261 } 1262 if (delim < 0) { 1263 return getString(0); 1264 } 1265 break; 1266 1267 case '\t': 1268 if (delim < 0) 1269 c = ' '; 1270 case ' ': 1271 ch = readCh(); 1272 if (delim < 0) { 1273 return getString(0); 1274 } 1275 break; 1276 1277 case '>': 1278 case '<': 1279 if (delim < 0) { 1280 return getString(0); 1281 } 1282 ch = readCh(); 1283 break; 1284 1285 case '\'': 1286 case '"': 1287 ch = readCh(); 1288 if (c == delim) { 1289 return getString(0); 1290 } else if (delim == -1) { 1291 error("attvalerr"); 1292 if (strict || ch == ' ') { 1293 return getString(0); 1294 } else { 1295 continue; 1296 } 1297 } 1298 break; 1299 1300 case '=': 1301 if (delim < 0) { 1302 /* In SGML a construct like <img src=/cgi-bin/foo?x=1> 1303 is considered invalid since an = sign can only be contained 1304 in an attributes value if the string is quoted. 1305 */ 1306 error("attvalerr"); 1307 /* If strict is true then we return with the string we have thus far. 1308 Otherwise we accept the = sign as part of the attribute's value and 1309 process the rest of the img tag. */ 1310 if (strict) { 1311 return getString(0); 1312 } 1313 } 1314 ch = readCh(); 1315 break; 1316 1317 case '&': 1318 if (strict && delim < 0) { 1319 ch = readCh(); 1320 break; 1321 } 1322 1323 char data[] = parseEntityReference(); 1324 for (int i = 0 ; i < data.length ; i++) { 1325 c = data[i]; 1326 addString((lower && (c >= 'A') && (c <= 'Z')) ? 'a' + c - 'A' : c); 1327 } 1328 continue; 1329 1330 case -1: 1331 return getString(0); 1332 1333 default: 1334 if (lower && (c >= 'A') && (c <= 'Z')) { 1335 c = 'a' + c - 'A'; 1336 } 1337 ch = readCh(); 1338 break; 1339 } 1340 addString(c); 1341 } 1342 } 1343 1344 1345 /** 1346 * Parse attribute specification List. [31] 327:17 1347 */ 1348 void parseAttributeSpecificationList(Element elem) throws IOException { 1349 1350 while (true) { 1351 skipSpace(); 1352 1353 switch (ch) { 1354 case '/': 1355 case '>': 1356 case '<': 1357 case -1: 1358 return; 1359 1360 case '-': 1361 if ((ch = readCh()) == '-') { 1362 ch = readCh(); 1363 parseComment(); 1364 strpos = 0; 1365 } else { 1366 error("invalid.tagchar", "-", elem.getName()); 1367 ch = readCh(); 1368 } 1369 continue; 1370 } 1371 1372 AttributeList att; 1373 String attname; 1374 String attvalue; 1375 1376 if (parseIdentifier(true)) { 1377 attname = getString(0); 1378 skipSpace(); 1379 if (ch == '=') { 1380 ch = readCh(); 1381 skipSpace(); 1382 att = elem.getAttribute(attname); 1383 // Bug ID 4102750 1384 // Load the NAME of an Attribute Case Sensitive 1385 // The case of the NAME must be intact 1386 // MG 021898 1387 attvalue = parseAttributeValue((att != null) && (att.type != CDATA) && (att.type != NOTATION) && (att.type != NAME)); 1388 // attvalue = parseAttributeValue((att != null) && (att.type != CDATA) && (att.type != NOTATION)); 1389 } else { 1390 attvalue = attname; 1391 att = elem.getAttributeByValue(attvalue); 1392 if (att == null) { 1393 att = elem.getAttribute(attname); 1394 if (att != null) { 1395 attvalue = att.getValue(); 1396 } 1397 else { 1398 // Make it null so that NULL_ATTRIBUTE_VALUE is 1399 // used 1400 attvalue = null; 1401 } 1402 } 1403 } 1404 } else if (!strict && ch == ',') { // allows for comma separated attribute-value pairs 1405 ch = readCh(); 1406 continue; 1407 } else if (!strict && ch == '"') { // allows for quoted attributes 1408 ch = readCh(); 1409 skipSpace(); 1410 if (parseIdentifier(true)) { 1411 attname = getString(0); 1412 if (ch == '"') { 1413 ch = readCh(); 1414 } 1415 skipSpace(); 1416 if (ch == '=') { 1417 ch = readCh(); 1418 skipSpace(); 1419 att = elem.getAttribute(attname); 1420 attvalue = parseAttributeValue((att != null) && 1421 (att.type != CDATA) && 1422 (att.type != NOTATION)); 1423 } else { 1424 attvalue = attname; 1425 att = elem.getAttributeByValue(attvalue); 1426 if (att == null) { 1427 att = elem.getAttribute(attname); 1428 if (att != null) { 1429 attvalue = att.getValue(); 1430 } 1431 } 1432 } 1433 } else { 1434 char str[] = {(char)ch}; 1435 error("invalid.tagchar", new String(str), elem.getName()); 1436 ch = readCh(); 1437 continue; 1438 } 1439 } else if (!strict && (attributes.isEmpty()) && (ch == '=')) { 1440 ch = readCh(); 1441 skipSpace(); 1442 attname = elem.getName(); 1443 att = elem.getAttribute(attname); 1444 attvalue = parseAttributeValue((att != null) && 1445 (att.type != CDATA) && 1446 (att.type != NOTATION)); 1447 } else if (!strict && (ch == '=')) { 1448 ch = readCh(); 1449 skipSpace(); 1450 attvalue = parseAttributeValue(true); 1451 error("attvalerr"); 1452 return; 1453 } else { 1454 char str[] = {(char)ch}; 1455 error("invalid.tagchar", new String(str), elem.getName()); 1456 if (!strict) { 1457 ch = readCh(); 1458 continue; 1459 } else { 1460 return; 1461 } 1462 } 1463 1464 if (att != null) { 1465 attname = att.getName(); 1466 } else { 1467 error("invalid.tagatt", attname, elem.getName()); 1468 } 1469 1470 // Check out the value 1471 if (attributes.isDefined(attname)) { 1472 error("multi.tagatt", attname, elem.getName()); 1473 } 1474 if (attvalue == null) { 1475 attvalue = ((att != null) && (att.value != null)) ? att.value : 1476 HTML.NULL_ATTRIBUTE_VALUE; 1477 } else if ((att != null) && (att.values != null) && !att.values.contains(attvalue)) { 1478 error("invalid.tagattval", attname, elem.getName()); 1479 } 1480 HTML.Attribute attkey = HTML.getAttributeKey(attname); 1481 if (attkey == null) { 1482 attributes.addAttribute(attname, attvalue); 1483 } else { 1484 attributes.addAttribute(attkey, attvalue); 1485 } 1486 } 1487 } 1488 1489 /** 1490 * Parses th Document Declaration Type markup declaration. 1491 * Currently ignores it. 1492 */ 1493 public String parseDTDMarkup() throws IOException { 1494 1495 StringBuilder strBuff = new StringBuilder(); 1496 ch = readCh(); 1497 while(true) { 1498 switch (ch) { 1499 case '>': 1500 ch = readCh(); 1501 return strBuff.toString(); 1502 case -1: 1503 error("invalid.markup"); 1504 return strBuff.toString(); 1505 case '\n': 1506 ln++; 1507 ch = readCh(); 1508 lfCount++; 1509 break; 1510 case '"': 1511 ch = readCh(); 1512 break; 1513 case '\r': 1514 ln++; 1515 if ((ch = readCh()) == '\n') { 1516 ch = readCh(); 1517 crlfCount++; 1518 } 1519 else { 1520 crCount++; 1521 } 1522 break; 1523 default: 1524 strBuff.append((char)(ch & 0xFF)); 1525 ch = readCh(); 1526 break; 1527 } 1528 } 1529 } 1530 1531 /** 1532 * Parse markup declarations. 1533 * Currently only handles the Document Type Declaration markup. 1534 * Returns true if it is a markup declaration false otherwise. 1535 */ 1536 protected boolean parseMarkupDeclarations(StringBuffer strBuff) throws IOException { 1537 1538 /* Currently handles only the DOCTYPE */ 1539 if ((strBuff.length() == "DOCTYPE".length()) && 1540 (strBuff.toString().toUpperCase().equals("DOCTYPE"))) { 1541 parseDTDMarkup(); 1542 return true; 1543 } 1544 return false; 1545 } 1546 1547 /** 1548 * Parse an invalid tag. 1549 */ 1550 void parseInvalidTag() throws IOException { 1551 // ignore all data upto the close bracket '>' 1552 while (true) { 1553 skipSpace(); 1554 switch (ch) { 1555 case '>': 1556 case -1: 1557 ch = readCh(); 1558 return; 1559 case '<': 1560 return; 1561 default: 1562 ch = readCh(); 1563 1564 } 1565 } 1566 } 1567 1568 /** 1569 * Parse a start or end tag. 1570 */ 1571 void parseTag() throws IOException { 1572 Element elem; 1573 boolean net = false; 1574 boolean warned = false; 1575 boolean unknown = false; 1576 1577 switch (ch = readCh()) { 1578 case '!': 1579 switch (ch = readCh()) { 1580 case '-': 1581 // Parse comment. [92] 391:7 1582 while (true) { 1583 if (ch == '-') { 1584 if (!strict || ((ch = readCh()) == '-')) { 1585 ch = readCh(); 1586 if (!strict && ch == '-') { 1587 ch = readCh(); 1588 } 1589 // send over any text you might see 1590 // before parsing and sending the 1591 // comment 1592 if (textpos != 0) { 1593 char newtext[] = new char[textpos]; 1594 System.arraycopy(text, 0, newtext, 0, textpos); 1595 handleText(newtext); 1596 lastBlockStartPos = currentBlockStartPos; 1597 textpos = 0; 1598 } 1599 parseComment(); 1600 last = makeTag(dtd.getElement("comment"), true); 1601 handleComment(getChars(0)); 1602 continue; 1603 } else if (!warned) { 1604 warned = true; 1605 error("invalid.commentchar", "-"); 1606 } 1607 } 1608 skipSpace(); 1609 switch (ch) { 1610 case '-': 1611 continue; 1612 case '>': 1613 ch = readCh(); 1614 case -1: 1615 return; 1616 default: 1617 ch = readCh(); 1618 if (!warned) { 1619 warned = true; 1620 error("invalid.commentchar", 1621 String.valueOf((char)ch)); 1622 } 1623 break; 1624 } 1625 } 1626 1627 default: 1628 // deal with marked sections 1629 StringBuffer strBuff = new StringBuffer(); 1630 while (true) { 1631 strBuff.append((char)ch); 1632 if (parseMarkupDeclarations(strBuff)) { 1633 return; 1634 } 1635 switch(ch) { 1636 case '>': 1637 ch = readCh(); 1638 case -1: 1639 error("invalid.markup"); 1640 return; 1641 case '\n': 1642 ln++; 1643 ch = readCh(); 1644 lfCount++; 1645 break; 1646 case '\r': 1647 ln++; 1648 if ((ch = readCh()) == '\n') { 1649 ch = readCh(); 1650 crlfCount++; 1651 } 1652 else { 1653 crCount++; 1654 } 1655 break; 1656 1657 default: 1658 ch = readCh(); 1659 break; 1660 } 1661 } 1662 } 1663 1664 case '/': 1665 // parse end tag [19] 317:4 1666 switch (ch = readCh()) { 1667 case '>': 1668 ch = readCh(); 1669 case '<': 1670 // empty end tag. either </> or </< 1671 if (recent == null) { 1672 error("invalid.shortend"); 1673 return; 1674 } 1675 elem = recent; 1676 break; 1677 1678 default: 1679 if (!parseIdentifier(true)) { 1680 error("expected.endtagname"); 1681 return; 1682 } 1683 skipSpace(); 1684 switch (ch) { 1685 case '>': 1686 ch = readCh(); 1687 case '<': 1688 break; 1689 1690 default: 1691 error("expected", "'>'"); 1692 while ((ch != -1) && (ch != '\n') && (ch != '>')) { 1693 ch = readCh(); 1694 } 1695 if (ch == '>') { 1696 ch = readCh(); 1697 } 1698 break; 1699 } 1700 String elemStr = getString(0); 1701 if (!dtd.elementExists(elemStr)) { 1702 error("end.unrecognized", elemStr); 1703 // Ignore RE before end tag 1704 if ((textpos > 0) && (text[textpos-1] == '\n')) { 1705 textpos--; 1706 } 1707 elem = dtd.getElement("unknown"); 1708 elem.name = elemStr; 1709 unknown = true; 1710 } else { 1711 elem = dtd.getElement(elemStr); 1712 } 1713 break; 1714 } 1715 1716 1717 // If the stack is null, we're seeing end tags without any begin 1718 // tags. Ignore them. 1719 1720 if (stack == null) { 1721 error("end.extra.tag", elem.getName()); 1722 return; 1723 } 1724 1725 // Ignore RE before end tag 1726 if ((textpos > 0) && (text[textpos-1] == '\n')) { 1727 // In a pre tag, if there are blank lines 1728 // we do not want to remove the newline 1729 // before the end tag. Hence this code. 1730 // 1731 if (stack.pre) { 1732 if ((textpos > 1) && (text[textpos-2] != '\n')) { 1733 textpos--; 1734 } 1735 } else { 1736 textpos--; 1737 } 1738 } 1739 1740 // If the end tag is a form, since we did not put it 1741 // on the tag stack, there is no corresponding start 1742 // start tag to find. Hence do not touch the tag stack. 1743 // 1744 1745 /* 1746 if (!strict && elem.getName().equals("form")) { 1747 if (lastFormSent != null) { 1748 handleEndTag(lastFormSent); 1749 return; 1750 } else { 1751 // do nothing. 1752 return; 1753 } 1754 } 1755 */ 1756 1757 if (unknown) { 1758 // we will not see a corresponding start tag 1759 // on the the stack. If we are seeing an 1760 // end tag, lets send this on as an empty 1761 // tag with the end tag attribute set to 1762 // true. 1763 TagElement t = makeTag(elem); 1764 handleText(t); 1765 attributes.addAttribute(HTML.Attribute.ENDTAG, "true"); 1766 handleEmptyTag(makeTag(elem)); 1767 unknown = false; 1768 return; 1769 } 1770 1771 // find the corresponding start tag 1772 1773 // A commonly occuring error appears to be the insertion 1774 // of extra end tags in a table. The intent here is ignore 1775 // such extra end tags. 1776 // 1777 if (!strict) { 1778 String stackElem = stack.elem.getName(); 1779 1780 if (stackElem.equals("table")) { 1781 // If it isnt a valid end tag ignore it and return 1782 // 1783 if (!elem.getName().equals(stackElem)) { 1784 error("tag.ignore", elem.getName()); 1785 return; 1786 } 1787 } 1788 1789 1790 1791 if (stackElem.equals("tr") || 1792 stackElem.equals("td")) { 1793 if ((!elem.getName().equals("table")) && 1794 (!elem.getName().equals(stackElem))) { 1795 error("tag.ignore", elem.getName()); 1796 return; 1797 } 1798 } 1799 } 1800 TagStack sp = stack; 1801 1802 while ((sp != null) && (elem != sp.elem)) { 1803 sp = sp.next; 1804 } 1805 if (sp == null) { 1806 error("unmatched.endtag", elem.getName()); 1807 return; 1808 } 1809 1810 // People put font ending tags in the darndest places. 1811 // Don't close other contexts based on them being between 1812 // a font tag and the corresponding end tag. Instead, 1813 // ignore the end tag like it doesn't exist and allow the end 1814 // of the document to close us out. 1815 String elemName = elem.getName(); 1816 if (stack != sp && 1817 (elemName.equals("font") || 1818 elemName.equals("center"))) { 1819 1820 // Since closing out a center tag can have real wierd 1821 // effects on the formatting, make sure that tags 1822 // for which omitting an end tag is legimitate 1823 // get closed out. 1824 // 1825 if (elemName.equals("center")) { 1826 while(stack.elem.omitEnd() && stack != sp) { 1827 endTag(true); 1828 } 1829 if (stack.elem == elem) { 1830 endTag(false); 1831 } 1832 } 1833 return; 1834 } 1835 // People do the same thing with center tags. In this 1836 // case we would like to close off the center tag but 1837 // not necessarily all enclosing tags. 1838 1839 1840 1841 // end tags 1842 while (stack != sp) { 1843 endTag(true); 1844 } 1845 1846 endTag(false); 1847 return; 1848 1849 case -1: 1850 error("eof"); 1851 return; 1852 } 1853 1854 // start tag [14] 314:1 1855 if (!parseIdentifier(true)) { 1856 elem = recent; 1857 if ((ch != '>') || (elem == null)) { 1858 error("expected.tagname"); 1859 return; 1860 } 1861 } else { 1862 String elemStr = getString(0); 1863 1864 if (elemStr.equals("image")) { 1865 elemStr = "img"; 1866 } 1867 1868 /* determine if this element is part of the dtd. */ 1869 1870 if (!dtd.elementExists(elemStr)) { 1871 // parseInvalidTag(); 1872 error("tag.unrecognized ", elemStr); 1873 elem = dtd.getElement("unknown"); 1874 elem.name = elemStr; 1875 unknown = true; 1876 } else { 1877 elem = dtd.getElement(elemStr); 1878 } 1879 } 1880 1881 // Parse attributes 1882 parseAttributeSpecificationList(elem); 1883 1884 switch (ch) { 1885 case '/': 1886 net = true; 1887 case '>': 1888 ch = readCh(); 1889 if (ch == '>' && net) { 1890 ch = readCh(); 1891 } 1892 case '<': 1893 break; 1894 1895 default: 1896 error("expected", "'>'"); 1897 break; 1898 } 1899 1900 if (!strict) { 1901 if (elem.getName().equals("script")) { 1902 error("javascript.unsupported"); 1903 } 1904 } 1905 1906 // ignore RE after start tag 1907 // 1908 if (!elem.isEmpty()) { 1909 if (ch == '\n') { 1910 ln++; 1911 lfCount++; 1912 ch = readCh(); 1913 } else if (ch == '\r') { 1914 ln++; 1915 if ((ch = readCh()) == '\n') { 1916 ch = readCh(); 1917 crlfCount++; 1918 } 1919 else { 1920 crCount++; 1921 } 1922 } 1923 } 1924 1925 // ensure a legal context for the tag 1926 TagElement tag = makeTag(elem, false); 1927 1928 1929 /** In dealing with forms, we have decided to treat 1930 them as legal in any context. Also, even though 1931 they do have a start and an end tag, we will 1932 not put this tag on the stack. This is to deal 1933 several pages in the web oasis that choose to 1934 start and end forms in any possible location. **/ 1935 1936 /* 1937 if (!strict && elem.getName().equals("form")) { 1938 if (lastFormSent == null) { 1939 lastFormSent = tag; 1940 } else { 1941 handleEndTag(lastFormSent); 1942 lastFormSent = tag; 1943 } 1944 } else { 1945 */ 1946 // Smlly, if a tag is unknown, we will apply 1947 // no legalTagContext logic to it. 1948 // 1949 if (!unknown) { 1950 legalTagContext(tag); 1951 1952 // If skip tag is true, this implies that 1953 // the tag was illegal and that the error 1954 // recovery strategy adopted is to ignore 1955 // the tag. 1956 if (!strict && skipTag) { 1957 skipTag = false; 1958 return; 1959 } 1960 } 1961 /* 1962 } 1963 */ 1964 1965 startTag(tag); 1966 1967 if (!elem.isEmpty()) { 1968 switch (elem.getType()) { 1969 case CDATA: 1970 parseLiteral(false); 1971 break; 1972 case RCDATA: 1973 parseLiteral(true); 1974 break; 1975 default: 1976 if (stack != null) { 1977 stack.net = net; 1978 } 1979 break; 1980 } 1981 } 1982 } 1983 1984 private static final String START_COMMENT = "<!--"; 1985 private static final String END_COMMENT = "-->"; 1986 private static final char[] SCRIPT_END_TAG = "</script>".toCharArray(); 1987 private static final char[] SCRIPT_END_TAG_UPPER_CASE = 1988 "</SCRIPT>".toCharArray(); 1989 1990 void parseScript() throws IOException { 1991 char[] charsToAdd = new char[SCRIPT_END_TAG.length]; 1992 1993 /* Here, ch should be the first character after <script> */ 1994 while (true) { 1995 int i = 0; 1996 while (i < SCRIPT_END_TAG.length 1997 && (SCRIPT_END_TAG[i] == ch 1998 || SCRIPT_END_TAG_UPPER_CASE[i] == ch)) { 1999 charsToAdd[i] = (char) ch; 2000 ch = readCh(); 2001 i++; 2002 } 2003 if (i == SCRIPT_END_TAG.length) { 2004 2005 /* '</script>' tag detected */ 2006 /* Here, ch == the first character after </script> */ 2007 return; 2008 } else { 2009 2010 /* To account for extra read()'s that happened */ 2011 for (int j = 0; j < i; j++) { 2012 addString(charsToAdd[j]); 2013 } 2014 2015 switch (ch) { 2016 case -1: 2017 error("eof.script"); 2018 return; 2019 case '\n': 2020 ln++; 2021 ch = readCh(); 2022 lfCount++; 2023 addString('\n'); 2024 break; 2025 case '\r': 2026 ln++; 2027 if ((ch = readCh()) == '\n') { 2028 ch = readCh(); 2029 crlfCount++; 2030 } else { 2031 crCount++; 2032 } 2033 addString('\n'); 2034 break; 2035 default: 2036 addString(ch); 2037 ch = readCh(); 2038 break; 2039 } // switch 2040 } 2041 } // while 2042 } 2043 2044 /** 2045 * Parse Content. [24] 320:1 2046 */ 2047 void parseContent() throws IOException { 2048 Thread curThread = Thread.currentThread(); 2049 2050 for (;;) { 2051 if (curThread.isInterrupted()) { 2052 curThread.interrupt(); // resignal the interrupt 2053 break; 2054 } 2055 2056 int c = ch; 2057 currentBlockStartPos = currentPosition; 2058 2059 if (recent == dtd.script) { // means: if after starting <script> tag 2060 2061 /* Here, ch has to be the first character after <script> */ 2062 parseScript(); 2063 last = makeTag(dtd.getElement("comment"), true); 2064 2065 /* Remove leading and trailing HTML comment declarations */ 2066 String str = new String(getChars(0)).trim(); 2067 int minLength = START_COMMENT.length() + END_COMMENT.length(); 2068 if (str.startsWith(START_COMMENT) && str.endsWith(END_COMMENT) 2069 && str.length() >= (minLength)) { 2070 str = str.substring(START_COMMENT.length(), 2071 str.length() - END_COMMENT.length()); 2072 } 2073 2074 /* Handle resulting chars as comment */ 2075 handleComment(str.toCharArray()); 2076 endTag(false); 2077 lastBlockStartPos = currentPosition; 2078 2079 continue; 2080 } else { 2081 switch (c) { 2082 case '<': 2083 parseTag(); 2084 lastBlockStartPos = currentPosition; 2085 continue; 2086 2087 case '/': 2088 ch = readCh(); 2089 if ((stack != null) && stack.net) { 2090 // null end tag. 2091 endTag(false); 2092 continue; 2093 } 2094 break; 2095 2096 case -1: 2097 return; 2098 2099 case '&': 2100 if (textpos == 0) { 2101 if (!legalElementContext(dtd.pcdata)) { 2102 error("unexpected.pcdata"); 2103 } 2104 if (last.breaksFlow()) { 2105 space = false; 2106 } 2107 } 2108 char data[] = parseEntityReference(); 2109 if (textpos + data.length + 1 > text.length) { 2110 char newtext[] = new char[Math.max(textpos + data.length + 128, text.length * 2)]; 2111 System.arraycopy(text, 0, newtext, 0, text.length); 2112 text = newtext; 2113 } 2114 if (space) { 2115 space = false; 2116 text[textpos++] = ' '; 2117 } 2118 System.arraycopy(data, 0, text, textpos, data.length); 2119 textpos += data.length; 2120 ignoreSpace = false; 2121 continue; 2122 2123 case '\n': 2124 ln++; 2125 lfCount++; 2126 ch = readCh(); 2127 if ((stack != null) && stack.pre) { 2128 break; 2129 } 2130 if (textpos == 0) { 2131 lastBlockStartPos = currentPosition; 2132 } 2133 if (!ignoreSpace) { 2134 space = true; 2135 } 2136 continue; 2137 2138 case '\r': 2139 ln++; 2140 c = '\n'; 2141 if ((ch = readCh()) == '\n') { 2142 ch = readCh(); 2143 crlfCount++; 2144 } 2145 else { 2146 crCount++; 2147 } 2148 if ((stack != null) && stack.pre) { 2149 break; 2150 } 2151 if (textpos == 0) { 2152 lastBlockStartPos = currentPosition; 2153 } 2154 if (!ignoreSpace) { 2155 space = true; 2156 } 2157 continue; 2158 2159 2160 case '\t': 2161 case ' ': 2162 ch = readCh(); 2163 if ((stack != null) && stack.pre) { 2164 break; 2165 } 2166 if (textpos == 0) { 2167 lastBlockStartPos = currentPosition; 2168 } 2169 if (!ignoreSpace) { 2170 space = true; 2171 } 2172 continue; 2173 2174 default: 2175 if (textpos == 0) { 2176 if (!legalElementContext(dtd.pcdata)) { 2177 error("unexpected.pcdata"); 2178 } 2179 if (last.breaksFlow()) { 2180 space = false; 2181 } 2182 } 2183 ch = readCh(); 2184 break; 2185 } 2186 } 2187 2188 // enlarge buffer if needed 2189 if (textpos + 2 > text.length) { 2190 char newtext[] = new char[text.length + 128]; 2191 System.arraycopy(text, 0, newtext, 0, text.length); 2192 text = newtext; 2193 } 2194 2195 // output pending space 2196 if (space) { 2197 if (textpos == 0) { 2198 lastBlockStartPos--; 2199 } 2200 text[textpos++] = ' '; 2201 space = false; 2202 } 2203 text[textpos++] = (char)c; 2204 ignoreSpace = false; 2205 } 2206 } 2207 2208 /** 2209 * Returns the end of line string. This will return the end of line 2210 * string that has been encountered the most, one of \r, \n or \r\n. 2211 */ 2212 String getEndOfLineString() { 2213 if (crlfCount >= crCount) { 2214 if (lfCount >= crlfCount) { 2215 return "\n"; 2216 } 2217 else { 2218 return "\r\n"; 2219 } 2220 } 2221 else { 2222 if (crCount > lfCount) { 2223 return "\r"; 2224 } 2225 else { 2226 return "\n"; 2227 } 2228 } 2229 } 2230 2231 /** 2232 * Parse an HTML stream, given a DTD. 2233 */ 2234 public synchronized void parse(Reader in) throws IOException { 2235 this.in = in; 2236 2237 this.ln = 1; 2238 2239 seenHtml = false; 2240 seenHead = false; 2241 seenBody = false; 2242 2243 crCount = lfCount = crlfCount = 0; 2244 2245 try { 2246 ch = readCh(); 2247 text = new char[1024]; 2248 str = new char[128]; 2249 2250 parseContent(); 2251 // NOTE: interruption may have occurred. Control flows out 2252 // of here normally. 2253 while (stack != null) { 2254 endTag(true); 2255 } 2256 in.close(); 2257 } catch (IOException e) { 2258 errorContext(); 2259 error("ioexception"); 2260 throw e; 2261 } catch (Exception e) { 2262 errorContext(); 2263 error("exception", e.getClass().getName(), e.getMessage()); 2264 e.printStackTrace(); 2265 } catch (ThreadDeath e) { 2266 errorContext(); 2267 error("terminated"); 2268 e.printStackTrace(); 2269 throw e; 2270 } finally { 2271 for (; stack != null ; stack = stack.next) { 2272 handleEndTag(stack.tag); 2273 } 2274 2275 text = null; 2276 str = null; 2277 } 2278 2279 } 2280 2281 2282 /* 2283 * Input cache. This is much faster than calling down to a synchronized 2284 * method of BufferedReader for each byte. Measurements done 5/30/97 2285 * show that there's no point in having a bigger buffer: Increasing 2286 * the buffer to 8192 had no measurable impact for a program discarding 2287 * one character at a time (reading from an http URL to a local machine). 2288 * NOTE: If the current encoding is bogus, and we read too much 2289 * (past the content-type) we may suffer a MalformedInputException. For 2290 * this reason the initial size is 1 and when the body is encountered the 2291 * size is adjusted to 256. 2292 */ 2293 private char buf[] = new char[1]; 2294 private int pos; 2295 private int len; 2296 /* 2297 tracks position relative to the beginning of the 2298 document. 2299 */ 2300 private int currentPosition; 2301 2302 2303 private final int readCh() throws IOException { 2304 2305 if (pos >= len) { 2306 2307 // This loop allows us to ignore interrupts if the flag 2308 // says so 2309 for (;;) { 2310 try { 2311 len = in.read(buf); 2312 break; 2313 } catch (InterruptedIOException ex) { 2314 throw ex; 2315 } 2316 } 2317 2318 if (len <= 0) { 2319 return -1; // eof 2320 } 2321 pos = 0; 2322 } 2323 ++currentPosition; 2324 2325 return buf[pos++]; 2326 } 2327 2328 2329 protected int getCurrentPos() { 2330 return currentPosition; 2331 } 2332 }