1 /*
   2  * Copyright (c) 1998, 2014, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 
  26 package javax.swing.text.html.parser;
  27 
  28 import javax.swing.text.SimpleAttributeSet;
  29 import javax.swing.text.html.HTML;
  30 import javax.swing.text.ChangedCharSetException;
  31 import java.io.*;
  32 import java.util.Hashtable;
  33 import java.util.Properties;
  34 import java.util.Vector;
  35 import java.util.Enumeration;
  36 import java.net.URL;
  37 
  38 import sun.misc.MessageUtils;
  39 
  40 /**
  41  * A simple DTD-driven HTML parser. The parser reads an
  42  * HTML file from an InputStream and calls various methods
  43  * (which should be overridden in a subclass) when tags and
  44  * data are encountered.
  45  * <p>
  46  * Unfortunately there are many badly implemented HTML parsers
  47  * out there, and as a result there are many badly formatted
  48  * HTML files. This parser attempts to parse most HTML files.
  49  * This means that the implementation sometimes deviates from
  50  * the SGML specification in favor of HTML.
  51  * <p>
  52  * The parser treats \r and \r\n as \n. Newlines after starttags
  53  * and before end tags are ignored just as specified in the SGML/HTML
  54  * specification.
  55  * <p>
  56  * The html spec does not specify how spaces are to be coalesced very well.
  57  * Specifically, the following scenarios are not discussed (note that a
  58  * space should be used here, but I am using &amp;nbsp to force the space to
  59  * be displayed):
  60  * <p>
  61  * '&lt;b&gt;blah&nbsp;&lt;i&gt;&nbsp;&lt;strike&gt;&nbsp;foo' which can be treated as:
  62  * '&lt;b&gt;blah&nbsp;&lt;i&gt;&lt;strike&gt;foo'
  63  * <p>as well as:
  64  * '&lt;p&gt;&lt;a href="xx"&gt;&nbsp;&lt;em&gt;Using&lt;/em&gt;&lt;/a&gt;&lt;/p&gt;'
  65  * which appears to be treated as:
  66  * '&lt;p&gt;&lt;a href="xx"&gt;&lt;em&gt;Using&lt;/em&gt;&lt;/a&gt;&lt;/p&gt;'
  67  * <p>
  68  * If <code>strict</code> is false, when a tag that breaks flow,
  69  * (<code>TagElement.breaksFlows</code>) or trailing whitespace is
  70  * encountered, all whitespace will be ignored until a non whitespace
  71  * character is encountered. This appears to give behavior closer to
  72  * the popular browsers.
  73  *
  74  * @see DTD
  75  * @see TagElement
  76  * @see SimpleAttributeSet
  77  * @author Arthur van Hoff
  78  * @author Sunita Mani
  79  */
  80 public
  81 class Parser implements DTDConstants {
  82 
  83     private char text[] = new char[1024];
  84     private int textpos = 0;
  85     private TagElement last;
  86     private boolean space;
  87 
  88     private char str[] = new char[128];
  89     private int strpos = 0;
  90 
  91     protected DTD dtd = null;
  92 
  93     private int ch;
  94     private int ln;
  95     private Reader in;
  96 
  97     private Element recent;
  98     private TagStack stack;
  99     private boolean skipTag = false;
 100     private TagElement lastFormSent = null;
 101     private SimpleAttributeSet attributes = new SimpleAttributeSet();
 102 
 103     // State for <html>, <head> and <body>.  Since people like to slap
 104     // together HTML documents without thinking, occasionally they
 105     // have multiple instances of these tags.  These booleans track
 106     // the first sightings of these tags so they can be safely ignored
 107     // by the parser if repeated.
 108     private boolean seenHtml = false;
 109     private boolean seenHead = false;
 110     private boolean seenBody = false;
 111 
 112     /**
 113      * The html spec does not specify how spaces are coalesced very well.
 114      * If strict == false, ignoreSpace is used to try and mimic the behavior
 115      * of the popular browsers.
 116      * <p>
 117      * The problematic scenarios are:
 118      * '&lt;b>blah &lt;i> &lt;strike> foo' which can be treated as:
 119      * '&lt;b>blah &lt;i>&lt;strike>foo'
 120      * as well as:
 121      * '&lt;p>&lt;a href="xx"> &lt;em>Using&lt;/em>&lt;/a>&lt;/p>'
 122      * which appears to be treated as:
 123      * '&lt;p>&lt;a href="xx">&lt;em>Using&lt;/em>&lt;/a>&lt;/p>'
 124      * <p>
 125      * When a tag that breaks flow, or trailing whitespace is encountered
 126      * ignoreSpace is set to true. From then on, all whitespace will be
 127      * ignored.
 128      * ignoreSpace will be set back to false the first time a
 129      * non whitespace character is encountered. This appears to give
 130      * behavior closer to the popular browsers.
 131      */
 132     private boolean ignoreSpace;
 133 
 134     /**
 135      * This flag determines whether or not the Parser will be strict
 136      * in enforcing SGML compatibility.  If false, it will be lenient
 137      * with certain common classes of erroneous HTML constructs.
 138      * Strict or not, in either case an error will be recorded.
 139      *
 140      */
 141     protected boolean strict = false;
 142 
 143 
 144     /** Number of \r\n's encountered. */
 145     private int crlfCount;
 146     /** Number of \r's encountered. A \r\n will not increment this. */
 147     private int crCount;
 148     /** Number of \n's encountered. A \r\n will not increment this. */
 149     private int lfCount;
 150 
 151     //
 152     // To correctly identify the start of a tag/comment/text we need two
 153     // ivars. Two are needed as handleText isn't invoked until the tag
 154     // after the text has been parsed, that is the parser parses the text,
 155     // then a tag, then invokes handleText followed by handleStart.
 156     //
 157     /** The start position of the current block. Block is overloaded here,
 158      * it really means the current start position for the current comment,
 159      * tag, text. Use getBlockStartPosition to access this. */
 160     private int currentBlockStartPos;
 161     /** Start position of the last block. */
 162     private int lastBlockStartPos;
 163 
 164     /**
 165      * array for mapping numeric references in range
 166      * 130-159 to displayable Unicode characters.
 167      */
 168     private static final char[] cp1252Map = {
 169         8218,  // ‚
 170         402,   // ƒ
 171         8222,  // „
 172         8230,  // …
 173         8224,  // †
 174         8225,  // ‡
 175         710,   // ˆ
 176         8240,  // ‰
 177         352,   // Š
 178         8249,  // ‹
 179         338,   // Œ
 180         141,   // 
 181         142,   // Ž
 182         143,   // 
 183         144,   // 
 184         8216,  // ‘
 185         8217,  // ’
 186         8220,  // “
 187         8221,  // ”
 188         8226,  // •
 189         8211,  // –
 190         8212,  // —
 191         732,   // ˜
 192         8482,  // ™
 193         353,   // š
 194         8250,  // ›
 195         339,   // œ
 196         157,   // 
 197         158,   // ž
 198         376    // Ÿ
 199     };
 200 
 201     public Parser(DTD dtd) {
 202         this.dtd = dtd;
 203     }
 204 
 205 
 206     /**
 207      * @return the line number of the line currently being parsed
 208      */
 209     protected int getCurrentLine() {
 210         return ln;
 211     }
 212 
 213     /**
 214      * Returns the start position of the current block. Block is
 215      * overloaded here, it really means the current start position for
 216      * the current comment tag, text, block.... This is provided for
 217      * subclassers that wish to know the start of the current block when
 218      * called with one of the handleXXX methods.
 219      */
 220     int getBlockStartPosition() {
 221         return Math.max(0, lastBlockStartPos - 1);
 222     }
 223 
 224     /**
 225      * Makes a TagElement.
 226      */
 227     protected TagElement makeTag(Element elem, boolean fictional) {
 228         return new TagElement(elem, fictional);
 229     }
 230 
 231     protected TagElement makeTag(Element elem) {
 232         return makeTag(elem, false);
 233     }
 234 
 235     protected SimpleAttributeSet getAttributes() {
 236         return attributes;
 237     }
 238 
 239     protected void flushAttributes() {
 240         attributes.removeAttributes(attributes);
 241     }
 242 
 243     /**
 244      * Called when PCDATA is encountered.
 245      */
 246     protected void handleText(char text[]) {
 247     }
 248 
 249     /**
 250      * Called when an HTML title tag is encountered.
 251      */
 252     protected void handleTitle(char text[]) {
 253         // default behavior is to call handleText. Subclasses
 254         // can override if necessary.
 255         handleText(text);
 256     }
 257 
 258     /**
 259      * Called when an HTML comment is encountered.
 260      */
 261     protected void handleComment(char text[]) {
 262     }
 263 
 264     protected void handleEOFInComment() {
 265         // We've reached EOF.  Our recovery strategy is to
 266         // see if we have more than one line in the comment;
 267         // if so, we pretend that the comment was an unterminated
 268         // single line comment, and reparse the lines after the
 269         // first line as normal HTML content.
 270 
 271         int commentEndPos = strIndexOf('\n');
 272         if (commentEndPos >= 0) {
 273             handleComment(getChars(0, commentEndPos));
 274             try {
 275                 in.close();
 276                 in = new CharArrayReader(getChars(commentEndPos + 1));
 277                 ch = '>';
 278             } catch (IOException e) {
 279                 error("ioexception");
 280             }
 281 
 282             resetStrBuffer();
 283         } else {
 284             // no newline, so signal an error
 285             error("eof.comment");
 286         }
 287     }
 288 
 289     /**
 290      * Called when an empty tag is encountered.
 291      */
 292     protected void handleEmptyTag(TagElement tag) throws ChangedCharSetException {
 293     }
 294 
 295     /**
 296      * Called when a start tag is encountered.
 297      */
 298     protected void handleStartTag(TagElement tag) {
 299     }
 300 
 301     /**
 302      * Called when an end tag is encountered.
 303      */
 304     protected void handleEndTag(TagElement tag) {
 305     }
 306 
 307     /**
 308      * An error has occurred.
 309      */
 310     protected void handleError(int ln, String msg) {
 311         /*
 312         Thread.dumpStack();
 313         System.out.println("**** " + stack);
 314         System.out.println("line " + ln + ": error: " + msg);
 315         System.out.println();
 316         */
 317     }
 318 
 319     /**
 320      * Output text.
 321      */
 322     void handleText(TagElement tag) {
 323         if (tag.breaksFlow()) {
 324             space = false;
 325             if (!strict) {
 326                 ignoreSpace = true;
 327             }
 328         }
 329         if (textpos == 0) {
 330             if ((!space) || (stack == null) || last.breaksFlow() ||
 331                 !stack.advance(dtd.pcdata)) {
 332                 last = tag;
 333                 space = false;
 334                 lastBlockStartPos = currentBlockStartPos;
 335                 return;
 336             }
 337         }
 338         if (space) {
 339             if (!ignoreSpace) {
 340                 // enlarge buffer if needed
 341                 if (textpos + 1 > text.length) {
 342                     char newtext[] = new char[text.length + 200];
 343                     System.arraycopy(text, 0, newtext, 0, text.length);
 344                     text = newtext;
 345                 }
 346 
 347                 // output pending space
 348                 text[textpos++] = ' ';
 349                 if (!strict && !tag.getElement().isEmpty()) {
 350                     ignoreSpace = true;
 351                 }
 352             }
 353             space = false;
 354         }
 355         char newtext[] = new char[textpos];
 356         System.arraycopy(text, 0, newtext, 0, textpos);
 357         // Handles cases of bad html where the title tag
 358         // was getting lost when we did error recovery.
 359         if (tag.getElement().getName().equals("title")) {
 360             handleTitle(newtext);
 361         } else {
 362             handleText(newtext);
 363         }
 364         lastBlockStartPos = currentBlockStartPos;
 365         textpos = 0;
 366         last = tag;
 367         space = false;
 368     }
 369 
 370     /**
 371      * Invoke the error handler.
 372      */
 373     protected void error(String err, String arg1, String arg2,
 374         String arg3) {
 375         handleError(ln, err + " " + arg1 + " " + arg2 + " " + arg3);
 376     }
 377 
 378     protected void error(String err, String arg1, String arg2) {
 379         error(err, arg1, arg2, "?");
 380     }
 381     protected void error(String err, String arg1) {
 382         error(err, arg1, "?", "?");
 383     }
 384     protected void error(String err) {
 385         error(err, "?", "?", "?");
 386     }
 387 
 388 
 389     /**
 390      * Handle a start tag. The new tag is pushed
 391      * onto the tag stack. The attribute list is
 392      * checked for required attributes.
 393      */
 394     protected void startTag(TagElement tag) throws ChangedCharSetException {
 395         Element elem = tag.getElement();
 396 
 397         // If the tag is an empty tag and texpos != 0
 398         // this implies that there is text before the
 399         // start tag that needs to be processed before
 400         // handling the tag.
 401         //
 402         if (!elem.isEmpty() ||
 403                     ((last != null) && !last.breaksFlow()) ||
 404                     (textpos != 0)) {
 405             handleText(tag);
 406         } else {
 407             // this variable gets updated in handleText().
 408             // Since in this case we do not call handleText()
 409             // we need to update it here.
 410             //
 411             last = tag;
 412             // Note that we should really check last.breakFlows before
 413             // assuming this should be false.
 414             space = false;
 415         }
 416         lastBlockStartPos = currentBlockStartPos;
 417 
 418         // check required attributes
 419         for (AttributeList a = elem.atts ; a != null ; a = a.next) {
 420             if ((a.modifier == REQUIRED) &&
 421                 ((attributes.isEmpty()) ||
 422                  ((!attributes.isDefined(a.name)) &&
 423                   (!attributes.isDefined(HTML.getAttributeKey(a.name)))))) {
 424                 error("req.att ", a.getName(), elem.getName());
 425             }
 426         }
 427 
 428         if (elem.isEmpty()) {
 429             handleEmptyTag(tag);
 430             /*
 431         } else if (elem.getName().equals("form")) {
 432             handleStartTag(tag);
 433             */
 434         } else {
 435             recent = elem;
 436             stack = new TagStack(tag, stack);
 437             handleStartTag(tag);
 438         }
 439     }
 440 
 441     /**
 442      * Handle an end tag. The end tag is popped
 443      * from the tag stack.
 444      */
 445     protected void endTag(boolean omitted) {
 446         handleText(stack.tag);
 447 
 448         if (omitted && !stack.elem.omitEnd()) {
 449             error("end.missing", stack.elem.getName());
 450         } else if (!stack.terminate()) {
 451             error("end.unexpected", stack.elem.getName());
 452         }
 453 
 454         // handle the tag
 455         handleEndTag(stack.tag);
 456         stack = stack.next;
 457         recent = (stack != null) ? stack.elem : null;
 458     }
 459 
 460 
 461     boolean ignoreElement(Element elem) {
 462 
 463         String stackElement = stack.elem.getName();
 464         String elemName = elem.getName();
 465         /* We ignore all elements that are not valid in the context of
 466            a table except <td>, <th> (these we handle in
 467            legalElementContext()) and #pcdata.  We also ignore the
 468            <font> tag in the context of <ul> and <ol> We additonally
 469            ignore the <meta> and the <style> tag if the body tag has
 470            been seen. **/
 471         if ((elemName.equals("html") && seenHtml) ||
 472             (elemName.equals("head") && seenHead) ||
 473             (elemName.equals("body") && seenBody)) {
 474             return true;
 475         }
 476         if (elemName.equals("dt") || elemName.equals("dd")) {
 477             TagStack s = stack;
 478             while (s != null && !s.elem.getName().equals("dl")) {
 479                 s = s.next;
 480             }
 481             if (s == null) {
 482                 return true;
 483             }
 484         }
 485 
 486         if (((stackElement.equals("table")) &&
 487              (!elemName.equals("#pcdata")) && (!elemName.equals("input"))) ||
 488             ((elemName.equals("font")) &&
 489              (stackElement.equals("ul") || stackElement.equals("ol"))) ||
 490             (elemName.equals("meta") && stack != null) ||
 491             (elemName.equals("style") && seenBody) ||
 492             (stackElement.equals("table") && elemName.equals("a"))) {
 493             return true;
 494         }
 495         return false;
 496     }
 497 
 498 
 499     /**
 500      * Marks the first time a tag has been seen in a document
 501      */
 502 
 503     protected void markFirstTime(Element elem) {
 504         String elemName = elem.getName();
 505         if (elemName.equals("html")) {
 506             seenHtml = true;
 507         } else if (elemName.equals("head")) {
 508             seenHead = true;
 509         } else if (elemName.equals("body")) {
 510             if (buf.length == 1) {
 511                 // Refer to note in definition of buf for details on this.
 512                 char[] newBuf = new char[256];
 513 
 514                 newBuf[0] = buf[0];
 515                 buf = newBuf;
 516             }
 517             seenBody = true;
 518         }
 519     }
 520 
 521     /**
 522      * Create a legal content for an element.
 523      */
 524     boolean legalElementContext(Element elem) throws ChangedCharSetException {
 525 
 526         // System.out.println("-- legalContext -- " + elem);
 527 
 528         // Deal with the empty stack
 529         if (stack == null) {
 530             // System.out.println("-- stack is empty");
 531             if (elem != dtd.html) {
 532                 // System.out.println("-- pushing html");
 533                 startTag(makeTag(dtd.html, true));
 534                 return legalElementContext(elem);
 535             }
 536             return true;
 537         }
 538 
 539         // Is it allowed in the current context
 540         if (stack.advance(elem)) {
 541             // System.out.println("-- legal context");
 542             markFirstTime(elem);
 543             return true;
 544         }
 545         boolean insertTag = false;
 546 
 547         // The use of all error recovery strategies are contingent
 548         // on the value of the strict property.
 549         //
 550         // These are commonly occurring errors.  if insertTag is true,
 551         // then we want to adopt an error recovery strategy that
 552         // involves attempting to insert an additional tag to
 553         // legalize the context.  The two errors addressed here
 554         // are:
 555         // 1) when a <td> or <th> is seen soon after a <table> tag.
 556         //    In this case we insert a <tr>.
 557         // 2) when any other tag apart from a <tr> is seen
 558         //    in the context of a <tr>.  In this case we would
 559         //    like to add a <td>.  If a <tr> is seen within a
 560         //    <tr> context, then we will close out the current
 561         //    <tr>.
 562         //
 563         // This insertion strategy is handled later in the method.
 564         // The reason for checking this now, is that in other cases
 565         // we would like to apply other error recovery strategies for example
 566         // ignoring tags.
 567         //
 568         // In certain cases it is better to ignore a tag than try to
 569         // fix the situation.  So the first test is to see if this
 570         // is what we need to do.
 571         //
 572         String stackElemName = stack.elem.getName();
 573         String elemName = elem.getName();
 574 
 575 
 576         if (!strict &&
 577             ((stackElemName.equals("table") && elemName.equals("td")) ||
 578              (stackElemName.equals("table") && elemName.equals("th")) ||
 579              (stackElemName.equals("tr") && !elemName.equals("tr")))){
 580              insertTag = true;
 581         }
 582 
 583 
 584         if (!strict && !insertTag && (stack.elem.getName() != elem.getName() ||
 585                                       elem.getName().equals("body"))) {
 586             if (skipTag = ignoreElement(elem)) {
 587                 error("tag.ignore", elem.getName());
 588                 return skipTag;
 589             }
 590         }
 591 
 592         // Check for anything after the start of the table besides tr, td, th
 593         // or caption, and if those aren't there, insert the <tr> and call
 594         // legalElementContext again.
 595         if (!strict && stackElemName.equals("table") &&
 596             !elemName.equals("tr") && !elemName.equals("td") &&
 597             !elemName.equals("th") && !elemName.equals("caption")) {
 598             Element e = dtd.getElement("tr");
 599             TagElement t = makeTag(e, true);
 600             legalTagContext(t);
 601             startTag(t);
 602             error("start.missing", elem.getName());
 603             return legalElementContext(elem);
 604         }
 605 
 606         // They try to find a legal context by checking if the current
 607         // tag is valid in an enclosing context.  If so
 608         // close out the tags by outputing end tags and then
 609         // insert the current tag.  If the tags that are
 610         // being closed out do not have an optional end tag
 611         // specification in the DTD then an html error is
 612         // reported.
 613         //
 614         if (!insertTag && stack.terminate() && (!strict || stack.elem.omitEnd())) {
 615             for (TagStack s = stack.next ; s != null ; s = s.next) {
 616                 if (s.advance(elem)) {
 617                     while (stack != s) {
 618                         endTag(true);
 619                     }
 620                     return true;
 621                 }
 622                 if (!s.terminate() || (strict && !s.elem.omitEnd())) {
 623                     break;
 624                 }
 625             }
 626         }
 627 
 628         // Check if we know what tag is expected next.
 629         // If so insert the tag.  Report an error if the
 630         // tag does not have its start tag spec in the DTD as optional.
 631         //
 632         Element next = stack.first();
 633         if (next != null && (!strict || next.omitStart()) &&
 634            !(next==dtd.head && elem==dtd.pcdata) ) {
 635             // System.out.println("-- omitting start tag: " + next);
 636             TagElement t = makeTag(next, true);
 637             legalTagContext(t);
 638             startTag(t);
 639             if (!next.omitStart()) {
 640                 error("start.missing", elem.getName());
 641             }
 642             return legalElementContext(elem);
 643         }
 644 
 645 
 646         // Traverse the list of expected elements and determine if adding
 647         // any of these elements would make for a legal context.
 648         //
 649 
 650         if (!strict) {
 651             ContentModel content = stack.contentModel();
 652             Vector<Element> elemVec = new Vector<Element>();
 653             if (content != null) {
 654                 content.getElements(elemVec);
 655                 for (Element e : elemVec) {
 656                     // Ensure that this element has not been included as
 657                     // part of the exclusions in the DTD.
 658                     //
 659                     if (stack.excluded(e.getIndex())) {
 660                         continue;
 661                     }
 662 
 663                     boolean reqAtts = false;
 664 
 665                     for (AttributeList a = e.getAttributes(); a != null ; a = a.next) {
 666                         if (a.modifier == REQUIRED) {
 667                             reqAtts = true;
 668                             break;
 669                         }
 670                     }
 671                     // Ensure that no tag that has required attributes
 672                     // gets inserted.
 673                     //
 674                     if (reqAtts) {
 675                         continue;
 676                     }
 677 
 678                     ContentModel m = e.getContent();
 679                     if (m != null && m.first(elem)) {
 680                         // System.out.println("-- adding a legal tag: " + e);
 681                         TagElement t = makeTag(e, true);
 682                         legalTagContext(t);
 683                         startTag(t);
 684                         error("start.missing", e.getName());
 685                         return legalElementContext(elem);
 686                     }
 687                 }
 688             }
 689         }
 690 
 691         // Check if the stack can be terminated.  If so add the appropriate
 692         // end tag.  Report an error if the tag being ended does not have its
 693         // end tag spec in the DTD as optional.
 694         //
 695         if (stack.terminate() && (stack.elem != dtd.body) && (!strict || stack.elem.omitEnd())) {
 696             // System.out.println("-- omitting end tag: " + stack.elem);
 697             if (!stack.elem.omitEnd()) {
 698                 error("end.missing", elem.getName());
 699             }
 700 
 701             endTag(true);
 702             return legalElementContext(elem);
 703         }
 704 
 705         // At this point we know that something is screwed up.
 706         return false;
 707     }
 708 
 709     /**
 710      * Create a legal context for a tag.
 711      */
 712     void legalTagContext(TagElement tag) throws ChangedCharSetException {
 713         if (legalElementContext(tag.getElement())) {
 714             markFirstTime(tag.getElement());
 715             return;
 716         }
 717 
 718         // Avoid putting a block tag in a flow tag.
 719         if (tag.breaksFlow() && (stack != null) && !stack.tag.breaksFlow()) {
 720             endTag(true);
 721             legalTagContext(tag);
 722             return;
 723         }
 724 
 725         // Avoid putting something wierd in the head of the document.
 726         for (TagStack s = stack ; s != null ; s = s.next) {
 727             if (s.tag.getElement() == dtd.head) {
 728                 while (stack != s) {
 729                     endTag(true);
 730                 }
 731                 endTag(true);
 732                 legalTagContext(tag);
 733                 return;
 734             }
 735         }
 736 
 737         // Everything failed
 738         error("tag.unexpected", tag.getElement().getName());
 739     }
 740 
 741     /**
 742      * Error context. Something went wrong, make sure we are in
 743      * the document's body context
 744      */
 745     void errorContext() throws ChangedCharSetException {
 746         for (; (stack != null) && (stack.tag.getElement() != dtd.body) ; stack = stack.next) {
 747             handleEndTag(stack.tag);
 748         }
 749         if (stack == null) {
 750             legalElementContext(dtd.body);
 751             startTag(makeTag(dtd.body, true));
 752         }
 753     }
 754 
 755     /**
 756      * Add a char to the string buffer.
 757      */
 758     void addString(int c) {
 759         if (strpos  == str.length) {
 760             char newstr[] = new char[str.length + 128];
 761             System.arraycopy(str, 0, newstr, 0, str.length);
 762             str = newstr;
 763         }
 764         str[strpos++] = (char)c;
 765     }
 766 
 767     /**
 768      * Get the string that's been accumulated.
 769      */
 770     String getString(int pos) {
 771         char newStr[] = new char[strpos - pos];
 772         System.arraycopy(str, pos, newStr, 0, strpos - pos);
 773         strpos = pos;
 774         return new String(newStr);
 775     }
 776 
 777     char[] getChars(int pos) {
 778         char newStr[] = new char[strpos - pos];
 779         System.arraycopy(str, pos, newStr, 0, strpos - pos);
 780         strpos = pos;
 781         return newStr;
 782     }
 783 
 784     char[] getChars(int pos, int endPos) {
 785         char newStr[] = new char[endPos - pos];
 786         System.arraycopy(str, pos, newStr, 0, endPos - pos);
 787         // REMIND: it's not clear whether this version should set strpos or not
 788         // strpos = pos;
 789         return newStr;
 790     }
 791 
 792     void resetStrBuffer() {
 793         strpos = 0;
 794     }
 795 
 796     int strIndexOf(char target) {
 797         for (int i = 0; i < strpos; i++) {
 798             if (str[i] == target) {
 799                 return i;
 800             }
 801         }
 802 
 803         return -1;
 804     }
 805 
 806     /**
 807      * Skip space.
 808      * [5] 297:5
 809      */
 810     void skipSpace() throws IOException {
 811         while (true) {
 812             switch (ch) {
 813               case '\n':
 814                 ln++;
 815                 ch = readCh();
 816                 lfCount++;
 817                 break;
 818 
 819               case '\r':
 820                 ln++;
 821                 if ((ch = readCh()) == '\n') {
 822                     ch = readCh();
 823                     crlfCount++;
 824                 }
 825                 else {
 826                     crCount++;
 827                 }
 828                 break;
 829               case ' ':
 830               case '\t':
 831                 ch = readCh();
 832                 break;
 833 
 834               default:
 835                 return;
 836             }
 837         }
 838     }
 839 
 840     /**
 841      * Parse identifier. Uppercase characters are folded
 842      * to lowercase when lower is true. Returns falsed if
 843      * no identifier is found. [55] 346:17
 844      */
 845     boolean parseIdentifier(boolean lower) throws IOException {
 846         switch (ch) {
 847           case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
 848           case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
 849           case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
 850           case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
 851           case 'Y': case 'Z':
 852             if (lower) {
 853                 ch = 'a' + (ch - 'A');
 854             }
 855             break;
 856 
 857           case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
 858           case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
 859           case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
 860           case 's': case 't': case 'u': case 'v': case 'w': case 'x':
 861           case 'y': case 'z':
 862             break;
 863 
 864           default:
 865             return false;
 866         }
 867 
 868         while (true) {
 869             addString(ch);
 870 
 871             switch (ch = readCh()) {
 872               case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
 873               case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
 874               case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
 875               case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
 876               case 'Y': case 'Z':
 877                 if (lower) {
 878                     ch = 'a' + (ch - 'A');
 879                 }
 880                 break;
 881 
 882               case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
 883               case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
 884               case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
 885               case 's': case 't': case 'u': case 'v': case 'w': case 'x':
 886               case 'y': case 'z':
 887 
 888               case '0': case '1': case '2': case '3': case '4':
 889               case '5': case '6': case '7': case '8': case '9':
 890 
 891               case '.': case '-':
 892 
 893               case '_': // not officially allowed
 894                 break;
 895 
 896               default:
 897                 return true;
 898             }
 899         }
 900     }
 901 
 902     /**
 903      * Parse an entity reference. [59] 350:17
 904      */
 905     private char[] parseEntityReference() throws IOException {
 906         int pos = strpos;
 907 
 908         if ((ch = readCh()) == '#') {
 909             int n = 0;
 910             ch = readCh();
 911             if ((ch >= '0') && (ch <= '9') ||
 912                     ch == 'x' || ch == 'X') {
 913 
 914                 if ((ch >= '0') && (ch <= '9')) {
 915                     // parse decimal reference
 916                     while ((ch >= '0') && (ch <= '9')) {
 917                         n = (n * 10) + ch - '0';
 918                         ch = readCh();
 919                     }
 920                 } else {
 921                     // parse hexadecimal reference
 922                     ch = readCh();
 923                     char lch = (char) Character.toLowerCase(ch);
 924                     while ((lch >= '0') && (lch <= '9') ||
 925                             (lch >= 'a') && (lch <= 'f')) {
 926                         if (lch >= '0' && lch <= '9') {
 927                             n = (n * 16) + lch - '0';
 928                         } else {
 929                             n = (n * 16) + lch - 'a' + 10;
 930                         }
 931                         ch = readCh();
 932                         lch = (char) Character.toLowerCase(ch);
 933                     }
 934                 }
 935                 switch (ch) {
 936                     case '\n':
 937                         ln++;
 938                         ch = readCh();
 939                         lfCount++;
 940                         break;
 941 
 942                     case '\r':
 943                         ln++;
 944                         if ((ch = readCh()) == '\n') {
 945                             ch = readCh();
 946                             crlfCount++;
 947                         }
 948                         else {
 949                             crCount++;
 950                         }
 951                         break;
 952 
 953                     case ';':
 954                         ch = readCh();
 955                         break;
 956                 }
 957                 char data[] = mapNumericReference(n);
 958                 return data;
 959             }
 960             addString('#');
 961             if (!parseIdentifier(false)) {
 962                 error("ident.expected");
 963                 strpos = pos;
 964                 char data[] = {'&', '#'};
 965                 return data;
 966             }
 967         } else if (!parseIdentifier(false)) {
 968             char data[] = {'&'};
 969             return data;
 970         }
 971 
 972         boolean semicolon = false;
 973 
 974         switch (ch) {
 975           case '\n':
 976             ln++;
 977             ch = readCh();
 978             lfCount++;
 979             break;
 980 
 981           case '\r':
 982             ln++;
 983             if ((ch = readCh()) == '\n') {
 984                 ch = readCh();
 985                 crlfCount++;
 986             }
 987             else {
 988                 crCount++;
 989             }
 990             break;
 991 
 992           case ';':
 993             semicolon = true;
 994 
 995             ch = readCh();
 996             break;
 997         }
 998 
 999         String nm = getString(pos);
1000         Entity ent = dtd.getEntity(nm);
1001 
1002         // entities are case sensitive - however if strict
1003         // is false then we will try to make a match by
1004         // converting the string to all lowercase.
1005         //
1006         if (!strict && (ent == null)) {
1007             ent = dtd.getEntity(nm.toLowerCase());
1008         }
1009         if ((ent == null) || !ent.isGeneral()) {
1010 
1011             if (nm.length() == 0) {
1012                 error("invalid.entref", nm);
1013                 return new char[0];
1014             }
1015             /* given that there is not a match restore the entity reference */
1016             String str = "&" + nm + (semicolon ? ";" : "");
1017 
1018             char b[] = new char[str.length()];
1019             str.getChars(0, b.length, b, 0);
1020             return b;
1021         }
1022         return ent.getData();
1023     }
1024 
1025     /**
1026      * Converts numeric character reference to char array.
1027      *
1028      * Normally the code in a reference should be always converted
1029      * to the Unicode character with the same code, but due to
1030      * wide usage of Cp1252 charset most browsers map numeric references
1031      * in the range 130-159 (which are control chars in Unicode set)
1032      * to displayable characters with other codes.
1033      *
1034      * @param c the code of numeric character reference.
1035      * @return a char array corresponding to the reference code.
1036      */
1037     private char[] mapNumericReference(int c) {
1038         char[] data;
1039         if (c >= 0xffff) { // outside unicode BMP.
1040             try {
1041                 data = Character.toChars(c);
1042             } catch (IllegalArgumentException e) {
1043                 data = new char[0];
1044             }
1045         } else {
1046             data = new char[1];
1047             data[0] = (c < 130 || c > 159) ? (char) c : cp1252Map[c - 130];
1048         }
1049         return data;
1050     }
1051 
1052     /**
1053      * Parse a comment. [92] 391:7
1054      */
1055     void parseComment() throws IOException {
1056 
1057         while (true) {
1058             int c = ch;
1059             switch (c) {
1060               case '-':
1061                   /** Presuming that the start string of a comment "<!--" has
1062                       already been parsed, the '-' character is valid only as
1063                       part of a comment termination and further more it must
1064                       be present in even numbers. Hence if strict is true, we
1065                       presume the comment has been terminated and return.
1066                       However if strict is false, then there is no even number
1067                       requirement and this character can appear anywhere in the
1068                       comment.  The parser reads on until it sees the following
1069                       pattern: "-->" or "--!>".
1070                    **/
1071                 if (!strict && (strpos != 0) && (str[strpos - 1] == '-')) {
1072                     if ((ch = readCh()) == '>') {
1073                         return;
1074                     }
1075                     if (ch == '!') {
1076                         if ((ch = readCh()) == '>') {
1077                             return;
1078                         } else {
1079                             /* to account for extra read()'s that happened */
1080                             addString('-');
1081                             addString('!');
1082                             continue;
1083                         }
1084                     }
1085                     break;
1086                 }
1087 
1088                 if ((ch = readCh()) == '-') {
1089                     ch = readCh();
1090                     if (strict || ch == '>') {
1091                         return;
1092                     }
1093                     if (ch == '!') {
1094                         if ((ch = readCh()) == '>') {
1095                             return;
1096                         } else {
1097                             /* to account for extra read()'s that happened */
1098                             addString('-');
1099                             addString('!');
1100                             continue;
1101                         }
1102                     }
1103                     /* to account for the extra read() */
1104                     addString('-');
1105                 }
1106                 break;
1107 
1108               case -1:
1109                   handleEOFInComment();
1110                   return;
1111 
1112               case '\n':
1113                 ln++;
1114                 ch = readCh();
1115                 lfCount++;
1116                 break;
1117 
1118               case '>':
1119                 ch = readCh();
1120                 break;
1121 
1122               case '\r':
1123                 ln++;
1124                 if ((ch = readCh()) == '\n') {
1125                     ch = readCh();
1126                     crlfCount++;
1127                 }
1128                 else {
1129                     crCount++;
1130                 }
1131                 c = '\n';
1132                 break;
1133               default:
1134                 ch = readCh();
1135                 break;
1136             }
1137 
1138             addString(c);
1139         }
1140     }
1141 
1142     /**
1143      * Parse literal content. [46] 343:1 and [47] 344:1
1144      */
1145     void parseLiteral(boolean replace) throws IOException {
1146         while (true) {
1147             int c = ch;
1148             switch (c) {
1149               case -1:
1150                 error("eof.literal", stack.elem.getName());
1151                 endTag(true);
1152                 return;
1153 
1154               case '>':
1155                 ch = readCh();
1156                 int i = textpos - (stack.elem.name.length() + 2), j = 0;
1157 
1158                 // match end tag
1159                 if ((i >= 0) && (text[i++] == '<') && (text[i] == '/')) {
1160                     while ((++i < textpos) &&
1161                            (Character.toLowerCase(text[i]) == stack.elem.name.charAt(j++)));
1162                     if (i == textpos) {
1163                         textpos -= (stack.elem.name.length() + 2);
1164                         if ((textpos > 0) && (text[textpos-1] == '\n')) {
1165                             textpos--;
1166                         }
1167                         endTag(false);
1168                         return;
1169                     }
1170                 }
1171                 break;
1172 
1173               case '&':
1174                 char data[] = parseEntityReference();
1175                 if (textpos + data.length > text.length) {
1176                     char newtext[] = new char[Math.max(textpos + data.length + 128, text.length * 2)];
1177                     System.arraycopy(text, 0, newtext, 0, text.length);
1178                     text = newtext;
1179                 }
1180                 System.arraycopy(data, 0, text, textpos, data.length);
1181                 textpos += data.length;
1182                 continue;
1183 
1184               case '\n':
1185                 ln++;
1186                 ch = readCh();
1187                 lfCount++;
1188                 break;
1189 
1190               case '\r':
1191                 ln++;
1192                 if ((ch = readCh()) == '\n') {
1193                     ch = readCh();
1194                     crlfCount++;
1195                 }
1196                 else {
1197                     crCount++;
1198                 }
1199                 c = '\n';
1200                 break;
1201               default:
1202                 ch = readCh();
1203                 break;
1204             }
1205 
1206             // output character
1207             if (textpos == text.length) {
1208                 char newtext[] = new char[text.length + 128];
1209                 System.arraycopy(text, 0, newtext, 0, text.length);
1210                 text = newtext;
1211             }
1212             text[textpos++] = (char)c;
1213         }
1214     }
1215 
1216     /**
1217      * Parse attribute value. [33] 331:1
1218      */
1219     @SuppressWarnings("fallthrough")
1220     String parseAttributeValue(boolean lower) throws IOException {
1221         int delim = -1;
1222 
1223         // Check for a delimiter
1224         switch(ch) {
1225           case '\'':
1226           case '"':
1227             delim = ch;
1228             ch = readCh();
1229             break;
1230         }
1231 
1232         // Parse the rest of the value
1233         while (true) {
1234             int c = ch;
1235 
1236             switch (c) {
1237               case '\n':
1238                 ln++;
1239                 ch = readCh();
1240                 lfCount++;
1241                 if (delim < 0) {
1242                     return getString(0);
1243                 }
1244                 break;
1245 
1246               case '\r':
1247                 ln++;
1248 
1249                 if ((ch = readCh()) == '\n') {
1250                     ch = readCh();
1251                     crlfCount++;
1252                 }
1253                 else {
1254                     crCount++;
1255                 }
1256                 if (delim < 0) {
1257                     return getString(0);
1258                 }
1259                 break;
1260 
1261               case '\t':
1262                   if (delim < 0)
1263                       c = ' ';
1264                   // Fall through
1265               case ' ':
1266                 ch = readCh();
1267                 if (delim < 0) {
1268                     return getString(0);
1269                 }
1270                 break;
1271 
1272               case '>':
1273               case '<':
1274                 if (delim < 0) {
1275                     return getString(0);
1276                 }
1277                 ch = readCh();
1278                 break;
1279 
1280               case '\'':
1281               case '"':
1282                 ch = readCh();
1283                 if (c == delim) {
1284                     return getString(0);
1285                 } else if (delim == -1) {
1286                     error("attvalerr");
1287                     if (strict || ch == ' ') {
1288                         return getString(0);
1289                     } else {
1290                         continue;
1291                     }
1292                 }
1293                 break;
1294 
1295             case '=':
1296                 if (delim < 0) {
1297                     /* In SGML a construct like <img src=/cgi-bin/foo?x=1>
1298                        is considered invalid since an = sign can only be contained
1299                        in an attributes value if the string is quoted.
1300                        */
1301                     error("attvalerr");
1302                     /* If strict is true then we return with the string we have thus far.
1303                        Otherwise we accept the = sign as part of the attribute's value and
1304                        process the rest of the img tag. */
1305                     if (strict) {
1306                         return getString(0);
1307                     }
1308                 }
1309                 ch = readCh();
1310                 break;
1311 
1312               case '&':
1313                 if (strict && delim < 0) {
1314                     ch = readCh();
1315                     break;
1316                 }
1317 
1318                 char data[] = parseEntityReference();
1319                 for (int i = 0 ; i < data.length ; i++) {
1320                     c = data[i];
1321                     addString((lower && (c >= 'A') && (c <= 'Z')) ? 'a' + c - 'A' : c);
1322                 }
1323                 continue;
1324 
1325               case -1:
1326                 return getString(0);
1327 
1328               default:
1329                 if (lower && (c >= 'A') && (c <= 'Z')) {
1330                     c = 'a' + c - 'A';
1331                 }
1332                 ch = readCh();
1333                 break;
1334             }
1335             addString(c);
1336         }
1337     }
1338 
1339 
1340     /**
1341      * Parse attribute specification List. [31] 327:17
1342      */
1343     void parseAttributeSpecificationList(Element elem) throws IOException {
1344 
1345         while (true) {
1346             skipSpace();
1347 
1348             switch (ch) {
1349               case '/':
1350               case '>':
1351               case '<':
1352               case -1:
1353                 return;
1354 
1355               case '-':
1356                 if ((ch = readCh()) == '-') {
1357                     ch = readCh();
1358                     parseComment();
1359                     strpos = 0;
1360                 } else {
1361                     error("invalid.tagchar", "-", elem.getName());
1362                     ch = readCh();
1363                 }
1364                 continue;
1365             }
1366 
1367             AttributeList att;
1368             String attname;
1369             String attvalue;
1370 
1371             if (parseIdentifier(true)) {
1372                 attname = getString(0);
1373                 skipSpace();
1374                 if (ch == '=') {
1375                     ch = readCh();
1376                     skipSpace();
1377                     att = elem.getAttribute(attname);
1378 //  Bug ID 4102750
1379 //  Load the NAME of an Attribute Case Sensitive
1380 //  The case of the NAME  must be intact
1381 //  MG 021898
1382                     attvalue = parseAttributeValue((att != null) && (att.type != CDATA) && (att.type != NOTATION) && (att.type != NAME));
1383 //                  attvalue = parseAttributeValue((att != null) && (att.type != CDATA) && (att.type != NOTATION));
1384                 } else {
1385                     attvalue = attname;
1386                     att = elem.getAttributeByValue(attvalue);
1387                     if (att == null) {
1388                         att = elem.getAttribute(attname);
1389                         if (att != null) {
1390                             attvalue = att.getValue();
1391                         }
1392                         else {
1393                             // Make it null so that NULL_ATTRIBUTE_VALUE is
1394                             // used
1395                             attvalue = null;
1396                         }
1397                     }
1398                 }
1399             } else if (!strict && ch == ',') { // allows for comma separated attribute-value pairs
1400                 ch = readCh();
1401                 continue;
1402             } else if (!strict && ch == '"') { // allows for quoted attributes
1403                 ch = readCh();
1404                 skipSpace();
1405                 if (parseIdentifier(true)) {
1406                     attname = getString(0);
1407                     if (ch == '"') {
1408                         ch = readCh();
1409                     }
1410                     skipSpace();
1411                     if (ch == '=') {
1412                         ch = readCh();
1413                         skipSpace();
1414                         att = elem.getAttribute(attname);
1415                         attvalue = parseAttributeValue((att != null) &&
1416                                                 (att.type != CDATA) &&
1417                                                 (att.type != NOTATION));
1418                     } else {
1419                         attvalue = attname;
1420                         att = elem.getAttributeByValue(attvalue);
1421                         if (att == null) {
1422                             att = elem.getAttribute(attname);
1423                             if (att != null) {
1424                                 attvalue = att.getValue();
1425                             }
1426                         }
1427                     }
1428                 } else {
1429                     char str[] = {(char)ch};
1430                     error("invalid.tagchar", new String(str), elem.getName());
1431                     ch = readCh();
1432                     continue;
1433                 }
1434             } else if (!strict && (attributes.isEmpty()) && (ch == '=')) {
1435                 ch = readCh();
1436                 skipSpace();
1437                 attname = elem.getName();
1438                 att = elem.getAttribute(attname);
1439                 attvalue = parseAttributeValue((att != null) &&
1440                                                (att.type != CDATA) &&
1441                                                (att.type != NOTATION));
1442             } else if (!strict && (ch == '=')) {
1443                 ch = readCh();
1444                 skipSpace();
1445                 attvalue = parseAttributeValue(true);
1446                 error("attvalerr");
1447                 return;
1448             } else {
1449                 char str[] = {(char)ch};
1450                 error("invalid.tagchar", new String(str), elem.getName());
1451                 if (!strict) {
1452                     ch = readCh();
1453                     continue;
1454                 } else {
1455                     return;
1456                 }
1457             }
1458 
1459             if (att != null) {
1460                 attname = att.getName();
1461             } else {
1462                 error("invalid.tagatt", attname, elem.getName());
1463             }
1464 
1465             // Check out the value
1466             if (attributes.isDefined(attname)) {
1467                 error("multi.tagatt", attname, elem.getName());
1468             }
1469             if (attvalue == null) {
1470                 attvalue = ((att != null) && (att.value != null)) ? att.value :
1471                     HTML.NULL_ATTRIBUTE_VALUE;
1472             } else if ((att != null) && (att.values != null) && !att.values.contains(attvalue)) {
1473                 error("invalid.tagattval", attname, elem.getName());
1474             }
1475             HTML.Attribute attkey = HTML.getAttributeKey(attname);
1476             if (attkey == null) {
1477                 attributes.addAttribute(attname, attvalue);
1478             } else {
1479                 attributes.addAttribute(attkey, attvalue);
1480             }
1481         }
1482     }
1483 
1484     /**
1485      * Parses th Document Declaration Type markup declaration.
1486      * Currently ignores it.
1487      */
1488     public String parseDTDMarkup() throws IOException {
1489 
1490         StringBuilder strBuff = new StringBuilder();
1491         ch = readCh();
1492         while(true) {
1493             switch (ch) {
1494             case '>':
1495                 ch = readCh();
1496                 return strBuff.toString();
1497             case -1:
1498                 error("invalid.markup");
1499                 return strBuff.toString();
1500             case '\n':
1501                 ln++;
1502                 ch = readCh();
1503                 lfCount++;
1504                 break;
1505             case '"':
1506                 ch = readCh();
1507                 break;
1508             case '\r':
1509                 ln++;
1510                 if ((ch = readCh()) == '\n') {
1511                     ch = readCh();
1512                     crlfCount++;
1513                 }
1514                 else {
1515                     crCount++;
1516                 }
1517                 break;
1518             default:
1519                 strBuff.append((char)(ch & 0xFF));
1520                 ch = readCh();
1521                 break;
1522             }
1523         }
1524     }
1525 
1526     /**
1527      * Parse markup declarations.
1528      * Currently only handles the Document Type Declaration markup.
1529      * Returns true if it is a markup declaration false otherwise.
1530      */
1531     protected boolean parseMarkupDeclarations(StringBuffer strBuff) throws IOException {
1532 
1533         /* Currently handles only the DOCTYPE */
1534         if ((strBuff.length() == "DOCTYPE".length()) &&
1535             (strBuff.toString().toUpperCase().equals("DOCTYPE"))) {
1536             parseDTDMarkup();
1537             return true;
1538         }
1539         return false;
1540     }
1541 
1542     /**
1543      * Parse an invalid tag.
1544      */
1545     void parseInvalidTag() throws IOException {
1546         // ignore all data upto the close bracket '>'
1547         while (true) {
1548             skipSpace();
1549             switch (ch) {
1550               case '>':
1551               case -1:
1552                   ch = readCh();
1553                 return;
1554               case '<':
1555                   return;
1556               default:
1557                   ch = readCh();
1558 
1559             }
1560         }
1561     }
1562 
1563     /**
1564      * Parse a start or end tag.
1565      */
1566     @SuppressWarnings("fallthrough")
1567     void parseTag() throws IOException {
1568         Element elem;
1569         boolean net = false;
1570         boolean warned = false;
1571         boolean unknown = false;
1572 
1573         switch (ch = readCh()) {
1574           case '!':
1575             switch (ch = readCh()) {
1576               case '-':
1577                 // Parse comment. [92] 391:7
1578                 while (true) {
1579                     if (ch == '-') {
1580                         if (!strict || ((ch = readCh()) == '-')) {
1581                             ch = readCh();
1582                             if (!strict && ch == '-') {
1583                                 ch = readCh();
1584                             }
1585                             // send over any text you might see
1586                             // before parsing and sending the
1587                             // comment
1588                             if (textpos != 0) {
1589                                 char newtext[] = new char[textpos];
1590                                 System.arraycopy(text, 0, newtext, 0, textpos);
1591                                 handleText(newtext);
1592                                 lastBlockStartPos = currentBlockStartPos;
1593                                 textpos = 0;
1594                             }
1595                             parseComment();
1596                             last = makeTag(dtd.getElement("comment"), true);
1597                             handleComment(getChars(0));
1598                             continue;
1599                         } else if (!warned) {
1600                             warned = true;
1601                             error("invalid.commentchar", "-");
1602                         }
1603                     }
1604                     skipSpace();
1605                     switch (ch) {
1606                       case '-':
1607                         continue;
1608                       case '>':
1609                         ch = readCh();
1610                         return;
1611                       case -1:
1612                         return;
1613                       default:
1614                         ch = readCh();
1615                         if (!warned) {
1616                             warned = true;
1617                             error("invalid.commentchar",
1618                                   String.valueOf((char)ch));
1619                         }
1620                         break;
1621                     }
1622                 }
1623 
1624               default:
1625                 // deal with marked sections
1626                 StringBuffer strBuff = new StringBuffer();
1627                 while (true) {
1628                     strBuff.append((char)ch);
1629                     if (parseMarkupDeclarations(strBuff)) {
1630                         return;
1631                     }
1632                     switch(ch) {
1633                       case '>':
1634                         ch = readCh();
1635                         // Fall through
1636                       case -1:
1637                         error("invalid.markup");
1638                         return;
1639                       case '\n':
1640                         ln++;
1641                         ch = readCh();
1642                         lfCount++;
1643                         break;
1644                       case '\r':
1645                         ln++;
1646                         if ((ch = readCh()) == '\n') {
1647                             ch = readCh();
1648                             crlfCount++;
1649                         }
1650                         else {
1651                             crCount++;
1652                         }
1653                         break;
1654 
1655                       default:
1656                         ch = readCh();
1657                         break;
1658                     }
1659                 }
1660             }
1661 
1662           case '/':
1663             // parse end tag [19] 317:4
1664             switch (ch = readCh()) {
1665               case '>':
1666                 ch = readCh();
1667                 // Fall through
1668               case '<':
1669                 // empty end tag. either </> or </<
1670                 if (recent == null) {
1671                     error("invalid.shortend");
1672                     return;
1673                 }
1674                 elem = recent;
1675                 break;
1676 
1677               default:
1678                 if (!parseIdentifier(true)) {
1679                     error("expected.endtagname");
1680                     return;
1681                 }
1682                 skipSpace();
1683                 switch (ch) {
1684                   case '>':
1685                     ch = readCh();
1686                     break;
1687                   case '<':
1688                     break;
1689 
1690                   default:
1691                     error("expected", "'>'");
1692                     while ((ch != -1) && (ch != '\n') && (ch != '>')) {
1693                         ch = readCh();
1694                     }
1695                     if (ch == '>') {
1696                         ch = readCh();
1697                     }
1698                     break;
1699                 }
1700                 String elemStr = getString(0);
1701                 if (!dtd.elementExists(elemStr)) {
1702                     error("end.unrecognized", elemStr);
1703                     // Ignore RE before end tag
1704                     if ((textpos > 0) && (text[textpos-1] == '\n')) {
1705                         textpos--;
1706                     }
1707                     elem = dtd.getElement("unknown");
1708                     elem.name = elemStr;
1709                     unknown = true;
1710                 } else {
1711                     elem = dtd.getElement(elemStr);
1712                 }
1713                 break;
1714             }
1715 
1716 
1717             // If the stack is null, we're seeing end tags without any begin
1718             // tags.  Ignore them.
1719 
1720             if (stack == null) {
1721                 error("end.extra.tag", elem.getName());
1722                 return;
1723             }
1724 
1725             // Ignore RE before end tag
1726             if ((textpos > 0) && (text[textpos-1] == '\n')) {
1727                 // In a pre tag, if there are blank lines
1728                 // we do not want to remove the newline
1729                 // before the end tag.  Hence this code.
1730                 //
1731                 if (stack.pre) {
1732                     if ((textpos > 1) && (text[textpos-2] != '\n')) {
1733                         textpos--;
1734                     }
1735                 } else {
1736                     textpos--;
1737                 }
1738             }
1739 
1740             // If the end tag is a form, since we did not put it
1741             // on the tag stack, there is no corresponding start
1742             // start tag to find. Hence do not touch the tag stack.
1743             //
1744 
1745             /*
1746             if (!strict && elem.getName().equals("form")) {
1747                 if (lastFormSent != null) {
1748                     handleEndTag(lastFormSent);
1749                     return;
1750                 } else {
1751                     // do nothing.
1752                     return;
1753                 }
1754             }
1755             */
1756 
1757             if (unknown) {
1758                 // we will not see a corresponding start tag
1759                 // on the the stack.  If we are seeing an
1760                 // end tag, lets send this on as an empty
1761                 // tag with the end tag attribute set to
1762                 // true.
1763                 TagElement t = makeTag(elem);
1764                 handleText(t);
1765                 attributes.addAttribute(HTML.Attribute.ENDTAG, "true");
1766                 handleEmptyTag(makeTag(elem));
1767                 unknown = false;
1768                 return;
1769             }
1770 
1771             // find the corresponding start tag
1772 
1773             // A commonly occurring error appears to be the insertion
1774             // of extra end tags in a table.  The intent here is ignore
1775             // such extra end tags.
1776             //
1777             if (!strict) {
1778                 String stackElem = stack.elem.getName();
1779 
1780                 if (stackElem.equals("table")) {
1781                     // If it is not a valid end tag ignore it and return
1782                     //
1783                     if (!elem.getName().equals(stackElem)) {
1784                         error("tag.ignore", elem.getName());
1785                         return;
1786                     }
1787                 }
1788 
1789 
1790 
1791                 if (stackElem.equals("tr") ||
1792                     stackElem.equals("td")) {
1793                     if ((!elem.getName().equals("table")) &&
1794                         (!elem.getName().equals(stackElem))) {
1795                         error("tag.ignore", elem.getName());
1796                         return;
1797                     }
1798                 }
1799             }
1800             TagStack sp = stack;
1801 
1802             while ((sp != null) && (elem != sp.elem)) {
1803                 sp = sp.next;
1804             }
1805             if (sp == null) {
1806                 error("unmatched.endtag", elem.getName());
1807                 return;
1808             }
1809 
1810             // People put font ending tags in the darndest places.
1811             // Don't close other contexts based on them being between
1812             // a font tag and the corresponding end tag.  Instead,
1813             // ignore the end tag like it doesn't exist and allow the end
1814             // of the document to close us out.
1815             String elemName = elem.getName();
1816             if (stack != sp &&
1817                 (elemName.equals("font") ||
1818                  elemName.equals("center"))) {
1819 
1820                 // Since closing out a center tag can have real wierd
1821                 // effects on the formatting,  make sure that tags
1822                 // for which omitting an end tag is legimitate
1823                 // get closed out.
1824                 //
1825                 if (elemName.equals("center")) {
1826                     while(stack.elem.omitEnd() && stack != sp) {
1827                         endTag(true);
1828                     }
1829                     if (stack.elem == elem) {
1830                         endTag(false);
1831                     }
1832                 }
1833                 return;
1834             }
1835             // People do the same thing with center tags.  In this
1836             // case we would like to close off the center tag but
1837             // not necessarily all enclosing tags.
1838 
1839 
1840 
1841             // end tags
1842             while (stack != sp) {
1843                 endTag(true);
1844             }
1845 
1846             endTag(false);
1847             return;
1848 
1849           case -1:
1850             error("eof");
1851             return;
1852         }
1853 
1854         // start tag [14] 314:1
1855         if (!parseIdentifier(true)) {
1856             elem = recent;
1857             if ((ch != '>') || (elem == null)) {
1858                 error("expected.tagname");
1859                 return;
1860             }
1861         } else {
1862             String elemStr = getString(0);
1863 
1864             if (elemStr.equals("image")) {
1865                 elemStr = "img";
1866             }
1867 
1868             /* determine if this element is part of the dtd. */
1869 
1870             if (!dtd.elementExists(elemStr)) {
1871                 //              parseInvalidTag();
1872                 error("tag.unrecognized ", elemStr);
1873                 elem = dtd.getElement("unknown");
1874                 elem.name = elemStr;
1875                 unknown = true;
1876             } else {
1877                 elem = dtd.getElement(elemStr);
1878             }
1879         }
1880 
1881         // Parse attributes
1882         parseAttributeSpecificationList(elem);
1883 
1884         switch (ch) {
1885           case '/':
1886             net = true;
1887             // Fall through
1888           case '>':
1889             ch = readCh();
1890             if (ch == '>' && net) {
1891                 ch = readCh();
1892             }
1893           case '<':
1894             break;
1895 
1896           default:
1897             error("expected", "'>'");
1898             break;
1899         }
1900 
1901         if (!strict) {
1902           if (elem.getName().equals("script")) {
1903             error("javascript.unsupported");
1904           }
1905         }
1906 
1907         // ignore RE after start tag
1908         //
1909         if (!elem.isEmpty())  {
1910             if (ch == '\n') {
1911                 ln++;
1912                 lfCount++;
1913                 ch = readCh();
1914             } else if (ch == '\r') {
1915                 ln++;
1916                 if ((ch = readCh()) == '\n') {
1917                     ch = readCh();
1918                     crlfCount++;
1919                 }
1920                 else {
1921                     crCount++;
1922                 }
1923             }
1924         }
1925 
1926         // ensure a legal context for the tag
1927         TagElement tag = makeTag(elem, false);
1928 
1929 
1930         /** In dealing with forms, we have decided to treat
1931             them as legal in any context.  Also, even though
1932             they do have a start and an end tag, we will
1933             not put this tag on the stack.  This is to deal
1934             several pages in the web oasis that choose to
1935             start and end forms in any possible location. **/
1936 
1937         /*
1938         if (!strict && elem.getName().equals("form")) {
1939             if (lastFormSent == null) {
1940                 lastFormSent = tag;
1941             } else {
1942                 handleEndTag(lastFormSent);
1943                 lastFormSent = tag;
1944             }
1945         } else {
1946         */
1947             // Smlly, if a tag is unknown, we will apply
1948             // no legalTagContext logic to it.
1949             //
1950             if (!unknown) {
1951                 legalTagContext(tag);
1952 
1953                 // If skip tag is true,  this implies that
1954                 // the tag was illegal and that the error
1955                 // recovery strategy adopted is to ignore
1956                 // the tag.
1957                 if (!strict && skipTag) {
1958                     skipTag = false;
1959                     return;
1960                 }
1961             }
1962             /*
1963         }
1964             */
1965 
1966         startTag(tag);
1967 
1968         if (!elem.isEmpty()) {
1969             switch (elem.getType()) {
1970               case CDATA:
1971                 parseLiteral(false);
1972                 break;
1973               case RCDATA:
1974                 parseLiteral(true);
1975                 break;
1976               default:
1977                 if (stack != null) {
1978                     stack.net = net;
1979                 }
1980                 break;
1981             }
1982         }
1983     }
1984 
1985     private static final String START_COMMENT = "<!--";
1986     private static final String END_COMMENT = "-->";
1987     private static final char[] SCRIPT_END_TAG = "</script>".toCharArray();
1988     private static final char[] SCRIPT_END_TAG_UPPER_CASE =
1989                                         "</SCRIPT>".toCharArray();
1990 
1991     void parseScript() throws IOException {
1992         char[] charsToAdd = new char[SCRIPT_END_TAG.length];
1993         boolean insideComment = false;
1994 
1995         /* Here, ch should be the first character after <script> */
1996         while (true) {
1997             int i = 0;
1998             while (!insideComment && i < SCRIPT_END_TAG.length
1999                        && (SCRIPT_END_TAG[i] == ch
2000                            || SCRIPT_END_TAG_UPPER_CASE[i] == ch)) {
2001                 charsToAdd[i] = (char) ch;
2002                 ch = readCh();
2003                 i++;
2004             }
2005             if (i == SCRIPT_END_TAG.length) {
2006 
2007                 /*  '</script>' tag detected */
2008                 /* Here, ch == the first character after </script> */
2009                 return;
2010             } else {
2011 
2012                 /* To account for extra read()'s that happened */
2013                 for (int j = 0; j < i; j++) {
2014                     addString(charsToAdd[j]);
2015                 }
2016 
2017                 switch (ch) {
2018                 case -1:
2019                     error("eof.script");
2020                     return;
2021                 case '\n':
2022                     ln++;
2023                     ch = readCh();
2024                     lfCount++;
2025                     addString('\n');
2026                     break;
2027                 case '\r':
2028                     ln++;
2029                     if ((ch = readCh()) == '\n') {
2030                         ch = readCh();
2031                         crlfCount++;
2032                     } else {
2033                         crCount++;
2034                     }
2035                     addString('\n');
2036                     break;
2037                 default:
2038                     addString(ch);
2039                     String str = new String(getChars(0, strpos));
2040                     if (!insideComment && str.endsWith(START_COMMENT)) {
2041                         insideComment = true;
2042                     }
2043                     if (insideComment && str.endsWith(END_COMMENT)) {
2044                         insideComment = false;
2045                     }
2046                     ch = readCh();
2047                     break;
2048                 } // switch
2049             }
2050         } // while
2051     }
2052 
2053     /**
2054      * Parse Content. [24] 320:1
2055      */
2056     void parseContent() throws IOException {
2057         Thread curThread = Thread.currentThread();
2058 
2059         for (;;) {
2060             if (curThread.isInterrupted()) {
2061                 curThread.interrupt(); // resignal the interrupt
2062                 break;
2063             }
2064 
2065             int c = ch;
2066             currentBlockStartPos = currentPosition;
2067 
2068             if (recent == dtd.script) { // means: if after starting <script> tag
2069 
2070                 /* Here, ch has to be the first character after <script> */
2071                 parseScript();
2072                 last = makeTag(dtd.getElement("comment"), true);
2073 
2074                 /* Remove leading and trailing HTML comment declarations */
2075                 String str = new String(getChars(0)).trim();
2076                 int minLength = START_COMMENT.length() + END_COMMENT.length();
2077                 if (str.startsWith(START_COMMENT) && str.endsWith(END_COMMENT)
2078                        && str.length() >= (minLength)) {
2079                     str = str.substring(START_COMMENT.length(),
2080                                       str.length() - END_COMMENT.length());
2081                 }
2082 
2083                 /* Handle resulting chars as comment */
2084                 handleComment(str.toCharArray());
2085                 endTag(false);
2086                 lastBlockStartPos = currentPosition;
2087 
2088                 continue;
2089             } else {
2090                 switch (c) {
2091                   case '<':
2092                     parseTag();
2093                     lastBlockStartPos = currentPosition;
2094                     continue;
2095 
2096                   case '/':
2097                     ch = readCh();
2098                     if ((stack != null) && stack.net) {
2099                         // null end tag.
2100                         endTag(false);
2101                         continue;
2102                     } else if (textpos == 0) {
2103                         if (!legalElementContext(dtd.pcdata)) {
2104                             error("unexpected.pcdata");
2105                         }
2106                         if (last.breaksFlow()) {
2107                             space = false;
2108                         }
2109                     }
2110                     break;
2111 
2112                   case -1:
2113                     return;
2114 
2115                   case '&':
2116                     if (textpos == 0) {
2117                         if (!legalElementContext(dtd.pcdata)) {
2118                             error("unexpected.pcdata");
2119                         }
2120                         if (last.breaksFlow()) {
2121                             space = false;
2122                         }
2123                     }
2124                     char data[] = parseEntityReference();
2125                     if (textpos + data.length + 1 > text.length) {
2126                         char newtext[] = new char[Math.max(textpos + data.length + 128, text.length * 2)];
2127                         System.arraycopy(text, 0, newtext, 0, text.length);
2128                         text = newtext;
2129                     }
2130                     if (space) {
2131                         space = false;
2132                         text[textpos++] = ' ';
2133                     }
2134                     System.arraycopy(data, 0, text, textpos, data.length);
2135                     textpos += data.length;
2136                     ignoreSpace = false;
2137                     continue;
2138 
2139                   case '\n':
2140                     ln++;
2141                     lfCount++;
2142                     ch = readCh();
2143                     if ((stack != null) && stack.pre) {
2144                         break;
2145                     }
2146                     if (textpos == 0) {
2147                         lastBlockStartPos = currentPosition;
2148                     }
2149                     if (!ignoreSpace) {
2150                         space = true;
2151                     }
2152                     continue;
2153 
2154                   case '\r':
2155                     ln++;
2156                     c = '\n';
2157                     if ((ch = readCh()) == '\n') {
2158                         ch = readCh();
2159                         crlfCount++;
2160                     }
2161                     else {
2162                         crCount++;
2163                     }
2164                     if ((stack != null) && stack.pre) {
2165                         break;
2166                     }
2167                     if (textpos == 0) {
2168                         lastBlockStartPos = currentPosition;
2169                     }
2170                     if (!ignoreSpace) {
2171                         space = true;
2172                     }
2173                     continue;
2174 
2175 
2176                   case '\t':
2177                   case ' ':
2178                     ch = readCh();
2179                     if ((stack != null) && stack.pre) {
2180                         break;
2181                     }
2182                     if (textpos == 0) {
2183                         lastBlockStartPos = currentPosition;
2184                     }
2185                     if (!ignoreSpace) {
2186                         space = true;
2187                     }
2188                     continue;
2189 
2190                   default:
2191                     if (textpos == 0) {
2192                         if (!legalElementContext(dtd.pcdata)) {
2193                             error("unexpected.pcdata");
2194                         }
2195                         if (last.breaksFlow()) {
2196                             space = false;
2197                         }
2198                     }
2199                     ch = readCh();
2200                     break;
2201                 }
2202             }
2203 
2204             // enlarge buffer if needed
2205             if (textpos + 2 > text.length) {
2206                 char newtext[] = new char[text.length + 128];
2207                 System.arraycopy(text, 0, newtext, 0, text.length);
2208                 text = newtext;
2209             }
2210 
2211             // output pending space
2212             if (space) {
2213                 if (textpos == 0) {
2214                     lastBlockStartPos--;
2215                 }
2216                 text[textpos++] = ' ';
2217                 space = false;
2218             }
2219             text[textpos++] = (char)c;
2220             ignoreSpace = false;
2221         }
2222     }
2223 
2224     /**
2225      * Returns the end of line string. This will return the end of line
2226      * string that has been encountered the most, one of \r, \n or \r\n.
2227      */
2228     String getEndOfLineString() {
2229         if (crlfCount >= crCount) {
2230             if (lfCount >= crlfCount) {
2231                 return "\n";
2232             }
2233             else {
2234                 return "\r\n";
2235             }
2236         }
2237         else {
2238             if (crCount > lfCount) {
2239                 return "\r";
2240             }
2241             else {
2242                 return "\n";
2243             }
2244         }
2245     }
2246 
2247     /**
2248      * Parse an HTML stream, given a DTD.
2249      */
2250     public synchronized void parse(Reader in) throws IOException {
2251         this.in = in;
2252 
2253         this.ln = 1;
2254 
2255         seenHtml = false;
2256         seenHead = false;
2257         seenBody = false;
2258 
2259         crCount = lfCount = crlfCount = 0;
2260 
2261         try {
2262             ch = readCh();
2263             text = new char[1024];
2264             str = new char[128];
2265 
2266             parseContent();
2267             // NOTE: interruption may have occurred.  Control flows out
2268             // of here normally.
2269             while (stack != null) {
2270                 endTag(true);
2271             }
2272             in.close();
2273         } catch (IOException e) {
2274             errorContext();
2275             error("ioexception");
2276             throw e;
2277         } catch (Exception e) {
2278             errorContext();
2279             error("exception", e.getClass().getName(), e.getMessage());
2280             e.printStackTrace();
2281         } catch (ThreadDeath e) {
2282             errorContext();
2283             error("terminated");
2284             e.printStackTrace();
2285             throw e;
2286         } finally {
2287             for (; stack != null ; stack = stack.next) {
2288                 handleEndTag(stack.tag);
2289             }
2290 
2291             text = null;
2292             str = null;
2293         }
2294 
2295     }
2296 
2297 
2298     /*
2299      * Input cache.  This is much faster than calling down to a synchronized
2300      * method of BufferedReader for each byte.  Measurements done 5/30/97
2301      * show that there's no point in having a bigger buffer:  Increasing
2302      * the buffer to 8192 had no measurable impact for a program discarding
2303      * one character at a time (reading from an http URL to a local machine).
2304      * NOTE: If the current encoding is bogus, and we read too much
2305      * (past the content-type) we may suffer a MalformedInputException. For
2306      * this reason the initial size is 1 and when the body is encountered the
2307      * size is adjusted to 256.
2308      */
2309     private char buf[] = new char[1];
2310     private int pos;
2311     private int len;
2312     /*
2313         tracks position relative to the beginning of the
2314         document.
2315     */
2316     private int currentPosition;
2317 
2318 
2319     private final int readCh() throws IOException {
2320 
2321         if (pos >= len) {
2322 
2323             // This loop allows us to ignore interrupts if the flag
2324             // says so
2325             for (;;) {
2326                 try {
2327                     len = in.read(buf);
2328                     break;
2329                 } catch (InterruptedIOException ex) {
2330                     throw ex;
2331                 }
2332             }
2333 
2334             if (len <= 0) {
2335                 return -1;      // eof
2336             }
2337             pos = 0;
2338         }
2339         ++currentPosition;
2340 
2341         return buf[pos++];
2342     }
2343 
2344 
2345     protected int getCurrentPos() {
2346         return currentPosition;
2347     }
2348 }