1 /*
   2  * Copyright (c) 1998, 2008, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 
  26 package javax.swing.text.html.parser;
  27 
  28 import javax.swing.text.SimpleAttributeSet;
  29 import javax.swing.text.html.HTML;
  30 import javax.swing.text.ChangedCharSetException;
  31 import java.io.*;
  32 import java.util.Hashtable;
  33 import java.util.Properties;
  34 import java.util.Vector;
  35 import java.util.Enumeration;
  36 import java.net.URL;
  37 
  38 import sun.misc.MessageUtils;
  39 
  40 /**
  41  * A simple DTD-driven HTML parser. The parser reads an
  42  * HTML file from an InputStream and calls various methods
  43  * (which should be overridden in a subclass) when tags and
  44  * data are encountered.
  45  * <p>
  46  * Unfortunately there are many badly implemented HTML parsers
  47  * out there, and as a result there are many badly formatted
  48  * HTML files. This parser attempts to parse most HTML files.
  49  * This means that the implementation sometimes deviates from
  50  * the SGML specification in favor of HTML.
  51  * <p>
  52  * The parser treats \r and \r\n as \n. Newlines after starttags
  53  * and before end tags are ignored just as specified in the SGML/HTML
  54  * specification.
  55  * <p>
  56  * The html spec does not specify how spaces are to be coalesced very well.
  57  * Specifically, the following scenarios are not discussed (note that a
  58  * space should be used here, but I am using &amp;nbsp to force the space to
  59  * be displayed):
  60  * <p>
  61  * '&lt;b>blah&nbsp;&lt;i>&nbsp;&lt;strike>&nbsp;foo' which can be treated as:
  62  * '&lt;b>blah&nbsp;&lt;i>&lt;strike>foo'
  63  * <p>as well as:
  64  * '&lt;p>&lt;a href="xx">&nbsp;&lt;em>Using&lt;/em>&lt;/a>&lt;/p>'
  65  * which appears to be treated as:
  66  * '&lt;p>&lt;a href="xx">&lt;em>Using&lt;/em>&lt;/a>&lt;/p>'
  67  * <p>
  68  * If <code>strict</code> is false, when a tag that breaks flow,
  69  * (<code>TagElement.breaksFlows</code>) or trailing whitespace is
  70  * encountered, all whitespace will be ignored until a non whitespace
  71  * character is encountered. This appears to give behavior closer to
  72  * the popular browsers.
  73  *
  74  * @see DTD
  75  * @see TagElement
  76  * @see SimpleAttributeSet
  77  * @author Arthur van Hoff
  78  * @author Sunita Mani
  79  */
  80 public
  81 class Parser implements DTDConstants {
  82 
  83     private char text[] = new char[1024];
  84     private int textpos = 0;
  85     private TagElement last;
  86     private boolean space;
  87 
  88     private char str[] = new char[128];
  89     private int strpos = 0;
  90 
  91     protected DTD dtd = null;
  92 
  93     private int ch;
  94     private int ln;
  95     private Reader in;
  96 
  97     private Element recent;
  98     private TagStack stack;
  99     private boolean skipTag = false;
 100     private TagElement lastFormSent = null;
 101     private SimpleAttributeSet attributes = new SimpleAttributeSet();
 102 
 103     // State for <html>, <head> and <body>.  Since people like to slap
 104     // together HTML documents without thinking, occasionally they
 105     // have multiple instances of these tags.  These booleans track
 106     // the first sightings of these tags so they can be safely ignored
 107     // by the parser if repeated.
 108     private boolean seenHtml = false;
 109     private boolean seenHead = false;
 110     private boolean seenBody = false;
 111 
 112     /**
 113      * The html spec does not specify how spaces are coalesced very well.
 114      * If strict == false, ignoreSpace is used to try and mimic the behavior
 115      * of the popular browsers.
 116      * <p>
 117      * The problematic scenarios are:
 118      * '&lt;b>blah &lt;i> &lt;strike> foo' which can be treated as:
 119      * '&lt;b>blah &lt;i>&lt;strike>foo'
 120      * as well as:
 121      * '&lt;p>&lt;a href="xx"> &lt;em>Using&lt;/em>&lt;/a>&lt;/p>'
 122      * which appears to be treated as:
 123      * '&lt;p>&lt;a href="xx">&lt;em>Using&lt;/em>&lt;/a>&lt;/p>'
 124      * <p>
 125      * When a tag that breaks flow, or trailing whitespace is encountered
 126      * ignoreSpace is set to true. From then on, all whitespace will be
 127      * ignored.
 128      * ignoreSpace will be set back to false the first time a
 129      * non whitespace character is encountered. This appears to give
 130      * behavior closer to the popular browsers.
 131      */
 132     private boolean ignoreSpace;
 133 
 134     /**
 135      * This flag determines whether or not the Parser will be strict
 136      * in enforcing SGML compatibility.  If false, it will be lenient
 137      * with certain common classes of erroneous HTML constructs.
 138      * Strict or not, in either case an error will be recorded.
 139      *
 140      */
 141     protected boolean strict = false;
 142 
 143 
 144     /** Number of \r\n's encountered. */
 145     private int crlfCount;
 146     /** Number of \r's encountered. A \r\n will not increment this. */
 147     private int crCount;
 148     /** Number of \n's encountered. A \r\n will not increment this. */
 149     private int lfCount;
 150 
 151     //
 152     // To correctly identify the start of a tag/comment/text we need two
 153     // ivars. Two are needed as handleText isn't invoked until the tag
 154     // after the text has been parsed, that is the parser parses the text,
 155     // then a tag, then invokes handleText followed by handleStart.
 156     //
 157     /** The start position of the current block. Block is overloaded here,
 158      * it really means the current start position for the current comment,
 159      * tag, text. Use getBlockStartPosition to access this. */
 160     private int currentBlockStartPos;
 161     /** Start position of the last block. */
 162     private int lastBlockStartPos;
 163 
 164     /**
 165      * array for mapping numeric references in range
 166      * 130-159 to displayable Unicode characters.
 167      */
 168     private static final char[] cp1252Map = {
 169         8218,  // &#130;
 170         402,   // &#131;
 171         8222,  // &#132;
 172         8230,  // &#133;
 173         8224,  // &#134;
 174         8225,  // &#135;
 175         710,   // &#136;
 176         8240,  // &#137;
 177         352,   // &#138;
 178         8249,  // &#139;
 179         338,   // &#140;
 180         141,   // &#141;
 181         142,   // &#142;
 182         143,   // &#143;
 183         144,   // &#144;
 184         8216,  // &#145;
 185         8217,  // &#146;
 186         8220,  // &#147;
 187         8221,  // &#148;
 188         8226,  // &#149;
 189         8211,  // &#150;
 190         8212,  // &#151;
 191         732,   // &#152;
 192         8482,  // &#153;
 193         353,   // &#154;
 194         8250,  // &#155;
 195         339,   // &#156;
 196         157,   // &#157;
 197         158,   // &#158;
 198         376    // &#159;
 199     };
 200 
 201     public Parser(DTD dtd) {
 202         this.dtd = dtd;
 203     }
 204 
 205 
 206     /**
 207      * @return the line number of the line currently being parsed
 208      */
 209     protected int getCurrentLine() {
 210         return ln;
 211     }
 212 
 213     /**
 214      * Returns the start position of the current block. Block is
 215      * overloaded here, it really means the current start position for
 216      * the current comment tag, text, block.... This is provided for
 217      * subclassers that wish to know the start of the current block when
 218      * called with one of the handleXXX methods.
 219      */
 220     int getBlockStartPosition() {
 221         return Math.max(0, lastBlockStartPos - 1);
 222     }
 223 
 224     /**
 225      * Makes a TagElement.
 226      */
 227     protected TagElement makeTag(Element elem, boolean fictional) {
 228         return new TagElement(elem, fictional);
 229     }
 230 
 231     protected TagElement makeTag(Element elem) {
 232         return makeTag(elem, false);
 233     }
 234 
 235     protected SimpleAttributeSet getAttributes() {
 236         return attributes;
 237     }
 238 
 239     protected void flushAttributes() {
 240         attributes.removeAttributes(attributes);
 241     }
 242 
 243     /**
 244      * Called when PCDATA is encountered.
 245      */
 246     protected void handleText(char text[]) {
 247     }
 248 
 249     /**
 250      * Called when an HTML title tag is encountered.
 251      */
 252     protected void handleTitle(char text[]) {
 253         // default behavior is to call handleText. Subclasses
 254         // can override if necessary.
 255         handleText(text);
 256     }
 257 
 258     /**
 259      * Called when an HTML comment is encountered.
 260      */
 261     protected void handleComment(char text[]) {
 262     }
 263 
 264     protected void handleEOFInComment() {
 265         // We've reached EOF.  Our recovery strategy is to
 266         // see if we have more than one line in the comment;
 267         // if so, we pretend that the comment was an unterminated
 268         // single line comment, and reparse the lines after the
 269         // first line as normal HTML content.
 270 
 271         int commentEndPos = strIndexOf('\n');
 272         if (commentEndPos >= 0) {
 273             handleComment(getChars(0, commentEndPos));
 274             try {
 275                 in.close();
 276                 in = new CharArrayReader(getChars(commentEndPos + 1));
 277                 ch = '>';
 278             } catch (IOException e) {
 279                 error("ioexception");
 280             }
 281 
 282             resetStrBuffer();
 283         } else {
 284             // no newline, so signal an error
 285             error("eof.comment");
 286         }
 287     }
 288 
 289     /**
 290      * Called when an empty tag is encountered.
 291      */
 292     protected void handleEmptyTag(TagElement tag) throws ChangedCharSetException {
 293     }
 294 
 295     /**
 296      * Called when a start tag is encountered.
 297      */
 298     protected void handleStartTag(TagElement tag) {
 299     }
 300 
 301     /**
 302      * Called when an end tag is encountered.
 303      */
 304     protected void handleEndTag(TagElement tag) {
 305     }
 306 
 307     /**
 308      * An error has occurred.
 309      */
 310     protected void handleError(int ln, String msg) {
 311         /*
 312         Thread.dumpStack();
 313         System.out.println("**** " + stack);
 314         System.out.println("line " + ln + ": error: " + msg);
 315         System.out.println();
 316         */
 317     }
 318 
 319     /**
 320      * Output text.
 321      */
 322     void handleText(TagElement tag) {
 323         if (tag.breaksFlow()) {
 324             space = false;
 325             if (!strict) {
 326                 ignoreSpace = true;
 327             }
 328         }
 329         if (textpos == 0) {
 330             if ((!space) || (stack == null) || last.breaksFlow() ||
 331                 !stack.advance(dtd.pcdata)) {
 332                 last = tag;
 333                 space = false;
 334                 lastBlockStartPos = currentBlockStartPos;
 335                 return;
 336             }
 337         }
 338         if (space) {
 339             if (!ignoreSpace) {
 340                 // enlarge buffer if needed
 341                 if (textpos + 1 > text.length) {
 342                     char newtext[] = new char[text.length + 200];
 343                     System.arraycopy(text, 0, newtext, 0, text.length);
 344                     text = newtext;
 345                 }
 346 
 347                 // output pending space
 348                 text[textpos++] = ' ';
 349                 if (!strict && !tag.getElement().isEmpty()) {
 350                     ignoreSpace = true;
 351                 }
 352             }
 353             space = false;
 354         }
 355         char newtext[] = new char[textpos];
 356         System.arraycopy(text, 0, newtext, 0, textpos);
 357         // Handles cases of bad html where the title tag
 358         // was getting lost when we did error recovery.
 359         if (tag.getElement().getName().equals("title")) {
 360             handleTitle(newtext);
 361         } else {
 362             handleText(newtext);
 363         }
 364         lastBlockStartPos = currentBlockStartPos;
 365         textpos = 0;
 366         last = tag;
 367         space = false;
 368     }
 369 
 370     /**
 371      * Invoke the error handler.
 372      */
 373     protected void error(String err, String arg1, String arg2,
 374         String arg3) {
 375         handleError(ln, err + " " + arg1 + " " + arg2 + " " + arg3);
 376     }
 377 
 378     protected void error(String err, String arg1, String arg2) {
 379         error(err, arg1, arg2, "?");
 380     }
 381     protected void error(String err, String arg1) {
 382         error(err, arg1, "?", "?");
 383     }
 384     protected void error(String err) {
 385         error(err, "?", "?", "?");
 386     }
 387 
 388 
 389     /**
 390      * Handle a start tag. The new tag is pushed
 391      * onto the tag stack. The attribute list is
 392      * checked for required attributes.
 393      */
 394     protected void startTag(TagElement tag) throws ChangedCharSetException {
 395         Element elem = tag.getElement();
 396 
 397         // If the tag is an empty tag and texpos != 0
 398         // this implies that there is text before the
 399         // start tag that needs to be processed before
 400         // handling the tag.
 401         //
 402         if (!elem.isEmpty() ||
 403                     ((last != null) && !last.breaksFlow()) ||
 404                     (textpos != 0)) {
 405             handleText(tag);
 406         } else {
 407             // this variable gets updated in handleText().
 408             // Since in this case we do not call handleText()
 409             // we need to update it here.
 410             //
 411             last = tag;
 412             // Note that we should really check last.breakFlows before
 413             // assuming this should be false.
 414             space = false;
 415         }
 416         lastBlockStartPos = currentBlockStartPos;
 417 
 418         // check required attributes
 419         for (AttributeList a = elem.atts ; a != null ; a = a.next) {
 420             if ((a.modifier == REQUIRED) &&
 421                 ((attributes.isEmpty()) ||
 422                  ((!attributes.isDefined(a.name)) &&
 423                   (!attributes.isDefined(HTML.getAttributeKey(a.name)))))) {
 424                 error("req.att ", a.getName(), elem.getName());
 425             }
 426         }
 427 
 428         if (elem.isEmpty()) {
 429             handleEmptyTag(tag);
 430             /*
 431         } else if (elem.getName().equals("form")) {
 432             handleStartTag(tag);
 433             */
 434         } else {
 435             recent = elem;
 436             stack = new TagStack(tag, stack);
 437             handleStartTag(tag);
 438         }
 439     }
 440 
 441     /**
 442      * Handle an end tag. The end tag is popped
 443      * from the tag stack.
 444      */
 445     protected void endTag(boolean omitted) {
 446         handleText(stack.tag);
 447 
 448         if (omitted && !stack.elem.omitEnd()) {
 449             error("end.missing", stack.elem.getName());
 450         } else if (!stack.terminate()) {
 451             error("end.unexpected", stack.elem.getName());
 452         }
 453 
 454         // handle the tag
 455         handleEndTag(stack.tag);
 456         stack = stack.next;
 457         recent = (stack != null) ? stack.elem : null;
 458     }
 459 
 460 
 461     boolean ignoreElement(Element elem) {
 462 
 463         String stackElement = stack.elem.getName();
 464         String elemName = elem.getName();
 465         /* We ignore all elements that are not valid in the context of
 466            a table except <td>, <th> (these we handle in
 467            legalElementContext()) and #pcdata.  We also ignore the
 468            <font> tag in the context of <ul> and <ol> We additonally
 469            ignore the <meta> and the <style> tag if the body tag has
 470            been seen. **/
 471         if ((elemName.equals("html") && seenHtml) ||
 472             (elemName.equals("head") && seenHead) ||
 473             (elemName.equals("body") && seenBody)) {
 474             return true;
 475         }
 476         if (elemName.equals("dt") || elemName.equals("dd")) {
 477             TagStack s = stack;
 478             while (s != null && !s.elem.getName().equals("dl")) {
 479                 s = s.next;
 480             }
 481             if (s == null) {
 482                 return true;
 483             }
 484         }
 485 
 486         if (((stackElement.equals("table")) &&
 487              (!elemName.equals("#pcdata")) && (!elemName.equals("input"))) ||
 488             ((elemName.equals("font")) &&
 489              (stackElement.equals("ul") || stackElement.equals("ol"))) ||
 490             (elemName.equals("meta") && stack != null) ||
 491             (elemName.equals("style") && seenBody) ||
 492             (stackElement.equals("table") && elemName.equals("a"))) {
 493             return true;
 494         }
 495         return false;
 496     }
 497 
 498 
 499     /**
 500      * Marks the first time a tag has been seen in a document
 501      */
 502 
 503     protected void markFirstTime(Element elem) {
 504         String elemName = elem.getName();
 505         if (elemName.equals("html")) {
 506             seenHtml = true;
 507         } else if (elemName.equals("head")) {
 508             seenHead = true;
 509         } else if (elemName.equals("body")) {
 510             if (buf.length == 1) {
 511                 // Refer to note in definition of buf for details on this.
 512                 char[] newBuf = new char[256];
 513 
 514                 newBuf[0] = buf[0];
 515                 buf = newBuf;
 516             }
 517             seenBody = true;
 518         }
 519     }
 520 
 521     /**
 522      * Create a legal content for an element.
 523      */
 524     boolean legalElementContext(Element elem) throws ChangedCharSetException {
 525 
 526         // System.out.println("-- legalContext -- " + elem);
 527 
 528         // Deal with the empty stack
 529         if (stack == null) {
 530             // System.out.println("-- stack is empty");
 531             if (elem != dtd.html) {
 532                 // System.out.println("-- pushing html");
 533                 startTag(makeTag(dtd.html, true));
 534                 return legalElementContext(elem);
 535             }
 536             return true;
 537         }
 538 
 539         // Is it allowed in the current context
 540         if (stack.advance(elem)) {
 541             // System.out.println("-- legal context");
 542             markFirstTime(elem);
 543             return true;
 544         }
 545         boolean insertTag = false;
 546 
 547         // The use of all error recovery strategies are contingent
 548         // on the value of the strict property.
 549         //
 550         // These are commonly occuring errors.  if insertTag is true,
 551         // then we want to adopt an error recovery strategy that
 552         // involves attempting to insert an additional tag to
 553         // legalize the context.  The two errors addressed here
 554         // are:
 555         // 1) when a <td> or <th> is seen soon after a <table> tag.
 556         //    In this case we insert a <tr>.
 557         // 2) when any other tag apart from a <tr> is seen
 558         //    in the context of a <tr>.  In this case we would
 559         //    like to add a <td>.  If a <tr> is seen within a
 560         //    <tr> context, then we will close out the current
 561         //    <tr>.
 562         //
 563         // This insertion strategy is handled later in the method.
 564         // The reason for checking this now, is that in other cases
 565         // we would like to apply other error recovery strategies for example
 566         // ignoring tags.
 567         //
 568         // In certain cases it is better to ignore a tag than try to
 569         // fix the situation.  So the first test is to see if this
 570         // is what we need to do.
 571         //
 572         String stackElemName = stack.elem.getName();
 573         String elemName = elem.getName();
 574 
 575 
 576         if (!strict &&
 577             ((stackElemName.equals("table") && elemName.equals("td")) ||
 578              (stackElemName.equals("table") && elemName.equals("th")) ||
 579              (stackElemName.equals("tr") && !elemName.equals("tr")))){
 580              insertTag = true;
 581         }
 582 
 583 
 584         if (!strict && !insertTag && (stack.elem.getName() != elem.getName() ||
 585                                       elem.getName().equals("body"))) {
 586             if (skipTag = ignoreElement(elem)) {
 587                 error("tag.ignore", elem.getName());
 588                 return skipTag;
 589             }
 590         }
 591 
 592         // Check for anything after the start of the table besides tr, td, th
 593         // or caption, and if those aren't there, insert the <tr> and call
 594         // legalElementContext again.
 595         if (!strict && stackElemName.equals("table") &&
 596             !elemName.equals("tr") && !elemName.equals("td") &&
 597             !elemName.equals("th") && !elemName.equals("caption")) {
 598             Element e = dtd.getElement("tr");
 599             TagElement t = makeTag(e, true);
 600             legalTagContext(t);
 601             startTag(t);
 602             error("start.missing", elem.getName());
 603             return legalElementContext(elem);
 604         }
 605 
 606         // They try to find a legal context by checking if the current
 607         // tag is valid in an enclosing context.  If so
 608         // close out the tags by outputing end tags and then
 609         // insert the curent tag.  If the tags that are
 610         // being closed out do not have an optional end tag
 611         // specification in the DTD then an html error is
 612         // reported.
 613         //
 614         if (!insertTag && stack.terminate() && (!strict || stack.elem.omitEnd())) {
 615             for (TagStack s = stack.next ; s != null ; s = s.next) {
 616                 if (s.advance(elem)) {
 617                     while (stack != s) {
 618                         endTag(true);
 619                     }
 620                     return true;
 621                 }
 622                 if (!s.terminate() || (strict && !s.elem.omitEnd())) {
 623                     break;
 624                 }
 625             }
 626         }
 627 
 628         // Check if we know what tag is expected next.
 629         // If so insert the tag.  Report an error if the
 630         // tag does not have its start tag spec in the DTD as optional.
 631         //
 632         Element next = stack.first();
 633         if (next != null && (!strict || next.omitStart()) &&
 634            !(next==dtd.head && elem==dtd.pcdata) ) {
 635             // System.out.println("-- omitting start tag: " + next);
 636             TagElement t = makeTag(next, true);
 637             legalTagContext(t);
 638             startTag(t);
 639             if (!next.omitStart()) {
 640                 error("start.missing", elem.getName());
 641             }
 642             return legalElementContext(elem);
 643         }
 644 
 645 
 646         // Traverse the list of expected elements and determine if adding
 647         // any of these elements would make for a legal context.
 648         //
 649 
 650         if (!strict) {
 651             ContentModel content = stack.contentModel();
 652             Vector<Element> elemVec = new Vector<Element>();
 653             if (content != null) {
 654                 content.getElements(elemVec);
 655                 for (Element e : elemVec) {
 656                     // Ensure that this element has not been included as
 657                     // part of the exclusions in the DTD.
 658                     //
 659                     if (stack.excluded(e.getIndex())) {
 660                         continue;
 661                     }
 662 
 663                     boolean reqAtts = false;
 664 
 665                     for (AttributeList a = e.getAttributes(); a != null ; a = a.next) {
 666                         if (a.modifier == REQUIRED) {
 667                             reqAtts = true;
 668                             break;
 669                         }
 670                     }
 671                     // Ensure that no tag that has required attributes
 672                     // gets inserted.
 673                     //
 674                     if (reqAtts) {
 675                         continue;
 676                     }
 677 
 678                     ContentModel m = e.getContent();
 679                     if (m != null && m.first(elem)) {
 680                         // System.out.println("-- adding a legal tag: " + e);
 681                         TagElement t = makeTag(e, true);
 682                         legalTagContext(t);
 683                         startTag(t);
 684                         error("start.missing", e.getName());
 685                         return legalElementContext(elem);
 686                     }
 687                 }
 688             }
 689         }
 690 
 691         // Check if the stack can be terminated.  If so add the appropriate
 692         // end tag.  Report an error if the tag being ended does not have its
 693         // end tag spec in the DTD as optional.
 694         //
 695         if (stack.terminate() && (stack.elem != dtd.body) && (!strict || stack.elem.omitEnd())) {
 696             // System.out.println("-- omitting end tag: " + stack.elem);
 697             if (!stack.elem.omitEnd()) {
 698                 error("end.missing", elem.getName());
 699             }
 700 
 701             endTag(true);
 702             return legalElementContext(elem);
 703         }
 704 
 705         // At this point we know that something is screwed up.
 706         return false;
 707     }
 708 
 709     /**
 710      * Create a legal context for a tag.
 711      */
 712     void legalTagContext(TagElement tag) throws ChangedCharSetException {
 713         if (legalElementContext(tag.getElement())) {
 714             markFirstTime(tag.getElement());
 715             return;
 716         }
 717 
 718         // Avoid putting a block tag in a flow tag.
 719         if (tag.breaksFlow() && (stack != null) && !stack.tag.breaksFlow()) {
 720             endTag(true);
 721             legalTagContext(tag);
 722             return;
 723         }
 724 
 725         // Avoid putting something wierd in the head of the document.
 726         for (TagStack s = stack ; s != null ; s = s.next) {
 727             if (s.tag.getElement() == dtd.head) {
 728                 while (stack != s) {
 729                     endTag(true);
 730                 }
 731                 endTag(true);
 732                 legalTagContext(tag);
 733                 return;
 734             }
 735         }
 736 
 737         // Everything failed
 738         error("tag.unexpected", tag.getElement().getName());
 739     }
 740 
 741     /**
 742      * Error context. Something went wrong, make sure we are in
 743      * the document's body context
 744      */
 745     void errorContext() throws ChangedCharSetException {
 746         for (; (stack != null) && (stack.tag.getElement() != dtd.body) ; stack = stack.next) {
 747             handleEndTag(stack.tag);
 748         }
 749         if (stack == null) {
 750             legalElementContext(dtd.body);
 751             startTag(makeTag(dtd.body, true));
 752         }
 753     }
 754 
 755     /**
 756      * Add a char to the string buffer.
 757      */
 758     void addString(int c) {
 759         if (strpos  == str.length) {
 760             char newstr[] = new char[str.length + 128];
 761             System.arraycopy(str, 0, newstr, 0, str.length);
 762             str = newstr;
 763         }
 764         str[strpos++] = (char)c;
 765     }
 766 
 767     /**
 768      * Get the string that's been accumulated.
 769      */
 770     String getString(int pos) {
 771         char newStr[] = new char[strpos - pos];
 772         System.arraycopy(str, pos, newStr, 0, strpos - pos);
 773         strpos = pos;
 774         return new String(newStr);
 775     }
 776 
 777     char[] getChars(int pos) {
 778         char newStr[] = new char[strpos - pos];
 779         System.arraycopy(str, pos, newStr, 0, strpos - pos);
 780         strpos = pos;
 781         return newStr;
 782     }
 783 
 784     char[] getChars(int pos, int endPos) {
 785         char newStr[] = new char[endPos - pos];
 786         System.arraycopy(str, pos, newStr, 0, endPos - pos);
 787         // REMIND: it's not clear whether this version should set strpos or not
 788         // strpos = pos;
 789         return newStr;
 790     }
 791 
 792     void resetStrBuffer() {
 793         strpos = 0;
 794     }
 795 
 796     int strIndexOf(char target) {
 797         for (int i = 0; i < strpos; i++) {
 798             if (str[i] == target) {
 799                 return i;
 800             }
 801         }
 802 
 803         return -1;
 804     }
 805 
 806     /**
 807      * Skip space.
 808      * [5] 297:5
 809      */
 810     void skipSpace() throws IOException {
 811         while (true) {
 812             switch (ch) {
 813               case '\n':
 814                 ln++;
 815                 ch = readCh();
 816                 lfCount++;
 817                 break;
 818 
 819               case '\r':
 820                 ln++;
 821                 if ((ch = readCh()) == '\n') {
 822                     ch = readCh();
 823                     crlfCount++;
 824                 }
 825                 else {
 826                     crCount++;
 827                 }
 828                 break;
 829               case ' ':
 830               case '\t':
 831                 ch = readCh();
 832                 break;
 833 
 834               default:
 835                 return;
 836             }
 837         }
 838     }
 839 
 840     /**
 841      * Parse identifier. Uppercase characters are folded
 842      * to lowercase when lower is true. Returns falsed if
 843      * no identifier is found. [55] 346:17
 844      */
 845     boolean parseIdentifier(boolean lower) throws IOException {
 846         switch (ch) {
 847           case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
 848           case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
 849           case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
 850           case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
 851           case 'Y': case 'Z':
 852             if (lower) {
 853                 ch = 'a' + (ch - 'A');
 854             }
 855 
 856           case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
 857           case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
 858           case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
 859           case 's': case 't': case 'u': case 'v': case 'w': case 'x':
 860           case 'y': case 'z':
 861             break;
 862 
 863           default:
 864             return false;
 865         }
 866 
 867         while (true) {
 868             addString(ch);
 869 
 870             switch (ch = readCh()) {
 871               case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
 872               case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
 873               case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
 874               case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
 875               case 'Y': case 'Z':
 876                 if (lower) {
 877                     ch = 'a' + (ch - 'A');
 878                 }
 879 
 880               case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
 881               case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
 882               case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
 883               case 's': case 't': case 'u': case 'v': case 'w': case 'x':
 884               case 'y': case 'z':
 885 
 886               case '0': case '1': case '2': case '3': case '4':
 887               case '5': case '6': case '7': case '8': case '9':
 888 
 889               case '.': case '-':
 890 
 891               case '_': // not officially allowed
 892                 break;
 893 
 894               default:
 895                 return true;
 896             }
 897         }
 898     }
 899 
 900     /**
 901      * Parse an entity reference. [59] 350:17
 902      */
 903     private char[] parseEntityReference() throws IOException {
 904         int pos = strpos;
 905 
 906         if ((ch = readCh()) == '#') {
 907             int n = 0;
 908             ch = readCh();
 909             if ((ch >= '0') && (ch <= '9') ||
 910                     ch == 'x' || ch == 'X') {
 911 
 912                 if ((ch >= '0') && (ch <= '9')) {
 913                     // parse decimal reference
 914                     while ((ch >= '0') && (ch <= '9')) {
 915                         n = (n * 10) + ch - '0';
 916                         ch = readCh();
 917                     }
 918                 } else {
 919                     // parse hexadecimal reference
 920                     ch = readCh();
 921                     char lch = (char) Character.toLowerCase(ch);
 922                     while ((lch >= '0') && (lch <= '9') ||
 923                             (lch >= 'a') && (lch <= 'f')) {
 924                         if (lch >= '0' && lch <= '9') {
 925                             n = (n * 16) + lch - '0';
 926                         } else {
 927                             n = (n * 16) + lch - 'a' + 10;
 928                         }
 929                         ch = readCh();
 930                         lch = (char) Character.toLowerCase(ch);
 931                     }
 932                 }
 933                 switch (ch) {
 934                     case '\n':
 935                         ln++;
 936                         ch = readCh();
 937                         lfCount++;
 938                         break;
 939 
 940                     case '\r':
 941                         ln++;
 942                         if ((ch = readCh()) == '\n') {
 943                             ch = readCh();
 944                             crlfCount++;
 945                         }
 946                         else {
 947                             crCount++;
 948                         }
 949                         break;
 950 
 951                     case ';':
 952                         ch = readCh();
 953                         break;
 954                 }
 955                 char data[] = {mapNumericReference((char) n)};
 956                 return data;
 957             }
 958             addString('#');
 959             if (!parseIdentifier(false)) {
 960                 error("ident.expected");
 961                 strpos = pos;
 962                 char data[] = {'&', '#'};
 963                 return data;
 964             }
 965         } else if (!parseIdentifier(false)) {
 966             char data[] = {'&'};
 967             return data;
 968         }
 969         switch (ch) {
 970           case '\n':
 971             ln++;
 972             ch = readCh();
 973             lfCount++;
 974             break;
 975 
 976           case '\r':
 977             ln++;
 978             if ((ch = readCh()) == '\n') {
 979                 ch = readCh();
 980                 crlfCount++;
 981             }
 982             else {
 983                 crCount++;
 984             }
 985             break;
 986 
 987           case ';':
 988             ch = readCh();
 989             break;
 990         }
 991 
 992         String nm = getString(pos);
 993         Entity ent = dtd.getEntity(nm);
 994 
 995         // entities are case sensitive - however if strict
 996         // is false then we will try to make a match by
 997         // converting the string to all lowercase.
 998         //
 999         if (!strict && (ent == null)) {
1000             ent = dtd.getEntity(nm.toLowerCase());
1001         }
1002         if ((ent == null) || !ent.isGeneral()) {
1003 
1004             if (nm.length() == 0) {
1005                 error("invalid.entref", nm);
1006                 return new char[0];
1007             }
1008             /* given that there is not a match restore the entity reference */
1009             String str = "&" + nm + ";";
1010 
1011             char b[] = new char[str.length()];
1012             str.getChars(0, b.length, b, 0);
1013             return b;
1014         }
1015         return ent.getData();
1016     }
1017 
1018     /**
1019      * Converts numeric character reference to Unicode character.
1020      *
1021      * Normally the code in a reference should be always converted
1022      * to the Unicode character with the same code, but due to
1023      * wide usage of Cp1252 charset most browsers map numeric references
1024      * in the range 130-159 (which are control chars in Unicode set)
1025      * to displayable characters with other codes.
1026      *
1027      * @param c the code of numeric character reference.
1028      * @return the character corresponding to the reference code.
1029      */
1030     private char mapNumericReference(char c) {
1031         if (c < 130 || c > 159) {
1032             return c;
1033         }
1034         return cp1252Map[c - 130];
1035     }
1036 
1037     /**
1038      * Parse a comment. [92] 391:7
1039      */
1040     void parseComment() throws IOException {
1041 
1042         while (true) {
1043             int c = ch;
1044             switch (c) {
1045               case '-':
1046                   /** Presuming that the start string of a comment "<!--" has
1047                       already been parsed, the '-' character is valid only as
1048                       part of a comment termination and further more it must
1049                       be present in even numbers. Hence if strict is true, we
1050                       presume the comment has been terminated and return.
1051                       However if strict is false, then there is no even number
1052                       requirement and this character can appear anywhere in the
1053                       comment.  The parser reads on until it sees the following
1054                       pattern: "-->" or "--!>".
1055                    **/
1056                 if (!strict && (strpos != 0) && (str[strpos - 1] == '-')) {
1057                     if ((ch = readCh()) == '>') {
1058                         return;
1059                     }
1060                     if (ch == '!') {
1061                         if ((ch = readCh()) == '>') {
1062                             return;
1063                         } else {
1064                             /* to account for extra read()'s that happened */
1065                             addString('-');
1066                             addString('!');
1067                             continue;
1068                         }
1069                     }
1070                     break;
1071                 }
1072 
1073                 if ((ch = readCh()) == '-') {
1074                     ch = readCh();
1075                     if (strict || ch == '>') {
1076                         return;
1077                     }
1078                     if (ch == '!') {
1079                         if ((ch = readCh()) == '>') {
1080                             return;
1081                         } else {
1082                             /* to account for extra read()'s that happened */
1083                             addString('-');
1084                             addString('!');
1085                             continue;
1086                         }
1087                     }
1088                     /* to account for the extra read() */
1089                     addString('-');
1090                 }
1091                 break;
1092 
1093               case -1:
1094                   handleEOFInComment();
1095                   return;
1096 
1097               case '\n':
1098                 ln++;
1099                 ch = readCh();
1100                 lfCount++;
1101                 break;
1102 
1103               case '>':
1104                 ch = readCh();
1105                 break;
1106 
1107               case '\r':
1108                 ln++;
1109                 if ((ch = readCh()) == '\n') {
1110                     ch = readCh();
1111                     crlfCount++;
1112                 }
1113                 else {
1114                     crCount++;
1115                 }
1116                 c = '\n';
1117                 break;
1118               default:
1119                 ch = readCh();
1120                 break;
1121             }
1122 
1123             addString(c);
1124         }
1125     }
1126 
1127     /**
1128      * Parse literal content. [46] 343:1 and [47] 344:1
1129      */
1130     void parseLiteral(boolean replace) throws IOException {
1131         while (true) {
1132             int c = ch;
1133             switch (c) {
1134               case -1:
1135                 error("eof.literal", stack.elem.getName());
1136                 endTag(true);
1137                 return;
1138 
1139               case '>':
1140                 ch = readCh();
1141                 int i = textpos - (stack.elem.name.length() + 2), j = 0;
1142 
1143                 // match end tag
1144                 if ((i >= 0) && (text[i++] == '<') && (text[i] == '/')) {
1145                     while ((++i < textpos) &&
1146                            (Character.toLowerCase(text[i]) == stack.elem.name.charAt(j++)));
1147                     if (i == textpos) {
1148                         textpos -= (stack.elem.name.length() + 2);
1149                         if ((textpos > 0) && (text[textpos-1] == '\n')) {
1150                             textpos--;
1151                         }
1152                         endTag(false);
1153                         return;
1154                     }
1155                 }
1156                 break;
1157 
1158               case '&':
1159                 char data[] = parseEntityReference();
1160                 if (textpos + data.length > text.length) {
1161                     char newtext[] = new char[Math.max(textpos + data.length + 128, text.length * 2)];
1162                     System.arraycopy(text, 0, newtext, 0, text.length);
1163                     text = newtext;
1164                 }
1165                 System.arraycopy(data, 0, text, textpos, data.length);
1166                 textpos += data.length;
1167                 continue;
1168 
1169               case '\n':
1170                 ln++;
1171                 ch = readCh();
1172                 lfCount++;
1173                 break;
1174 
1175               case '\r':
1176                 ln++;
1177                 if ((ch = readCh()) == '\n') {
1178                     ch = readCh();
1179                     crlfCount++;
1180                 }
1181                 else {
1182                     crCount++;
1183                 }
1184                 c = '\n';
1185                 break;
1186               default:
1187                 ch = readCh();
1188                 break;
1189             }
1190 
1191             // output character
1192             if (textpos == text.length) {
1193                 char newtext[] = new char[text.length + 128];
1194                 System.arraycopy(text, 0, newtext, 0, text.length);
1195                 text = newtext;
1196             }
1197             text[textpos++] = (char)c;
1198         }
1199     }
1200 
1201     /**
1202      * Parse attribute value. [33] 331:1
1203      */
1204     String parseAttributeValue(boolean lower) throws IOException {
1205         int delim = -1;
1206 
1207         // Check for a delimiter
1208         switch(ch) {
1209           case '\'':
1210           case '"':
1211             delim = ch;
1212             ch = readCh();
1213             break;
1214         }
1215 
1216         // Parse the rest of the value
1217         while (true) {
1218             int c = ch;
1219 
1220             switch (c) {
1221               case '\n':
1222                 ln++;
1223                 ch = readCh();
1224                 lfCount++;
1225                 if (delim < 0) {
1226                     return getString(0);
1227                 }
1228                 break;
1229 
1230               case '\r':
1231                 ln++;
1232 
1233                 if ((ch = readCh()) == '\n') {
1234                     ch = readCh();
1235                     crlfCount++;
1236                 }
1237                 else {
1238                     crCount++;
1239                 }
1240                 if (delim < 0) {
1241                     return getString(0);
1242                 }
1243                 break;
1244 
1245               case '\t':
1246                   if (delim < 0)
1247                       c = ' ';
1248               case ' ':
1249                 ch = readCh();
1250                 if (delim < 0) {
1251                     return getString(0);
1252                 }
1253                 break;
1254 
1255               case '>':
1256               case '<':
1257                 if (delim < 0) {
1258                     return getString(0);
1259                 }
1260                 ch = readCh();
1261                 break;
1262 
1263               case '\'':
1264               case '"':
1265                 ch = readCh();
1266                 if (c == delim) {
1267                     return getString(0);
1268                 } else if (delim == -1) {
1269                     error("attvalerr");
1270                     if (strict || ch == ' ') {
1271                         return getString(0);
1272                     } else {
1273                         continue;
1274                     }
1275                 }
1276                 break;
1277 
1278             case '=':
1279                 if (delim < 0) {
1280                     /* In SGML a construct like <img src=/cgi-bin/foo?x=1>
1281                        is considered invalid since an = sign can only be contained
1282                        in an attributes value if the string is quoted.
1283                        */
1284                     error("attvalerr");
1285                     /* If strict is true then we return with the string we have thus far.
1286                        Otherwise we accept the = sign as part of the attribute's value and
1287                        process the rest of the img tag. */
1288                     if (strict) {
1289                         return getString(0);
1290                     }
1291                 }
1292                 ch = readCh();
1293                 break;
1294 
1295               case '&':
1296                 if (strict && delim < 0) {
1297                     ch = readCh();
1298                     break;
1299                 }
1300 
1301                 char data[] = parseEntityReference();
1302                 for (int i = 0 ; i < data.length ; i++) {
1303                     c = data[i];
1304                     addString((lower && (c >= 'A') && (c <= 'Z')) ? 'a' + c - 'A' : c);
1305                 }
1306                 continue;
1307 
1308               case -1:
1309                 return getString(0);
1310 
1311               default:
1312                 if (lower && (c >= 'A') && (c <= 'Z')) {
1313                     c = 'a' + c - 'A';
1314                 }
1315                 ch = readCh();
1316                 break;
1317             }
1318             addString(c);
1319         }
1320     }
1321 
1322 
1323     /**
1324      * Parse attribute specification List. [31] 327:17
1325      */
1326     void parseAttributeSpecificationList(Element elem) throws IOException {
1327 
1328         while (true) {
1329             skipSpace();
1330 
1331             switch (ch) {
1332               case '/':
1333               case '>':
1334               case '<':
1335               case -1:
1336                 return;
1337 
1338               case '-':
1339                 if ((ch = readCh()) == '-') {
1340                     ch = readCh();
1341                     parseComment();
1342                     strpos = 0;
1343                 } else {
1344                     error("invalid.tagchar", "-", elem.getName());
1345                     ch = readCh();
1346                 }
1347                 continue;
1348             }
1349 
1350             AttributeList att;
1351             String attname;
1352             String attvalue;
1353 
1354             if (parseIdentifier(true)) {
1355                 attname = getString(0);
1356                 skipSpace();
1357                 if (ch == '=') {
1358                     ch = readCh();
1359                     skipSpace();
1360                     att = elem.getAttribute(attname);
1361 //  Bug ID 4102750
1362 //  Load the NAME of an Attribute Case Sensitive
1363 //  The case of the NAME  must be intact
1364 //  MG 021898
1365                     attvalue = parseAttributeValue((att != null) && (att.type != CDATA) && (att.type != NOTATION) && (att.type != NAME));
1366 //                  attvalue = parseAttributeValue((att != null) && (att.type != CDATA) && (att.type != NOTATION));
1367                 } else {
1368                     attvalue = attname;
1369                     att = elem.getAttributeByValue(attvalue);
1370                     if (att == null) {
1371                         att = elem.getAttribute(attname);
1372                         if (att != null) {
1373                             attvalue = att.getValue();
1374                         }
1375                         else {
1376                             // Make it null so that NULL_ATTRIBUTE_VALUE is
1377                             // used
1378                             attvalue = null;
1379                         }
1380                     }
1381                 }
1382             } else if (!strict && ch == ',') { // allows for comma separated attribute-value pairs
1383                 ch = readCh();
1384                 continue;
1385             } else if (!strict && ch == '"') { // allows for quoted attributes
1386                 ch = readCh();
1387                 skipSpace();
1388                 if (parseIdentifier(true)) {
1389                     attname = getString(0);
1390                     if (ch == '"') {
1391                         ch = readCh();
1392                     }
1393                     skipSpace();
1394                     if (ch == '=') {
1395                         ch = readCh();
1396                         skipSpace();
1397                         att = elem.getAttribute(attname);
1398                         attvalue = parseAttributeValue((att != null) &&
1399                                                 (att.type != CDATA) &&
1400                                                 (att.type != NOTATION));
1401                     } else {
1402                         attvalue = attname;
1403                         att = elem.getAttributeByValue(attvalue);
1404                         if (att == null) {
1405                             att = elem.getAttribute(attname);
1406                             if (att != null) {
1407                                 attvalue = att.getValue();
1408                             }
1409                         }
1410                     }
1411                 } else {
1412                     char str[] = {(char)ch};
1413                     error("invalid.tagchar", new String(str), elem.getName());
1414                     ch = readCh();
1415                     continue;
1416                 }
1417             } else if (!strict && (attributes.isEmpty()) && (ch == '=')) {
1418                 ch = readCh();
1419                 skipSpace();
1420                 attname = elem.getName();
1421                 att = elem.getAttribute(attname);
1422                 attvalue = parseAttributeValue((att != null) &&
1423                                                (att.type != CDATA) &&
1424                                                (att.type != NOTATION));
1425             } else if (!strict && (ch == '=')) {
1426                 ch = readCh();
1427                 skipSpace();
1428                 attvalue = parseAttributeValue(true);
1429                 error("attvalerr");
1430                 return;
1431             } else {
1432                 char str[] = {(char)ch};
1433                 error("invalid.tagchar", new String(str), elem.getName());
1434                 if (!strict) {
1435                     ch = readCh();
1436                     continue;
1437                 } else {
1438                     return;
1439                 }
1440             }
1441 
1442             if (att != null) {
1443                 attname = att.getName();
1444             } else {
1445                 error("invalid.tagatt", attname, elem.getName());
1446             }
1447 
1448             // Check out the value
1449             if (attributes.isDefined(attname)) {
1450                 error("multi.tagatt", attname, elem.getName());
1451             }
1452             if (attvalue == null) {
1453                 attvalue = ((att != null) && (att.value != null)) ? att.value :
1454                     HTML.NULL_ATTRIBUTE_VALUE;
1455             } else if ((att != null) && (att.values != null) && !att.values.contains(attvalue)) {
1456                 error("invalid.tagattval", attname, elem.getName());
1457             }
1458             HTML.Attribute attkey = HTML.getAttributeKey(attname);
1459             if (attkey == null) {
1460                 attributes.addAttribute(attname, attvalue);
1461             } else {
1462                 attributes.addAttribute(attkey, attvalue);
1463             }
1464         }
1465     }
1466 
1467     /**
1468      * Parses th Document Declaration Type markup declaration.
1469      * Currently ignores it.
1470      */
1471     public String parseDTDMarkup() throws IOException {
1472 
1473         StringBuilder strBuff = new StringBuilder();
1474         ch = readCh();
1475         while(true) {
1476             switch (ch) {
1477             case '>':
1478                 ch = readCh();
1479                 return strBuff.toString();
1480             case -1:
1481                 error("invalid.markup");
1482                 return strBuff.toString();
1483             case '\n':
1484                 ln++;
1485                 ch = readCh();
1486                 lfCount++;
1487                 break;
1488             case '"':
1489                 ch = readCh();
1490                 break;
1491             case '\r':
1492                 ln++;
1493                 if ((ch = readCh()) == '\n') {
1494                     ch = readCh();
1495                     crlfCount++;
1496                 }
1497                 else {
1498                     crCount++;
1499                 }
1500                 break;
1501             default:
1502                 strBuff.append((char)(ch & 0xFF));
1503                 ch = readCh();
1504                 break;
1505             }
1506         }
1507     }
1508 
1509     /**
1510      * Parse markup declarations.
1511      * Currently only handles the Document Type Declaration markup.
1512      * Returns true if it is a markup declaration false otherwise.
1513      */
1514     protected boolean parseMarkupDeclarations(StringBuffer strBuff) throws IOException {
1515 
1516         /* Currently handles only the DOCTYPE */
1517         if ((strBuff.length() == "DOCTYPE".length()) &&
1518             (strBuff.toString().toUpperCase().equals("DOCTYPE"))) {
1519             parseDTDMarkup();
1520             return true;
1521         }
1522         return false;
1523     }
1524 
1525     /**
1526      * Parse an invalid tag.
1527      */
1528     void parseInvalidTag() throws IOException {
1529         // ignore all data upto the close bracket '>'
1530         while (true) {
1531             skipSpace();
1532             switch (ch) {
1533               case '>':
1534               case -1:
1535                   ch = readCh();
1536                 return;
1537               case '<':
1538                   return;
1539               default:
1540                   ch = readCh();
1541 
1542             }
1543         }
1544     }
1545 
1546     /**
1547      * Parse a start or end tag.
1548      */
1549     void parseTag() throws IOException {
1550         Element elem;
1551         boolean net = false;
1552         boolean warned = false;
1553         boolean unknown = false;
1554 
1555         switch (ch = readCh()) {
1556           case '!':
1557             switch (ch = readCh()) {
1558               case '-':
1559                 // Parse comment. [92] 391:7
1560                 while (true) {
1561                     if (ch == '-') {
1562                         if (!strict || ((ch = readCh()) == '-')) {
1563                             ch = readCh();
1564                             if (!strict && ch == '-') {
1565                                 ch = readCh();
1566                             }
1567                             // send over any text you might see
1568                             // before parsing and sending the
1569                             // comment
1570                             if (textpos != 0) {
1571                                 char newtext[] = new char[textpos];
1572                                 System.arraycopy(text, 0, newtext, 0, textpos);
1573                                 handleText(newtext);
1574                                 lastBlockStartPos = currentBlockStartPos;
1575                                 textpos = 0;
1576                             }
1577                             parseComment();
1578                             last = makeTag(dtd.getElement("comment"), true);
1579                             handleComment(getChars(0));
1580                             continue;
1581                         } else if (!warned) {
1582                             warned = true;
1583                             error("invalid.commentchar", "-");
1584                         }
1585                     }
1586                     skipSpace();
1587                     switch (ch) {
1588                       case '-':
1589                         continue;
1590                       case '>':
1591                         ch = readCh();
1592                       case -1:
1593                         return;
1594                       default:
1595                         ch = readCh();
1596                         if (!warned) {
1597                             warned = true;
1598                             error("invalid.commentchar",
1599                                   String.valueOf((char)ch));
1600                         }
1601                         break;
1602                     }
1603                 }
1604 
1605               default:
1606                 // deal with marked sections
1607                 StringBuffer strBuff = new StringBuffer();
1608                 while (true) {
1609                     strBuff.append((char)ch);
1610                     if (parseMarkupDeclarations(strBuff)) {
1611                         return;
1612                     }
1613                     switch(ch) {
1614                       case '>':
1615                         ch = readCh();
1616                       case -1:
1617                         error("invalid.markup");
1618                         return;
1619                       case '\n':
1620                         ln++;
1621                         ch = readCh();
1622                         lfCount++;
1623                         break;
1624                       case '\r':
1625                         ln++;
1626                         if ((ch = readCh()) == '\n') {
1627                             ch = readCh();
1628                             crlfCount++;
1629                         }
1630                         else {
1631                             crCount++;
1632                         }
1633                         break;
1634 
1635                       default:
1636                         ch = readCh();
1637                         break;
1638                     }
1639                 }
1640             }
1641 
1642           case '/':
1643             // parse end tag [19] 317:4
1644             switch (ch = readCh()) {
1645               case '>':
1646                 ch = readCh();
1647               case '<':
1648                 // empty end tag. either </> or </<
1649                 if (recent == null) {
1650                     error("invalid.shortend");
1651                     return;
1652                 }
1653                 elem = recent;
1654                 break;
1655 
1656               default:
1657                 if (!parseIdentifier(true)) {
1658                     error("expected.endtagname");
1659                     return;
1660                 }
1661                 skipSpace();
1662                 switch (ch) {
1663                   case '>':
1664                     ch = readCh();
1665                   case '<':
1666                     break;
1667 
1668                   default:
1669                     error("expected", "'>'");
1670                     while ((ch != -1) && (ch != '\n') && (ch != '>')) {
1671                         ch = readCh();
1672                     }
1673                     if (ch == '>') {
1674                         ch = readCh();
1675                     }
1676                     break;
1677                 }
1678                 String elemStr = getString(0);
1679                 if (!dtd.elementExists(elemStr)) {
1680                     error("end.unrecognized", elemStr);
1681                     // Ignore RE before end tag
1682                     if ((textpos > 0) && (text[textpos-1] == '\n')) {
1683                         textpos--;
1684                     }
1685                     elem = dtd.getElement("unknown");
1686                     elem.name = elemStr;
1687                     unknown = true;
1688                 } else {
1689                     elem = dtd.getElement(elemStr);
1690                 }
1691                 break;
1692             }
1693 
1694 
1695             // If the stack is null, we're seeing end tags without any begin
1696             // tags.  Ignore them.
1697 
1698             if (stack == null) {
1699                 error("end.extra.tag", elem.getName());
1700                 return;
1701             }
1702 
1703             // Ignore RE before end tag
1704             if ((textpos > 0) && (text[textpos-1] == '\n')) {
1705                 // In a pre tag, if there are blank lines
1706                 // we do not want to remove the newline
1707                 // before the end tag.  Hence this code.
1708                 //
1709                 if (stack.pre) {
1710                     if ((textpos > 1) && (text[textpos-2] != '\n')) {
1711                         textpos--;
1712                     }
1713                 } else {
1714                     textpos--;
1715                 }
1716             }
1717 
1718             // If the end tag is a form, since we did not put it
1719             // on the tag stack, there is no corresponding start
1720             // start tag to find. Hence do not touch the tag stack.
1721             //
1722 
1723             /*
1724             if (!strict && elem.getName().equals("form")) {
1725                 if (lastFormSent != null) {
1726                     handleEndTag(lastFormSent);
1727                     return;
1728                 } else {
1729                     // do nothing.
1730                     return;
1731                 }
1732             }
1733             */
1734 
1735             if (unknown) {
1736                 // we will not see a corresponding start tag
1737                 // on the the stack.  If we are seeing an
1738                 // end tag, lets send this on as an empty
1739                 // tag with the end tag attribute set to
1740                 // true.
1741                 TagElement t = makeTag(elem);
1742                 handleText(t);
1743                 attributes.addAttribute(HTML.Attribute.ENDTAG, "true");
1744                 handleEmptyTag(makeTag(elem));
1745                 unknown = false;
1746                 return;
1747             }
1748 
1749             // find the corresponding start tag
1750 
1751             // A commonly occuring error appears to be the insertion
1752             // of extra end tags in a table.  The intent here is ignore
1753             // such extra end tags.
1754             //
1755             if (!strict) {
1756                 String stackElem = stack.elem.getName();
1757 
1758                 if (stackElem.equals("table")) {
1759                     // If it isnt a valid end tag ignore it and return
1760                     //
1761                     if (!elem.getName().equals(stackElem)) {
1762                         error("tag.ignore", elem.getName());
1763                         return;
1764                     }
1765                 }
1766 
1767 
1768 
1769                 if (stackElem.equals("tr") ||
1770                     stackElem.equals("td")) {
1771                     if ((!elem.getName().equals("table")) &&
1772                         (!elem.getName().equals(stackElem))) {
1773                         error("tag.ignore", elem.getName());
1774                         return;
1775                     }
1776                 }
1777             }
1778             TagStack sp = stack;
1779 
1780             while ((sp != null) && (elem != sp.elem)) {
1781                 sp = sp.next;
1782             }
1783             if (sp == null) {
1784                 error("unmatched.endtag", elem.getName());
1785                 return;
1786             }
1787 
1788             // People put font ending tags in the darndest places.
1789             // Don't close other contexts based on them being between
1790             // a font tag and the corresponding end tag.  Instead,
1791             // ignore the end tag like it doesn't exist and allow the end
1792             // of the document to close us out.
1793             String elemName = elem.getName();
1794             if (stack != sp &&
1795                 (elemName.equals("font") ||
1796                  elemName.equals("center"))) {
1797 
1798                 // Since closing out a center tag can have real wierd
1799                 // effects on the formatting,  make sure that tags
1800                 // for which omitting an end tag is legimitate
1801                 // get closed out.
1802                 //
1803                 if (elemName.equals("center")) {
1804                     while(stack.elem.omitEnd() && stack != sp) {
1805                         endTag(true);
1806                     }
1807                     if (stack.elem == elem) {
1808                         endTag(false);
1809                     }
1810                 }
1811                 return;
1812             }
1813             // People do the same thing with center tags.  In this
1814             // case we would like to close off the center tag but
1815             // not necessarily all enclosing tags.
1816 
1817 
1818 
1819             // end tags
1820             while (stack != sp) {
1821                 endTag(true);
1822             }
1823 
1824             endTag(false);
1825             return;
1826 
1827           case -1:
1828             error("eof");
1829             return;
1830         }
1831 
1832         // start tag [14] 314:1
1833         if (!parseIdentifier(true)) {
1834             elem = recent;
1835             if ((ch != '>') || (elem == null)) {
1836                 error("expected.tagname");
1837                 return;
1838             }
1839         } else {
1840             String elemStr = getString(0);
1841 
1842             if (elemStr.equals("image")) {
1843                 elemStr = "img";
1844             }
1845 
1846             /* determine if this element is part of the dtd. */
1847 
1848             if (!dtd.elementExists(elemStr)) {
1849                 //              parseInvalidTag();
1850                 error("tag.unrecognized ", elemStr);
1851                 elem = dtd.getElement("unknown");
1852                 elem.name = elemStr;
1853                 unknown = true;
1854             } else {
1855                 elem = dtd.getElement(elemStr);
1856             }
1857         }
1858 
1859         // Parse attributes
1860         parseAttributeSpecificationList(elem);
1861 
1862         switch (ch) {
1863           case '/':
1864             net = true;
1865           case '>':
1866             ch = readCh();
1867             if (ch == '>' && net) {
1868                 ch = readCh();
1869             }
1870           case '<':
1871             break;
1872 
1873           default:
1874             error("expected", "'>'");
1875             break;
1876         }
1877 
1878         if (!strict) {
1879           if (elem.getName().equals("script")) {
1880             error("javascript.unsupported");
1881           }
1882         }
1883 
1884         // ignore RE after start tag
1885         //
1886         if (!elem.isEmpty())  {
1887             if (ch == '\n') {
1888                 ln++;
1889                 lfCount++;
1890                 ch = readCh();
1891             } else if (ch == '\r') {
1892                 ln++;
1893                 if ((ch = readCh()) == '\n') {
1894                     ch = readCh();
1895                     crlfCount++;
1896                 }
1897                 else {
1898                     crCount++;
1899                 }
1900             }
1901         }
1902 
1903         // ensure a legal context for the tag
1904         TagElement tag = makeTag(elem, false);
1905 
1906 
1907         /** In dealing with forms, we have decided to treat
1908             them as legal in any context.  Also, even though
1909             they do have a start and an end tag, we will
1910             not put this tag on the stack.  This is to deal
1911             several pages in the web oasis that choose to
1912             start and end forms in any possible location. **/
1913 
1914         /*
1915         if (!strict && elem.getName().equals("form")) {
1916             if (lastFormSent == null) {
1917                 lastFormSent = tag;
1918             } else {
1919                 handleEndTag(lastFormSent);
1920                 lastFormSent = tag;
1921             }
1922         } else {
1923         */
1924             // Smlly, if a tag is unknown, we will apply
1925             // no legalTagContext logic to it.
1926             //
1927             if (!unknown) {
1928                 legalTagContext(tag);
1929 
1930                 // If skip tag is true,  this implies that
1931                 // the tag was illegal and that the error
1932                 // recovery strategy adopted is to ignore
1933                 // the tag.
1934                 if (!strict && skipTag) {
1935                     skipTag = false;
1936                     return;
1937                 }
1938             }
1939             /*
1940         }
1941             */
1942 
1943         startTag(tag);
1944 
1945         if (!elem.isEmpty()) {
1946             switch (elem.getType()) {
1947               case CDATA:
1948                 parseLiteral(false);
1949                 break;
1950               case RCDATA:
1951                 parseLiteral(true);
1952                 break;
1953               default:
1954                 if (stack != null) {
1955                     stack.net = net;
1956                 }
1957                 break;
1958             }
1959         }
1960     }
1961 
1962     private static final String START_COMMENT = "<!--";
1963     private static final String END_COMMENT = "-->";
1964     private static final char[] SCRIPT_END_TAG = "</script>".toCharArray();
1965     private static final char[] SCRIPT_END_TAG_UPPER_CASE =
1966                                         "</SCRIPT>".toCharArray();
1967 
1968     void parseScript() throws IOException {
1969         char[] charsToAdd = new char[SCRIPT_END_TAG.length];
1970 
1971         /* Here, ch should be the first character after <script> */
1972         while (true) {
1973             int i = 0;
1974             while (i < SCRIPT_END_TAG.length
1975                        && (SCRIPT_END_TAG[i] == ch
1976                            || SCRIPT_END_TAG_UPPER_CASE[i] == ch)) {
1977                 charsToAdd[i] = (char) ch;
1978                 ch = readCh();
1979                 i++;
1980             }
1981             if (i == SCRIPT_END_TAG.length) {
1982 
1983                 /*  '</script>' tag detected */
1984                 /* Here, ch == '>' */
1985                 ch = readCh();
1986                 /* Here, ch == the first character after </script> */
1987                 return;
1988             } else {
1989 
1990                 /* To account for extra read()'s that happened */
1991                 for (int j = 0; j < i; j++) {
1992                     addString(charsToAdd[j]);
1993                 }
1994 
1995                 switch (ch) {
1996                 case -1:
1997                     error("eof.script");
1998                     return;
1999                 case '\n':
2000                     ln++;
2001                     ch = readCh();
2002                     lfCount++;
2003                     addString('\n');
2004                     break;
2005                 case '\r':
2006                     ln++;
2007                     if ((ch = readCh()) == '\n') {
2008                         ch = readCh();
2009                         crlfCount++;
2010                     } else {
2011                         crCount++;
2012                     }
2013                     addString('\n');
2014                     break;
2015                 default:
2016                     addString(ch);
2017                     ch = readCh();
2018                     break;
2019                 } // switch
2020             }
2021         } // while
2022     }
2023 
2024     /**
2025      * Parse Content. [24] 320:1
2026      */
2027     void parseContent() throws IOException {
2028         Thread curThread = Thread.currentThread();
2029 
2030         for (;;) {
2031             if (curThread.isInterrupted()) {
2032                 curThread.interrupt(); // resignal the interrupt
2033                 break;
2034             }
2035 
2036             int c = ch;
2037             currentBlockStartPos = currentPosition;
2038 
2039             if (recent == dtd.script) { // means: if after starting <script> tag
2040 
2041                 /* Here, ch has to be the first character after <script> */
2042                 parseScript();
2043                 last = makeTag(dtd.getElement("comment"), true);
2044 
2045                 /* Remove leading and trailing HTML comment declarations */
2046                 String str = new String(getChars(0)).trim();
2047                 int minLength = START_COMMENT.length() + END_COMMENT.length();
2048                 if (str.startsWith(START_COMMENT) && str.endsWith(END_COMMENT)
2049                        && str.length() >= (minLength)) {
2050                     str = str.substring(START_COMMENT.length(),
2051                                       str.length() - END_COMMENT.length());
2052                 }
2053 
2054                 /* Handle resulting chars as comment */
2055                 handleComment(str.toCharArray());
2056                 endTag(false);
2057                 lastBlockStartPos = currentPosition;
2058             } else {
2059                 switch (c) {
2060                   case '<':
2061                     parseTag();
2062                     lastBlockStartPos = currentPosition;
2063                     continue;
2064 
2065                   case '/':
2066                     ch = readCh();
2067                     if ((stack != null) && stack.net) {
2068                         // null end tag.
2069                         endTag(false);
2070                         continue;
2071                     }
2072                     break;
2073 
2074                   case -1:
2075                     return;
2076 
2077                   case '&':
2078                     if (textpos == 0) {
2079                         if (!legalElementContext(dtd.pcdata)) {
2080                             error("unexpected.pcdata");
2081                         }
2082                         if (last.breaksFlow()) {
2083                             space = false;
2084                         }
2085                     }
2086                     char data[] = parseEntityReference();
2087                     if (textpos + data.length + 1 > text.length) {
2088                         char newtext[] = new char[Math.max(textpos + data.length + 128, text.length * 2)];
2089                         System.arraycopy(text, 0, newtext, 0, text.length);
2090                         text = newtext;
2091                     }
2092                     if (space) {
2093                         space = false;
2094                         text[textpos++] = ' ';
2095                     }
2096                     System.arraycopy(data, 0, text, textpos, data.length);
2097                     textpos += data.length;
2098                     ignoreSpace = false;
2099                     continue;
2100 
2101                   case '\n':
2102                     ln++;
2103                     lfCount++;
2104                     ch = readCh();
2105                     if ((stack != null) && stack.pre) {
2106                         break;
2107                     }
2108                     if (textpos == 0) {
2109                         lastBlockStartPos = currentPosition;
2110                     }
2111                     if (!ignoreSpace) {
2112                         space = true;
2113                     }
2114                     continue;
2115 
2116                   case '\r':
2117                     ln++;
2118                     c = '\n';
2119                     if ((ch = readCh()) == '\n') {
2120                         ch = readCh();
2121                         crlfCount++;
2122                     }
2123                     else {
2124                         crCount++;
2125                     }
2126                     if ((stack != null) && stack.pre) {
2127                         break;
2128                     }
2129                     if (textpos == 0) {
2130                         lastBlockStartPos = currentPosition;
2131                     }
2132                     if (!ignoreSpace) {
2133                         space = true;
2134                     }
2135                     continue;
2136 
2137 
2138                   case '\t':
2139                   case ' ':
2140                     ch = readCh();
2141                     if ((stack != null) && stack.pre) {
2142                         break;
2143                     }
2144                     if (textpos == 0) {
2145                         lastBlockStartPos = currentPosition;
2146                     }
2147                     if (!ignoreSpace) {
2148                         space = true;
2149                     }
2150                     continue;
2151 
2152                   default:
2153                     if (textpos == 0) {
2154                         if (!legalElementContext(dtd.pcdata)) {
2155                             error("unexpected.pcdata");
2156                         }
2157                         if (last.breaksFlow()) {
2158                             space = false;
2159                         }
2160                     }
2161                     ch = readCh();
2162                     break;
2163                 }
2164             }
2165 
2166             // enlarge buffer if needed
2167             if (textpos + 2 > text.length) {
2168                 char newtext[] = new char[text.length + 128];
2169                 System.arraycopy(text, 0, newtext, 0, text.length);
2170                 text = newtext;
2171             }
2172 
2173             // output pending space
2174             if (space) {
2175                 if (textpos == 0) {
2176                     lastBlockStartPos--;
2177                 }
2178                 text[textpos++] = ' ';
2179                 space = false;
2180             }
2181             text[textpos++] = (char)c;
2182             ignoreSpace = false;
2183         }
2184     }
2185 
2186     /**
2187      * Returns the end of line string. This will return the end of line
2188      * string that has been encountered the most, one of \r, \n or \r\n.
2189      */
2190     String getEndOfLineString() {
2191         if (crlfCount >= crCount) {
2192             if (lfCount >= crlfCount) {
2193                 return "\n";
2194             }
2195             else {
2196                 return "\r\n";
2197             }
2198         }
2199         else {
2200             if (crCount > lfCount) {
2201                 return "\r";
2202             }
2203             else {
2204                 return "\n";
2205             }
2206         }
2207     }
2208 
2209     /**
2210      * Parse an HTML stream, given a DTD.
2211      */
2212     public synchronized void parse(Reader in) throws IOException {
2213         this.in = in;
2214 
2215         this.ln = 1;
2216 
2217         seenHtml = false;
2218         seenHead = false;
2219         seenBody = false;
2220 
2221         crCount = lfCount = crlfCount = 0;
2222 
2223         try {
2224             ch = readCh();
2225             text = new char[1024];
2226             str = new char[128];
2227 
2228             parseContent();
2229             // NOTE: interruption may have occurred.  Control flows out
2230             // of here normally.
2231             while (stack != null) {
2232                 endTag(true);
2233             }
2234             in.close();
2235         } catch (IOException e) {
2236             errorContext();
2237             error("ioexception");
2238             throw e;
2239         } catch (Exception e) {
2240             errorContext();
2241             error("exception", e.getClass().getName(), e.getMessage());
2242             e.printStackTrace();
2243         } catch (ThreadDeath e) {
2244             errorContext();
2245             error("terminated");
2246             e.printStackTrace();
2247             throw e;
2248         } finally {
2249             for (; stack != null ; stack = stack.next) {
2250                 handleEndTag(stack.tag);
2251             }
2252 
2253             text = null;
2254             str = null;
2255         }
2256 
2257     }
2258 
2259 
2260     /*
2261      * Input cache.  This is much faster than calling down to a synchronized
2262      * method of BufferedReader for each byte.  Measurements done 5/30/97
2263      * show that there's no point in having a bigger buffer:  Increasing
2264      * the buffer to 8192 had no measurable impact for a program discarding
2265      * one character at a time (reading from an http URL to a local machine).
2266      * NOTE: If the current encoding is bogus, and we read too much
2267      * (past the content-type) we may suffer a MalformedInputException. For
2268      * this reason the initial size is 1 and when the body is encountered the
2269      * size is adjusted to 256.
2270      */
2271     private char buf[] = new char[1];
2272     private int pos;
2273     private int len;
2274     /*
2275         tracks position relative to the beginning of the
2276         document.
2277     */
2278     private int currentPosition;
2279 
2280 
2281     private final int readCh() throws IOException {
2282 
2283         if (pos >= len) {
2284 
2285             // This loop allows us to ignore interrupts if the flag
2286             // says so
2287             for (;;) {
2288                 try {
2289                     len = in.read(buf);
2290                     break;
2291                 } catch (InterruptedIOException ex) {
2292                     throw ex;
2293                 }
2294             }
2295 
2296             if (len <= 0) {
2297                 return -1;      // eof
2298             }
2299             pos = 0;
2300         }
2301         ++currentPosition;
2302 
2303         return buf[pos++];
2304     }
2305 
2306 
2307     protected int getCurrentPos() {
2308         return currentPosition;
2309     }
2310 }