New src/share/classes/javax/swing/text/html/parser/Parser.java

   1 /*
   2  * Copyright (c) 1998, 2012, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 
  26 package javax.swing.text.html.parser;
  27 
  28 import javax.swing.text.SimpleAttributeSet;
  29 import javax.swing.text.html.HTML;
  30 import javax.swing.text.ChangedCharSetException;
  31 import java.io.*;
  32 import java.util.Hashtable;
  33 import java.util.Properties;
  34 import java.util.Vector;
  35 import java.util.Enumeration;
  36 import java.net.URL;
  37 
  38 import sun.misc.MessageUtils;
  39 
  40 /**
  41  * A simple DTD-driven HTML parser. The parser reads an
  42  * HTML file from an InputStream and calls various methods
  43  * (which should be overridden in a subclass) when tags and
  44  * data are encountered.
  45  * <p>
  46  * Unfortunately there are many badly implemented HTML parsers
  47  * out there, and as a result there are many badly formatted
  48  * HTML files. This parser attempts to parse most HTML files.
  49  * This means that the implementation sometimes deviates from
  50  * the SGML specification in favor of HTML.
  51  * <p>
  52  * The parser treats \r and \r\n as \n. Newlines after starttags
  53  * and before end tags are ignored just as specified in the SGML/HTML
  54  * specification.
  55  * <p>
  56  * The html spec does not specify how spaces are to be coalesced very well.
  57  * Specifically, the following scenarios are not discussed (note that a
  58  * space should be used here, but I am using &amp;nbsp to force the space to
  59  * be displayed):
  60  * <p>
  61  * '&lt;b>blah&nbsp;&lt;i>&nbsp;&lt;strike>&nbsp;foo' which can be treated as:
  62  * '&lt;b>blah&nbsp;&lt;i>&lt;strike>foo'
  63  * <p>as well as:
  64  * '&lt;p>&lt;a href="xx">&nbsp;&lt;em>Using&lt;/em>&lt;/a>&lt;/p>'
  65  * which appears to be treated as:
  66  * '&lt;p>&lt;a href="xx">&lt;em>Using&lt;/em>&lt;/a>&lt;/p>'
  67  * <p>
  68  * If <code>strict</code> is false, when a tag that breaks flow,
  69  * (<code>TagElement.breaksFlows</code>) or trailing whitespace is
  70  * encountered, all whitespace will be ignored until a non whitespace
  71  * character is encountered. This appears to give behavior closer to
  72  * the popular browsers.
  73  *
  74  * @see DTD
  75  * @see TagElement
  76  * @see SimpleAttributeSet
  77  * @author Arthur van Hoff
  78  * @author Sunita Mani
  79  */
  80 public
  81 class Parser implements DTDConstants {
  82 
  83     // Maximum codepoint value within BMP
  84     private static final int MAX_BMP_BOUND = 65535;
  85 
  86     private char text[] = new char[1024];
  87     private int textpos = 0;
  88     private TagElement last;
  89     private boolean space;
  90 
  91     private char str[] = new char[128];
  92     private int strpos = 0;
  93 
  94     protected DTD dtd = null;
  95 
  96     private int ch;
  97     private int ln;
  98     private Reader in;
  99 
 100     private Element recent;
 101     private TagStack stack;
 102     private boolean skipTag = false;
 103     private TagElement lastFormSent = null;
 104     private SimpleAttributeSet attributes = new SimpleAttributeSet();
 105 
 106     // State for <html>, <head> and <body>.  Since people like to slap
 107     // together HTML documents without thinking, occasionally they
 108     // have multiple instances of these tags.  These booleans track
 109     // the first sightings of these tags so they can be safely ignored
 110     // by the parser if repeated.
 111     private boolean seenHtml = false;
 112     private boolean seenHead = false;
 113     private boolean seenBody = false;
 114 
 115     /**
 116      * The html spec does not specify how spaces are coalesced very well.
 117      * If strict == false, ignoreSpace is used to try and mimic the behavior
 118      * of the popular browsers.
 119      * <p>
 120      * The problematic scenarios are:
 121      * '&lt;b>blah &lt;i> &lt;strike> foo' which can be treated as:
 122      * '&lt;b>blah &lt;i>&lt;strike>foo'
 123      * as well as:
 124      * '&lt;p>&lt;a href="xx"> &lt;em>Using&lt;/em>&lt;/a>&lt;/p>'
 125      * which appears to be treated as:
 126      * '&lt;p>&lt;a href="xx">&lt;em>Using&lt;/em>&lt;/a>&lt;/p>'
 127      * <p>
 128      * When a tag that breaks flow, or trailing whitespace is encountered
 129      * ignoreSpace is set to true. From then on, all whitespace will be
 130      * ignored.
 131      * ignoreSpace will be set back to false the first time a
 132      * non whitespace character is encountered. This appears to give
 133      * behavior closer to the popular browsers.
 134      */
 135     private boolean ignoreSpace;
 136 
 137     /**
 138      * This flag determines whether or not the Parser will be strict
 139      * in enforcing SGML compatibility.  If false, it will be lenient
 140      * with certain common classes of erroneous HTML constructs.
 141      * Strict or not, in either case an error will be recorded.
 142      *
 143      */
 144     protected boolean strict = false;
 145 
 146 
 147     /** Number of \r\n's encountered. */
 148     private int crlfCount;
 149     /** Number of \r's encountered. A \r\n will not increment this. */
 150     private int crCount;
 151     /** Number of \n's encountered. A \r\n will not increment this. */
 152     private int lfCount;
 153 
 154     //
 155     // To correctly identify the start of a tag/comment/text we need two
 156     // ivars. Two are needed as handleText isn't invoked until the tag
 157     // after the text has been parsed, that is the parser parses the text,
 158     // then a tag, then invokes handleText followed by handleStart.
 159     //
 160     /** The start position of the current block. Block is overloaded here,
 161      * it really means the current start position for the current comment,
 162      * tag, text. Use getBlockStartPosition to access this. */
 163     private int currentBlockStartPos;
 164     /** Start position of the last block. */
 165     private int lastBlockStartPos;
 166 
 167     /**
 168      * array for mapping numeric references in range
 169      * 130-159 to displayable Unicode characters.
 170      */
 171     private static final char[] cp1252Map = {
 172         8218,  // &#130;
 173         402,   // &#131;
 174         8222,  // &#132;
 175         8230,  // &#133;
 176         8224,  // &#134;
 177         8225,  // &#135;
 178         710,   // &#136;
 179         8240,  // &#137;
 180         352,   // &#138;
 181         8249,  // &#139;
 182         338,   // &#140;
 183         141,   // &#141;
 184         142,   // &#142;
 185         143,   // &#143;
 186         144,   // &#144;
 187         8216,  // &#145;
 188         8217,  // &#146;
 189         8220,  // &#147;
 190         8221,  // &#148;
 191         8226,  // &#149;
 192         8211,  // &#150;
 193         8212,  // &#151;
 194         732,   // &#152;
 195         8482,  // &#153;
 196         353,   // &#154;
 197         8250,  // &#155;
 198         339,   // &#156;
 199         157,   // &#157;
 200         158,   // &#158;
 201         376    // &#159;
 202     };
 203 
 204     public Parser(DTD dtd) {
 205         this.dtd = dtd;
 206     }
 207 
 208 
 209     /**
 210      * @return the line number of the line currently being parsed
 211      */
 212     protected int getCurrentLine() {
 213         return ln;
 214     }
 215 
 216     /**
 217      * Returns the start position of the current block. Block is
 218      * overloaded here, it really means the current start position for
 219      * the current comment tag, text, block.... This is provided for
 220      * subclassers that wish to know the start of the current block when
 221      * called with one of the handleXXX methods.
 222      */
 223     int getBlockStartPosition() {
 224         return Math.max(0, lastBlockStartPos - 1);
 225     }
 226 
 227     /**
 228      * Makes a TagElement.
 229      */
 230     protected TagElement makeTag(Element elem, boolean fictional) {
 231         return new TagElement(elem, fictional);
 232     }
 233 
 234     protected TagElement makeTag(Element elem) {
 235         return makeTag(elem, false);
 236     }
 237 
 238     protected SimpleAttributeSet getAttributes() {
 239         return attributes;
 240     }
 241 
 242     protected void flushAttributes() {
 243         attributes.removeAttributes(attributes);
 244     }
 245 
 246     /**
 247      * Called when PCDATA is encountered.
 248      */
 249     protected void handleText(char text[]) {
 250     }
 251 
 252     /**
 253      * Called when an HTML title tag is encountered.
 254      */
 255     protected void handleTitle(char text[]) {
 256         // default behavior is to call handleText. Subclasses
 257         // can override if necessary.
 258         handleText(text);
 259     }
 260 
 261     /**
 262      * Called when an HTML comment is encountered.
 263      */
 264     protected void handleComment(char text[]) {
 265     }
 266 
 267     protected void handleEOFInComment() {
 268         // We've reached EOF.  Our recovery strategy is to
 269         // see if we have more than one line in the comment;
 270         // if so, we pretend that the comment was an unterminated
 271         // single line comment, and reparse the lines after the
 272         // first line as normal HTML content.
 273 
 274         int commentEndPos = strIndexOf('\n');
 275         if (commentEndPos >= 0) {
 276             handleComment(getChars(0, commentEndPos));
 277             try {
 278                 in.close();
 279                 in = new CharArrayReader(getChars(commentEndPos + 1));
 280                 ch = '>';
 281             } catch (IOException e) {
 282                 error("ioexception");
 283             }
 284 
 285             resetStrBuffer();
 286         } else {
 287             // no newline, so signal an error
 288             error("eof.comment");
 289         }
 290     }
 291 
 292     /**
 293      * Called when an empty tag is encountered.
 294      */
 295     protected void handleEmptyTag(TagElement tag) throws ChangedCharSetException {
 296     }
 297 
 298     /**
 299      * Called when a start tag is encountered.
 300      */
 301     protected void handleStartTag(TagElement tag) {
 302     }
 303 
 304     /**
 305      * Called when an end tag is encountered.
 306      */
 307     protected void handleEndTag(TagElement tag) {
 308     }
 309 
 310     /**
 311      * An error has occurred.
 312      */
 313     protected void handleError(int ln, String msg) {
 314         /*
 315         Thread.dumpStack();
 316         System.out.println("**** " + stack);
 317         System.out.println("line " + ln + ": error: " + msg);
 318         System.out.println();
 319         */
 320     }
 321 
 322     /**
 323      * Output text.
 324      */
 325     void handleText(TagElement tag) {
 326         if (tag.breaksFlow()) {
 327             space = false;
 328             if (!strict) {
 329                 ignoreSpace = true;
 330             }
 331         }
 332         if (textpos == 0) {
 333             if ((!space) || (stack == null) || last.breaksFlow() ||
 334                 !stack.advance(dtd.pcdata)) {
 335                 last = tag;
 336                 space = false;
 337                 lastBlockStartPos = currentBlockStartPos;
 338                 return;
 339             }
 340         }
 341         if (space) {
 342             if (!ignoreSpace) {
 343                 // enlarge buffer if needed
 344                 if (textpos + 1 > text.length) {
 345                     char newtext[] = new char[text.length + 200];
 346                     System.arraycopy(text, 0, newtext, 0, text.length);
 347                     text = newtext;
 348                 }
 349 
 350                 // output pending space
 351                 text[textpos++] = ' ';
 352                 if (!strict && !tag.getElement().isEmpty()) {
 353                     ignoreSpace = true;
 354                 }
 355             }
 356             space = false;
 357         }
 358         char newtext[] = new char[textpos];
 359         System.arraycopy(text, 0, newtext, 0, textpos);
 360         // Handles cases of bad html where the title tag
 361         // was getting lost when we did error recovery.
 362         if (tag.getElement().getName().equals("title")) {
 363             handleTitle(newtext);
 364         } else {
 365             handleText(newtext);
 366         }
 367         lastBlockStartPos = currentBlockStartPos;
 368         textpos = 0;
 369         last = tag;
 370         space = false;
 371     }
 372 
 373     /**
 374      * Invoke the error handler.
 375      */
 376     protected void error(String err, String arg1, String arg2,
 377         String arg3) {
 378         handleError(ln, err + " " + arg1 + " " + arg2 + " " + arg3);
 379     }
 380 
 381     protected void error(String err, String arg1, String arg2) {
 382         error(err, arg1, arg2, "?");
 383     }
 384     protected void error(String err, String arg1) {
 385         error(err, arg1, "?", "?");
 386     }
 387     protected void error(String err) {
 388         error(err, "?", "?", "?");
 389     }
 390 
 391 
 392     /**
 393      * Handle a start tag. The new tag is pushed
 394      * onto the tag stack. The attribute list is
 395      * checked for required attributes.
 396      */
 397     protected void startTag(TagElement tag) throws ChangedCharSetException {
 398         Element elem = tag.getElement();
 399 
 400         // If the tag is an empty tag and texpos != 0
 401         // this implies that there is text before the
 402         // start tag that needs to be processed before
 403         // handling the tag.
 404         //
 405         if (!elem.isEmpty() ||
 406                     ((last != null) && !last.breaksFlow()) ||
 407                     (textpos != 0)) {
 408             handleText(tag);
 409         } else {
 410             // this variable gets updated in handleText().
 411             // Since in this case we do not call handleText()
 412             // we need to update it here.
 413             //
 414             last = tag;
 415             // Note that we should really check last.breakFlows before
 416             // assuming this should be false.
 417             space = false;
 418         }
 419         lastBlockStartPos = currentBlockStartPos;
 420 
 421         // check required attributes
 422         for (AttributeList a = elem.atts ; a != null ; a = a.next) {
 423             if ((a.modifier == REQUIRED) &&
 424                 ((attributes.isEmpty()) ||
 425                  ((!attributes.isDefined(a.name)) &&
 426                   (!attributes.isDefined(HTML.getAttributeKey(a.name)))))) {
 427                 error("req.att ", a.getName(), elem.getName());
 428             }
 429         }
 430 
 431         if (elem.isEmpty()) {
 432             handleEmptyTag(tag);
 433             /*
 434         } else if (elem.getName().equals("form")) {
 435             handleStartTag(tag);
 436             */
 437         } else {
 438             recent = elem;
 439             stack = new TagStack(tag, stack);
 440             handleStartTag(tag);
 441         }
 442     }
 443 
 444     /**
 445      * Handle an end tag. The end tag is popped
 446      * from the tag stack.
 447      */
 448     protected void endTag(boolean omitted) {
 449         handleText(stack.tag);
 450 
 451         if (omitted && !stack.elem.omitEnd()) {
 452             error("end.missing", stack.elem.getName());
 453         } else if (!stack.terminate()) {
 454             error("end.unexpected", stack.elem.getName());
 455         }
 456 
 457         // handle the tag
 458         handleEndTag(stack.tag);
 459         stack = stack.next;
 460         recent = (stack != null) ? stack.elem : null;
 461     }
 462 
 463 
 464     boolean ignoreElement(Element elem) {
 465 
 466         String stackElement = stack.elem.getName();
 467         String elemName = elem.getName();
 468         /* We ignore all elements that are not valid in the context of
 469            a table except <td>, <th> (these we handle in
 470            legalElementContext()) and #pcdata.  We also ignore the
 471            <font> tag in the context of <ul> and <ol> We additonally
 472            ignore the <meta> and the <style> tag if the body tag has
 473            been seen. **/
 474         if ((elemName.equals("html") && seenHtml) ||
 475             (elemName.equals("head") && seenHead) ||
 476             (elemName.equals("body") && seenBody)) {
 477             return true;
 478         }
 479         if (elemName.equals("dt") || elemName.equals("dd")) {
 480             TagStack s = stack;
 481             while (s != null && !s.elem.getName().equals("dl")) {
 482                 s = s.next;
 483             }
 484             if (s == null) {
 485                 return true;
 486             }
 487         }
 488 
 489         if (((stackElement.equals("table")) &&
 490              (!elemName.equals("#pcdata")) && (!elemName.equals("input"))) ||
 491             ((elemName.equals("font")) &&
 492              (stackElement.equals("ul") || stackElement.equals("ol"))) ||
 493             (elemName.equals("meta") && stack != null) ||
 494             (elemName.equals("style") && seenBody) ||
 495             (stackElement.equals("table") && elemName.equals("a"))) {
 496             return true;
 497         }
 498         return false;
 499     }
 500 
 501 
 502     /**
 503      * Marks the first time a tag has been seen in a document
 504      */
 505 
 506     protected void markFirstTime(Element elem) {
 507         String elemName = elem.getName();
 508         if (elemName.equals("html")) {
 509             seenHtml = true;
 510         } else if (elemName.equals("head")) {
 511             seenHead = true;
 512         } else if (elemName.equals("body")) {
 513             if (buf.length == 1) {
 514                 // Refer to note in definition of buf for details on this.
 515                 char[] newBuf = new char[256];
 516 
 517                 newBuf[0] = buf[0];
 518                 buf = newBuf;
 519             }
 520             seenBody = true;
 521         }
 522     }
 523 
 524     /**
 525      * Create a legal content for an element.
 526      */
 527     boolean legalElementContext(Element elem) throws ChangedCharSetException {
 528 
 529         // System.out.println("-- legalContext -- " + elem);
 530 
 531         // Deal with the empty stack
 532         if (stack == null) {
 533             // System.out.println("-- stack is empty");
 534             if (elem != dtd.html) {
 535                 // System.out.println("-- pushing html");
 536                 startTag(makeTag(dtd.html, true));
 537                 return legalElementContext(elem);
 538             }
 539             return true;
 540         }
 541 
 542         // Is it allowed in the current context
 543         if (stack.advance(elem)) {
 544             // System.out.println("-- legal context");
 545             markFirstTime(elem);
 546             return true;
 547         }
 548         boolean insertTag = false;
 549 
 550         // The use of all error recovery strategies are contingent
 551         // on the value of the strict property.
 552         //
 553         // These are commonly occuring errors.  if insertTag is true,
 554         // then we want to adopt an error recovery strategy that
 555         // involves attempting to insert an additional tag to
 556         // legalize the context.  The two errors addressed here
 557         // are:
 558         // 1) when a <td> or <th> is seen soon after a <table> tag.
 559         //    In this case we insert a <tr>.
 560         // 2) when any other tag apart from a <tr> is seen
 561         //    in the context of a <tr>.  In this case we would
 562         //    like to add a <td>.  If a <tr> is seen within a
 563         //    <tr> context, then we will close out the current
 564         //    <tr>.
 565         //
 566         // This insertion strategy is handled later in the method.
 567         // The reason for checking this now, is that in other cases
 568         // we would like to apply other error recovery strategies for example
 569         // ignoring tags.
 570         //
 571         // In certain cases it is better to ignore a tag than try to
 572         // fix the situation.  So the first test is to see if this
 573         // is what we need to do.
 574         //
 575         String stackElemName = stack.elem.getName();
 576         String elemName = elem.getName();
 577 
 578 
 579         if (!strict &&
 580             ((stackElemName.equals("table") && elemName.equals("td")) ||
 581              (stackElemName.equals("table") && elemName.equals("th")) ||
 582              (stackElemName.equals("tr") && !elemName.equals("tr")))){
 583              insertTag = true;
 584         }
 585 
 586 
 587         if (!strict && !insertTag && (stack.elem.getName() != elem.getName() ||
 588                                       elem.getName().equals("body"))) {
 589             if (skipTag = ignoreElement(elem)) {
 590                 error("tag.ignore", elem.getName());
 591                 return skipTag;
 592             }
 593         }
 594 
 595         // Check for anything after the start of the table besides tr, td, th
 596         // or caption, and if those aren't there, insert the <tr> and call
 597         // legalElementContext again.
 598         if (!strict && stackElemName.equals("table") &&
 599             !elemName.equals("tr") && !elemName.equals("td") &&
 600             !elemName.equals("th") && !elemName.equals("caption")) {
 601             Element e = dtd.getElement("tr");
 602             TagElement t = makeTag(e, true);
 603             legalTagContext(t);
 604             startTag(t);
 605             error("start.missing", elem.getName());
 606             return legalElementContext(elem);
 607         }
 608 
 609         // They try to find a legal context by checking if the current
 610         // tag is valid in an enclosing context.  If so
 611         // close out the tags by outputing end tags and then
 612         // insert the curent tag.  If the tags that are
 613         // being closed out do not have an optional end tag
 614         // specification in the DTD then an html error is
 615         // reported.
 616         //
 617         if (!insertTag && stack.terminate() && (!strict || stack.elem.omitEnd())) {
 618             for (TagStack s = stack.next ; s != null ; s = s.next) {
 619                 if (s.advance(elem)) {
 620                     while (stack != s) {
 621                         endTag(true);
 622                     }
 623                     return true;
 624                 }
 625                 if (!s.terminate() || (strict && !s.elem.omitEnd())) {
 626                     break;
 627                 }
 628             }
 629         }
 630 
 631         // Check if we know what tag is expected next.
 632         // If so insert the tag.  Report an error if the
 633         // tag does not have its start tag spec in the DTD as optional.
 634         //
 635         Element next = stack.first();
 636         if (next != null && (!strict || next.omitStart()) &&
 637            !(next==dtd.head && elem==dtd.pcdata) ) {
 638             // System.out.println("-- omitting start tag: " + next);
 639             TagElement t = makeTag(next, true);
 640             legalTagContext(t);
 641             startTag(t);
 642             if (!next.omitStart()) {
 643                 error("start.missing", elem.getName());
 644             }
 645             return legalElementContext(elem);
 646         }
 647 
 648 
 649         // Traverse the list of expected elements and determine if adding
 650         // any of these elements would make for a legal context.
 651         //
 652 
 653         if (!strict) {
 654             ContentModel content = stack.contentModel();
 655             Vector<Element> elemVec = new Vector<Element>();
 656             if (content != null) {
 657                 content.getElements(elemVec);
 658                 for (Element e : elemVec) {
 659                     // Ensure that this element has not been included as
 660                     // part of the exclusions in the DTD.
 661                     //
 662                     if (stack.excluded(e.getIndex())) {
 663                         continue;
 664                     }
 665 
 666                     boolean reqAtts = false;
 667 
 668                     for (AttributeList a = e.getAttributes(); a != null ; a = a.next) {
 669                         if (a.modifier == REQUIRED) {
 670                             reqAtts = true;
 671                             break;
 672                         }
 673                     }
 674                     // Ensure that no tag that has required attributes
 675                     // gets inserted.
 676                     //
 677                     if (reqAtts) {
 678                         continue;
 679                     }
 680 
 681                     ContentModel m = e.getContent();
 682                     if (m != null && m.first(elem)) {
 683                         // System.out.println("-- adding a legal tag: " + e);
 684                         TagElement t = makeTag(e, true);
 685                         legalTagContext(t);
 686                         startTag(t);
 687                         error("start.missing", e.getName());
 688                         return legalElementContext(elem);
 689                     }
 690                 }
 691             }
 692         }
 693 
 694         // Check if the stack can be terminated.  If so add the appropriate
 695         // end tag.  Report an error if the tag being ended does not have its
 696         // end tag spec in the DTD as optional.
 697         //
 698         if (stack.terminate() && (stack.elem != dtd.body) && (!strict || stack.elem.omitEnd())) {
 699             // System.out.println("-- omitting end tag: " + stack.elem);
 700             if (!stack.elem.omitEnd()) {
 701                 error("end.missing", elem.getName());
 702             }
 703 
 704             endTag(true);
 705             return legalElementContext(elem);
 706         }
 707 
 708         // At this point we know that something is screwed up.
 709         return false;
 710     }
 711 
 712     /**
 713      * Create a legal context for a tag.
 714      */
 715     void legalTagContext(TagElement tag) throws ChangedCharSetException {
 716         if (legalElementContext(tag.getElement())) {
 717             markFirstTime(tag.getElement());
 718             return;
 719         }
 720 
 721         // Avoid putting a block tag in a flow tag.
 722         if (tag.breaksFlow() && (stack != null) && !stack.tag.breaksFlow()) {
 723             endTag(true);
 724             legalTagContext(tag);
 725             return;
 726         }
 727 
 728         // Avoid putting something wierd in the head of the document.
 729         for (TagStack s = stack ; s != null ; s = s.next) {
 730             if (s.tag.getElement() == dtd.head) {
 731                 while (stack != s) {
 732                     endTag(true);
 733                 }
 734                 endTag(true);
 735                 legalTagContext(tag);
 736                 return;
 737             }
 738         }
 739 
 740         // Everything failed
 741         error("tag.unexpected", tag.getElement().getName());
 742     }
 743 
 744     /**
 745      * Error context. Something went wrong, make sure we are in
 746      * the document's body context
 747      */
 748     void errorContext() throws ChangedCharSetException {
 749         for (; (stack != null) && (stack.tag.getElement() != dtd.body) ; stack = stack.next) {
 750             handleEndTag(stack.tag);
 751         }
 752         if (stack == null) {
 753             legalElementContext(dtd.body);
 754             startTag(makeTag(dtd.body, true));
 755         }
 756     }
 757 
 758     /**
 759      * Add a char to the string buffer.
 760      */
 761     void addString(int c) {
 762         if (strpos  == str.length) {
 763             char newstr[] = new char[str.length + 128];
 764             System.arraycopy(str, 0, newstr, 0, str.length);
 765             str = newstr;
 766         }
 767         str[strpos++] = (char)c;
 768     }
 769 
 770     /**
 771      * Get the string that's been accumulated.
 772      */
 773     String getString(int pos) {
 774         char newStr[] = new char[strpos - pos];
 775         System.arraycopy(str, pos, newStr, 0, strpos - pos);
 776         strpos = pos;
 777         return new String(newStr);
 778     }
 779 
 780     char[] getChars(int pos) {
 781         char newStr[] = new char[strpos - pos];
 782         System.arraycopy(str, pos, newStr, 0, strpos - pos);
 783         strpos = pos;
 784         return newStr;
 785     }
 786 
 787     char[] getChars(int pos, int endPos) {
 788         char newStr[] = new char[endPos - pos];
 789         System.arraycopy(str, pos, newStr, 0, endPos - pos);
 790         // REMIND: it's not clear whether this version should set strpos or not
 791         // strpos = pos;
 792         return newStr;
 793     }
 794 
 795     void resetStrBuffer() {
 796         strpos = 0;
 797     }
 798 
 799     int strIndexOf(char target) {
 800         for (int i = 0; i < strpos; i++) {
 801             if (str[i] == target) {
 802                 return i;
 803             }
 804         }
 805 
 806         return -1;
 807     }
 808 
 809     /**
 810      * Skip space.
 811      * [5] 297:5
 812      */
 813     void skipSpace() throws IOException {
 814         while (true) {
 815             switch (ch) {
 816               case '\n':
 817                 ln++;
 818                 ch = readCh();
 819                 lfCount++;
 820                 break;
 821 
 822               case '\r':
 823                 ln++;
 824                 if ((ch = readCh()) == '\n') {
 825                     ch = readCh();
 826                     crlfCount++;
 827                 }
 828                 else {
 829                     crCount++;
 830                 }
 831                 break;
 832               case ' ':
 833               case '\t':
 834                 ch = readCh();
 835                 break;
 836 
 837               default:
 838                 return;
 839             }
 840         }
 841     }
 842 
 843     /**
 844      * Parse identifier. Uppercase characters are folded
 845      * to lowercase when lower is true. Returns falsed if
 846      * no identifier is found. [55] 346:17
 847      */
 848     boolean parseIdentifier(boolean lower) throws IOException {
 849         switch (ch) {
 850           case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
 851           case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
 852           case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
 853           case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
 854           case 'Y': case 'Z':
 855             if (lower) {
 856                 ch = 'a' + (ch - 'A');
 857             }
 858 
 859           case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
 860           case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
 861           case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
 862           case 's': case 't': case 'u': case 'v': case 'w': case 'x':
 863           case 'y': case 'z':
 864             break;
 865 
 866           default:
 867             return false;
 868         }
 869 
 870         while (true) {
 871             addString(ch);
 872 
 873             switch (ch = readCh()) {
 874               case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
 875               case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
 876               case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
 877               case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
 878               case 'Y': case 'Z':
 879                 if (lower) {
 880                     ch = 'a' + (ch - 'A');
 881                 }
 882 
 883               case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
 884               case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
 885               case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
 886               case 's': case 't': case 'u': case 'v': case 'w': case 'x':
 887               case 'y': case 'z':
 888 
 889               case '0': case '1': case '2': case '3': case '4':
 890               case '5': case '6': case '7': case '8': case '9':
 891 
 892               case '.': case '-':
 893 
 894               case '_': // not officially allowed
 895                 break;
 896 
 897               default:
 898                 return true;
 899             }
 900         }
 901     }
 902 
 903     /**
 904      * Parse an entity reference. [59] 350:17
 905      */
 906     private char[] parseEntityReference() throws IOException {
 907         int pos = strpos;
 908 
 909         if ((ch = readCh()) == '#') {
 910             int n = 0;
 911             ch = readCh();
 912             if ((ch >= '0') && (ch <= '9') ||
 913                     ch == 'x' || ch == 'X') {
 914 
 915                 if ((ch >= '0') && (ch <= '9')) {
 916                     // parse decimal reference
 917                     while ((ch >= '0') && (ch <= '9')) {
 918                         n = (n * 10) + ch - '0';
 919                         ch = readCh();
 920                     }
 921                 } else {
 922                     // parse hexadecimal reference
 923                     ch = readCh();
 924                     char lch = (char) Character.toLowerCase(ch);
 925                     while ((lch >= '0') && (lch <= '9') ||
 926                             (lch >= 'a') && (lch <= 'f')) {
 927                         if (lch >= '0' && lch <= '9') {
 928                             n = (n * 16) + lch - '0';
 929                         } else {
 930                             n = (n * 16) + lch - 'a' + 10;
 931                         }
 932                         ch = readCh();
 933                         lch = (char) Character.toLowerCase(ch);
 934                     }
 935                 }
 936                 switch (ch) {
 937                     case '\n':
 938                         ln++;
 939                         ch = readCh();
 940                         lfCount++;
 941                         break;
 942 
 943                     case '\r':
 944                         ln++;
 945                         if ((ch = readCh()) == '\n') {
 946                             ch = readCh();
 947                             crlfCount++;
 948                         }
 949                         else {
 950                             crCount++;
 951                         }
 952                         break;
 953 
 954                     case ';':
 955                         ch = readCh();
 956                         break;
 957                 }
 958                 // Check if n codepoint is within BMP; convert into surrogate
 959                 // pair otherwise 
 960                 try { 
 961                     char data[];
 962                     if (n <= MAX_BMP_BOUND) { 
 963                         data = Character.toChars(mapNumericReference((char) n));
 964                     } else {
 965                         data = Character.toChars(n);
 966                     }
 967                         
 968                     return data; 
 969                 }
 970                 catch(IllegalArgumentException ex) {
 971                     error(ex.toString()); 
 972                     return new char[0]; 
 973                 } 
 974             }
 975             addString('#');
 976             if (!parseIdentifier(false)) {
 977                 error("ident.expected");
 978                 strpos = pos;
 979                 char data[] = {'&', '#'};
 980                 return data;
 981             }
 982         } else if (!parseIdentifier(false)) {
 983             char data[] = {'&'};
 984             return data;
 985         }
 986 
 987         boolean semicolon = false;
 988 
 989         switch (ch) {
 990           case '\n':
 991             ln++;
 992             ch = readCh();
 993             lfCount++;
 994             break;
 995 
 996           case '\r':
 997             ln++;
 998             if ((ch = readCh()) == '\n') {
 999                 ch = readCh();
1000                 crlfCount++;
1001             }
1002             else {
1003                 crCount++;
1004             }
1005             break;
1006 
1007           case ';':
1008             semicolon = true;
1009 
1010             ch = readCh();
1011             break;
1012         }
1013 
1014         String nm = getString(pos);
1015         Entity ent = dtd.getEntity(nm);
1016 
1017         // entities are case sensitive - however if strict
1018         // is false then we will try to make a match by
1019         // converting the string to all lowercase.
1020         //
1021         if (!strict && (ent == null)) {
1022             ent = dtd.getEntity(nm.toLowerCase());
1023         }
1024         if ((ent == null) || !ent.isGeneral()) {
1025 
1026             if (nm.length() == 0) {
1027                 error("invalid.entref", nm);
1028                 return new char[0];
1029             }
1030             /* given that there is not a match restore the entity reference */
1031             String str = "&" + nm + (semicolon ? ";" : "");
1032 
1033             char b[] = new char[str.length()];
1034             str.getChars(0, b.length, b, 0);
1035             return b;
1036         }
1037         return ent.getData();
1038     }
1039 
1040     /**
1041      * Converts numeric character reference to Unicode character.
1042      *
1043      * Normally the code in a reference should be always converted
1044      * to the Unicode character with the same code, but due to
1045      * wide usage of Cp1252 charset most browsers map numeric references
1046      * in the range 130-159 (which are control chars in Unicode set)
1047      * to displayable characters with other codes.
1048      *
1049      * @param c the code of numeric character reference.
1050      * @return the character corresponding to the reference code.
1051      */
1052     private char mapNumericReference(char c) {
1053         if (c < 130 || c > 159) {
1054             return c;
1055         }
1056         return cp1252Map[c - 130];
1057     }
1058 
1059     /**
1060      * Parse a comment. [92] 391:7
1061      */
1062     void parseComment() throws IOException {
1063 
1064         while (true) {
1065             int c = ch;
1066             switch (c) {
1067               case '-':
1068                   /** Presuming that the start string of a comment "<!--" has
1069                       already been parsed, the '-' character is valid only as
1070                       part of a comment termination and further more it must
1071                       be present in even numbers. Hence if strict is true, we
1072                       presume the comment has been terminated and return.
1073                       However if strict is false, then there is no even number
1074                       requirement and this character can appear anywhere in the
1075                       comment.  The parser reads on until it sees the following
1076                       pattern: "-->" or "--!>".
1077                    **/
1078                 if (!strict && (strpos != 0) && (str[strpos - 1] == '-')) {
1079                     if ((ch = readCh()) == '>') {
1080                         return;
1081                     }
1082                     if (ch == '!') {
1083                         if ((ch = readCh()) == '>') {
1084                             return;
1085                         } else {
1086                             /* to account for extra read()'s that happened */
1087                             addString('-');
1088                             addString('!');
1089                             continue;
1090                         }
1091                     }
1092                     break;
1093                 }
1094 
1095                 if ((ch = readCh()) == '-') {
1096                     ch = readCh();
1097                     if (strict || ch == '>') {
1098                         return;
1099                     }
1100                     if (ch == '!') {
1101                         if ((ch = readCh()) == '>') {
1102                             return;
1103                         } else {
1104                             /* to account for extra read()'s that happened */
1105                             addString('-');
1106                             addString('!');
1107                             continue;
1108                         }
1109                     }
1110                     /* to account for the extra read() */
1111                     addString('-');
1112                 }
1113                 break;
1114 
1115               case -1:
1116                   handleEOFInComment();
1117                   return;
1118 
1119               case '\n':
1120                 ln++;
1121                 ch = readCh();
1122                 lfCount++;
1123                 break;
1124 
1125               case '>':
1126                 ch = readCh();
1127                 break;
1128 
1129               case '\r':
1130                 ln++;
1131                 if ((ch = readCh()) == '\n') {
1132                     ch = readCh();
1133                     crlfCount++;
1134                 }
1135                 else {
1136                     crCount++;
1137                 }
1138                 c = '\n';
1139                 break;
1140               default:
1141                 ch = readCh();
1142                 break;
1143             }
1144 
1145             addString(c);
1146         }
1147     }
1148 
1149     /**
1150      * Parse literal content. [46] 343:1 and [47] 344:1
1151      */
1152     void parseLiteral(boolean replace) throws IOException {
1153         while (true) {
1154             int c = ch;
1155             switch (c) {
1156               case -1:
1157                 error("eof.literal", stack.elem.getName());
1158                 endTag(true);
1159                 return;
1160 
1161               case '>':
1162                 ch = readCh();
1163                 int i = textpos - (stack.elem.name.length() + 2), j = 0;
1164 
1165                 // match end tag
1166                 if ((i >= 0) && (text[i++] == '<') && (text[i] == '/')) {
1167                     while ((++i < textpos) &&
1168                            (Character.toLowerCase(text[i]) == stack.elem.name.charAt(j++)));
1169                     if (i == textpos) {
1170                         textpos -= (stack.elem.name.length() + 2);
1171                         if ((textpos > 0) && (text[textpos-1] == '\n')) {
1172                             textpos--;
1173                         }
1174                         endTag(false);
1175                         return;
1176                     }
1177                 }
1178                 break;
1179 
1180               case '&':
1181                 char data[] = parseEntityReference();
1182                 if (textpos + data.length > text.length) {
1183                     char newtext[] = new char[Math.max(textpos + data.length + 128, text.length * 2)];
1184                     System.arraycopy(text, 0, newtext, 0, text.length);
1185                     text = newtext;
1186                 }
1187                 System.arraycopy(data, 0, text, textpos, data.length);
1188                 textpos += data.length;
1189                 continue;
1190 
1191               case '\n':
1192                 ln++;
1193                 ch = readCh();
1194                 lfCount++;
1195                 break;
1196 
1197               case '\r':
1198                 ln++;
1199                 if ((ch = readCh()) == '\n') {
1200                     ch = readCh();
1201                     crlfCount++;
1202                 }
1203                 else {
1204                     crCount++;
1205                 }
1206                 c = '\n';
1207                 break;
1208               default:
1209                 ch = readCh();
1210                 break;
1211             }
1212 
1213             // output character
1214             if (textpos == text.length) {
1215                 char newtext[] = new char[text.length + 128];
1216                 System.arraycopy(text, 0, newtext, 0, text.length);
1217                 text = newtext;
1218             }
1219             text[textpos++] = (char)c;
1220         }
1221     }
1222 
1223     /**
1224      * Parse attribute value. [33] 331:1
1225      */
1226     String parseAttributeValue(boolean lower) throws IOException {
1227         int delim = -1;
1228 
1229         // Check for a delimiter
1230         switch(ch) {
1231           case '\'':
1232           case '"':
1233             delim = ch;
1234             ch = readCh();
1235             break;
1236         }
1237 
1238         // Parse the rest of the value
1239         while (true) {
1240             int c = ch;
1241 
1242             switch (c) {
1243               case '\n':
1244                 ln++;
1245                 ch = readCh();
1246                 lfCount++;
1247                 if (delim < 0) {
1248                     return getString(0);
1249                 }
1250                 break;
1251 
1252               case '\r':
1253                 ln++;
1254 
1255                 if ((ch = readCh()) == '\n') {
1256                     ch = readCh();
1257                     crlfCount++;
1258                 }
1259                 else {
1260                     crCount++;
1261                 }
1262                 if (delim < 0) {
1263                     return getString(0);
1264                 }
1265                 break;
1266 
1267               case '\t':
1268                   if (delim < 0)
1269                       c = ' ';
1270               case ' ':
1271                 ch = readCh();
1272                 if (delim < 0) {
1273                     return getString(0);
1274                 }
1275                 break;
1276 
1277               case '>':
1278               case '<':
1279                 if (delim < 0) {
1280                     return getString(0);
1281                 }
1282                 ch = readCh();
1283                 break;
1284 
1285               case '\'':
1286               case '"':
1287                 ch = readCh();
1288                 if (c == delim) {
1289                     return getString(0);
1290                 } else if (delim == -1) {
1291                     error("attvalerr");
1292                     if (strict || ch == ' ') {
1293                         return getString(0);
1294                     } else {
1295                         continue;
1296                     }
1297                 }
1298                 break;
1299 
1300             case '=':
1301                 if (delim < 0) {
1302                     /* In SGML a construct like <img src=/cgi-bin/foo?x=1>
1303                        is considered invalid since an = sign can only be contained
1304                        in an attributes value if the string is quoted.
1305                        */
1306                     error("attvalerr");
1307                     /* If strict is true then we return with the string we have thus far.
1308                        Otherwise we accept the = sign as part of the attribute's value and
1309                        process the rest of the img tag. */
1310                     if (strict) {
1311                         return getString(0);
1312                     }
1313                 }
1314                 ch = readCh();
1315                 break;
1316 
1317               case '&':
1318                 if (strict && delim < 0) {
1319                     ch = readCh();
1320                     break;
1321                 }
1322 
1323                 char data[] = parseEntityReference();
1324                 for (int i = 0 ; i < data.length ; i++) {
1325                     c = data[i];
1326                     addString((lower && (c >= 'A') && (c <= 'Z')) ? 'a' + c - 'A' : c);
1327                 }
1328                 continue;
1329 
1330               case -1:
1331                 return getString(0);
1332 
1333               default:
1334                 if (lower && (c >= 'A') && (c <= 'Z')) {
1335                     c = 'a' + c - 'A';
1336                 }
1337                 ch = readCh();
1338                 break;
1339             }
1340             addString(c);
1341         }
1342     }
1343 
1344 
1345     /**
1346      * Parse attribute specification List. [31] 327:17
1347      */
1348     void parseAttributeSpecificationList(Element elem) throws IOException {
1349 
1350         while (true) {
1351             skipSpace();
1352 
1353             switch (ch) {
1354               case '/':
1355               case '>':
1356               case '<':
1357               case -1:
1358                 return;
1359 
1360               case '-':
1361                 if ((ch = readCh()) == '-') {
1362                     ch = readCh();
1363                     parseComment();
1364                     strpos = 0;
1365                 } else {
1366                     error("invalid.tagchar", "-", elem.getName());
1367                     ch = readCh();
1368                 }
1369                 continue;
1370             }
1371 
1372             AttributeList att;
1373             String attname;
1374             String attvalue;
1375 
1376             if (parseIdentifier(true)) {
1377                 attname = getString(0);
1378                 skipSpace();
1379                 if (ch == '=') {
1380                     ch = readCh();
1381                     skipSpace();
1382                     att = elem.getAttribute(attname);
1383 //  Bug ID 4102750
1384 //  Load the NAME of an Attribute Case Sensitive
1385 //  The case of the NAME  must be intact
1386 //  MG 021898
1387                     attvalue = parseAttributeValue((att != null) && (att.type != CDATA) && (att.type != NOTATION) && (att.type != NAME));
1388 //                  attvalue = parseAttributeValue((att != null) && (att.type != CDATA) && (att.type != NOTATION));
1389                 } else {
1390                     attvalue = attname;
1391                     att = elem.getAttributeByValue(attvalue);
1392                     if (att == null) {
1393                         att = elem.getAttribute(attname);
1394                         if (att != null) {
1395                             attvalue = att.getValue();
1396                         }
1397                         else {
1398                             // Make it null so that NULL_ATTRIBUTE_VALUE is
1399                             // used
1400                             attvalue = null;
1401                         }
1402                     }
1403                 }
1404             } else if (!strict && ch == ',') { // allows for comma separated attribute-value pairs
1405                 ch = readCh();
1406                 continue;
1407             } else if (!strict && ch == '"') { // allows for quoted attributes
1408                 ch = readCh();
1409                 skipSpace();
1410                 if (parseIdentifier(true)) {
1411                     attname = getString(0);
1412                     if (ch == '"') {
1413                         ch = readCh();
1414                     }
1415                     skipSpace();
1416                     if (ch == '=') {
1417                         ch = readCh();
1418                         skipSpace();
1419                         att = elem.getAttribute(attname);
1420                         attvalue = parseAttributeValue((att != null) &&
1421                                                 (att.type != CDATA) &&
1422                                                 (att.type != NOTATION));
1423                     } else {
1424                         attvalue = attname;
1425                         att = elem.getAttributeByValue(attvalue);
1426                         if (att == null) {
1427                             att = elem.getAttribute(attname);
1428                             if (att != null) {
1429                                 attvalue = att.getValue();
1430                             }
1431                         }
1432                     }
1433                 } else {
1434                     char str[] = {(char)ch};
1435                     error("invalid.tagchar", new String(str), elem.getName());
1436                     ch = readCh();
1437                     continue;
1438                 }
1439             } else if (!strict && (attributes.isEmpty()) && (ch == '=')) {
1440                 ch = readCh();
1441                 skipSpace();
1442                 attname = elem.getName();
1443                 att = elem.getAttribute(attname);
1444                 attvalue = parseAttributeValue((att != null) &&
1445                                                (att.type != CDATA) &&
1446                                                (att.type != NOTATION));
1447             } else if (!strict && (ch == '=')) {
1448                 ch = readCh();
1449                 skipSpace();
1450                 attvalue = parseAttributeValue(true);
1451                 error("attvalerr");
1452                 return;
1453             } else {
1454                 char str[] = {(char)ch};
1455                 error("invalid.tagchar", new String(str), elem.getName());
1456                 if (!strict) {
1457                     ch = readCh();
1458                     continue;
1459                 } else {
1460                     return;
1461                 }
1462             }
1463 
1464             if (att != null) {
1465                 attname = att.getName();
1466             } else {
1467                 error("invalid.tagatt", attname, elem.getName());
1468             }
1469 
1470             // Check out the value
1471             if (attributes.isDefined(attname)) {
1472                 error("multi.tagatt", attname, elem.getName());
1473             }
1474             if (attvalue == null) {
1475                 attvalue = ((att != null) && (att.value != null)) ? att.value :
1476                     HTML.NULL_ATTRIBUTE_VALUE;
1477             } else if ((att != null) && (att.values != null) && !att.values.contains(attvalue)) {
1478                 error("invalid.tagattval", attname, elem.getName());
1479             }
1480             HTML.Attribute attkey = HTML.getAttributeKey(attname);
1481             if (attkey == null) {
1482                 attributes.addAttribute(attname, attvalue);
1483             } else {
1484                 attributes.addAttribute(attkey, attvalue);
1485             }
1486         }
1487     }
1488 
1489     /**
1490      * Parses th Document Declaration Type markup declaration.
1491      * Currently ignores it.
1492      */
1493     public String parseDTDMarkup() throws IOException {
1494 
1495         StringBuilder strBuff = new StringBuilder();
1496         ch = readCh();
1497         while(true) {
1498             switch (ch) {
1499             case '>':
1500                 ch = readCh();
1501                 return strBuff.toString();
1502             case -1:
1503                 error("invalid.markup");
1504                 return strBuff.toString();
1505             case '\n':
1506                 ln++;
1507                 ch = readCh();
1508                 lfCount++;
1509                 break;
1510             case '"':
1511                 ch = readCh();
1512                 break;
1513             case '\r':
1514                 ln++;
1515                 if ((ch = readCh()) == '\n') {
1516                     ch = readCh();
1517                     crlfCount++;
1518                 }
1519                 else {
1520                     crCount++;
1521                 }
1522                 break;
1523             default:
1524                 strBuff.append((char)(ch & 0xFF));
1525                 ch = readCh();
1526                 break;
1527             }
1528         }
1529     }
1530 
1531     /**
1532      * Parse markup declarations.
1533      * Currently only handles the Document Type Declaration markup.
1534      * Returns true if it is a markup declaration false otherwise.
1535      */
1536     protected boolean parseMarkupDeclarations(StringBuffer strBuff) throws IOException {
1537 
1538         /* Currently handles only the DOCTYPE */
1539         if ((strBuff.length() == "DOCTYPE".length()) &&
1540             (strBuff.toString().toUpperCase().equals("DOCTYPE"))) {
1541             parseDTDMarkup();
1542             return true;
1543         }
1544         return false;
1545     }
1546 
1547     /**
1548      * Parse an invalid tag.
1549      */
1550     void parseInvalidTag() throws IOException {
1551         // ignore all data upto the close bracket '>'
1552         while (true) {
1553             skipSpace();
1554             switch (ch) {
1555               case '>':
1556               case -1:
1557                   ch = readCh();
1558                 return;
1559               case '<':
1560                   return;
1561               default:
1562                   ch = readCh();
1563 
1564             }
1565         }
1566     }
1567 
1568     /**
1569      * Parse a start or end tag.
1570      */
1571     void parseTag() throws IOException {
1572         Element elem;
1573         boolean net = false;
1574         boolean warned = false;
1575         boolean unknown = false;
1576 
1577         switch (ch = readCh()) {
1578           case '!':
1579             switch (ch = readCh()) {
1580               case '-':
1581                 // Parse comment. [92] 391:7
1582                 while (true) {
1583                     if (ch == '-') {
1584                         if (!strict || ((ch = readCh()) == '-')) {
1585                             ch = readCh();
1586                             if (!strict && ch == '-') {
1587                                 ch = readCh();
1588                             }
1589                             // send over any text you might see
1590                             // before parsing and sending the
1591                             // comment
1592                             if (textpos != 0) {
1593                                 char newtext[] = new char[textpos];
1594                                 System.arraycopy(text, 0, newtext, 0, textpos);
1595                                 handleText(newtext);
1596                                 lastBlockStartPos = currentBlockStartPos;
1597                                 textpos = 0;
1598                             }
1599                             parseComment();
1600                             last = makeTag(dtd.getElement("comment"), true);
1601                             handleComment(getChars(0));
1602                             continue;
1603                         } else if (!warned) {
1604                             warned = true;
1605                             error("invalid.commentchar", "-");
1606                         }
1607                     }
1608                     skipSpace();
1609                     switch (ch) {
1610                       case '-':
1611                         continue;
1612                       case '>':
1613                         ch = readCh();
1614                       case -1:
1615                         return;
1616                       default:
1617                         ch = readCh();
1618                         if (!warned) {
1619                             warned = true;
1620                             error("invalid.commentchar",
1621                                   String.valueOf((char)ch));
1622                         }
1623                         break;
1624                     }
1625                 }
1626 
1627               default:
1628                 // deal with marked sections
1629                 StringBuffer strBuff = new StringBuffer();
1630                 while (true) {
1631                     strBuff.append((char)ch);
1632                     if (parseMarkupDeclarations(strBuff)) {
1633                         return;
1634                     }
1635                     switch(ch) {
1636                       case '>':
1637                         ch = readCh();
1638                       case -1:
1639                         error("invalid.markup");
1640                         return;
1641                       case '\n':
1642                         ln++;
1643                         ch = readCh();
1644                         lfCount++;
1645                         break;
1646                       case '\r':
1647                         ln++;
1648                         if ((ch = readCh()) == '\n') {
1649                             ch = readCh();
1650                             crlfCount++;
1651                         }
1652                         else {
1653                             crCount++;
1654                         }
1655                         break;
1656 
1657                       default:
1658                         ch = readCh();
1659                         break;
1660                     }
1661                 }
1662             }
1663 
1664           case '/':
1665             // parse end tag [19] 317:4
1666             switch (ch = readCh()) {
1667               case '>':
1668                 ch = readCh();
1669               case '<':
1670                 // empty end tag. either </> or </<
1671                 if (recent == null) {
1672                     error("invalid.shortend");
1673                     return;
1674                 }
1675                 elem = recent;
1676                 break;
1677 
1678               default:
1679                 if (!parseIdentifier(true)) {
1680                     error("expected.endtagname");
1681                     return;
1682                 }
1683                 skipSpace();
1684                 switch (ch) {
1685                   case '>':
1686                     ch = readCh();
1687                   case '<':
1688                     break;
1689 
1690                   default:
1691                     error("expected", "'>'");
1692                     while ((ch != -1) && (ch != '\n') && (ch != '>')) {
1693                         ch = readCh();
1694                     }
1695                     if (ch == '>') {
1696                         ch = readCh();
1697                     }
1698                     break;
1699                 }
1700                 String elemStr = getString(0);
1701                 if (!dtd.elementExists(elemStr)) {
1702                     error("end.unrecognized", elemStr);
1703                     // Ignore RE before end tag
1704                     if ((textpos > 0) && (text[textpos-1] == '\n')) {
1705                         textpos--;
1706                     }
1707                     elem = dtd.getElement("unknown");
1708                     elem.name = elemStr;
1709                     unknown = true;
1710                 } else {
1711                     elem = dtd.getElement(elemStr);
1712                 }
1713                 break;
1714             }
1715 
1716 
1717             // If the stack is null, we're seeing end tags without any begin
1718             // tags.  Ignore them.
1719 
1720             if (stack == null) {
1721                 error("end.extra.tag", elem.getName());
1722                 return;
1723             }
1724 
1725             // Ignore RE before end tag
1726             if ((textpos > 0) && (text[textpos-1] == '\n')) {
1727                 // In a pre tag, if there are blank lines
1728                 // we do not want to remove the newline
1729                 // before the end tag.  Hence this code.
1730                 //
1731                 if (stack.pre) {
1732                     if ((textpos > 1) && (text[textpos-2] != '\n')) {
1733                         textpos--;
1734                     }
1735                 } else {
1736                     textpos--;
1737                 }
1738             }
1739 
1740             // If the end tag is a form, since we did not put it
1741             // on the tag stack, there is no corresponding start
1742             // start tag to find. Hence do not touch the tag stack.
1743             //
1744 
1745             /*
1746             if (!strict && elem.getName().equals("form")) {
1747                 if (lastFormSent != null) {
1748                     handleEndTag(lastFormSent);
1749                     return;
1750                 } else {
1751                     // do nothing.
1752                     return;
1753                 }
1754             }
1755             */
1756 
1757             if (unknown) {
1758                 // we will not see a corresponding start tag
1759                 // on the the stack.  If we are seeing an
1760                 // end tag, lets send this on as an empty
1761                 // tag with the end tag attribute set to
1762                 // true.
1763                 TagElement t = makeTag(elem);
1764                 handleText(t);
1765                 attributes.addAttribute(HTML.Attribute.ENDTAG, "true");
1766                 handleEmptyTag(makeTag(elem));
1767                 unknown = false;
1768                 return;
1769             }
1770 
1771             // find the corresponding start tag
1772 
1773             // A commonly occuring error appears to be the insertion
1774             // of extra end tags in a table.  The intent here is ignore
1775             // such extra end tags.
1776             //
1777             if (!strict) {
1778                 String stackElem = stack.elem.getName();
1779 
1780                 if (stackElem.equals("table")) {
1781                     // If it isnt a valid end tag ignore it and return
1782                     //
1783                     if (!elem.getName().equals(stackElem)) {
1784                         error("tag.ignore", elem.getName());
1785                         return;
1786                     }
1787                 }
1788 
1789 
1790 
1791                 if (stackElem.equals("tr") ||
1792                     stackElem.equals("td")) {
1793                     if ((!elem.getName().equals("table")) &&
1794                         (!elem.getName().equals(stackElem))) {
1795                         error("tag.ignore", elem.getName());
1796                         return;
1797                     }
1798                 }
1799             }
1800             TagStack sp = stack;
1801 
1802             while ((sp != null) && (elem != sp.elem)) {
1803                 sp = sp.next;
1804             }
1805             if (sp == null) {
1806                 error("unmatched.endtag", elem.getName());
1807                 return;
1808             }
1809 
1810             // People put font ending tags in the darndest places.
1811             // Don't close other contexts based on them being between
1812             // a font tag and the corresponding end tag.  Instead,
1813             // ignore the end tag like it doesn't exist and allow the end
1814             // of the document to close us out.
1815             String elemName = elem.getName();
1816             if (stack != sp &&
1817                 (elemName.equals("font") ||
1818                  elemName.equals("center"))) {
1819 
1820                 // Since closing out a center tag can have real wierd
1821                 // effects on the formatting,  make sure that tags
1822                 // for which omitting an end tag is legimitate
1823                 // get closed out.
1824                 //
1825                 if (elemName.equals("center")) {
1826                     while(stack.elem.omitEnd() && stack != sp) {
1827                         endTag(true);
1828                     }
1829                     if (stack.elem == elem) {
1830                         endTag(false);
1831                     }
1832                 }
1833                 return;
1834             }
1835             // People do the same thing with center tags.  In this
1836             // case we would like to close off the center tag but
1837             // not necessarily all enclosing tags.
1838 
1839 
1840 
1841             // end tags
1842             while (stack != sp) {
1843                 endTag(true);
1844             }
1845 
1846             endTag(false);
1847             return;
1848 
1849           case -1:
1850             error("eof");
1851             return;
1852         }
1853 
1854         // start tag [14] 314:1
1855         if (!parseIdentifier(true)) {
1856             elem = recent;
1857             if ((ch != '>') || (elem == null)) {
1858                 error("expected.tagname");
1859                 return;
1860             }
1861         } else {
1862             String elemStr = getString(0);
1863 
1864             if (elemStr.equals("image")) {
1865                 elemStr = "img";
1866             }
1867 
1868             /* determine if this element is part of the dtd. */
1869 
1870             if (!dtd.elementExists(elemStr)) {
1871                 //              parseInvalidTag();
1872                 error("tag.unrecognized ", elemStr);
1873                 elem = dtd.getElement("unknown");
1874                 elem.name = elemStr;
1875                 unknown = true;
1876             } else {
1877                 elem = dtd.getElement(elemStr);
1878             }
1879         }
1880 
1881         // Parse attributes
1882         parseAttributeSpecificationList(elem);
1883 
1884         switch (ch) {
1885           case '/':
1886             net = true;
1887           case '>':
1888             ch = readCh();
1889             if (ch == '>' && net) {
1890                 ch = readCh();
1891             }
1892           case '<':
1893             break;
1894 
1895           default:
1896             error("expected", "'>'");
1897             break;
1898         }
1899 
1900         if (!strict) {
1901           if (elem.getName().equals("script")) {
1902             error("javascript.unsupported");
1903           }
1904         }
1905 
1906         // ignore RE after start tag
1907         //
1908         if (!elem.isEmpty())  {
1909             if (ch == '\n') {
1910                 ln++;
1911                 lfCount++;
1912                 ch = readCh();
1913             } else if (ch == '\r') {
1914                 ln++;
1915                 if ((ch = readCh()) == '\n') {
1916                     ch = readCh();
1917                     crlfCount++;
1918                 }
1919                 else {
1920                     crCount++;
1921                 }
1922             }
1923         }
1924 
1925         // ensure a legal context for the tag
1926         TagElement tag = makeTag(elem, false);
1927 
1928 
1929         /** In dealing with forms, we have decided to treat
1930             them as legal in any context.  Also, even though
1931             they do have a start and an end tag, we will
1932             not put this tag on the stack.  This is to deal
1933             several pages in the web oasis that choose to
1934             start and end forms in any possible location. **/
1935 
1936         /*
1937         if (!strict && elem.getName().equals("form")) {
1938             if (lastFormSent == null) {
1939                 lastFormSent = tag;
1940             } else {
1941                 handleEndTag(lastFormSent);
1942                 lastFormSent = tag;
1943             }
1944         } else {
1945         */
1946             // Smlly, if a tag is unknown, we will apply
1947             // no legalTagContext logic to it.
1948             //
1949             if (!unknown) {
1950                 legalTagContext(tag);
1951 
1952                 // If skip tag is true,  this implies that
1953                 // the tag was illegal and that the error
1954                 // recovery strategy adopted is to ignore
1955                 // the tag.
1956                 if (!strict && skipTag) {
1957                     skipTag = false;
1958                     return;
1959                 }
1960             }
1961             /*
1962         }
1963             */
1964 
1965         startTag(tag);
1966 
1967         if (!elem.isEmpty()) {
1968             switch (elem.getType()) {
1969               case CDATA:
1970                 parseLiteral(false);
1971                 break;
1972               case RCDATA:
1973                 parseLiteral(true);
1974                 break;
1975               default:
1976                 if (stack != null) {
1977                     stack.net = net;
1978                 }
1979                 break;
1980             }
1981         }
1982     }
1983 
1984     private static final String START_COMMENT = "<!--";
1985     private static final String END_COMMENT = "-->";
1986     private static final char[] SCRIPT_END_TAG = "</script>".toCharArray();
1987     private static final char[] SCRIPT_END_TAG_UPPER_CASE =
1988                                         "</SCRIPT>".toCharArray();
1989 
1990     void parseScript() throws IOException {
1991         char[] charsToAdd = new char[SCRIPT_END_TAG.length];
1992 
1993         /* Here, ch should be the first character after <script> */
1994         while (true) {
1995             int i = 0;
1996             while (i < SCRIPT_END_TAG.length
1997                        && (SCRIPT_END_TAG[i] == ch
1998                            || SCRIPT_END_TAG_UPPER_CASE[i] == ch)) {
1999                 charsToAdd[i] = (char) ch;
2000                 ch = readCh();
2001                 i++;
2002             }
2003             if (i == SCRIPT_END_TAG.length) {
2004 
2005                 /*  '</script>' tag detected */
2006                 /* Here, ch == the first character after </script> */
2007                 return;
2008             } else {
2009 
2010                 /* To account for extra read()'s that happened */
2011                 for (int j = 0; j < i; j++) {
2012                     addString(charsToAdd[j]);
2013                 }
2014 
2015                 switch (ch) {
2016                 case -1:
2017                     error("eof.script");
2018                     return;
2019                 case '\n':
2020                     ln++;
2021                     ch = readCh();
2022                     lfCount++;
2023                     addString('\n');
2024                     break;
2025                 case '\r':
2026                     ln++;
2027                     if ((ch = readCh()) == '\n') {
2028                         ch = readCh();
2029                         crlfCount++;
2030                     } else {
2031                         crCount++;
2032                     }
2033                     addString('\n');
2034                     break;
2035                 default:
2036                     addString(ch);
2037                     ch = readCh();
2038                     break;
2039                 } // switch
2040             }
2041         } // while
2042     }
2043 
2044     /**
2045      * Parse Content. [24] 320:1
2046      */
2047     void parseContent() throws IOException {
2048         Thread curThread = Thread.currentThread();
2049 
2050         for (;;) {
2051             if (curThread.isInterrupted()) {
2052                 curThread.interrupt(); // resignal the interrupt
2053                 break;
2054             }
2055 
2056             int c = ch;
2057             currentBlockStartPos = currentPosition;
2058 
2059             if (recent == dtd.script) { // means: if after starting <script> tag
2060 
2061                 /* Here, ch has to be the first character after <script> */
2062                 parseScript();
2063                 last = makeTag(dtd.getElement("comment"), true);
2064 
2065                 /* Remove leading and trailing HTML comment declarations */
2066                 String str = new String(getChars(0)).trim();
2067                 int minLength = START_COMMENT.length() + END_COMMENT.length();
2068                 if (str.startsWith(START_COMMENT) && str.endsWith(END_COMMENT)
2069                        && str.length() >= (minLength)) {
2070                     str = str.substring(START_COMMENT.length(),
2071                                       str.length() - END_COMMENT.length());
2072                 }
2073 
2074                 /* Handle resulting chars as comment */
2075                 handleComment(str.toCharArray());
2076                 endTag(false);
2077                 lastBlockStartPos = currentPosition;
2078 
2079                 continue;
2080             } else {
2081                 switch (c) {
2082                   case '<':
2083                     parseTag();
2084                     lastBlockStartPos = currentPosition;
2085                     continue;
2086 
2087                   case '/':
2088                     ch = readCh();
2089                     if ((stack != null) && stack.net) {
2090                         // null end tag.
2091                         endTag(false);
2092                         continue;
2093                     }
2094                     break;
2095 
2096                   case -1:
2097                     return;
2098 
2099                   case '&':
2100                     if (textpos == 0) {
2101                         if (!legalElementContext(dtd.pcdata)) {
2102                             error("unexpected.pcdata");
2103                         }
2104                         if (last.breaksFlow()) {
2105                             space = false;
2106                         }
2107                     }
2108                     char data[] = parseEntityReference();
2109                     if (textpos + data.length + 1 > text.length) {
2110                         char newtext[] = new char[Math.max(textpos + data.length + 128, text.length * 2)];
2111                         System.arraycopy(text, 0, newtext, 0, text.length);
2112                         text = newtext;
2113                     }
2114                     if (space) {
2115                         space = false;
2116                         text[textpos++] = ' ';
2117                     }
2118                     System.arraycopy(data, 0, text, textpos, data.length);
2119                     textpos += data.length;
2120                     ignoreSpace = false;
2121                     continue;
2122 
2123                   case '\n':
2124                     ln++;
2125                     lfCount++;
2126                     ch = readCh();
2127                     if ((stack != null) && stack.pre) {
2128                         break;
2129                     }
2130                     if (textpos == 0) {
2131                         lastBlockStartPos = currentPosition;
2132                     }
2133                     if (!ignoreSpace) {
2134                         space = true;
2135                     }
2136                     continue;
2137 
2138                   case '\r':
2139                     ln++;
2140                     c = '\n';
2141                     if ((ch = readCh()) == '\n') {
2142                         ch = readCh();
2143                         crlfCount++;
2144                     }
2145                     else {
2146                         crCount++;
2147                     }
2148                     if ((stack != null) && stack.pre) {
2149                         break;
2150                     }
2151                     if (textpos == 0) {
2152                         lastBlockStartPos = currentPosition;
2153                     }
2154                     if (!ignoreSpace) {
2155                         space = true;
2156                     }
2157                     continue;
2158 
2159 
2160                   case '\t':
2161                   case ' ':
2162                     ch = readCh();
2163                     if ((stack != null) && stack.pre) {
2164                         break;
2165                     }
2166                     if (textpos == 0) {
2167                         lastBlockStartPos = currentPosition;
2168                     }
2169                     if (!ignoreSpace) {
2170                         space = true;
2171                     }
2172                     continue;
2173 
2174                   default:
2175                     if (textpos == 0) {
2176                         if (!legalElementContext(dtd.pcdata)) {
2177                             error("unexpected.pcdata");
2178                         }
2179                         if (last.breaksFlow()) {
2180                             space = false;
2181                         }
2182                     }
2183                     ch = readCh();
2184                     break;
2185                 }
2186             }
2187 
2188             // enlarge buffer if needed
2189             if (textpos + 2 > text.length) {
2190                 char newtext[] = new char[text.length + 128];
2191                 System.arraycopy(text, 0, newtext, 0, text.length);
2192                 text = newtext;
2193             }
2194 
2195             // output pending space
2196             if (space) {
2197                 if (textpos == 0) {
2198                     lastBlockStartPos--;
2199                 }
2200                 text[textpos++] = ' ';
2201                 space = false;
2202             }
2203             text[textpos++] = (char)c;
2204             ignoreSpace = false;
2205         }
2206     }
2207 
2208     /**
2209      * Returns the end of line string. This will return the end of line
2210      * string that has been encountered the most, one of \r, \n or \r\n.
2211      */
2212     String getEndOfLineString() {
2213         if (crlfCount >= crCount) {
2214             if (lfCount >= crlfCount) {
2215                 return "\n";
2216             }
2217             else {
2218                 return "\r\n";
2219             }
2220         }
2221         else {
2222             if (crCount > lfCount) {
2223                 return "\r";
2224             }
2225             else {
2226                 return "\n";
2227             }
2228         }
2229     }
2230 
2231     /**
2232      * Parse an HTML stream, given a DTD.
2233      */
2234     public synchronized void parse(Reader in) throws IOException {
2235         this.in = in;
2236 
2237         this.ln = 1;
2238 
2239         seenHtml = false;
2240         seenHead = false;
2241         seenBody = false;
2242 
2243         crCount = lfCount = crlfCount = 0;
2244 
2245         try {
2246             ch = readCh();
2247             text = new char[1024];
2248             str = new char[128];
2249 
2250             parseContent();
2251             // NOTE: interruption may have occurred.  Control flows out
2252             // of here normally.
2253             while (stack != null) {
2254                 endTag(true);
2255             }
2256             in.close();
2257         } catch (IOException e) {
2258             errorContext();
2259             error("ioexception");
2260             throw e;
2261         } catch (Exception e) {
2262             errorContext();
2263             error("exception", e.getClass().getName(), e.getMessage());
2264             e.printStackTrace();
2265         } catch (ThreadDeath e) {
2266             errorContext();
2267             error("terminated");
2268             e.printStackTrace();
2269             throw e;
2270         } finally {
2271             for (; stack != null ; stack = stack.next) {
2272                 handleEndTag(stack.tag);
2273             }
2274 
2275             text = null;
2276             str = null;
2277         }
2278 
2279     }
2280 
2281 
2282     /*
2283      * Input cache.  This is much faster than calling down to a synchronized
2284      * method of BufferedReader for each byte.  Measurements done 5/30/97
2285      * show that there's no point in having a bigger buffer:  Increasing
2286      * the buffer to 8192 had no measurable impact for a program discarding
2287      * one character at a time (reading from an http URL to a local machine).
2288      * NOTE: If the current encoding is bogus, and we read too much
2289      * (past the content-type) we may suffer a MalformedInputException. For
2290      * this reason the initial size is 1 and when the body is encountered the
2291      * size is adjusted to 256.
2292      */
2293     private char buf[] = new char[1];
2294     private int pos;
2295     private int len;
2296     /*
2297         tracks position relative to the beginning of the
2298         document.
2299     */
2300     private int currentPosition;
2301 
2302 
2303     private final int readCh() throws IOException {
2304 
2305         if (pos >= len) {
2306 
2307             // This loop allows us to ignore interrupts if the flag
2308             // says so
2309             for (;;) {
2310                 try {
2311                     len = in.read(buf);
2312                     break;
2313                 } catch (InterruptedIOException ex) {
2314                     throw ex;
2315                 }
2316             }
2317 
2318             if (len <= 0) {
2319                 return -1;      // eof
2320             }
2321             pos = 0;
2322         }
2323         ++currentPosition;
2324 
2325         return buf[pos++];
2326     }
2327 
2328 
2329     protected int getCurrentPos() {
2330         return currentPosition;
2331     }
2332 }