Old src/share/classes/javax/swing/text/html/parser/Parser.java

   1 /*
   2  * Copyright (c) 1998, 2014, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 
  26 package javax.swing.text.html.parser;
  27 
  28 import javax.swing.text.SimpleAttributeSet;
  29 import javax.swing.text.html.HTML;
  30 import javax.swing.text.ChangedCharSetException;
  31 import java.io.*;
  32 import java.util.Hashtable;
  33 import java.util.Properties;
  34 import java.util.Vector;
  35 import java.util.Enumeration;
  36 import java.net.URL;
  37 
  38 import sun.misc.MessageUtils;
  39 
  40 /**
  41  * A simple DTD-driven HTML parser. The parser reads an
  42  * HTML file from an InputStream and calls various methods
  43  * (which should be overridden in a subclass) when tags and
  44  * data are encountered.
  45  * <p>
  46  * Unfortunately there are many badly implemented HTML parsers
  47  * out there, and as a result there are many badly formatted
  48  * HTML files. This parser attempts to parse most HTML files.
  49  * This means that the implementation sometimes deviates from
  50  * the SGML specification in favor of HTML.
  51  * <p>
  52  * The parser treats \r and \r\n as \n. Newlines after starttags
  53  * and before end tags are ignored just as specified in the SGML/HTML
  54  * specification.
  55  * <p>
  56  * The html spec does not specify how spaces are to be coalesced very well.
  57  * Specifically, the following scenarios are not discussed (note that a
  58  * space should be used here, but I am using &amp;nbsp to force the space to
  59  * be displayed):
  60  * <p>
  61  * '&lt;b&gt;blah&nbsp;&lt;i&gt;&nbsp;&lt;strike&gt;&nbsp;foo' which can be treated as:
  62  * '&lt;b&gt;blah&nbsp;&lt;i&gt;&lt;strike&gt;foo'
  63  * <p>as well as:
  64  * '&lt;p&gt;&lt;a href="xx"&gt;&nbsp;&lt;em&gt;Using&lt;/em&gt;&lt;/a&gt;&lt;/p&gt;'
  65  * which appears to be treated as:
  66  * '&lt;p&gt;&lt;a href="xx"&gt;&lt;em&gt;Using&lt;/em&gt;&lt;/a&gt;&lt;/p&gt;'
  67  * <p>
  68  * If <code>strict</code> is false, when a tag that breaks flow,
  69  * (<code>TagElement.breaksFlows</code>) or trailing whitespace is
  70  * encountered, all whitespace will be ignored until a non whitespace
  71  * character is encountered. This appears to give behavior closer to
  72  * the popular browsers.
  73  *
  74  * @see DTD
  75  * @see TagElement
  76  * @see SimpleAttributeSet
  77  * @author Arthur van Hoff
  78  * @author Sunita Mani
  79  */
  80 public
  81 class Parser implements DTDConstants {
  82 
  83     private char text[] = new char[1024];
  84     private int textpos = 0;
  85     private TagElement last;
  86     private boolean space;
  87 
  88     private char str[] = new char[128];
  89     private int strpos = 0;
  90 
  91     protected DTD dtd = null;
  92 
  93     private int ch;
  94     private int ln;
  95     private Reader in;
  96 
  97     private Element recent;
  98     private TagStack stack;
  99     private boolean skipTag = false;
 100     private TagElement lastFormSent = null;
 101     private SimpleAttributeSet attributes = new SimpleAttributeSet();
 102 
 103     // State for <html>, <head> and <body>.  Since people like to slap
 104     // together HTML documents without thinking, occasionally they
 105     // have multiple instances of these tags.  These booleans track
 106     // the first sightings of these tags so they can be safely ignored
 107     // by the parser if repeated.
 108     private boolean seenHtml = false;
 109     private boolean seenHead = false;
 110     private boolean seenBody = false;
 111 
 112     /**
 113      * The html spec does not specify how spaces are coalesced very well.
 114      * If strict == false, ignoreSpace is used to try and mimic the behavior
 115      * of the popular browsers.
 116      * <p>
 117      * The problematic scenarios are:
 118      * '&lt;b>blah &lt;i> &lt;strike> foo' which can be treated as:
 119      * '&lt;b>blah &lt;i>&lt;strike>foo'
 120      * as well as:
 121      * '&lt;p>&lt;a href="xx"> &lt;em>Using&lt;/em>&lt;/a>&lt;/p>'
 122      * which appears to be treated as:
 123      * '&lt;p>&lt;a href="xx">&lt;em>Using&lt;/em>&lt;/a>&lt;/p>'
 124      * <p>
 125      * When a tag that breaks flow, or trailing whitespace is encountered
 126      * ignoreSpace is set to true. From then on, all whitespace will be
 127      * ignored.
 128      * ignoreSpace will be set back to false the first time a
 129      * non whitespace character is encountered. This appears to give
 130      * behavior closer to the popular browsers.
 131      */
 132     private boolean ignoreSpace;
 133 
 134     /**
 135      * This flag determines whether or not the Parser will be strict
 136      * in enforcing SGML compatibility.  If false, it will be lenient
 137      * with certain common classes of erroneous HTML constructs.
 138      * Strict or not, in either case an error will be recorded.
 139      *
 140      */
 141     protected boolean strict = false;
 142 
 143 
 144     /** Number of \r\n's encountered. */
 145     private int crlfCount;
 146     /** Number of \r's encountered. A \r\n will not increment this. */
 147     private int crCount;
 148     /** Number of \n's encountered. A \r\n will not increment this. */
 149     private int lfCount;
 150 
 151     //
 152     // To correctly identify the start of a tag/comment/text we need two
 153     // ivars. Two are needed as handleText isn't invoked until the tag
 154     // after the text has been parsed, that is the parser parses the text,
 155     // then a tag, then invokes handleText followed by handleStart.
 156     //
 157     /** The start position of the current block. Block is overloaded here,
 158      * it really means the current start position for the current comment,
 159      * tag, text. Use getBlockStartPosition to access this. */
 160     private int currentBlockStartPos;
 161     /** Start position of the last block. */
 162     private int lastBlockStartPos;
 163 
 164     /**
 165      * array for mapping numeric references in range
 166      * 130-159 to displayable Unicode characters.
 167      */
 168     private static final char[] cp1252Map = {
 169         8218,  // ‚
 170         402,   // ƒ
 171         8222,  // „
 172         8230,  // …
 173         8224,  // †
 174         8225,  // ‡
 175         710,   // ˆ
 176         8240,  // ‰
 177         352,   // Š
 178         8249,  // ‹
 179         338,   // Œ
 180         141,   // 
 181         142,   // Ž
 182         143,   // 
 183         144,   // 
 184         8216,  // ‘
 185         8217,  // ’
 186         8220,  // “
 187         8221,  // ”
 188         8226,  // •
 189         8211,  // –
 190         8212,  // —
 191         732,   // ˜
 192         8482,  // ™
 193         353,   // š
 194         8250,  // ›
 195         339,   // œ
 196         157,   // 
 197         158,   // ž
 198         376    // Ÿ
 199     };
 200 
 201     public Parser(DTD dtd) {
 202         this.dtd = dtd;
 203     }
 204 
 205 
 206     /**
 207      * @return the line number of the line currently being parsed
 208      */
 209     protected int getCurrentLine() {
 210         return ln;
 211     }
 212 
 213     /**
 214      * Returns the start position of the current block. Block is
 215      * overloaded here, it really means the current start position for
 216      * the current comment tag, text, block.... This is provided for
 217      * subclassers that wish to know the start of the current block when
 218      * called with one of the handleXXX methods.
 219      */
 220     int getBlockStartPosition() {
 221         return Math.max(0, lastBlockStartPos - 1);
 222     }
 223 
 224     /**
 225      * Makes a TagElement.
 226      */
 227     protected TagElement makeTag(Element elem, boolean fictional) {
 228         return new TagElement(elem, fictional);
 229     }
 230 
 231     protected TagElement makeTag(Element elem) {
 232         return makeTag(elem, false);
 233     }
 234 
 235     protected SimpleAttributeSet getAttributes() {
 236         return attributes;
 237     }
 238 
 239     protected void flushAttributes() {
 240         attributes.removeAttributes(attributes);
 241     }
 242 
 243     /**
 244      * Called when PCDATA is encountered.
 245      */
 246     protected void handleText(char text[]) {
 247     }
 248 
 249     /**
 250      * Called when an HTML title tag is encountered.
 251      */
 252     protected void handleTitle(char text[]) {
 253         // default behavior is to call handleText. Subclasses
 254         // can override if necessary.
 255         handleText(text);
 256     }
 257 
 258     /**
 259      * Called when an HTML comment is encountered.
 260      */
 261     protected void handleComment(char text[]) {
 262     }
 263 
 264     protected void handleEOFInComment() {
 265         // We've reached EOF.  Our recovery strategy is to
 266         // see if we have more than one line in the comment;
 267         // if so, we pretend that the comment was an unterminated
 268         // single line comment, and reparse the lines after the
 269         // first line as normal HTML content.
 270 
 271         int commentEndPos = strIndexOf('\n');
 272         if (commentEndPos >= 0) {
 273             handleComment(getChars(0, commentEndPos));
 274             try {
 275                 in.close();
 276                 in = new CharArrayReader(getChars(commentEndPos + 1));
 277                 ch = '>';
 278             } catch (IOException e) {
 279                 error("ioexception");
 280             }
 281 
 282             resetStrBuffer();
 283         } else {
 284             // no newline, so signal an error
 285             error("eof.comment");
 286         }
 287     }
 288 
 289     /**
 290      * Called when an empty tag is encountered.
 291      */
 292     protected void handleEmptyTag(TagElement tag) throws ChangedCharSetException {
 293     }
 294 
 295     /**
 296      * Called when a start tag is encountered.
 297      */
 298     protected void handleStartTag(TagElement tag) {
 299     }
 300 
 301     /**
 302      * Called when an end tag is encountered.
 303      */
 304     protected void handleEndTag(TagElement tag) {
 305     }
 306 
 307     /**
 308      * An error has occurred.
 309      */
 310     protected void handleError(int ln, String msg) {
 311         /*
 312         Thread.dumpStack();
 313         System.out.println("**** " + stack);
 314         System.out.println("line " + ln + ": error: " + msg);
 315         System.out.println();
 316         */
 317     }
 318 
 319     /**
 320      * Output text.
 321      */
 322     void handleText(TagElement tag) {
 323         if (tag.breaksFlow()) {
 324             space = false;
 325             if (!strict) {
 326                 ignoreSpace = true;
 327             }
 328         }
 329         if (textpos == 0) {
 330             if ((!space) || (stack == null) || last.breaksFlow() ||
 331                 !stack.advance(dtd.pcdata)) {
 332                 last = tag;
 333                 space = false;
 334                 lastBlockStartPos = currentBlockStartPos;
 335                 return;
 336             }
 337         }
 338         if (space) {
 339             if (!ignoreSpace) {
 340                 // enlarge buffer if needed
 341                 if (textpos + 1 > text.length) {
 342                     char newtext[] = new char[text.length + 200];
 343                     System.arraycopy(text, 0, newtext, 0, text.length);
 344                     text = newtext;
 345                 }
 346 
 347                 // output pending space
 348                 text[textpos++] = ' ';
 349                 if (!strict && !tag.getElement().isEmpty()) {
 350                     ignoreSpace = true;
 351                 }
 352             }
 353             space = false;
 354         }
 355         char newtext[] = new char[textpos];
 356         System.arraycopy(text, 0, newtext, 0, textpos);
 357         // Handles cases of bad html where the title tag
 358         // was getting lost when we did error recovery.
 359         if (tag.getElement().getName().equals("title")) {
 360             handleTitle(newtext);
 361         } else {
 362             handleText(newtext);
 363         }
 364         lastBlockStartPos = currentBlockStartPos;
 365         textpos = 0;
 366         last = tag;
 367         space = false;
 368     }
 369 
 370     /**
 371      * Invoke the error handler.
 372      */
 373     protected void error(String err, String arg1, String arg2,
 374         String arg3) {
 375         handleError(ln, err + " " + arg1 + " " + arg2 + " " + arg3);
 376     }
 377 
 378     protected void error(String err, String arg1, String arg2) {
 379         error(err, arg1, arg2, "?");
 380     }
 381     protected void error(String err, String arg1) {
 382         error(err, arg1, "?", "?");
 383     }
 384     protected void error(String err) {
 385         error(err, "?", "?", "?");
 386     }
 387 
 388 
 389     /**
 390      * Handle a start tag. The new tag is pushed
 391      * onto the tag stack. The attribute list is
 392      * checked for required attributes.
 393      */
 394     protected void startTag(TagElement tag) throws ChangedCharSetException {
 395         Element elem = tag.getElement();
 396 
 397         // If the tag is an empty tag and texpos != 0
 398         // this implies that there is text before the
 399         // start tag that needs to be processed before
 400         // handling the tag.
 401         //
 402         if (!elem.isEmpty() ||
 403                     ((last != null) && !last.breaksFlow()) ||
 404                     (textpos != 0)) {
 405             handleText(tag);
 406         } else {
 407             // this variable gets updated in handleText().
 408             // Since in this case we do not call handleText()
 409             // we need to update it here.
 410             //
 411             last = tag;
 412             // Note that we should really check last.breakFlows before
 413             // assuming this should be false.
 414             space = false;
 415         }
 416         lastBlockStartPos = currentBlockStartPos;
 417 
 418         // check required attributes
 419         for (AttributeList a = elem.atts ; a != null ; a = a.next) {
 420             if ((a.modifier == REQUIRED) &&
 421                 ((attributes.isEmpty()) ||
 422                  ((!attributes.isDefined(a.name)) &&
 423                   (!attributes.isDefined(HTML.getAttributeKey(a.name)))))) {
 424                 error("req.att ", a.getName(), elem.getName());
 425             }
 426         }
 427 
 428         if (elem.isEmpty()) {
 429             handleEmptyTag(tag);
 430             /*
 431         } else if (elem.getName().equals("form")) {
 432             handleStartTag(tag);
 433             */
 434         } else {
 435             recent = elem;
 436             stack = new TagStack(tag, stack);
 437             handleStartTag(tag);
 438         }
 439     }
 440 
 441     /**
 442      * Handle an end tag. The end tag is popped
 443      * from the tag stack.
 444      */
 445     protected void endTag(boolean omitted) {
 446         handleText(stack.tag);
 447 
 448         if (omitted && !stack.elem.omitEnd()) {
 449             error("end.missing", stack.elem.getName());
 450         } else if (!stack.terminate()) {
 451             error("end.unexpected", stack.elem.getName());
 452         }
 453 
 454         // handle the tag
 455         handleEndTag(stack.tag);
 456         stack = stack.next;
 457         recent = (stack != null) ? stack.elem : null;
 458     }
 459 
 460 
 461     boolean ignoreElement(Element elem) {
 462 
 463         String stackElement = stack.elem.getName();
 464         String elemName = elem.getName();
 465         /* We ignore all elements that are not valid in the context of
 466            a table except <td>, <th> (these we handle in
 467            legalElementContext()) and #pcdata.  We also ignore the
 468            <font> tag in the context of <ul> and <ol> We additonally
 469            ignore the <meta> and the <style> tag if the body tag has
 470            been seen. **/
 471         if ((elemName.equals("html") && seenHtml) ||
 472             (elemName.equals("head") && seenHead) ||
 473             (elemName.equals("body") && seenBody)) {
 474             return true;
 475         }
 476         if (elemName.equals("dt") || elemName.equals("dd")) {
 477             TagStack s = stack;
 478             while (s != null && !s.elem.getName().equals("dl")) {
 479                 s = s.next;
 480             }
 481             if (s == null) {
 482                 return true;
 483             }
 484         }
 485 
 486         if (((stackElement.equals("table")) &&
 487              (!elemName.equals("#pcdata")) && (!elemName.equals("input"))) ||
 488             ((elemName.equals("font")) &&
 489              (stackElement.equals("ul") || stackElement.equals("ol"))) ||
 490             (elemName.equals("meta") && stack != null) ||
 491             (elemName.equals("style") && seenBody) ||
 492             (stackElement.equals("table") && elemName.equals("a"))) {
 493             return true;
 494         }
 495         return false;
 496     }
 497 
 498 
 499     /**
 500      * Marks the first time a tag has been seen in a document
 501      */
 502 
 503     protected void markFirstTime(Element elem) {
 504         String elemName = elem.getName();
 505         if (elemName.equals("html")) {
 506             seenHtml = true;
 507         } else if (elemName.equals("head")) {
 508             seenHead = true;
 509         } else if (elemName.equals("body")) {
 510             if (buf.length == 1) {
 511                 // Refer to note in definition of buf for details on this.
 512                 char[] newBuf = new char[256];
 513 
 514                 newBuf[0] = buf[0];
 515                 buf = newBuf;
 516             }
 517             seenBody = true;
 518         }
 519     }
 520 
 521     /**
 522      * Create a legal content for an element.
 523      */
 524     boolean legalElementContext(Element elem) throws ChangedCharSetException {
 525 
 526         // System.out.println("-- legalContext -- " + elem);
 527 
 528         // Deal with the empty stack
 529         if (stack == null) {
 530             // System.out.println("-- stack is empty");
 531             if (elem != dtd.html) {
 532                 // System.out.println("-- pushing html");
 533                 startTag(makeTag(dtd.html, true));
 534                 return legalElementContext(elem);
 535             }
 536             return true;
 537         }
 538 
 539         // Is it allowed in the current context
 540         if (stack.advance(elem)) {
 541             // System.out.println("-- legal context");
 542             markFirstTime(elem);
 543             return true;
 544         }
 545         boolean insertTag = false;
 546 
 547         // The use of all error recovery strategies are contingent
 548         // on the value of the strict property.
 549         //
 550         // These are commonly occurring errors.  if insertTag is true,
 551         // then we want to adopt an error recovery strategy that
 552         // involves attempting to insert an additional tag to
 553         // legalize the context.  The two errors addressed here
 554         // are:
 555         // 1) when a <td> or <th> is seen soon after a <table> tag.
 556         //    In this case we insert a <tr>.
 557         // 2) when any other tag apart from a <tr> is seen
 558         //    in the context of a <tr>.  In this case we would
 559         //    like to add a <td>.  If a <tr> is seen within a
 560         //    <tr> context, then we will close out the current
 561         //    <tr>.
 562         //
 563         // This insertion strategy is handled later in the method.
 564         // The reason for checking this now, is that in other cases
 565         // we would like to apply other error recovery strategies for example
 566         // ignoring tags.
 567         //
 568         // In certain cases it is better to ignore a tag than try to
 569         // fix the situation.  So the first test is to see if this
 570         // is what we need to do.
 571         //
 572         String stackElemName = stack.elem.getName();
 573         String elemName = elem.getName();
 574 
 575 
 576         if (!strict &&
 577             ((stackElemName.equals("table") && elemName.equals("td")) ||
 578              (stackElemName.equals("table") && elemName.equals("th")) ||
 579              (stackElemName.equals("tr") && !elemName.equals("tr")))){
 580              insertTag = true;
 581         }
 582 
 583 
 584         if (!strict && !insertTag && (stack.elem.getName() != elem.getName() ||
 585                                       elem.getName().equals("body"))) {
 586             if (skipTag = ignoreElement(elem)) {
 587                 error("tag.ignore", elem.getName());
 588                 return skipTag;
 589             }
 590         }
 591 
 592         // Check for anything after the start of the table besides tr, td, th
 593         // or caption, and if those aren't there, insert the <tr> and call
 594         // legalElementContext again.
 595         if (!strict && stackElemName.equals("table") &&
 596             !elemName.equals("tr") && !elemName.equals("td") &&
 597             !elemName.equals("th") && !elemName.equals("caption")) {
 598             Element e = dtd.getElement("tr");
 599             TagElement t = makeTag(e, true);
 600             legalTagContext(t);
 601             startTag(t);
 602             error("start.missing", elem.getName());
 603             return legalElementContext(elem);
 604         }
 605 
 606         // They try to find a legal context by checking if the current
 607         // tag is valid in an enclosing context.  If so
 608         // close out the tags by outputing end tags and then
 609         // insert the current tag.  If the tags that are
 610         // being closed out do not have an optional end tag
 611         // specification in the DTD then an html error is
 612         // reported.
 613         //
 614         if (!insertTag && stack.terminate() && (!strict || stack.elem.omitEnd())) {
 615             for (TagStack s = stack.next ; s != null ; s = s.next) {
 616                 if (s.advance(elem)) {
 617                     while (stack != s) {
 618                         endTag(true);
 619                     }
 620                     return true;
 621                 }
 622                 if (!s.terminate() || (strict && !s.elem.omitEnd())) {
 623                     break;
 624                 }
 625             }
 626         }
 627 
 628         // Check if we know what tag is expected next.
 629         // If so insert the tag.  Report an error if the
 630         // tag does not have its start tag spec in the DTD as optional.
 631         //
 632         Element next = stack.first();
 633         if (next != null && (!strict || next.omitStart()) &&
 634            !(next==dtd.head && elem==dtd.pcdata) ) {
 635             // System.out.println("-- omitting start tag: " + next);
 636             TagElement t = makeTag(next, true);
 637             legalTagContext(t);
 638             startTag(t);
 639             if (!next.omitStart()) {
 640                 error("start.missing", elem.getName());
 641             }
 642             return legalElementContext(elem);
 643         }
 644 
 645 
 646         // Traverse the list of expected elements and determine if adding
 647         // any of these elements would make for a legal context.
 648         //
 649 
 650         if (!strict) {
 651             ContentModel content = stack.contentModel();
 652             Vector<Element> elemVec = new Vector<Element>();
 653             if (content != null) {
 654                 content.getElements(elemVec);
 655                 for (Element e : elemVec) {
 656                     // Ensure that this element has not been included as
 657                     // part of the exclusions in the DTD.
 658                     //
 659                     if (stack.excluded(e.getIndex())) {
 660                         continue;
 661                     }
 662 
 663                     boolean reqAtts = false;
 664 
 665                     for (AttributeList a = e.getAttributes(); a != null ; a = a.next) {
 666                         if (a.modifier == REQUIRED) {
 667                             reqAtts = true;
 668                             break;
 669                         }
 670                     }
 671                     // Ensure that no tag that has required attributes
 672                     // gets inserted.
 673                     //
 674                     if (reqAtts) {
 675                         continue;
 676                     }
 677 
 678                     ContentModel m = e.getContent();
 679                     if (m != null && m.first(elem)) {
 680                         // System.out.println("-- adding a legal tag: " + e);
 681                         TagElement t = makeTag(e, true);
 682                         legalTagContext(t);
 683                         startTag(t);
 684                         error("start.missing", e.getName());
 685                         return legalElementContext(elem);
 686                     }
 687                 }
 688             }
 689         }
 690 
 691         // Check if the stack can be terminated.  If so add the appropriate
 692         // end tag.  Report an error if the tag being ended does not have its
 693         // end tag spec in the DTD as optional.
 694         //
 695         if (stack.terminate() && (stack.elem != dtd.body) && (!strict || stack.elem.omitEnd())) {
 696             // System.out.println("-- omitting end tag: " + stack.elem);
 697             if (!stack.elem.omitEnd()) {
 698                 error("end.missing", elem.getName());
 699             }
 700 
 701             endTag(true);
 702             return legalElementContext(elem);
 703         }
 704 
 705         // At this point we know that something is screwed up.
 706         return false;
 707     }
 708 
 709     /**
 710      * Create a legal context for a tag.
 711      */
 712     void legalTagContext(TagElement tag) throws ChangedCharSetException {
 713         if (legalElementContext(tag.getElement())) {
 714             markFirstTime(tag.getElement());
 715             return;
 716         }
 717 
 718         // Avoid putting a block tag in a flow tag.
 719         if (tag.breaksFlow() && (stack != null) && !stack.tag.breaksFlow()) {
 720             endTag(true);
 721             legalTagContext(tag);
 722             return;
 723         }
 724 
 725         // Avoid putting something wierd in the head of the document.
 726         for (TagStack s = stack ; s != null ; s = s.next) {
 727             if (s.tag.getElement() == dtd.head) {
 728                 while (stack != s) {
 729                     endTag(true);
 730                 }
 731                 endTag(true);
 732                 legalTagContext(tag);
 733                 return;
 734             }
 735         }
 736 
 737         // Everything failed
 738         error("tag.unexpected", tag.getElement().getName());
 739     }
 740 
 741     /**
 742      * Error context. Something went wrong, make sure we are in
 743      * the document's body context
 744      */
 745     void errorContext() throws ChangedCharSetException {
 746         for (; (stack != null) && (stack.tag.getElement() != dtd.body) ; stack = stack.next) {
 747             handleEndTag(stack.tag);
 748         }
 749         if (stack == null) {
 750             legalElementContext(dtd.body);
 751             startTag(makeTag(dtd.body, true));
 752         }
 753     }
 754 
 755     /**
 756      * Add a char to the string buffer.
 757      */
 758     void addString(int c) {
 759         if (strpos  == str.length) {
 760             char newstr[] = new char[str.length + 128];
 761             System.arraycopy(str, 0, newstr, 0, str.length);
 762             str = newstr;
 763         }
 764         str[strpos++] = (char)c;
 765     }
 766 
 767     /**
 768      * Get the string that's been accumulated.
 769      */
 770     String getString(int pos) {
 771         char newStr[] = new char[strpos - pos];
 772         System.arraycopy(str, pos, newStr, 0, strpos - pos);
 773         strpos = pos;
 774         return new String(newStr);
 775     }
 776 
 777     char[] getChars(int pos) {
 778         char newStr[] = new char[strpos - pos];
 779         System.arraycopy(str, pos, newStr, 0, strpos - pos);
 780         strpos = pos;
 781         return newStr;
 782     }
 783 
 784     char[] getChars(int pos, int endPos) {
 785         char newStr[] = new char[endPos - pos];
 786         System.arraycopy(str, pos, newStr, 0, endPos - pos);
 787         // REMIND: it's not clear whether this version should set strpos or not
 788         // strpos = pos;
 789         return newStr;
 790     }
 791 
 792     void resetStrBuffer() {
 793         strpos = 0;
 794     }
 795 
 796     int strIndexOf(char target) {
 797         for (int i = 0; i < strpos; i++) {
 798             if (str[i] == target) {
 799                 return i;
 800             }
 801         }
 802 
 803         return -1;
 804     }
 805 
 806     /**
 807      * Skip space.
 808      * [5] 297:5
 809      */
 810     void skipSpace() throws IOException {
 811         while (true) {
 812             switch (ch) {
 813               case '\n':
 814                 ln++;
 815                 ch = readCh();
 816                 lfCount++;
 817                 break;
 818 
 819               case '\r':
 820                 ln++;
 821                 if ((ch = readCh()) == '\n') {
 822                     ch = readCh();
 823                     crlfCount++;
 824                 }
 825                 else {
 826                     crCount++;
 827                 }
 828                 break;
 829               case ' ':
 830               case '\t':
 831                 ch = readCh();
 832                 break;
 833 
 834               default:
 835                 return;
 836             }
 837         }
 838     }
 839 
 840     /**
 841      * Parse identifier. Uppercase characters are folded
 842      * to lowercase when lower is true. Returns falsed if
 843      * no identifier is found. [55] 346:17
 844      */
 845     boolean parseIdentifier(boolean lower) throws IOException {
 846         switch (ch) {
 847           case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
 848           case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
 849           case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
 850           case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
 851           case 'Y': case 'Z':
 852             if (lower) {
 853                 ch = 'a' + (ch - 'A');
 854             }
 855 
 856           case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
 857           case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
 858           case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
 859           case 's': case 't': case 'u': case 'v': case 'w': case 'x':
 860           case 'y': case 'z':
 861             break;
 862 
 863           default:
 864             return false;
 865         }
 866 
 867         while (true) {
 868             addString(ch);
 869 
 870             switch (ch = readCh()) {
 871               case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
 872               case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
 873               case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
 874               case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
 875               case 'Y': case 'Z':
 876                 if (lower) {
 877                     ch = 'a' + (ch - 'A');
 878                 }
 879 
 880               case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
 881               case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
 882               case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
 883               case 's': case 't': case 'u': case 'v': case 'w': case 'x':
 884               case 'y': case 'z':
 885 
 886               case '0': case '1': case '2': case '3': case '4':
 887               case '5': case '6': case '7': case '8': case '9':
 888 
 889               case '.': case '-':
 890 
 891               case '_': // not officially allowed
 892                 break;
 893 
 894               default:
 895                 return true;
 896             }
 897         }
 898     }
 899 
 900     /**
 901      * Parse an entity reference. [59] 350:17
 902      */
 903     private char[] parseEntityReference() throws IOException {
 904         int pos = strpos;
 905 
 906         if ((ch = readCh()) == '#') {
 907             int n = 0;
 908             ch = readCh();
 909             if ((ch >= '0') && (ch <= '9') ||
 910                     ch == 'x' || ch == 'X') {
 911 
 912                 if ((ch >= '0') && (ch <= '9')) {
 913                     // parse decimal reference
 914                     while ((ch >= '0') && (ch <= '9')) {
 915                         n = (n * 10) + ch - '0';
 916                         ch = readCh();
 917                     }
 918                 } else {
 919                     // parse hexadecimal reference
 920                     ch = readCh();
 921                     char lch = (char) Character.toLowerCase(ch);
 922                     while ((lch >= '0') && (lch <= '9') ||
 923                             (lch >= 'a') && (lch <= 'f')) {
 924                         if (lch >= '0' && lch <= '9') {
 925                             n = (n * 16) + lch - '0';
 926                         } else {
 927                             n = (n * 16) + lch - 'a' + 10;
 928                         }
 929                         ch = readCh();
 930                         lch = (char) Character.toLowerCase(ch);
 931                     }
 932                 }
 933                 switch (ch) {
 934                     case '\n':
 935                         ln++;
 936                         ch = readCh();
 937                         lfCount++;
 938                         break;
 939 
 940                     case '\r':
 941                         ln++;
 942                         if ((ch = readCh()) == '\n') {
 943                             ch = readCh();
 944                             crlfCount++;
 945                         }
 946                         else {
 947                             crCount++;
 948                         }
 949                         break;
 950 
 951                     case ';':
 952                         ch = readCh();
 953                         break;
 954                 }
 955                 char data[] = mapNumericReference(n);
 956                 return data;
 957             }
 958             addString('#');
 959             if (!parseIdentifier(false)) {
 960                 error("ident.expected");
 961                 strpos = pos;
 962                 char data[] = {'&', '#'};
 963                 return data;
 964             }
 965         } else if (!parseIdentifier(false)) {
 966             char data[] = {'&'};
 967             return data;
 968         }
 969 
 970         boolean semicolon = false;
 971 
 972         switch (ch) {
 973           case '\n':
 974             ln++;
 975             ch = readCh();
 976             lfCount++;
 977             break;
 978 
 979           case '\r':
 980             ln++;
 981             if ((ch = readCh()) == '\n') {
 982                 ch = readCh();
 983                 crlfCount++;
 984             }
 985             else {
 986                 crCount++;
 987             }
 988             break;
 989 
 990           case ';':
 991             semicolon = true;
 992 
 993             ch = readCh();
 994             break;
 995         }
 996 
 997         String nm = getString(pos);
 998         Entity ent = dtd.getEntity(nm);
 999 
1000         // entities are case sensitive - however if strict
1001         // is false then we will try to make a match by
1002         // converting the string to all lowercase.
1003         //
1004         if (!strict && (ent == null)) {
1005             ent = dtd.getEntity(nm.toLowerCase());
1006         }
1007         if ((ent == null) || !ent.isGeneral()) {
1008 
1009             if (nm.length() == 0) {
1010                 error("invalid.entref", nm);
1011                 return new char[0];
1012             }
1013             /* given that there is not a match restore the entity reference */
1014             String str = "&" + nm + (semicolon ? ";" : "");
1015 
1016             char b[] = new char[str.length()];
1017             str.getChars(0, b.length, b, 0);
1018             return b;
1019         }
1020         return ent.getData();
1021     }
1022 
1023     /**
1024      * Converts numeric character reference to char array.
1025      *
1026      * Normally the code in a reference should be always converted
1027      * to the Unicode character with the same code, but due to
1028      * wide usage of Cp1252 charset most browsers map numeric references
1029      * in the range 130-159 (which are control chars in Unicode set)
1030      * to displayable characters with other codes.
1031      *
1032      * @param c the code of numeric character reference.
1033      * @return a char array corresponding to the reference code.
1034      */
1035     private char[] mapNumericReference(int c) {
1036         char[] data;
1037         if (c >= 0xffff) { // outside unicode BMP.
1038             try {
1039                 data = Character.toChars(c);
1040             } catch (IllegalArgumentException e) {
1041                 data = new char[0];
1042             }
1043         } else {
1044             data = new char[1];
1045             data[0] = (c < 130 || c > 159) ? (char) c : cp1252Map[c - 130];
1046         }
1047         return data;
1048     }
1049 
1050     /**
1051      * Parse a comment. [92] 391:7
1052      */
1053     void parseComment() throws IOException {
1054 
1055         while (true) {
1056             int c = ch;
1057             switch (c) {
1058               case '-':
1059                   /** Presuming that the start string of a comment "<!--" has
1060                       already been parsed, the '-' character is valid only as
1061                       part of a comment termination and further more it must
1062                       be present in even numbers. Hence if strict is true, we
1063                       presume the comment has been terminated and return.
1064                       However if strict is false, then there is no even number
1065                       requirement and this character can appear anywhere in the
1066                       comment.  The parser reads on until it sees the following
1067                       pattern: "-->" or "--!>".
1068                    **/
1069                 if (!strict && (strpos != 0) && (str[strpos - 1] == '-')) {
1070                     if ((ch = readCh()) == '>') {
1071                         return;
1072                     }
1073                     if (ch == '!') {
1074                         if ((ch = readCh()) == '>') {
1075                             return;
1076                         } else {
1077                             /* to account for extra read()'s that happened */
1078                             addString('-');
1079                             addString('!');
1080                             continue;
1081                         }
1082                     }
1083                     break;
1084                 }
1085 
1086                 if ((ch = readCh()) == '-') {
1087                     ch = readCh();
1088                     if (strict || ch == '>') {
1089                         return;
1090                     }
1091                     if (ch == '!') {
1092                         if ((ch = readCh()) == '>') {
1093                             return;
1094                         } else {
1095                             /* to account for extra read()'s that happened */
1096                             addString('-');
1097                             addString('!');
1098                             continue;
1099                         }
1100                     }
1101                     /* to account for the extra read() */
1102                     addString('-');
1103                 }
1104                 break;
1105 
1106               case -1:
1107                   handleEOFInComment();
1108                   return;
1109 
1110               case '\n':
1111                 ln++;
1112                 ch = readCh();
1113                 lfCount++;
1114                 break;
1115 
1116               case '>':
1117                 ch = readCh();
1118                 break;
1119 
1120               case '\r':
1121                 ln++;
1122                 if ((ch = readCh()) == '\n') {
1123                     ch = readCh();
1124                     crlfCount++;
1125                 }
1126                 else {
1127                     crCount++;
1128                 }
1129                 c = '\n';
1130                 break;
1131               default:
1132                 ch = readCh();
1133                 break;
1134             }
1135 
1136             addString(c);
1137         }
1138     }
1139 
1140     /**
1141      * Parse literal content. [46] 343:1 and [47] 344:1
1142      */
1143     void parseLiteral(boolean replace) throws IOException {
1144         while (true) {
1145             int c = ch;
1146             switch (c) {
1147               case -1:
1148                 error("eof.literal", stack.elem.getName());
1149                 endTag(true);
1150                 return;
1151 
1152               case '>':
1153                 ch = readCh();
1154                 int i = textpos - (stack.elem.name.length() + 2), j = 0;
1155 
1156                 // match end tag
1157                 if ((i >= 0) && (text[i++] == '<') && (text[i] == '/')) {
1158                     while ((++i < textpos) &&
1159                            (Character.toLowerCase(text[i]) == stack.elem.name.charAt(j++)));
1160                     if (i == textpos) {
1161                         textpos -= (stack.elem.name.length() + 2);
1162                         if ((textpos > 0) && (text[textpos-1] == '\n')) {
1163                             textpos--;
1164                         }
1165                         endTag(false);
1166                         return;
1167                     }
1168                 }
1169                 break;
1170 
1171               case '&':
1172                 char data[] = parseEntityReference();
1173                 if (textpos + data.length > text.length) {
1174                     char newtext[] = new char[Math.max(textpos + data.length + 128, text.length * 2)];
1175                     System.arraycopy(text, 0, newtext, 0, text.length);
1176                     text = newtext;
1177                 }
1178                 System.arraycopy(data, 0, text, textpos, data.length);
1179                 textpos += data.length;
1180                 continue;
1181 
1182               case '\n':
1183                 ln++;
1184                 ch = readCh();
1185                 lfCount++;
1186                 break;
1187 
1188               case '\r':
1189                 ln++;
1190                 if ((ch = readCh()) == '\n') {
1191                     ch = readCh();
1192                     crlfCount++;
1193                 }
1194                 else {
1195                     crCount++;
1196                 }
1197                 c = '\n';
1198                 break;
1199               default:
1200                 ch = readCh();
1201                 break;
1202             }
1203 
1204             // output character
1205             if (textpos == text.length) {
1206                 char newtext[] = new char[text.length + 128];
1207                 System.arraycopy(text, 0, newtext, 0, text.length);
1208                 text = newtext;
1209             }
1210             text[textpos++] = (char)c;
1211         }
1212     }
1213 
1214     /**
1215      * Parse attribute value. [33] 331:1
1216      */
1217     String parseAttributeValue(boolean lower) throws IOException {
1218         int delim = -1;
1219 
1220         // Check for a delimiter
1221         switch(ch) {
1222           case '\'':
1223           case '"':
1224             delim = ch;
1225             ch = readCh();
1226             break;
1227         }
1228 
1229         // Parse the rest of the value
1230         while (true) {
1231             int c = ch;
1232 
1233             switch (c) {
1234               case '\n':
1235                 ln++;
1236                 ch = readCh();
1237                 lfCount++;
1238                 if (delim < 0) {
1239                     return getString(0);
1240                 }
1241                 break;
1242 
1243               case '\r':
1244                 ln++;
1245 
1246                 if ((ch = readCh()) == '\n') {
1247                     ch = readCh();
1248                     crlfCount++;
1249                 }
1250                 else {
1251                     crCount++;
1252                 }
1253                 if (delim < 0) {
1254                     return getString(0);
1255                 }
1256                 break;
1257 
1258               case '\t':
1259                   if (delim < 0)
1260                       c = ' ';
1261               case ' ':
1262                 ch = readCh();
1263                 if (delim < 0) {
1264                     return getString(0);
1265                 }
1266                 break;
1267 
1268               case '>':
1269               case '<':
1270                 if (delim < 0) {
1271                     return getString(0);
1272                 }
1273                 ch = readCh();
1274                 break;
1275 
1276               case '\'':
1277               case '"':
1278                 ch = readCh();
1279                 if (c == delim) {
1280                     return getString(0);
1281                 } else if (delim == -1) {
1282                     error("attvalerr");
1283                     if (strict || ch == ' ') {
1284                         return getString(0);
1285                     } else {
1286                         continue;
1287                     }
1288                 }
1289                 break;
1290 
1291             case '=':
1292                 if (delim < 0) {
1293                     /* In SGML a construct like <img src=/cgi-bin/foo?x=1>
1294                        is considered invalid since an = sign can only be contained
1295                        in an attributes value if the string is quoted.
1296                        */
1297                     error("attvalerr");
1298                     /* If strict is true then we return with the string we have thus far.
1299                        Otherwise we accept the = sign as part of the attribute's value and
1300                        process the rest of the img tag. */
1301                     if (strict) {
1302                         return getString(0);
1303                     }
1304                 }
1305                 ch = readCh();
1306                 break;
1307 
1308               case '&':
1309                 if (strict && delim < 0) {
1310                     ch = readCh();
1311                     break;
1312                 }
1313 
1314                 char data[] = parseEntityReference();
1315                 for (int i = 0 ; i < data.length ; i++) {
1316                     c = data[i];
1317                     addString((lower && (c >= 'A') && (c <= 'Z')) ? 'a' + c - 'A' : c);
1318                 }
1319                 continue;
1320 
1321               case -1:
1322                 return getString(0);
1323 
1324               default:
1325                 if (lower && (c >= 'A') && (c <= 'Z')) {
1326                     c = 'a' + c - 'A';
1327                 }
1328                 ch = readCh();
1329                 break;
1330             }
1331             addString(c);
1332         }
1333     }
1334 
1335 
1336     /**
1337      * Parse attribute specification List. [31] 327:17
1338      */
1339     void parseAttributeSpecificationList(Element elem) throws IOException {
1340 
1341         while (true) {
1342             skipSpace();
1343 
1344             switch (ch) {
1345               case '/':
1346               case '>':
1347               case '<':
1348               case -1:
1349                 return;
1350 
1351               case '-':
1352                 if ((ch = readCh()) == '-') {
1353                     ch = readCh();
1354                     parseComment();
1355                     strpos = 0;
1356                 } else {
1357                     error("invalid.tagchar", "-", elem.getName());
1358                     ch = readCh();
1359                 }
1360                 continue;
1361             }
1362 
1363             AttributeList att;
1364             String attname;
1365             String attvalue;
1366 
1367             if (parseIdentifier(true)) {
1368                 attname = getString(0);
1369                 skipSpace();
1370                 if (ch == '=') {
1371                     ch = readCh();
1372                     skipSpace();
1373                     att = elem.getAttribute(attname);
1374 //  Bug ID 4102750
1375 //  Load the NAME of an Attribute Case Sensitive
1376 //  The case of the NAME  must be intact
1377 //  MG 021898
1378                     attvalue = parseAttributeValue((att != null) && (att.type != CDATA) && (att.type != NOTATION) && (att.type != NAME));
1379 //                  attvalue = parseAttributeValue((att != null) && (att.type != CDATA) && (att.type != NOTATION));
1380                 } else {
1381                     attvalue = attname;
1382                     att = elem.getAttributeByValue(attvalue);
1383                     if (att == null) {
1384                         att = elem.getAttribute(attname);
1385                         if (att != null) {
1386                             attvalue = att.getValue();
1387                         }
1388                         else {
1389                             // Make it null so that NULL_ATTRIBUTE_VALUE is
1390                             // used
1391                             attvalue = null;
1392                         }
1393                     }
1394                 }
1395             } else if (!strict && ch == ',') { // allows for comma separated attribute-value pairs
1396                 ch = readCh();
1397                 continue;
1398             } else if (!strict && ch == '"') { // allows for quoted attributes
1399                 ch = readCh();
1400                 skipSpace();
1401                 if (parseIdentifier(true)) {
1402                     attname = getString(0);
1403                     if (ch == '"') {
1404                         ch = readCh();
1405                     }
1406                     skipSpace();
1407                     if (ch == '=') {
1408                         ch = readCh();
1409                         skipSpace();
1410                         att = elem.getAttribute(attname);
1411                         attvalue = parseAttributeValue((att != null) &&
1412                                                 (att.type != CDATA) &&
1413                                                 (att.type != NOTATION));
1414                     } else {
1415                         attvalue = attname;
1416                         att = elem.getAttributeByValue(attvalue);
1417                         if (att == null) {
1418                             att = elem.getAttribute(attname);
1419                             if (att != null) {
1420                                 attvalue = att.getValue();
1421                             }
1422                         }
1423                     }
1424                 } else {
1425                     char str[] = {(char)ch};
1426                     error("invalid.tagchar", new String(str), elem.getName());
1427                     ch = readCh();
1428                     continue;
1429                 }
1430             } else if (!strict && (attributes.isEmpty()) && (ch == '=')) {
1431                 ch = readCh();
1432                 skipSpace();
1433                 attname = elem.getName();
1434                 att = elem.getAttribute(attname);
1435                 attvalue = parseAttributeValue((att != null) &&
1436                                                (att.type != CDATA) &&
1437                                                (att.type != NOTATION));
1438             } else if (!strict && (ch == '=')) {
1439                 ch = readCh();
1440                 skipSpace();
1441                 attvalue = parseAttributeValue(true);
1442                 error("attvalerr");
1443                 return;
1444             } else {
1445                 char str[] = {(char)ch};
1446                 error("invalid.tagchar", new String(str), elem.getName());
1447                 if (!strict) {
1448                     ch = readCh();
1449                     continue;
1450                 } else {
1451                     return;
1452                 }
1453             }
1454 
1455             if (att != null) {
1456                 attname = att.getName();
1457             } else {
1458                 error("invalid.tagatt", attname, elem.getName());
1459             }
1460 
1461             // Check out the value
1462             if (attributes.isDefined(attname)) {
1463                 error("multi.tagatt", attname, elem.getName());
1464             }
1465             if (attvalue == null) {
1466                 attvalue = ((att != null) && (att.value != null)) ? att.value :
1467                     HTML.NULL_ATTRIBUTE_VALUE;
1468             } else if ((att != null) && (att.values != null) && !att.values.contains(attvalue)) {
1469                 error("invalid.tagattval", attname, elem.getName());
1470             }
1471             HTML.Attribute attkey = HTML.getAttributeKey(attname);
1472             if (attkey == null) {
1473                 attributes.addAttribute(attname, attvalue);
1474             } else {
1475                 attributes.addAttribute(attkey, attvalue);
1476             }
1477         }
1478     }
1479 
1480     /**
1481      * Parses th Document Declaration Type markup declaration.
1482      * Currently ignores it.
1483      */
1484     public String parseDTDMarkup() throws IOException {
1485 
1486         StringBuilder strBuff = new StringBuilder();
1487         ch = readCh();
1488         while(true) {
1489             switch (ch) {
1490             case '>':
1491                 ch = readCh();
1492                 return strBuff.toString();
1493             case -1:
1494                 error("invalid.markup");
1495                 return strBuff.toString();
1496             case '\n':
1497                 ln++;
1498                 ch = readCh();
1499                 lfCount++;
1500                 break;
1501             case '"':
1502                 ch = readCh();
1503                 break;
1504             case '\r':
1505                 ln++;
1506                 if ((ch = readCh()) == '\n') {
1507                     ch = readCh();
1508                     crlfCount++;
1509                 }
1510                 else {
1511                     crCount++;
1512                 }
1513                 break;
1514             default:
1515                 strBuff.append((char)(ch & 0xFF));
1516                 ch = readCh();
1517                 break;
1518             }
1519         }
1520     }
1521 
1522     /**
1523      * Parse markup declarations.
1524      * Currently only handles the Document Type Declaration markup.
1525      * Returns true if it is a markup declaration false otherwise.
1526      */
1527     protected boolean parseMarkupDeclarations(StringBuffer strBuff) throws IOException {
1528 
1529         /* Currently handles only the DOCTYPE */
1530         if ((strBuff.length() == "DOCTYPE".length()) &&
1531             (strBuff.toString().toUpperCase().equals("DOCTYPE"))) {
1532             parseDTDMarkup();
1533             return true;
1534         }
1535         return false;
1536     }
1537 
1538     /**
1539      * Parse an invalid tag.
1540      */
1541     void parseInvalidTag() throws IOException {
1542         // ignore all data upto the close bracket '>'
1543         while (true) {
1544             skipSpace();
1545             switch (ch) {
1546               case '>':
1547               case -1:
1548                   ch = readCh();
1549                 return;
1550               case '<':
1551                   return;
1552               default:
1553                   ch = readCh();
1554 
1555             }
1556         }
1557     }
1558 
1559     /**
1560      * Parse a start or end tag.
1561      */
1562     void parseTag() throws IOException {
1563         Element elem;
1564         boolean net = false;
1565         boolean warned = false;
1566         boolean unknown = false;
1567 
1568         switch (ch = readCh()) {
1569           case '!':
1570             switch (ch = readCh()) {
1571               case '-':
1572                 // Parse comment. [92] 391:7
1573                 while (true) {
1574                     if (ch == '-') {
1575                         if (!strict || ((ch = readCh()) == '-')) {
1576                             ch = readCh();
1577                             if (!strict && ch == '-') {
1578                                 ch = readCh();
1579                             }
1580                             // send over any text you might see
1581                             // before parsing and sending the
1582                             // comment
1583                             if (textpos != 0) {
1584                                 char newtext[] = new char[textpos];
1585                                 System.arraycopy(text, 0, newtext, 0, textpos);
1586                                 handleText(newtext);
1587                                 lastBlockStartPos = currentBlockStartPos;
1588                                 textpos = 0;
1589                             }
1590                             parseComment();
1591                             last = makeTag(dtd.getElement("comment"), true);
1592                             handleComment(getChars(0));
1593                             continue;
1594                         } else if (!warned) {
1595                             warned = true;
1596                             error("invalid.commentchar", "-");
1597                         }
1598                     }
1599                     skipSpace();
1600                     switch (ch) {
1601                       case '-':
1602                         continue;
1603                       case '>':
1604                         ch = readCh();
1605                       case -1:
1606                         return;
1607                       default:
1608                         ch = readCh();
1609                         if (!warned) {
1610                             warned = true;
1611                             error("invalid.commentchar",
1612                                   String.valueOf((char)ch));
1613                         }
1614                         break;
1615                     }
1616                 }
1617 
1618               default:
1619                 // deal with marked sections
1620                 StringBuffer strBuff = new StringBuffer();
1621                 while (true) {
1622                     strBuff.append((char)ch);
1623                     if (parseMarkupDeclarations(strBuff)) {
1624                         return;
1625                     }
1626                     switch(ch) {
1627                       case '>':
1628                         ch = readCh();
1629                       case -1:
1630                         error("invalid.markup");
1631                         return;
1632                       case '\n':
1633                         ln++;
1634                         ch = readCh();
1635                         lfCount++;
1636                         break;
1637                       case '\r':
1638                         ln++;
1639                         if ((ch = readCh()) == '\n') {
1640                             ch = readCh();
1641                             crlfCount++;
1642                         }
1643                         else {
1644                             crCount++;
1645                         }
1646                         break;
1647 
1648                       default:
1649                         ch = readCh();
1650                         break;
1651                     }
1652                 }
1653             }
1654 
1655           case '/':
1656             // parse end tag [19] 317:4
1657             switch (ch = readCh()) {
1658               case '>':
1659                 ch = readCh();
1660               case '<':
1661                 // empty end tag. either </> or </<
1662                 if (recent == null) {
1663                     error("invalid.shortend");
1664                     return;
1665                 }
1666                 elem = recent;
1667                 break;
1668 
1669               default:
1670                 if (!parseIdentifier(true)) {
1671                     error("expected.endtagname");
1672                     return;
1673                 }
1674                 skipSpace();
1675                 switch (ch) {
1676                   case '>':
1677                     ch = readCh();
1678                   case '<':
1679                     break;
1680 
1681                   default:
1682                     error("expected", "'>'");
1683                     while ((ch != -1) && (ch != '\n') && (ch != '>')) {
1684                         ch = readCh();
1685                     }
1686                     if (ch == '>') {
1687                         ch = readCh();
1688                     }
1689                     break;
1690                 }
1691                 String elemStr = getString(0);
1692                 if (!dtd.elementExists(elemStr)) {
1693                     error("end.unrecognized", elemStr);
1694                     // Ignore RE before end tag
1695                     if ((textpos > 0) && (text[textpos-1] == '\n')) {
1696                         textpos--;
1697                     }
1698                     elem = dtd.getElement("unknown");
1699                     elem.name = elemStr;
1700                     unknown = true;
1701                 } else {
1702                     elem = dtd.getElement(elemStr);
1703                 }
1704                 break;
1705             }
1706 
1707 
1708             // If the stack is null, we're seeing end tags without any begin
1709             // tags.  Ignore them.
1710 
1711             if (stack == null) {
1712                 error("end.extra.tag", elem.getName());
1713                 return;
1714             }
1715 
1716             // Ignore RE before end tag
1717             if ((textpos > 0) && (text[textpos-1] == '\n')) {
1718                 // In a pre tag, if there are blank lines
1719                 // we do not want to remove the newline
1720                 // before the end tag.  Hence this code.
1721                 //
1722                 if (stack.pre) {
1723                     if ((textpos > 1) && (text[textpos-2] != '\n')) {
1724                         textpos--;
1725                     }
1726                 } else {
1727                     textpos--;
1728                 }
1729             }
1730 
1731             // If the end tag is a form, since we did not put it
1732             // on the tag stack, there is no corresponding start
1733             // start tag to find. Hence do not touch the tag stack.
1734             //
1735 
1736             /*
1737             if (!strict && elem.getName().equals("form")) {
1738                 if (lastFormSent != null) {
1739                     handleEndTag(lastFormSent);
1740                     return;
1741                 } else {
1742                     // do nothing.
1743                     return;
1744                 }
1745             }
1746             */
1747 
1748             if (unknown) {
1749                 // we will not see a corresponding start tag
1750                 // on the the stack.  If we are seeing an
1751                 // end tag, lets send this on as an empty
1752                 // tag with the end tag attribute set to
1753                 // true.
1754                 TagElement t = makeTag(elem);
1755                 handleText(t);
1756                 attributes.addAttribute(HTML.Attribute.ENDTAG, "true");
1757                 handleEmptyTag(makeTag(elem));
1758                 unknown = false;
1759                 return;
1760             }
1761 
1762             // find the corresponding start tag
1763 
1764             // A commonly occurring error appears to be the insertion
1765             // of extra end tags in a table.  The intent here is ignore
1766             // such extra end tags.
1767             //
1768             if (!strict) {
1769                 String stackElem = stack.elem.getName();
1770 
1771                 if (stackElem.equals("table")) {
1772                     // If it is not a valid end tag ignore it and return
1773                     //
1774                     if (!elem.getName().equals(stackElem)) {
1775                         error("tag.ignore", elem.getName());
1776                         return;
1777                     }
1778                 }
1779 
1780 
1781 
1782                 if (stackElem.equals("tr") ||
1783                     stackElem.equals("td")) {
1784                     if ((!elem.getName().equals("table")) &&
1785                         (!elem.getName().equals(stackElem))) {
1786                         error("tag.ignore", elem.getName());
1787                         return;
1788                     }
1789                 }
1790             }
1791             TagStack sp = stack;
1792 
1793             while ((sp != null) && (elem != sp.elem)) {
1794                 sp = sp.next;
1795             }
1796             if (sp == null) {
1797                 error("unmatched.endtag", elem.getName());
1798                 return;
1799             }
1800 
1801             // People put font ending tags in the darndest places.
1802             // Don't close other contexts based on them being between
1803             // a font tag and the corresponding end tag.  Instead,
1804             // ignore the end tag like it doesn't exist and allow the end
1805             // of the document to close us out.
1806             String elemName = elem.getName();
1807             if (stack != sp &&
1808                 (elemName.equals("font") ||
1809                  elemName.equals("center"))) {
1810 
1811                 // Since closing out a center tag can have real wierd
1812                 // effects on the formatting,  make sure that tags
1813                 // for which omitting an end tag is legimitate
1814                 // get closed out.
1815                 //
1816                 if (elemName.equals("center")) {
1817                     while(stack.elem.omitEnd() && stack != sp) {
1818                         endTag(true);
1819                     }
1820                     if (stack.elem == elem) {
1821                         endTag(false);
1822                     }
1823                 }
1824                 return;
1825             }
1826             // People do the same thing with center tags.  In this
1827             // case we would like to close off the center tag but
1828             // not necessarily all enclosing tags.
1829 
1830 
1831 
1832             // end tags
1833             while (stack != sp) {
1834                 endTag(true);
1835             }
1836 
1837             endTag(false);
1838             return;
1839 
1840           case -1:
1841             error("eof");
1842             return;
1843         }
1844 
1845         // start tag [14] 314:1
1846         if (!parseIdentifier(true)) {
1847             elem = recent;
1848             if ((ch != '>') || (elem == null)) {
1849                 error("expected.tagname");
1850                 return;
1851             }
1852         } else {
1853             String elemStr = getString(0);
1854 
1855             if (elemStr.equals("image")) {
1856                 elemStr = "img";
1857             }
1858 
1859             /* determine if this element is part of the dtd. */
1860 
1861             if (!dtd.elementExists(elemStr)) {
1862                 //              parseInvalidTag();
1863                 error("tag.unrecognized ", elemStr);
1864                 elem = dtd.getElement("unknown");
1865                 elem.name = elemStr;
1866                 unknown = true;
1867             } else {
1868                 elem = dtd.getElement(elemStr);
1869             }
1870         }
1871 
1872         // Parse attributes
1873         parseAttributeSpecificationList(elem);
1874 
1875         switch (ch) {
1876           case '/':
1877             net = true;
1878           case '>':
1879             ch = readCh();
1880             if (ch == '>' && net) {
1881                 ch = readCh();
1882             }
1883           case '<':
1884             break;
1885 
1886           default:
1887             error("expected", "'>'");
1888             break;
1889         }
1890 
1891         if (!strict) {
1892           if (elem.getName().equals("script")) {
1893             error("javascript.unsupported");
1894           }
1895         }
1896 
1897         // ignore RE after start tag
1898         //
1899         if (!elem.isEmpty())  {
1900             if (ch == '\n') {
1901                 ln++;
1902                 lfCount++;
1903                 ch = readCh();
1904             } else if (ch == '\r') {
1905                 ln++;
1906                 if ((ch = readCh()) == '\n') {
1907                     ch = readCh();
1908                     crlfCount++;
1909                 }
1910                 else {
1911                     crCount++;
1912                 }
1913             }
1914         }
1915 
1916         // ensure a legal context for the tag
1917         TagElement tag = makeTag(elem, false);
1918 
1919 
1920         /** In dealing with forms, we have decided to treat
1921             them as legal in any context.  Also, even though
1922             they do have a start and an end tag, we will
1923             not put this tag on the stack.  This is to deal
1924             several pages in the web oasis that choose to
1925             start and end forms in any possible location. **/
1926 
1927         /*
1928         if (!strict && elem.getName().equals("form")) {
1929             if (lastFormSent == null) {
1930                 lastFormSent = tag;
1931             } else {
1932                 handleEndTag(lastFormSent);
1933                 lastFormSent = tag;
1934             }
1935         } else {
1936         */
1937             // Smlly, if a tag is unknown, we will apply
1938             // no legalTagContext logic to it.
1939             //
1940             if (!unknown) {
1941                 legalTagContext(tag);
1942 
1943                 // If skip tag is true,  this implies that
1944                 // the tag was illegal and that the error
1945                 // recovery strategy adopted is to ignore
1946                 // the tag.
1947                 if (!strict && skipTag) {
1948                     skipTag = false;
1949                     return;
1950                 }
1951             }
1952             /*
1953         }
1954             */
1955 
1956         startTag(tag);
1957 
1958         if (!elem.isEmpty()) {
1959             switch (elem.getType()) {
1960               case CDATA:
1961                 parseLiteral(false);
1962                 break;
1963               case RCDATA:
1964                 parseLiteral(true);
1965                 break;
1966               default:
1967                 if (stack != null) {
1968                     stack.net = net;
1969                 }
1970                 break;
1971             }
1972         }
1973     }
1974 
1975     private static final String START_COMMENT = "<!--";
1976     private static final String END_COMMENT = "-->";
1977     private static final char[] SCRIPT_END_TAG = "</script>".toCharArray();
1978     private static final char[] SCRIPT_END_TAG_UPPER_CASE =
1979                                         "</SCRIPT>".toCharArray();
1980 
1981     void parseScript() throws IOException {
1982         char[] charsToAdd = new char[SCRIPT_END_TAG.length];
1983         boolean insideComment = false;
1984 
1985         /* Here, ch should be the first character after <script> */
1986         while (true) {
1987             int i = 0;
1988             while (!insideComment && i < SCRIPT_END_TAG.length
1989                        && (SCRIPT_END_TAG[i] == ch
1990                            || SCRIPT_END_TAG_UPPER_CASE[i] == ch)) {
1991                 charsToAdd[i] = (char) ch;
1992                 ch = readCh();
1993                 i++;
1994             }
1995             if (i == SCRIPT_END_TAG.length) {
1996 
1997                 /*  '</script>' tag detected */
1998                 /* Here, ch == the first character after </script> */
1999                 return;
2000             } else {
2001 
2002                 /* To account for extra read()'s that happened */
2003                 for (int j = 0; j < i; j++) {
2004                     addString(charsToAdd[j]);
2005                 }
2006 
2007                 switch (ch) {
2008                 case -1:
2009                     error("eof.script");
2010                     return;
2011                 case '\n':
2012                     ln++;
2013                     ch = readCh();
2014                     lfCount++;
2015                     addString('\n');
2016                     break;
2017                 case '\r':
2018                     ln++;
2019                     if ((ch = readCh()) == '\n') {
2020                         ch = readCh();
2021                         crlfCount++;
2022                     } else {
2023                         crCount++;
2024                     }
2025                     addString('\n');
2026                     break;
2027                 default:
2028                     addString(ch);
2029                     String str = new String(getChars(0, strpos));
2030                     if (!insideComment && str.endsWith(START_COMMENT)) {
2031                         insideComment = true;
2032                     }
2033                     if (insideComment && str.endsWith(END_COMMENT)) {
2034                         insideComment = false;
2035                     }
2036                     ch = readCh();
2037                     break;
2038                 } // switch
2039             }
2040         } // while
2041     }
2042 
2043     /**
2044      * Parse Content. [24] 320:1
2045      */
2046     void parseContent() throws IOException {
2047         Thread curThread = Thread.currentThread();
2048 
2049         for (;;) {
2050             if (curThread.isInterrupted()) {
2051                 curThread.interrupt(); // resignal the interrupt
2052                 break;
2053             }
2054 
2055             int c = ch;
2056             currentBlockStartPos = currentPosition;
2057 
2058             if (recent == dtd.script) { // means: if after starting <script> tag
2059 
2060                 /* Here, ch has to be the first character after <script> */
2061                 parseScript();
2062                 last = makeTag(dtd.getElement("comment"), true);
2063 
2064                 /* Remove leading and trailing HTML comment declarations */
2065                 String str = new String(getChars(0)).trim();
2066                 int minLength = START_COMMENT.length() + END_COMMENT.length();
2067                 if (str.startsWith(START_COMMENT) && str.endsWith(END_COMMENT)
2068                        && str.length() >= (minLength)) {
2069                     str = str.substring(START_COMMENT.length(),
2070                                       str.length() - END_COMMENT.length());
2071                 }
2072 
2073                 /* Handle resulting chars as comment */
2074                 handleComment(str.toCharArray());
2075                 endTag(false);
2076                 lastBlockStartPos = currentPosition;
2077 
2078                 continue;
2079             } else {
2080                 switch (c) {
2081                   case '<':
2082                     parseTag();
2083                     lastBlockStartPos = currentPosition;
2084                     continue;
2085 
2086                   case '/':
2087                     ch = readCh();
2088                     if ((stack != null) && stack.net) {
2089                         // null end tag.
2090                         endTag(false);
2091                         continue;
2092                     } else if (textpos == 0) {
2093                         if (!legalElementContext(dtd.pcdata)) {
2094                             error("unexpected.pcdata");
2095                         }
2096                         if (last.breaksFlow()) {
2097                             space = false;
2098                         }
2099                     }
2100                     break;
2101 
2102                   case -1:
2103                     return;
2104 
2105                   case '&':
2106                     if (textpos == 0) {
2107                         if (!legalElementContext(dtd.pcdata)) {
2108                             error("unexpected.pcdata");
2109                         }
2110                         if (last.breaksFlow()) {
2111                             space = false;
2112                         }
2113                     }
2114                     char data[] = parseEntityReference();
2115                     if (textpos + data.length + 1 > text.length) {
2116                         char newtext[] = new char[Math.max(textpos + data.length + 128, text.length * 2)];
2117                         System.arraycopy(text, 0, newtext, 0, text.length);
2118                         text = newtext;
2119                     }
2120                     if (space) {
2121                         space = false;
2122                         text[textpos++] = ' ';
2123                     }
2124                     System.arraycopy(data, 0, text, textpos, data.length);
2125                     textpos += data.length;
2126                     ignoreSpace = false;
2127                     continue;
2128 
2129                   case '\n':
2130                     ln++;
2131                     lfCount++;
2132                     ch = readCh();
2133                     if ((stack != null) && stack.pre) {
2134                         break;
2135                     }
2136                     if (textpos == 0) {
2137                         lastBlockStartPos = currentPosition;
2138                     }
2139                     if (!ignoreSpace) {
2140                         space = true;
2141                     }
2142                     continue;
2143 
2144                   case '\r':
2145                     ln++;
2146                     c = '\n';
2147                     if ((ch = readCh()) == '\n') {
2148                         ch = readCh();
2149                         crlfCount++;
2150                     }
2151                     else {
2152                         crCount++;
2153                     }
2154                     if ((stack != null) && stack.pre) {
2155                         break;
2156                     }
2157                     if (textpos == 0) {
2158                         lastBlockStartPos = currentPosition;
2159                     }
2160                     if (!ignoreSpace) {
2161                         space = true;
2162                     }
2163                     continue;
2164 
2165 
2166                   case '\t':
2167                   case ' ':
2168                     ch = readCh();
2169                     if ((stack != null) && stack.pre) {
2170                         break;
2171                     }
2172                     if (textpos == 0) {
2173                         lastBlockStartPos = currentPosition;
2174                     }
2175                     if (!ignoreSpace) {
2176                         space = true;
2177                     }
2178                     continue;
2179 
2180                   default:
2181                     if (textpos == 0) {
2182                         if (!legalElementContext(dtd.pcdata)) {
2183                             error("unexpected.pcdata");
2184                         }
2185                         if (last.breaksFlow()) {
2186                             space = false;
2187                         }
2188                     }
2189                     ch = readCh();
2190                     break;
2191                 }
2192             }
2193 
2194             // enlarge buffer if needed
2195             if (textpos + 2 > text.length) {
2196                 char newtext[] = new char[text.length + 128];
2197                 System.arraycopy(text, 0, newtext, 0, text.length);
2198                 text = newtext;
2199             }
2200 
2201             // output pending space
2202             if (space) {
2203                 if (textpos == 0) {
2204                     lastBlockStartPos--;
2205                 }
2206                 text[textpos++] = ' ';
2207                 space = false;
2208             }
2209             text[textpos++] = (char)c;
2210             ignoreSpace = false;
2211         }
2212     }
2213 
2214     /**
2215      * Returns the end of line string. This will return the end of line
2216      * string that has been encountered the most, one of \r, \n or \r\n.
2217      */
2218     String getEndOfLineString() {
2219         if (crlfCount >= crCount) {
2220             if (lfCount >= crlfCount) {
2221                 return "\n";
2222             }
2223             else {
2224                 return "\r\n";
2225             }
2226         }
2227         else {
2228             if (crCount > lfCount) {
2229                 return "\r";
2230             }
2231             else {
2232                 return "\n";
2233             }
2234         }
2235     }
2236 
2237     /**
2238      * Parse an HTML stream, given a DTD.
2239      */
2240     public synchronized void parse(Reader in) throws IOException {
2241         this.in = in;
2242 
2243         this.ln = 1;
2244 
2245         seenHtml = false;
2246         seenHead = false;
2247         seenBody = false;
2248 
2249         crCount = lfCount = crlfCount = 0;
2250 
2251         try {
2252             ch = readCh();
2253             text = new char[1024];
2254             str = new char[128];
2255 
2256             parseContent();
2257             // NOTE: interruption may have occurred.  Control flows out
2258             // of here normally.
2259             while (stack != null) {
2260                 endTag(true);
2261             }
2262             in.close();
2263         } catch (IOException e) {
2264             errorContext();
2265             error("ioexception");
2266             throw e;
2267         } catch (Exception e) {
2268             errorContext();
2269             error("exception", e.getClass().getName(), e.getMessage());
2270             e.printStackTrace();
2271         } catch (ThreadDeath e) {
2272             errorContext();
2273             error("terminated");
2274             e.printStackTrace();
2275             throw e;
2276         } finally {
2277             for (; stack != null ; stack = stack.next) {
2278                 handleEndTag(stack.tag);
2279             }
2280 
2281             text = null;
2282             str = null;
2283         }
2284 
2285     }
2286 
2287 
2288     /*
2289      * Input cache.  This is much faster than calling down to a synchronized
2290      * method of BufferedReader for each byte.  Measurements done 5/30/97
2291      * show that there's no point in having a bigger buffer:  Increasing
2292      * the buffer to 8192 had no measurable impact for a program discarding
2293      * one character at a time (reading from an http URL to a local machine).
2294      * NOTE: If the current encoding is bogus, and we read too much
2295      * (past the content-type) we may suffer a MalformedInputException. For
2296      * this reason the initial size is 1 and when the body is encountered the
2297      * size is adjusted to 256.
2298      */
2299     private char buf[] = new char[1];
2300     private int pos;
2301     private int len;
2302     /*
2303         tracks position relative to the beginning of the
2304         document.
2305     */
2306     private int currentPosition;
2307 
2308 
2309     private final int readCh() throws IOException {
2310 
2311         if (pos >= len) {
2312 
2313             // This loop allows us to ignore interrupts if the flag
2314             // says so
2315             for (;;) {
2316                 try {
2317                     len = in.read(buf);
2318                     break;
2319                 } catch (InterruptedIOException ex) {
2320                     throw ex;
2321                 }
2322             }
2323 
2324             if (len <= 0) {
2325                 return -1;      // eof
2326             }
2327             pos = 0;
2328         }
2329         ++currentPosition;
2330 
2331         return buf[pos++];
2332     }
2333 
2334 
2335     protected int getCurrentPos() {
2336         return currentPosition;
2337     }
2338 }