1 /*
   2  * Copyright (c) 1998, 2017, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 
  26 package javax.swing.text.html.parser;
  27 
  28 import javax.swing.text.SimpleAttributeSet;
  29 import javax.swing.text.html.HTML;
  30 import javax.swing.text.ChangedCharSetException;
  31 import java.io.*;
  32 import java.util.Hashtable;
  33 import java.util.Properties;
  34 import java.util.Vector;
  35 import java.util.Enumeration;
  36 import java.net.URL;
  37 
  38 /**
  39  * A simple DTD-driven HTML parser. The parser reads an
  40  * HTML file from an InputStream and calls various methods
  41  * (which should be overridden in a subclass) when tags and
  42  * data are encountered.
  43  * <p>
  44  * Unfortunately there are many badly implemented HTML parsers
  45  * out there, and as a result there are many badly formatted
  46  * HTML files. This parser attempts to parse most HTML files.
  47  * This means that the implementation sometimes deviates from
  48  * the SGML specification in favor of HTML.
  49  * <p>
  50  * The parser treats \r and \r\n as \n. Newlines after starttags
  51  * and before end tags are ignored just as specified in the SGML/HTML
  52  * specification.
  53  * <p>
  54  * The html spec does not specify how spaces are to be coalesced very well.
  55  * Specifically, the following scenarios are not discussed (note that a
  56  * space should be used here, but I am using &amp;nbsp to force the space to
  57  * be displayed):
  58  * <p>
  59  * '&lt;b&gt;blah&nbsp;&lt;i&gt;&nbsp;&lt;strike&gt;&nbsp;foo' which can be treated as:
  60  * '&lt;b&gt;blah&nbsp;&lt;i&gt;&lt;strike&gt;foo'
  61  * <p>as well as:
  62  * '&lt;p&gt;&lt;a href="xx"&gt;&nbsp;&lt;em&gt;Using&lt;/em&gt;&lt;/a&gt;&lt;/p&gt;'
  63  * which appears to be treated as:
  64  * '&lt;p&gt;&lt;a href="xx"&gt;&lt;em&gt;Using&lt;/em&gt;&lt;/a&gt;&lt;/p&gt;'
  65  * <p>
  66  * If <code>strict</code> is false, when a tag that breaks flow,
  67  * (<code>TagElement.breaksFlows</code>) or trailing whitespace is
  68  * encountered, all whitespace will be ignored until a non whitespace
  69  * character is encountered. This appears to give behavior closer to
  70  * the popular browsers.
  71  *
  72  * @see DTD
  73  * @see TagElement
  74  * @see SimpleAttributeSet
  75  * @author Arthur van Hoff
  76  * @author Sunita Mani
  77  */
  78 public
  79 class Parser implements DTDConstants {
  80 
  81     private char text[] = new char[1024];
  82     private int textpos = 0;
  83     private TagElement last;
  84     private boolean space;
  85 
  86     private char str[] = new char[128];
  87     private int strpos = 0;
  88 
  89     /**
  90      * The dtd.
  91      */
  92     protected DTD dtd = null;
  93 
  94     private int ch;
  95     private int ln;
  96     private Reader in;
  97 
  98     private Element recent;
  99     private TagStack stack;
 100     private boolean skipTag = false;
 101     private TagElement lastFormSent = null;
 102     private SimpleAttributeSet attributes = new SimpleAttributeSet();
 103 
 104     // State for <html>, <head> and <body>.  Since people like to slap
 105     // together HTML documents without thinking, occasionally they
 106     // have multiple instances of these tags.  These booleans track
 107     // the first sightings of these tags so they can be safely ignored
 108     // by the parser if repeated.
 109     private boolean seenHtml = false;
 110     private boolean seenHead = false;
 111     private boolean seenBody = false;
 112 
 113     /**
 114      * The html spec does not specify how spaces are coalesced very well.
 115      * If strict == false, ignoreSpace is used to try and mimic the behavior
 116      * of the popular browsers.
 117      * <p>
 118      * The problematic scenarios are:
 119      * '&lt;b>blah &lt;i> &lt;strike> foo' which can be treated as:
 120      * '&lt;b>blah &lt;i>&lt;strike>foo'
 121      * as well as:
 122      * '&lt;p>&lt;a href="xx"> &lt;em>Using&lt;/em>&lt;/a>&lt;/p>'
 123      * which appears to be treated as:
 124      * '&lt;p>&lt;a href="xx">&lt;em>Using&lt;/em>&lt;/a>&lt;/p>'
 125      * <p>
 126      * When a tag that breaks flow, or trailing whitespace is encountered
 127      * ignoreSpace is set to true. From then on, all whitespace will be
 128      * ignored.
 129      * ignoreSpace will be set back to false the first time a
 130      * non whitespace character is encountered. This appears to give
 131      * behavior closer to the popular browsers.
 132      */
 133     private boolean ignoreSpace;
 134 
 135     /**
 136      * This flag determines whether or not the Parser will be strict
 137      * in enforcing SGML compatibility.  If false, it will be lenient
 138      * with certain common classes of erroneous HTML constructs.
 139      * Strict or not, in either case an error will be recorded.
 140      *
 141      */
 142     protected boolean strict = false;
 143 
 144 
 145     /** Number of \r\n's encountered. */
 146     private int crlfCount;
 147     /** Number of \r's encountered. A \r\n will not increment this. */
 148     private int crCount;
 149     /** Number of \n's encountered. A \r\n will not increment this. */
 150     private int lfCount;
 151 
 152     //
 153     // To correctly identify the start of a tag/comment/text we need two
 154     // ivars. Two are needed as handleText isn't invoked until the tag
 155     // after the text has been parsed, that is the parser parses the text,
 156     // then a tag, then invokes handleText followed by handleStart.
 157     //
 158     /** The start position of the current block. Block is overloaded here,
 159      * it really means the current start position for the current comment,
 160      * tag, text. Use getBlockStartPosition to access this. */
 161     private int currentBlockStartPos;
 162     /** Start position of the last block. */
 163     private int lastBlockStartPos;
 164 
 165     /**
 166      * array for mapping numeric references in range
 167      * 130-159 to displayable Unicode characters.
 168      */
 169     private static final char[] cp1252Map = {
 170         8218,  // ‚
 171         402,   // ƒ
 172         8222,  // „
 173         8230,  // …
 174         8224,  // †
 175         8225,  // ‡
 176         710,   // ˆ
 177         8240,  // ‰
 178         352,   // Š
 179         8249,  // ‹
 180         338,   // Œ
 181         141,   // 
 182         142,   // Ž
 183         143,   // 
 184         144,   // 
 185         8216,  // ‘
 186         8217,  // ’
 187         8220,  // “
 188         8221,  // ”
 189         8226,  // •
 190         8211,  // –
 191         8212,  // —
 192         732,   // ˜
 193         8482,  // ™
 194         353,   // š
 195         8250,  // ›
 196         339,   // œ
 197         157,   // 
 198         158,   // ž
 199         376    // Ÿ
 200     };
 201 
 202     /**
 203      * Creates parser with the specified {@code dtd}.
 204      *
 205      * @param dtd the dtd.
 206      */
 207     public Parser(DTD dtd) {
 208         this.dtd = dtd;
 209     }
 210 
 211 
 212     /**
 213      * @return the line number of the line currently being parsed
 214      */
 215     protected int getCurrentLine() {
 216         return ln;
 217     }
 218 
 219     /**
 220      * Returns the start position of the current block. Block is
 221      * overloaded here, it really means the current start position for
 222      * the current comment tag, text, block.... This is provided for
 223      * subclassers that wish to know the start of the current block when
 224      * called with one of the handleXXX methods.
 225      *
 226      * @return the start position of the current block
 227      */
 228     int getBlockStartPosition() {
 229         return Math.max(0, lastBlockStartPos - 1);
 230     }
 231 
 232     /**
 233      * Makes a TagElement.
 234      *
 235      * @param elem       the element storing the tag definition
 236      * @param fictional  the value of the flag "{@code fictional}" to be set for the tag
 237      *
 238      * @return the created {@code TagElement}
 239      */
 240     protected TagElement makeTag(Element elem, boolean fictional) {
 241         return new TagElement(elem, fictional);
 242     }
 243 
 244     /**
 245      * Makes a TagElement.
 246      *
 247      * @param elem  the element storing the tag definition
 248      *
 249      * @return the created {@code TagElement}
 250      */
 251     protected TagElement makeTag(Element elem) {
 252         return makeTag(elem, false);
 253     }
 254 
 255     /**
 256      * Returns attributes for the current tag.
 257      *
 258      * @return {@code SimpleAttributeSet} containing the attributes
 259      */
 260     protected SimpleAttributeSet getAttributes() {
 261         return attributes;
 262     }
 263 
 264     /**
 265      * Removes the current attributes.
 266      */
 267     protected void flushAttributes() {
 268         attributes.removeAttributes(attributes);
 269     }
 270 
 271     /**
 272      * Called when PCDATA is encountered.
 273      *
 274      * @param text  the section text
 275      */
 276     protected void handleText(char text[]) {
 277     }
 278 
 279     /**
 280      * Called when an HTML title tag is encountered.
 281      *
 282      * @param text  the title text
 283      */
 284     protected void handleTitle(char text[]) {
 285         // default behavior is to call handleText. Subclasses
 286         // can override if necessary.
 287         handleText(text);
 288     }
 289 
 290     /**
 291      * Called when an HTML comment is encountered.
 292      *
 293      * @param text  the comment being handled
 294      */
 295     protected void handleComment(char text[]) {
 296     }
 297 
 298     /**
 299      * Called when the content terminates without closing the HTML comment.
 300      */
 301     protected void handleEOFInComment() {
 302         // We've reached EOF.  Our recovery strategy is to
 303         // see if we have more than one line in the comment;
 304         // if so, we pretend that the comment was an unterminated
 305         // single line comment, and reparse the lines after the
 306         // first line as normal HTML content.
 307 
 308         int commentEndPos = strIndexOf('\n');
 309         if (commentEndPos >= 0) {
 310             handleComment(getChars(0, commentEndPos));
 311             try {
 312                 in.close();
 313                 in = new CharArrayReader(getChars(commentEndPos + 1));
 314                 ch = '>';
 315             } catch (IOException e) {
 316                 error("ioexception");
 317             }
 318 
 319             resetStrBuffer();
 320         } else {
 321             // no newline, so signal an error
 322             error("eof.comment");
 323         }
 324     }
 325 
 326     /**
 327      * Called when an empty tag is encountered.
 328      *
 329      * @param tag  the tag being handled
 330      * @throws ChangedCharSetException if the document charset was changed
 331      */
 332     protected void handleEmptyTag(TagElement tag) throws ChangedCharSetException {
 333     }
 334 
 335     /**
 336      * Called when a start tag is encountered.
 337      *
 338      * @param tag  the tag being handled
 339      */
 340     protected void handleStartTag(TagElement tag) {
 341     }
 342 
 343     /**
 344      * Called when an end tag is encountered.
 345      *
 346      * @param tag  the tag being handled
 347      */
 348     protected void handleEndTag(TagElement tag) {
 349     }
 350 
 351     /**
 352      * An error has occurred.
 353      *
 354      * @param ln   the number of line containing the error
 355      * @param msg  the error message
 356      */
 357     protected void handleError(int ln, String msg) {
 358         /*
 359         Thread.dumpStack();
 360         System.out.println("**** " + stack);
 361         System.out.println("line " + ln + ": error: " + msg);
 362         System.out.println();
 363         */
 364     }
 365 
 366     /**
 367      * Output text.
 368      */
 369     void handleText(TagElement tag) {
 370         if (tag.breaksFlow()) {
 371             space = false;
 372             if (!strict) {
 373                 ignoreSpace = true;
 374             }
 375         }
 376         if (textpos == 0) {
 377             if ((!space) || (stack == null) || last.breaksFlow() ||
 378                 !stack.advance(dtd.pcdata)) {
 379                 last = tag;
 380                 space = false;
 381                 lastBlockStartPos = currentBlockStartPos;
 382                 return;
 383             }
 384         }
 385         if (space) {
 386             if (!ignoreSpace) {
 387                 // enlarge buffer if needed
 388                 if (textpos + 1 > text.length) {
 389                     char newtext[] = new char[text.length + 200];
 390                     System.arraycopy(text, 0, newtext, 0, text.length);
 391                     text = newtext;
 392                 }
 393 
 394                 // output pending space
 395                 text[textpos++] = ' ';
 396                 if (!strict && !tag.getElement().isEmpty()) {
 397                     ignoreSpace = true;
 398                 }
 399             }
 400             space = false;
 401         }
 402         char newtext[] = new char[textpos];
 403         System.arraycopy(text, 0, newtext, 0, textpos);
 404         // Handles cases of bad html where the title tag
 405         // was getting lost when we did error recovery.
 406         if (tag.getElement().getName().equals("title")) {
 407             handleTitle(newtext);
 408         } else {
 409             handleText(newtext);
 410         }
 411         lastBlockStartPos = currentBlockStartPos;
 412         textpos = 0;
 413         last = tag;
 414         space = false;
 415     }
 416 
 417     /**
 418      * Invokes the error handler.
 419      *
 420      * @param err   the error type
 421      * @param arg1  the 1st error message argument
 422      * @param arg2  the 2nd error message argument
 423      * @param arg3  the 3rd error message argument
 424      */
 425     protected void error(String err, String arg1, String arg2,
 426         String arg3) {
 427         handleError(ln, err + " " + arg1 + " " + arg2 + " " + arg3);
 428     }
 429 
 430     /**
 431      * Invokes the error handler with the 3rd error message argument "?".
 432      *
 433      * @param err   the error type
 434      * @param arg1  the 1st error message argument
 435      * @param arg2  the 2nd error message argument
 436      */
 437     protected void error(String err, String arg1, String arg2) {
 438         error(err, arg1, arg2, "?");
 439     }
 440 
 441     /**
 442      * Invokes the error handler with the 2nd and 3rd error message argument "?".
 443      *
 444      * @param err   the error type
 445      * @param arg1  the 1st error message argument
 446      */
 447     protected void error(String err, String arg1) {
 448         error(err, arg1, "?", "?");
 449     }
 450 
 451     /**
 452      * Invokes the error handler with the 1st, 2nd and 3rd error message argument "?".
 453      *
 454      * @param err   the error type
 455      */
 456     protected void error(String err) {
 457         error(err, "?", "?", "?");
 458     }
 459 
 460 
 461     /**
 462      * Handle a start tag. The new tag is pushed
 463      * onto the tag stack. The attribute list is
 464      * checked for required attributes.
 465      *
 466      * @param tag  the tag
 467      * @throws ChangedCharSetException if the document charset was changed
 468      */
 469     protected void startTag(TagElement tag) throws ChangedCharSetException {
 470         Element elem = tag.getElement();
 471 
 472         // If the tag is an empty tag and texpos != 0
 473         // this implies that there is text before the
 474         // start tag that needs to be processed before
 475         // handling the tag.
 476         //
 477         if (!elem.isEmpty() ||
 478                     ((last != null) && !last.breaksFlow()) ||
 479                     (textpos != 0)) {
 480             handleText(tag);
 481         } else {
 482             // this variable gets updated in handleText().
 483             // Since in this case we do not call handleText()
 484             // we need to update it here.
 485             //
 486             last = tag;
 487             // Note that we should really check last.breakFlows before
 488             // assuming this should be false.
 489             space = false;
 490         }
 491         lastBlockStartPos = currentBlockStartPos;
 492 
 493         // check required attributes
 494         for (AttributeList a = elem.atts ; a != null ; a = a.next) {
 495             if ((a.modifier == REQUIRED) &&
 496                 ((attributes.isEmpty()) ||
 497                  ((!attributes.isDefined(a.name)) &&
 498                   (!attributes.isDefined(HTML.getAttributeKey(a.name)))))) {
 499                 error("req.att ", a.getName(), elem.getName());
 500             }
 501         }
 502 
 503         if (elem.isEmpty()) {
 504             handleEmptyTag(tag);
 505             /*
 506         } else if (elem.getName().equals("form")) {
 507             handleStartTag(tag);
 508             */
 509         } else {
 510             recent = elem;
 511             stack = new TagStack(tag, stack);
 512             handleStartTag(tag);
 513         }
 514     }
 515 
 516     /**
 517      * Handle an end tag. The end tag is popped
 518      * from the tag stack.
 519      *
 520      * @param omitted  {@code true} if the tag is no actually present in the
 521      *                 document, but is supposed by the parser
 522      */
 523     protected void endTag(boolean omitted) {
 524         handleText(stack.tag);
 525 
 526         if (omitted && !stack.elem.omitEnd()) {
 527             error("end.missing", stack.elem.getName());
 528         } else if (!stack.terminate()) {
 529             error("end.unexpected", stack.elem.getName());
 530         }
 531 
 532         // handle the tag
 533         handleEndTag(stack.tag);
 534         stack = stack.next;
 535         recent = (stack != null) ? stack.elem : null;
 536     }
 537 
 538 
 539     boolean ignoreElement(Element elem) {
 540 
 541         String stackElement = stack.elem.getName();
 542         String elemName = elem.getName();
 543         /* We ignore all elements that are not valid in the context of
 544            a table except <td>, <th> (these we handle in
 545            legalElementContext()) and #pcdata.  We also ignore the
 546            <font> tag in the context of <ul> and <ol> We additonally
 547            ignore the <meta> and the <style> tag if the body tag has
 548            been seen. **/
 549         if ((elemName.equals("html") && seenHtml) ||
 550             (elemName.equals("head") && seenHead) ||
 551             (elemName.equals("body") && seenBody)) {
 552             return true;
 553         }
 554         if (elemName.equals("dt") || elemName.equals("dd")) {
 555             TagStack s = stack;
 556             while (s != null && !s.elem.getName().equals("dl")) {
 557                 s = s.next;
 558             }
 559             if (s == null) {
 560                 return true;
 561             }
 562         }
 563 
 564         if (((stackElement.equals("table")) &&
 565              (!elemName.equals("#pcdata")) && (!elemName.equals("input"))) ||
 566             ((elemName.equals("font")) &&
 567              (stackElement.equals("ul") || stackElement.equals("ol"))) ||
 568             (elemName.equals("meta") && stack != null) ||
 569             (elemName.equals("style") && seenBody) ||
 570             (stackElement.equals("table") && elemName.equals("a"))) {
 571             return true;
 572         }
 573         return false;
 574     }
 575 
 576 
 577     /**
 578      * Marks the first time a tag has been seen in a document
 579      *
 580      * @param elem  the element represented by the tag
 581      */
 582 
 583     protected void markFirstTime(Element elem) {
 584         String elemName = elem.getName();
 585         if (elemName.equals("html")) {
 586             seenHtml = true;
 587         } else if (elemName.equals("head")) {
 588             seenHead = true;
 589         } else if (elemName.equals("body")) {
 590             if (buf.length == 1) {
 591                 // Refer to note in definition of buf for details on this.
 592                 char[] newBuf = new char[256];
 593 
 594                 newBuf[0] = buf[0];
 595                 buf = newBuf;
 596             }
 597             seenBody = true;
 598         }
 599     }
 600 
 601     /**
 602      * Create a legal content for an element.
 603      */
 604     boolean legalElementContext(Element elem) throws ChangedCharSetException {
 605 
 606         // System.out.println("-- legalContext -- " + elem);
 607 
 608         // Deal with the empty stack
 609         if (stack == null) {
 610             // System.out.println("-- stack is empty");
 611             if (elem != dtd.html) {
 612                 // System.out.println("-- pushing html");
 613                 startTag(makeTag(dtd.html, true));
 614                 return legalElementContext(elem);
 615             }
 616             return true;
 617         }
 618 
 619         // Is it allowed in the current context
 620         if (stack.advance(elem)) {
 621             // System.out.println("-- legal context");
 622             markFirstTime(elem);
 623             return true;
 624         }
 625         boolean insertTag = false;
 626 
 627         // The use of all error recovery strategies are contingent
 628         // on the value of the strict property.
 629         //
 630         // These are commonly occurring errors.  if insertTag is true,
 631         // then we want to adopt an error recovery strategy that
 632         // involves attempting to insert an additional tag to
 633         // legalize the context.  The two errors addressed here
 634         // are:
 635         // 1) when a <td> or <th> is seen soon after a <table> tag.
 636         //    In this case we insert a <tr>.
 637         // 2) when any other tag apart from a <tr> is seen
 638         //    in the context of a <tr>.  In this case we would
 639         //    like to add a <td>.  If a <tr> is seen within a
 640         //    <tr> context, then we will close out the current
 641         //    <tr>.
 642         //
 643         // This insertion strategy is handled later in the method.
 644         // The reason for checking this now, is that in other cases
 645         // we would like to apply other error recovery strategies for example
 646         // ignoring tags.
 647         //
 648         // In certain cases it is better to ignore a tag than try to
 649         // fix the situation.  So the first test is to see if this
 650         // is what we need to do.
 651         //
 652         String stackElemName = stack.elem.getName();
 653         String elemName = elem.getName();
 654 
 655 
 656         if (!strict &&
 657             ((stackElemName.equals("table") && elemName.equals("td")) ||
 658              (stackElemName.equals("table") && elemName.equals("th")) ||
 659              (stackElemName.equals("tr") && !elemName.equals("tr")))){
 660              insertTag = true;
 661         }
 662 
 663 
 664         if (!strict && !insertTag && (stack.elem.getName() != elem.getName() ||
 665                                       elem.getName().equals("body"))) {
 666             if (skipTag = ignoreElement(elem)) {
 667                 error("tag.ignore", elem.getName());
 668                 return skipTag;
 669             }
 670         }
 671 
 672         // Check for anything after the start of the table besides tr, td, th
 673         // or caption, and if those aren't there, insert the <tr> and call
 674         // legalElementContext again.
 675         if (!strict && stackElemName.equals("table") &&
 676             !elemName.equals("tr") && !elemName.equals("td") &&
 677             !elemName.equals("th") && !elemName.equals("caption")) {
 678             Element e = dtd.getElement("tr");
 679             TagElement t = makeTag(e, true);
 680             legalTagContext(t);
 681             startTag(t);
 682             error("start.missing", elem.getName());
 683             return legalElementContext(elem);
 684         }
 685 
 686         // They try to find a legal context by checking if the current
 687         // tag is valid in an enclosing context.  If so
 688         // close out the tags by outputing end tags and then
 689         // insert the current tag.  If the tags that are
 690         // being closed out do not have an optional end tag
 691         // specification in the DTD then an html error is
 692         // reported.
 693         //
 694         if (!insertTag && stack.terminate() && (!strict || stack.elem.omitEnd())) {
 695             for (TagStack s = stack.next ; s != null ; s = s.next) {
 696                 if (s.advance(elem)) {
 697                     while (stack != s) {
 698                         endTag(true);
 699                     }
 700                     return true;
 701                 }
 702                 if (!s.terminate() || (strict && !s.elem.omitEnd())) {
 703                     break;
 704                 }
 705             }
 706         }
 707 
 708         // Check if we know what tag is expected next.
 709         // If so insert the tag.  Report an error if the
 710         // tag does not have its start tag spec in the DTD as optional.
 711         //
 712         Element next = stack.first();
 713         if (next != null && (!strict || next.omitStart()) &&
 714            !(next==dtd.head && elem==dtd.pcdata) ) {
 715             // System.out.println("-- omitting start tag: " + next);
 716             TagElement t = makeTag(next, true);
 717             legalTagContext(t);
 718             startTag(t);
 719             if (!next.omitStart()) {
 720                 error("start.missing", elem.getName());
 721             }
 722             return legalElementContext(elem);
 723         }
 724 
 725 
 726         // Traverse the list of expected elements and determine if adding
 727         // any of these elements would make for a legal context.
 728         //
 729 
 730         if (!strict) {
 731             ContentModel content = stack.contentModel();
 732             Vector<Element> elemVec = new Vector<Element>();
 733             if (content != null) {
 734                 content.getElements(elemVec);
 735                 for (Element e : elemVec) {
 736                     // Ensure that this element has not been included as
 737                     // part of the exclusions in the DTD.
 738                     //
 739                     if (stack.excluded(e.getIndex())) {
 740                         continue;
 741                     }
 742 
 743                     boolean reqAtts = false;
 744 
 745                     for (AttributeList a = e.getAttributes(); a != null ; a = a.next) {
 746                         if (a.modifier == REQUIRED) {
 747                             reqAtts = true;
 748                             break;
 749                         }
 750                     }
 751                     // Ensure that no tag that has required attributes
 752                     // gets inserted.
 753                     //
 754                     if (reqAtts) {
 755                         continue;
 756                     }
 757 
 758                     ContentModel m = e.getContent();
 759                     if (m != null && m.first(elem)) {
 760                         // System.out.println("-- adding a legal tag: " + e);
 761                         TagElement t = makeTag(e, true);
 762                         legalTagContext(t);
 763                         startTag(t);
 764                         error("start.missing", e.getName());
 765                         return legalElementContext(elem);
 766                     }
 767                 }
 768             }
 769         }
 770 
 771         // Check if the stack can be terminated.  If so add the appropriate
 772         // end tag.  Report an error if the tag being ended does not have its
 773         // end tag spec in the DTD as optional.
 774         //
 775         if (stack.terminate() && (stack.elem != dtd.body) && (!strict || stack.elem.omitEnd())) {
 776             // System.out.println("-- omitting end tag: " + stack.elem);
 777             if (!stack.elem.omitEnd()) {
 778                 error("end.missing", elem.getName());
 779             }
 780 
 781             endTag(true);
 782             return legalElementContext(elem);
 783         }
 784 
 785         // At this point we know that something is screwed up.
 786         return false;
 787     }
 788 
 789     /**
 790      * Create a legal context for a tag.
 791      */
 792     void legalTagContext(TagElement tag) throws ChangedCharSetException {
 793         if (legalElementContext(tag.getElement())) {
 794             markFirstTime(tag.getElement());
 795             return;
 796         }
 797 
 798         // Avoid putting a block tag in a flow tag.
 799         if (tag.breaksFlow() && (stack != null) && !stack.tag.breaksFlow()) {
 800             endTag(true);
 801             legalTagContext(tag);
 802             return;
 803         }
 804 
 805         // Avoid putting something wierd in the head of the document.
 806         for (TagStack s = stack ; s != null ; s = s.next) {
 807             if (s.tag.getElement() == dtd.head) {
 808                 while (stack != s) {
 809                     endTag(true);
 810                 }
 811                 endTag(true);
 812                 legalTagContext(tag);
 813                 return;
 814             }
 815         }
 816 
 817         // Everything failed
 818         error("tag.unexpected", tag.getElement().getName());
 819     }
 820 
 821     /**
 822      * Error context. Something went wrong, make sure we are in
 823      * the document's body context
 824      */
 825     void errorContext() throws ChangedCharSetException {
 826         for (; (stack != null) && (stack.tag.getElement() != dtd.body) ; stack = stack.next) {
 827             handleEndTag(stack.tag);
 828         }
 829         if (stack == null) {
 830             legalElementContext(dtd.body);
 831             startTag(makeTag(dtd.body, true));
 832         }
 833     }
 834 
 835     /**
 836      * Add a char to the string buffer.
 837      */
 838     void addString(int c) {
 839         if (strpos  == str.length) {
 840             char newstr[] = new char[str.length + 128];
 841             System.arraycopy(str, 0, newstr, 0, str.length);
 842             str = newstr;
 843         }
 844         str[strpos++] = (char)c;
 845     }
 846 
 847     /**
 848      * Get the string that's been accumulated.
 849      */
 850     String getString(int pos) {
 851         char newStr[] = new char[strpos - pos];
 852         System.arraycopy(str, pos, newStr, 0, strpos - pos);
 853         strpos = pos;
 854         return new String(newStr);
 855     }
 856 
 857     char[] getChars(int pos) {
 858         char newStr[] = new char[strpos - pos];
 859         System.arraycopy(str, pos, newStr, 0, strpos - pos);
 860         strpos = pos;
 861         return newStr;
 862     }
 863 
 864     char[] getChars(int pos, int endPos) {
 865         char newStr[] = new char[endPos - pos];
 866         System.arraycopy(str, pos, newStr, 0, endPos - pos);
 867         // REMIND: it's not clear whether this version should set strpos or not
 868         // strpos = pos;
 869         return newStr;
 870     }
 871 
 872     void resetStrBuffer() {
 873         strpos = 0;
 874     }
 875 
 876     int strIndexOf(char target) {
 877         for (int i = 0; i < strpos; i++) {
 878             if (str[i] == target) {
 879                 return i;
 880             }
 881         }
 882 
 883         return -1;
 884     }
 885 
 886     /**
 887      * Skip space.
 888      * [5] 297:5
 889      */
 890     void skipSpace() throws IOException {
 891         while (true) {
 892             switch (ch) {
 893               case '\n':
 894                 ln++;
 895                 ch = readCh();
 896                 lfCount++;
 897                 break;
 898 
 899               case '\r':
 900                 ln++;
 901                 if ((ch = readCh()) == '\n') {
 902                     ch = readCh();
 903                     crlfCount++;
 904                 }
 905                 else {
 906                     crCount++;
 907                 }
 908                 break;
 909               case ' ':
 910               case '\t':
 911                 ch = readCh();
 912                 break;
 913 
 914               default:
 915                 return;
 916             }
 917         }
 918     }
 919 
 920     /**
 921      * Parse identifier. Uppercase characters are folded
 922      * to lowercase when lower is true. Returns falsed if
 923      * no identifier is found. [55] 346:17
 924      */
 925     boolean parseIdentifier(boolean lower) throws IOException {
 926         switch (ch) {
 927           case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
 928           case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
 929           case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
 930           case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
 931           case 'Y': case 'Z':
 932             if (lower) {
 933                 ch = 'a' + (ch - 'A');
 934             }
 935             break;
 936 
 937           case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
 938           case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
 939           case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
 940           case 's': case 't': case 'u': case 'v': case 'w': case 'x':
 941           case 'y': case 'z':
 942             break;
 943 
 944           default:
 945             return false;
 946         }
 947 
 948         while (true) {
 949             addString(ch);
 950 
 951             switch (ch = readCh()) {
 952               case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
 953               case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
 954               case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
 955               case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
 956               case 'Y': case 'Z':
 957                 if (lower) {
 958                     ch = 'a' + (ch - 'A');
 959                 }
 960                 break;
 961 
 962               case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
 963               case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
 964               case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
 965               case 's': case 't': case 'u': case 'v': case 'w': case 'x':
 966               case 'y': case 'z':
 967 
 968               case '0': case '1': case '2': case '3': case '4':
 969               case '5': case '6': case '7': case '8': case '9':
 970 
 971               case '.': case '-':
 972 
 973               case '_': // not officially allowed
 974                 break;
 975 
 976               default:
 977                 return true;
 978             }
 979         }
 980     }
 981 
 982     /**
 983      * Parse an entity reference. [59] 350:17
 984      */
 985     private char[] parseEntityReference() throws IOException {
 986         int pos = strpos;
 987 
 988         if ((ch = readCh()) == '#') {
 989             int n = 0;
 990             ch = readCh();
 991             if ((ch >= '0') && (ch <= '9') ||
 992                     ch == 'x' || ch == 'X') {
 993 
 994                 if ((ch >= '0') && (ch <= '9')) {
 995                     // parse decimal reference
 996                     while ((ch >= '0') && (ch <= '9')) {
 997                         n = (n * 10) + ch - '0';
 998                         ch = readCh();
 999                     }
1000                 } else {
1001                     // parse hexadecimal reference
1002                     ch = readCh();
1003                     char lch = (char) Character.toLowerCase(ch);
1004                     while ((lch >= '0') && (lch <= '9') ||
1005                             (lch >= 'a') && (lch <= 'f')) {
1006                         if (lch >= '0' && lch <= '9') {
1007                             n = (n * 16) + lch - '0';
1008                         } else {
1009                             n = (n * 16) + lch - 'a' + 10;
1010                         }
1011                         ch = readCh();
1012                         lch = (char) Character.toLowerCase(ch);
1013                     }
1014                 }
1015                 switch (ch) {
1016                     case '\n':
1017                         ln++;
1018                         ch = readCh();
1019                         lfCount++;
1020                         break;
1021 
1022                     case '\r':
1023                         ln++;
1024                         if ((ch = readCh()) == '\n') {
1025                             ch = readCh();
1026                             crlfCount++;
1027                         }
1028                         else {
1029                             crCount++;
1030                         }
1031                         break;
1032 
1033                     case ';':
1034                         ch = readCh();
1035                         break;
1036                 }
1037                 char data[] = mapNumericReference(n);
1038                 return data;
1039             }
1040             addString('#');
1041             if (!parseIdentifier(false)) {
1042                 error("ident.expected");
1043                 strpos = pos;
1044                 char data[] = {'&', '#'};
1045                 return data;
1046             }
1047         } else if (!parseIdentifier(false)) {
1048             char data[] = {'&'};
1049             return data;
1050         }
1051 
1052         boolean semicolon = false;
1053 
1054         switch (ch) {
1055           case '\n':
1056             ln++;
1057             ch = readCh();
1058             lfCount++;
1059             break;
1060 
1061           case '\r':
1062             ln++;
1063             if ((ch = readCh()) == '\n') {
1064                 ch = readCh();
1065                 crlfCount++;
1066             }
1067             else {
1068                 crCount++;
1069             }
1070             break;
1071 
1072           case ';':
1073             semicolon = true;
1074 
1075             ch = readCh();
1076             break;
1077         }
1078 
1079         String nm = getString(pos);
1080         Entity ent = dtd.getEntity(nm);
1081 
1082         // entities are case sensitive - however if strict
1083         // is false then we will try to make a match by
1084         // converting the string to all lowercase.
1085         //
1086         if (!strict && (ent == null)) {
1087             ent = dtd.getEntity(nm.toLowerCase());
1088         }
1089         if ((ent == null) || !ent.isGeneral()) {
1090 
1091             if (nm.length() == 0) {
1092                 error("invalid.entref", nm);
1093                 return new char[0];
1094             }
1095             /* given that there is not a match restore the entity reference */
1096             String str = "&" + nm + (semicolon ? ";" : "");
1097 
1098             char b[] = new char[str.length()];
1099             str.getChars(0, b.length, b, 0);
1100             return b;
1101         }
1102         return ent.getData();
1103     }
1104 
1105     /**
1106      * Converts numeric character reference to char array.
1107      *
1108      * Normally the code in a reference should be always converted
1109      * to the Unicode character with the same code, but due to
1110      * wide usage of Cp1252 charset most browsers map numeric references
1111      * in the range 130-159 (which are control chars in Unicode set)
1112      * to displayable characters with other codes.
1113      *
1114      * @param c the code of numeric character reference.
1115      * @return a char array corresponding to the reference code.
1116      */
1117     private char[] mapNumericReference(int c) {
1118         char[] data;
1119         if (c >= 0xffff) { // outside unicode BMP.
1120             try {
1121                 data = Character.toChars(c);
1122             } catch (IllegalArgumentException e) {
1123                 data = new char[0];
1124             }
1125         } else {
1126             data = new char[1];
1127             data[0] = (c < 130 || c > 159) ? (char) c : cp1252Map[c - 130];
1128         }
1129         return data;
1130     }
1131 
1132     /**
1133      * Parse a comment. [92] 391:7
1134      */
1135     void parseComment() throws IOException {
1136 
1137         while (true) {
1138             int c = ch;
1139             switch (c) {
1140               case '-':
1141                   /** Presuming that the start string of a comment "<!--" has
1142                       already been parsed, the '-' character is valid only as
1143                       part of a comment termination and further more it must
1144                       be present in even numbers. Hence if strict is true, we
1145                       presume the comment has been terminated and return.
1146                       However if strict is false, then there is no even number
1147                       requirement and this character can appear anywhere in the
1148                       comment.  The parser reads on until it sees the following
1149                       pattern: "-->" or "--!>".
1150                    **/
1151                 if (!strict && (strpos != 0) && (str[strpos - 1] == '-')) {
1152                     if ((ch = readCh()) == '>') {
1153                         return;
1154                     }
1155                     if (ch == '!') {
1156                         if ((ch = readCh()) == '>') {
1157                             return;
1158                         } else {
1159                             /* to account for extra read()'s that happened */
1160                             addString('-');
1161                             addString('!');
1162                             continue;
1163                         }
1164                     }
1165                     break;
1166                 }
1167 
1168                 if ((ch = readCh()) == '-') {
1169                     ch = readCh();
1170                     if (strict || ch == '>') {
1171                         return;
1172                     }
1173                     if (ch == '!') {
1174                         if ((ch = readCh()) == '>') {
1175                             return;
1176                         } else {
1177                             /* to account for extra read()'s that happened */
1178                             addString('-');
1179                             addString('!');
1180                             continue;
1181                         }
1182                     }
1183                     /* to account for the extra read() */
1184                     addString('-');
1185                 }
1186                 break;
1187 
1188               case -1:
1189                   handleEOFInComment();
1190                   return;
1191 
1192               case '\n':
1193                 ln++;
1194                 ch = readCh();
1195                 lfCount++;
1196                 break;
1197 
1198               case '>':
1199                 ch = readCh();
1200                 break;
1201 
1202               case '\r':
1203                 ln++;
1204                 if ((ch = readCh()) == '\n') {
1205                     ch = readCh();
1206                     crlfCount++;
1207                 }
1208                 else {
1209                     crCount++;
1210                 }
1211                 c = '\n';
1212                 break;
1213               default:
1214                 ch = readCh();
1215                 break;
1216             }
1217 
1218             addString(c);
1219         }
1220     }
1221 
1222     /**
1223      * Parse literal content. [46] 343:1 and [47] 344:1
1224      */
1225     void parseLiteral(boolean replace) throws IOException {
1226         while (true) {
1227             int c = ch;
1228             switch (c) {
1229               case -1:
1230                 error("eof.literal", stack.elem.getName());
1231                 endTag(true);
1232                 return;
1233 
1234               case '>':
1235                 ch = readCh();
1236                 int i = textpos - (stack.elem.name.length() + 2), j = 0;
1237 
1238                 // match end tag
1239                 if ((i >= 0) && (text[i++] == '<') && (text[i] == '/')) {
1240                     while ((++i < textpos) &&
1241                            (Character.toLowerCase(text[i]) == stack.elem.name.charAt(j++)));
1242                     if (i == textpos) {
1243                         textpos -= (stack.elem.name.length() + 2);
1244                         if ((textpos > 0) && (text[textpos-1] == '\n')) {
1245                             textpos--;
1246                         }
1247                         endTag(false);
1248                         return;
1249                     }
1250                 }
1251                 break;
1252 
1253               case '&':
1254                 char data[] = parseEntityReference();
1255                 if (textpos + data.length > text.length) {
1256                     char newtext[] = new char[Math.max(textpos + data.length + 128, text.length * 2)];
1257                     System.arraycopy(text, 0, newtext, 0, text.length);
1258                     text = newtext;
1259                 }
1260                 System.arraycopy(data, 0, text, textpos, data.length);
1261                 textpos += data.length;
1262                 continue;
1263 
1264               case '\n':
1265                 ln++;
1266                 ch = readCh();
1267                 lfCount++;
1268                 break;
1269 
1270               case '\r':
1271                 ln++;
1272                 if ((ch = readCh()) == '\n') {
1273                     ch = readCh();
1274                     crlfCount++;
1275                 }
1276                 else {
1277                     crCount++;
1278                 }
1279                 c = '\n';
1280                 break;
1281               default:
1282                 ch = readCh();
1283                 break;
1284             }
1285 
1286             // output character
1287             if (textpos == text.length) {
1288                 char newtext[] = new char[text.length + 128];
1289                 System.arraycopy(text, 0, newtext, 0, text.length);
1290                 text = newtext;
1291             }
1292             text[textpos++] = (char)c;
1293         }
1294     }
1295 
1296     /**
1297      * Parse attribute value. [33] 331:1
1298      */
1299     @SuppressWarnings("fallthrough")
1300     String parseAttributeValue(boolean lower) throws IOException {
1301         int delim = -1;
1302 
1303         // Check for a delimiter
1304         switch(ch) {
1305           case '\'':
1306           case '"':
1307             delim = ch;
1308             ch = readCh();
1309             break;
1310         }
1311 
1312         // Parse the rest of the value
1313         while (true) {
1314             int c = ch;
1315 
1316             switch (c) {
1317               case '\n':
1318                 ln++;
1319                 ch = readCh();
1320                 lfCount++;
1321                 if (delim < 0) {
1322                     return getString(0);
1323                 }
1324                 break;
1325 
1326               case '\r':
1327                 ln++;
1328 
1329                 if ((ch = readCh()) == '\n') {
1330                     ch = readCh();
1331                     crlfCount++;
1332                 }
1333                 else {
1334                     crCount++;
1335                 }
1336                 if (delim < 0) {
1337                     return getString(0);
1338                 }
1339                 break;
1340 
1341               case '\t':
1342                   if (delim < 0)
1343                       c = ' ';
1344                   // Fall through
1345               case ' ':
1346                 ch = readCh();
1347                 if (delim < 0) {
1348                     return getString(0);
1349                 }
1350                 break;
1351 
1352               case '>':
1353               case '<':
1354                 if (delim < 0) {
1355                     return getString(0);
1356                 }
1357                 ch = readCh();
1358                 break;
1359 
1360               case '\'':
1361               case '"':
1362                 ch = readCh();
1363                 if (c == delim) {
1364                     return getString(0);
1365                 } else if (delim == -1) {
1366                     error("attvalerr");
1367                     if (strict || ch == ' ') {
1368                         return getString(0);
1369                     } else {
1370                         continue;
1371                     }
1372                 }
1373                 break;
1374 
1375             case '=':
1376                 if (delim < 0) {
1377                     /* In SGML a construct like <img src=/cgi-bin/foo?x=1>
1378                        is considered invalid since an = sign can only be contained
1379                        in an attributes value if the string is quoted.
1380                        */
1381                     error("attvalerr");
1382                     /* If strict is true then we return with the string we have thus far.
1383                        Otherwise we accept the = sign as part of the attribute's value and
1384                        process the rest of the img tag. */
1385                     if (strict) {
1386                         return getString(0);
1387                     }
1388                 }
1389                 ch = readCh();
1390                 break;
1391 
1392               case '&':
1393                 if (strict && delim < 0) {
1394                     ch = readCh();
1395                     break;
1396                 }
1397 
1398                 char data[] = parseEntityReference();
1399                 for (int i = 0 ; i < data.length ; i++) {
1400                     c = data[i];
1401                     addString((lower && (c >= 'A') && (c <= 'Z')) ? 'a' + c - 'A' : c);
1402                 }
1403                 continue;
1404 
1405               case -1:
1406                 return getString(0);
1407 
1408               default:
1409                 if (lower && (c >= 'A') && (c <= 'Z')) {
1410                     c = 'a' + c - 'A';
1411                 }
1412                 ch = readCh();
1413                 break;
1414             }
1415             addString(c);
1416         }
1417     }
1418 
1419 
1420     /**
1421      * Parse attribute specification List. [31] 327:17
1422      */
1423     void parseAttributeSpecificationList(Element elem) throws IOException {
1424 
1425         while (true) {
1426             skipSpace();
1427 
1428             switch (ch) {
1429               case '/':
1430               case '>':
1431               case '<':
1432               case -1:
1433                 return;
1434 
1435               case '-':
1436                 if ((ch = readCh()) == '-') {
1437                     ch = readCh();
1438                     parseComment();
1439                     strpos = 0;
1440                 } else {
1441                     error("invalid.tagchar", "-", elem.getName());
1442                     ch = readCh();
1443                 }
1444                 continue;
1445             }
1446 
1447             AttributeList att;
1448             String attname;
1449             String attvalue;
1450 
1451             if (parseIdentifier(true)) {
1452                 attname = getString(0);
1453                 skipSpace();
1454                 if (ch == '=') {
1455                     ch = readCh();
1456                     skipSpace();
1457                     att = elem.getAttribute(attname);
1458 //  Bug ID 4102750
1459 //  Load the NAME of an Attribute Case Sensitive
1460 //  The case of the NAME  must be intact
1461 //  MG 021898
1462                     attvalue = parseAttributeValue((att != null) && (att.type != CDATA) && (att.type != NOTATION) && (att.type != NAME));
1463 //                  attvalue = parseAttributeValue((att != null) && (att.type != CDATA) && (att.type != NOTATION));
1464                 } else {
1465                     attvalue = attname;
1466                     att = elem.getAttributeByValue(attvalue);
1467                     if (att == null) {
1468                         att = elem.getAttribute(attname);
1469                         if (att != null) {
1470                             attvalue = att.getValue();
1471                         }
1472                         else {
1473                             // Make it null so that NULL_ATTRIBUTE_VALUE is
1474                             // used
1475                             attvalue = null;
1476                         }
1477                     }
1478                 }
1479             } else if (!strict && ch == ',') { // allows for comma separated attribute-value pairs
1480                 ch = readCh();
1481                 continue;
1482             } else if (!strict && ch == '"') { // allows for quoted attributes
1483                 ch = readCh();
1484                 skipSpace();
1485                 if (parseIdentifier(true)) {
1486                     attname = getString(0);
1487                     if (ch == '"') {
1488                         ch = readCh();
1489                     }
1490                     skipSpace();
1491                     if (ch == '=') {
1492                         ch = readCh();
1493                         skipSpace();
1494                         att = elem.getAttribute(attname);
1495                         attvalue = parseAttributeValue((att != null) &&
1496                                                 (att.type != CDATA) &&
1497                                                 (att.type != NOTATION));
1498                     } else {
1499                         attvalue = attname;
1500                         att = elem.getAttributeByValue(attvalue);
1501                         if (att == null) {
1502                             att = elem.getAttribute(attname);
1503                             if (att != null) {
1504                                 attvalue = att.getValue();
1505                             }
1506                         }
1507                     }
1508                 } else {
1509                     char str[] = {(char)ch};
1510                     error("invalid.tagchar", new String(str), elem.getName());
1511                     ch = readCh();
1512                     continue;
1513                 }
1514             } else if (!strict && (attributes.isEmpty()) && (ch == '=')) {
1515                 ch = readCh();
1516                 skipSpace();
1517                 attname = elem.getName();
1518                 att = elem.getAttribute(attname);
1519                 attvalue = parseAttributeValue((att != null) &&
1520                                                (att.type != CDATA) &&
1521                                                (att.type != NOTATION));
1522             } else if (!strict && (ch == '=')) {
1523                 ch = readCh();
1524                 skipSpace();
1525                 attvalue = parseAttributeValue(true);
1526                 error("attvalerr");
1527                 return;
1528             } else {
1529                 char str[] = {(char)ch};
1530                 error("invalid.tagchar", new String(str), elem.getName());
1531                 if (!strict) {
1532                     ch = readCh();
1533                     continue;
1534                 } else {
1535                     return;
1536                 }
1537             }
1538 
1539             if (att != null) {
1540                 attname = att.getName();
1541             } else {
1542                 error("invalid.tagatt", attname, elem.getName());
1543             }
1544 
1545             // Check out the value
1546             if (attributes.isDefined(attname)) {
1547                 error("multi.tagatt", attname, elem.getName());
1548             }
1549             if (attvalue == null) {
1550                 attvalue = ((att != null) && (att.value != null)) ? att.value :
1551                     HTML.NULL_ATTRIBUTE_VALUE;
1552             } else if ((att != null) && (att.values != null) && !att.values.contains(attvalue)) {
1553                 error("invalid.tagattval", attname, elem.getName());
1554             }
1555             HTML.Attribute attkey = HTML.getAttributeKey(attname);
1556             if (attkey == null) {
1557                 attributes.addAttribute(attname, attvalue);
1558             } else {
1559                 attributes.addAttribute(attkey, attvalue);
1560             }
1561         }
1562     }
1563 
1564     /**
1565      * Parses the Document Type Declaration markup declaration.
1566      * Currently ignores it.
1567      *
1568      * @return the string representation of the markup declaration
1569      * @throws IOException if an I/O error occurs
1570      */
1571     public String parseDTDMarkup() throws IOException {
1572 
1573         StringBuilder strBuff = new StringBuilder();
1574         ch = readCh();
1575         while(true) {
1576             switch (ch) {
1577             case '>':
1578                 ch = readCh();
1579                 return strBuff.toString();
1580             case -1:
1581                 error("invalid.markup");
1582                 return strBuff.toString();
1583             case '\n':
1584                 ln++;
1585                 ch = readCh();
1586                 lfCount++;
1587                 break;
1588             case '"':
1589                 ch = readCh();
1590                 break;
1591             case '\r':
1592                 ln++;
1593                 if ((ch = readCh()) == '\n') {
1594                     ch = readCh();
1595                     crlfCount++;
1596                 }
1597                 else {
1598                     crCount++;
1599                 }
1600                 break;
1601             default:
1602                 strBuff.append((char)(ch & 0xFF));
1603                 ch = readCh();
1604                 break;
1605             }
1606         }
1607     }
1608 
1609     /**
1610      * Parse markup declarations.
1611      * Currently only handles the Document Type Declaration markup.
1612      * Returns true if it is a markup declaration false otherwise.
1613      *
1614      * @param strBuff  the markup declaration
1615      * @return {@code true} if this is a valid markup declaration;
1616      *         otherwise {@code false}
1617      * @throws IOException if an I/O error occurs
1618      */
1619     protected boolean parseMarkupDeclarations(StringBuffer strBuff) throws IOException {
1620 
1621         /* Currently handles only the DOCTYPE */
1622         if ((strBuff.length() == "DOCTYPE".length()) &&
1623             (strBuff.toString().toUpperCase().equals("DOCTYPE"))) {
1624             parseDTDMarkup();
1625             return true;
1626         }
1627         return false;
1628     }
1629 
1630     /**
1631      * Parse an invalid tag.
1632      */
1633     void parseInvalidTag() throws IOException {
1634         // ignore all data upto the close bracket '>'
1635         while (true) {
1636             skipSpace();
1637             switch (ch) {
1638               case '>':
1639               case -1:
1640                   ch = readCh();
1641                 return;
1642               case '<':
1643                   return;
1644               default:
1645                   ch = readCh();
1646 
1647             }
1648         }
1649     }
1650 
1651     /**
1652      * Parse a start or end tag.
1653      */
1654     @SuppressWarnings("fallthrough")
1655     void parseTag() throws IOException {
1656         Element elem;
1657         boolean net = false;
1658         boolean warned = false;
1659         boolean unknown = false;
1660 
1661         switch (ch = readCh()) {
1662           case '!':
1663             switch (ch = readCh()) {
1664               case '-':
1665                 // Parse comment. [92] 391:7
1666                 while (true) {
1667                     if (ch == '-') {
1668                         if (!strict || ((ch = readCh()) == '-')) {
1669                             ch = readCh();
1670                             if (!strict && ch == '-') {
1671                                 ch = readCh();
1672                             }
1673                             // send over any text you might see
1674                             // before parsing and sending the
1675                             // comment
1676                             if (textpos != 0) {
1677                                 char newtext[] = new char[textpos];
1678                                 System.arraycopy(text, 0, newtext, 0, textpos);
1679                                 handleText(newtext);
1680                                 lastBlockStartPos = currentBlockStartPos;
1681                                 textpos = 0;
1682                             }
1683                             parseComment();
1684                             last = makeTag(dtd.getElement("comment"), true);
1685                             handleComment(getChars(0));
1686                             continue;
1687                         } else if (!warned) {
1688                             warned = true;
1689                             error("invalid.commentchar", "-");
1690                         }
1691                     }
1692                     skipSpace();
1693                     switch (ch) {
1694                       case '-':
1695                         continue;
1696                       case '>':
1697                         ch = readCh();
1698                         return;
1699                       case -1:
1700                         return;
1701                       default:
1702                         ch = readCh();
1703                         if (!warned) {
1704                             warned = true;
1705                             error("invalid.commentchar",
1706                                   String.valueOf((char)ch));
1707                         }
1708                         break;
1709                     }
1710                 }
1711 
1712               default:
1713                 // deal with marked sections
1714                 StringBuffer strBuff = new StringBuffer();
1715                 while (true) {
1716                     strBuff.append((char)ch);
1717                     if (parseMarkupDeclarations(strBuff)) {
1718                         return;
1719                     }
1720                     switch(ch) {
1721                       case '>':
1722                         ch = readCh();
1723                         // Fall through
1724                       case -1:
1725                         error("invalid.markup");
1726                         return;
1727                       case '\n':
1728                         ln++;
1729                         ch = readCh();
1730                         lfCount++;
1731                         break;
1732                       case '\r':
1733                         ln++;
1734                         if ((ch = readCh()) == '\n') {
1735                             ch = readCh();
1736                             crlfCount++;
1737                         }
1738                         else {
1739                             crCount++;
1740                         }
1741                         break;
1742 
1743                       default:
1744                         ch = readCh();
1745                         break;
1746                     }
1747                 }
1748             }
1749 
1750           case '/':
1751             // parse end tag [19] 317:4
1752             switch (ch = readCh()) {
1753               case '>':
1754                 ch = readCh();
1755                 // Fall through
1756               case '<':
1757                 // empty end tag. either </> or </<
1758                 if (recent == null) {
1759                     error("invalid.shortend");
1760                     return;
1761                 }
1762                 elem = recent;
1763                 break;
1764 
1765               default:
1766                 if (!parseIdentifier(true)) {
1767                     error("expected.endtagname");
1768                     return;
1769                 }
1770                 skipSpace();
1771                 switch (ch) {
1772                   case '>':
1773                     ch = readCh();
1774                     break;
1775                   case '<':
1776                     break;
1777 
1778                   default:
1779                     error("expected", "'>'");
1780                     while ((ch != -1) && (ch != '\n') && (ch != '>')) {
1781                         ch = readCh();
1782                     }
1783                     if (ch == '>') {
1784                         ch = readCh();
1785                     }
1786                     break;
1787                 }
1788                 String elemStr = getString(0);
1789                 if (!dtd.elementExists(elemStr)) {
1790                     error("end.unrecognized", elemStr);
1791                     // Ignore RE before end tag
1792                     if ((textpos > 0) && (text[textpos-1] == '\n')) {
1793                         textpos--;
1794                     }
1795                     elem = dtd.getElement("unknown");
1796                     elem.name = elemStr;
1797                     unknown = true;
1798                 } else {
1799                     elem = dtd.getElement(elemStr);
1800                 }
1801                 break;
1802             }
1803 
1804 
1805             // If the stack is null, we're seeing end tags without any begin
1806             // tags.  Ignore them.
1807 
1808             if (stack == null) {
1809                 error("end.extra.tag", elem.getName());
1810                 return;
1811             }
1812 
1813             // Ignore RE before end tag
1814             if ((textpos > 0) && (text[textpos-1] == '\n')) {
1815                 // In a pre tag, if there are blank lines
1816                 // we do not want to remove the newline
1817                 // before the end tag.  Hence this code.
1818                 //
1819                 if (stack.pre) {
1820                     if ((textpos > 1) && (text[textpos-2] != '\n')) {
1821                         textpos--;
1822                     }
1823                 } else {
1824                     textpos--;
1825                 }
1826             }
1827 
1828             // If the end tag is a form, since we did not put it
1829             // on the tag stack, there is no corresponding start
1830             // start tag to find. Hence do not touch the tag stack.
1831             //
1832 
1833             /*
1834             if (!strict && elem.getName().equals("form")) {
1835                 if (lastFormSent != null) {
1836                     handleEndTag(lastFormSent);
1837                     return;
1838                 } else {
1839                     // do nothing.
1840                     return;
1841                 }
1842             }
1843             */
1844 
1845             if (unknown) {
1846                 // we will not see a corresponding start tag
1847                 // on the stack.  If we are seeing an
1848                 // end tag, lets send this on as an empty
1849                 // tag with the end tag attribute set to
1850                 // true.
1851                 TagElement t = makeTag(elem);
1852                 handleText(t);
1853                 attributes.addAttribute(HTML.Attribute.ENDTAG, "true");
1854                 handleEmptyTag(makeTag(elem));
1855                 unknown = false;
1856                 return;
1857             }
1858 
1859             // find the corresponding start tag
1860 
1861             // A commonly occurring error appears to be the insertion
1862             // of extra end tags in a table.  The intent here is ignore
1863             // such extra end tags.
1864             //
1865             if (!strict) {
1866                 String stackElem = stack.elem.getName();
1867 
1868                 if (stackElem.equals("table")) {
1869                     // If it is not a valid end tag ignore it and return
1870                     //
1871                     if (!elem.getName().equals(stackElem)) {
1872                         error("tag.ignore", elem.getName());
1873                         return;
1874                     }
1875                 }
1876 
1877 
1878 
1879                 if (stackElem.equals("tr") ||
1880                     stackElem.equals("td")) {
1881                     if ((!elem.getName().equals("table")) &&
1882                         (!elem.getName().equals(stackElem))) {
1883                         error("tag.ignore", elem.getName());
1884                         return;
1885                     }
1886                 }
1887             }
1888             TagStack sp = stack;
1889 
1890             while ((sp != null) && (elem != sp.elem)) {
1891                 sp = sp.next;
1892             }
1893             if (sp == null) {
1894                 error("unmatched.endtag", elem.getName());
1895                 return;
1896             }
1897 
1898             // People put font ending tags in the darndest places.
1899             // Don't close other contexts based on them being between
1900             // a font tag and the corresponding end tag.  Instead,
1901             // ignore the end tag like it doesn't exist and allow the end
1902             // of the document to close us out.
1903             String elemName = elem.getName();
1904             if (stack != sp &&
1905                 (elemName.equals("font") ||
1906                  elemName.equals("center"))) {
1907 
1908                 // Since closing out a center tag can have real wierd
1909                 // effects on the formatting,  make sure that tags
1910                 // for which omitting an end tag is legimitate
1911                 // get closed out.
1912                 //
1913                 if (elemName.equals("center")) {
1914                     while(stack.elem.omitEnd() && stack != sp) {
1915                         endTag(true);
1916                     }
1917                     if (stack.elem == elem) {
1918                         endTag(false);
1919                     }
1920                 }
1921                 return;
1922             }
1923             // People do the same thing with center tags.  In this
1924             // case we would like to close off the center tag but
1925             // not necessarily all enclosing tags.
1926 
1927 
1928 
1929             // end tags
1930             while (stack != sp) {
1931                 endTag(true);
1932             }
1933 
1934             endTag(false);
1935             return;
1936 
1937           case -1:
1938             error("eof");
1939             return;
1940         }
1941 
1942         // start tag [14] 314:1
1943         if (!parseIdentifier(true)) {
1944             elem = recent;
1945             if ((ch != '>') || (elem == null)) {
1946                 error("expected.tagname");
1947                 return;
1948             }
1949         } else {
1950             String elemStr = getString(0);
1951 
1952             if (elemStr.equals("image")) {
1953                 elemStr = "img";
1954             }
1955 
1956             /* determine if this element is part of the dtd. */
1957 
1958             if (!dtd.elementExists(elemStr)) {
1959                 //              parseInvalidTag();
1960                 error("tag.unrecognized ", elemStr);
1961                 elem = dtd.getElement("unknown");
1962                 elem.name = elemStr;
1963                 unknown = true;
1964             } else {
1965                 elem = dtd.getElement(elemStr);
1966             }
1967         }
1968 
1969         // Parse attributes
1970         parseAttributeSpecificationList(elem);
1971 
1972         switch (ch) {
1973           case '/':
1974             net = true;
1975             // Fall through
1976           case '>':
1977             ch = readCh();
1978             if (ch == '>' && net) {
1979                 ch = readCh();
1980             }
1981           case '<':
1982             break;
1983 
1984           default:
1985             error("expected", "'>'");
1986             break;
1987         }
1988 
1989         if (!strict) {
1990           if (elem.getName().equals("script")) {
1991             error("javascript.unsupported");
1992           }
1993         }
1994 
1995         // ignore RE after start tag
1996         //
1997         if (!elem.isEmpty())  {
1998             if (ch == '\n') {
1999                 ln++;
2000                 lfCount++;
2001                 ch = readCh();
2002             } else if (ch == '\r') {
2003                 ln++;
2004                 if ((ch = readCh()) == '\n') {
2005                     ch = readCh();
2006                     crlfCount++;
2007                 }
2008                 else {
2009                     crCount++;
2010                 }
2011             }
2012         }
2013 
2014         // ensure a legal context for the tag
2015         TagElement tag = makeTag(elem, false);
2016 
2017 
2018         /** In dealing with forms, we have decided to treat
2019             them as legal in any context.  Also, even though
2020             they do have a start and an end tag, we will
2021             not put this tag on the stack.  This is to deal
2022             several pages in the web oasis that choose to
2023             start and end forms in any possible location. **/
2024 
2025         /*
2026         if (!strict && elem.getName().equals("form")) {
2027             if (lastFormSent == null) {
2028                 lastFormSent = tag;
2029             } else {
2030                 handleEndTag(lastFormSent);
2031                 lastFormSent = tag;
2032             }
2033         } else {
2034         */
2035             // Smlly, if a tag is unknown, we will apply
2036             // no legalTagContext logic to it.
2037             //
2038             if (!unknown) {
2039                 legalTagContext(tag);
2040 
2041                 // If skip tag is true,  this implies that
2042                 // the tag was illegal and that the error
2043                 // recovery strategy adopted is to ignore
2044                 // the tag.
2045                 if (!strict && skipTag) {
2046                     skipTag = false;
2047                     return;
2048                 }
2049             }
2050             /*
2051         }
2052             */
2053 
2054         startTag(tag);
2055 
2056         if (!elem.isEmpty()) {
2057             switch (elem.getType()) {
2058               case CDATA:
2059                 parseLiteral(false);
2060                 break;
2061               case RCDATA:
2062                 parseLiteral(true);
2063                 break;
2064               default:
2065                 if (stack != null) {
2066                     stack.net = net;
2067                 }
2068                 break;
2069             }
2070         }
2071     }
2072 
2073     private static final String START_COMMENT = "<!--";
2074     private static final String END_COMMENT = "-->";
2075     private static final char[] SCRIPT_END_TAG = "</script>".toCharArray();
2076     private static final char[] SCRIPT_END_TAG_UPPER_CASE =
2077                                         "</SCRIPT>".toCharArray();
2078 
2079     void parseScript() throws IOException {
2080         char[] charsToAdd = new char[SCRIPT_END_TAG.length];
2081         boolean insideComment = false;
2082 
2083         /* Here, ch should be the first character after <script> */
2084         while (true) {
2085             int i = 0;
2086             while (!insideComment && i < SCRIPT_END_TAG.length
2087                     && (SCRIPT_END_TAG[i] == ch
2088                     || SCRIPT_END_TAG_UPPER_CASE[i] == ch)) {
2089                 charsToAdd[i] = (char) ch;
2090                 ch = readCh();
2091                 i++;
2092             }
2093             if (i == SCRIPT_END_TAG.length) {
2094                 return;
2095             }
2096 
2097             if (!insideComment && i == 1 && charsToAdd[0] == START_COMMENT.charAt(0)) {
2098                 // it isn't end script tag, but may be it's start comment tag?
2099                 while (i < START_COMMENT.length()
2100                         && START_COMMENT.charAt(i) == ch) {
2101                     charsToAdd[i] = (char) ch;
2102                     ch = readCh();
2103                     i++;
2104                 }
2105                 if (i == START_COMMENT.length()) {
2106                     insideComment = true;
2107                 }
2108             }
2109             if (insideComment) {
2110                 while (i < END_COMMENT.length()
2111                         && END_COMMENT.charAt(i) == ch) {
2112                     charsToAdd[i] = (char) ch;
2113                     ch = readCh();
2114                     i++;
2115                 }
2116                 if (i == END_COMMENT.length()) {
2117                     insideComment = false;
2118                 }
2119             }
2120 
2121             /* To account for extra read()'s that happened */
2122             if (i > 0) {
2123                 for (int j = 0; j < i; j++) {
2124                     addString(charsToAdd[j]);
2125                 }
2126                 continue;
2127             }
2128             switch (ch) {
2129             case -1:
2130                 error("eof.script");
2131                 return;
2132             case '\n':
2133                 ln++;
2134                 ch = readCh();
2135                 lfCount++;
2136                 addString('\n');
2137                 break;
2138             case '\r':
2139                 ln++;
2140                 if ((ch = readCh()) == '\n') {
2141                     ch = readCh();
2142                     crlfCount++;
2143                 } else {
2144                     crCount++;
2145                 }
2146                 addString('\n');
2147                 break;
2148             default:
2149                 addString(ch);
2150                 ch = readCh();
2151                 break;
2152             } // switch
2153         } // while
2154     }
2155 
2156     /**
2157      * Parse Content. [24] 320:1
2158      */
2159     void parseContent() throws IOException {
2160         Thread curThread = Thread.currentThread();
2161 
2162         for (;;) {
2163             if (curThread.isInterrupted()) {
2164                 curThread.interrupt(); // resignal the interrupt
2165                 break;
2166             }
2167 
2168             int c = ch;
2169             currentBlockStartPos = currentPosition;
2170 
2171             if (recent == dtd.script) { // means: if after starting <script> tag
2172 
2173                 /* Here, ch has to be the first character after <script> */
2174                 parseScript();
2175                 last = makeTag(dtd.getElement("comment"), true);
2176 
2177                 /* Remove leading and trailing HTML comment declarations */
2178                 String str = new String(getChars(0)).trim();
2179                 int minLength = START_COMMENT.length() + END_COMMENT.length();
2180                 if (str.startsWith(START_COMMENT) && str.endsWith(END_COMMENT)
2181                        && str.length() >= (minLength)) {
2182                     str = str.substring(START_COMMENT.length(),
2183                                       str.length() - END_COMMENT.length());
2184                 }
2185 
2186                 /* Handle resulting chars as comment */
2187                 handleComment(str.toCharArray());
2188                 endTag(false);
2189                 lastBlockStartPos = currentPosition;
2190 
2191                 continue;
2192             } else {
2193                 switch (c) {
2194                   case '<':
2195                     parseTag();
2196                     lastBlockStartPos = currentPosition;
2197                     continue;
2198 
2199                   case '/':
2200                     ch = readCh();
2201                     if ((stack != null) && stack.net) {
2202                         // null end tag.
2203                         endTag(false);
2204                         continue;
2205                     } else if (textpos == 0) {
2206                         if (!legalElementContext(dtd.pcdata)) {
2207                             error("unexpected.pcdata");
2208                         }
2209                         if (last.breaksFlow()) {
2210                             space = false;
2211                         }
2212                     }
2213                     break;
2214 
2215                   case -1:
2216                     return;
2217 
2218                   case '&':
2219                     if (textpos == 0) {
2220                         if (!legalElementContext(dtd.pcdata)) {
2221                             error("unexpected.pcdata");
2222                         }
2223                         if (last.breaksFlow()) {
2224                             space = false;
2225                         }
2226                     }
2227                     char data[] = parseEntityReference();
2228                     if (textpos + data.length + 1 > text.length) {
2229                         char newtext[] = new char[Math.max(textpos + data.length + 128, text.length * 2)];
2230                         System.arraycopy(text, 0, newtext, 0, text.length);
2231                         text = newtext;
2232                     }
2233                     if (space) {
2234                         space = false;
2235                         text[textpos++] = ' ';
2236                     }
2237                     System.arraycopy(data, 0, text, textpos, data.length);
2238                     textpos += data.length;
2239                     ignoreSpace = false;
2240                     continue;
2241 
2242                   case '\n':
2243                     ln++;
2244                     lfCount++;
2245                     ch = readCh();
2246                     if ((stack != null) && stack.pre) {
2247                         break;
2248                     }
2249                     if (textpos == 0) {
2250                         lastBlockStartPos = currentPosition;
2251                     }
2252                     if (!ignoreSpace) {
2253                         space = true;
2254                     }
2255                     continue;
2256 
2257                   case '\r':
2258                     ln++;
2259                     c = '\n';
2260                     if ((ch = readCh()) == '\n') {
2261                         ch = readCh();
2262                         crlfCount++;
2263                     }
2264                     else {
2265                         crCount++;
2266                     }
2267                     if ((stack != null) && stack.pre) {
2268                         break;
2269                     }
2270                     if (textpos == 0) {
2271                         lastBlockStartPos = currentPosition;
2272                     }
2273                     if (!ignoreSpace) {
2274                         space = true;
2275                     }
2276                     continue;
2277 
2278 
2279                   case '\t':
2280                   case ' ':
2281                     ch = readCh();
2282                     if ((stack != null) && stack.pre) {
2283                         break;
2284                     }
2285                     if (textpos == 0) {
2286                         lastBlockStartPos = currentPosition;
2287                     }
2288                     if (!ignoreSpace) {
2289                         space = true;
2290                     }
2291                     continue;
2292 
2293                   default:
2294                     if (textpos == 0) {
2295                         if (!legalElementContext(dtd.pcdata)) {
2296                             error("unexpected.pcdata");
2297                         }
2298                         if (last.breaksFlow()) {
2299                             space = false;
2300                         }
2301                     }
2302                     ch = readCh();
2303                     break;
2304                 }
2305             }
2306 
2307             // enlarge buffer if needed
2308             if (textpos + 2 > text.length) {
2309                 char newtext[] = new char[text.length + 128];
2310                 System.arraycopy(text, 0, newtext, 0, text.length);
2311                 text = newtext;
2312             }
2313 
2314             // output pending space
2315             if (space) {
2316                 if (textpos == 0) {
2317                     lastBlockStartPos--;
2318                 }
2319                 text[textpos++] = ' ';
2320                 space = false;
2321             }
2322             text[textpos++] = (char)c;
2323             ignoreSpace = false;
2324         }
2325     }
2326 
2327     /**
2328      * Returns the end of line string. This will return the end of line
2329      * string that has been encountered the most, one of \r, \n or \r\n.
2330      */
2331     String getEndOfLineString() {
2332         if (crlfCount >= crCount) {
2333             if (lfCount >= crlfCount) {
2334                 return "\n";
2335             }
2336             else {
2337                 return "\r\n";
2338             }
2339         }
2340         else {
2341             if (crCount > lfCount) {
2342                 return "\r";
2343             }
2344             else {
2345                 return "\n";
2346             }
2347         }
2348     }
2349 
2350     /**
2351      * Parse an HTML stream, given a DTD.
2352      *
2353      * @param in  the reader to read the source from
2354      * @throws IOException if an I/O error occurs
2355      */
2356     public synchronized void parse(Reader in) throws IOException {
2357         this.in = in;
2358 
2359         this.ln = 1;
2360 
2361         seenHtml = false;
2362         seenHead = false;
2363         seenBody = false;
2364 
2365         crCount = lfCount = crlfCount = 0;
2366 
2367         try {
2368             ch = readCh();
2369             text = new char[1024];
2370             str = new char[128];
2371 
2372             parseContent();
2373             // NOTE: interruption may have occurred.  Control flows out
2374             // of here normally.
2375             while (stack != null) {
2376                 endTag(true);
2377             }
2378             in.close();
2379         } catch (IOException e) {
2380             errorContext();
2381             error("ioexception");
2382             throw e;
2383         } catch (Exception e) {
2384             errorContext();
2385             error("exception", e.getClass().getName(), e.getMessage());
2386             e.printStackTrace();
2387         } catch (ThreadDeath e) {
2388             errorContext();
2389             error("terminated");
2390             e.printStackTrace();
2391             throw e;
2392         } finally {
2393             for (; stack != null ; stack = stack.next) {
2394                 handleEndTag(stack.tag);
2395             }
2396 
2397             text = null;
2398             str = null;
2399         }
2400 
2401     }
2402 
2403 
2404     /*
2405      * Input cache.  This is much faster than calling down to a synchronized
2406      * method of BufferedReader for each byte.  Measurements done 5/30/97
2407      * show that there's no point in having a bigger buffer:  Increasing
2408      * the buffer to 8192 had no measurable impact for a program discarding
2409      * one character at a time (reading from an http URL to a local machine).
2410      * NOTE: If the current encoding is bogus, and we read too much
2411      * (past the content-type) we may suffer a MalformedInputException. For
2412      * this reason the initial size is 1 and when the body is encountered the
2413      * size is adjusted to 256.
2414      */
2415     private char buf[] = new char[1];
2416     private int pos;
2417     private int len;
2418     /*
2419         tracks position relative to the beginning of the
2420         document.
2421     */
2422     private int currentPosition;
2423 
2424 
2425     private int readCh() throws IOException {
2426 
2427         if (pos >= len) {
2428 
2429             // This loop allows us to ignore interrupts if the flag
2430             // says so
2431             for (;;) {
2432                 try {
2433                     len = in.read(buf);
2434                     break;
2435                 } catch (InterruptedIOException ex) {
2436                     throw ex;
2437                 }
2438             }
2439 
2440             if (len <= 0) {
2441                 return -1;      // eof
2442             }
2443             pos = 0;
2444         }
2445         ++currentPosition;
2446 
2447         return buf[pos++];
2448     }
2449 
2450 
2451     /**
2452      * Returns the current position.
2453      *
2454      * @return the current position
2455      */
2456     protected int getCurrentPos() {
2457         return currentPosition;
2458     }
2459 }