1 /*
   2  * Copyright (c) 1998, 2014, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 
  26 package javax.swing.text.html.parser;
  27 
  28 import javax.swing.text.SimpleAttributeSet;
  29 import javax.swing.text.html.HTML;
  30 import javax.swing.text.ChangedCharSetException;
  31 import java.io.*;
  32 import java.util.Hashtable;
  33 import java.util.Properties;
  34 import java.util.Vector;
  35 import java.util.Enumeration;
  36 import java.net.URL;
  37 
  38 import sun.misc.MessageUtils;
  39 
  40 /**
  41  * A simple DTD-driven HTML parser. The parser reads an
  42  * HTML file from an InputStream and calls various methods
  43  * (which should be overridden in a subclass) when tags and
  44  * data are encountered.
  45  * <p>
  46  * Unfortunately there are many badly implemented HTML parsers
  47  * out there, and as a result there are many badly formatted
  48  * HTML files. This parser attempts to parse most HTML files.
  49  * This means that the implementation sometimes deviates from
  50  * the SGML specification in favor of HTML.
  51  * <p>
  52  * The parser treats \r and \r\n as \n. Newlines after starttags
  53  * and before end tags are ignored just as specified in the SGML/HTML
  54  * specification.
  55  * <p>
  56  * The html spec does not specify how spaces are to be coalesced very well.
  57  * Specifically, the following scenarios are not discussed (note that a
  58  * space should be used here, but I am using &amp;nbsp to force the space to
  59  * be displayed):
  60  * <p>
  61  * '&lt;b&gt;blah&nbsp;&lt;i&gt;&nbsp;&lt;strike&gt;&nbsp;foo' which can be treated as:
  62  * '&lt;b&gt;blah&nbsp;&lt;i&gt;&lt;strike&gt;foo'
  63  * <p>as well as:
  64  * '&lt;p&gt;&lt;a href="xx"&gt;&nbsp;&lt;em&gt;Using&lt;/em&gt;&lt;/a&gt;&lt;/p&gt;'
  65  * which appears to be treated as:
  66  * '&lt;p&gt;&lt;a href="xx"&gt;&lt;em&gt;Using&lt;/em&gt;&lt;/a&gt;&lt;/p&gt;'
  67  * <p>
  68  * If <code>strict</code> is false, when a tag that breaks flow,
  69  * (<code>TagElement.breaksFlows</code>) or trailing whitespace is
  70  * encountered, all whitespace will be ignored until a non whitespace
  71  * character is encountered. This appears to give behavior closer to
  72  * the popular browsers.
  73  *
  74  * @see DTD
  75  * @see TagElement
  76  * @see SimpleAttributeSet
  77  * @author Arthur van Hoff
  78  * @author Sunita Mani
  79  */
  80 public
  81 class Parser implements DTDConstants {
  82 
  83     private char text[] = new char[1024];
  84     private int textpos = 0;
  85     private TagElement last;
  86     private boolean space;
  87 
  88     private char str[] = new char[128];
  89     private int strpos = 0;
  90 
  91     /**
  92      * The dtd.
  93      */
  94     protected DTD dtd = null;
  95 
  96     private int ch;
  97     private int ln;
  98     private Reader in;
  99 
 100     private Element recent;
 101     private TagStack stack;
 102     private boolean skipTag = false;
 103     private TagElement lastFormSent = null;
 104     private SimpleAttributeSet attributes = new SimpleAttributeSet();
 105 
 106     // State for <html>, <head> and <body>.  Since people like to slap
 107     // together HTML documents without thinking, occasionally they
 108     // have multiple instances of these tags.  These booleans track
 109     // the first sightings of these tags so they can be safely ignored
 110     // by the parser if repeated.
 111     private boolean seenHtml = false;
 112     private boolean seenHead = false;
 113     private boolean seenBody = false;
 114 
 115     /**
 116      * The html spec does not specify how spaces are coalesced very well.
 117      * If strict == false, ignoreSpace is used to try and mimic the behavior
 118      * of the popular browsers.
 119      * <p>
 120      * The problematic scenarios are:
 121      * '&lt;b>blah &lt;i> &lt;strike> foo' which can be treated as:
 122      * '&lt;b>blah &lt;i>&lt;strike>foo'
 123      * as well as:
 124      * '&lt;p>&lt;a href="xx"> &lt;em>Using&lt;/em>&lt;/a>&lt;/p>'
 125      * which appears to be treated as:
 126      * '&lt;p>&lt;a href="xx">&lt;em>Using&lt;/em>&lt;/a>&lt;/p>'
 127      * <p>
 128      * When a tag that breaks flow, or trailing whitespace is encountered
 129      * ignoreSpace is set to true. From then on, all whitespace will be
 130      * ignored.
 131      * ignoreSpace will be set back to false the first time a
 132      * non whitespace character is encountered. This appears to give
 133      * behavior closer to the popular browsers.
 134      */
 135     private boolean ignoreSpace;
 136 
 137     /**
 138      * This flag determines whether or not the Parser will be strict
 139      * in enforcing SGML compatibility.  If false, it will be lenient
 140      * with certain common classes of erroneous HTML constructs.
 141      * Strict or not, in either case an error will be recorded.
 142      *
 143      */
 144     protected boolean strict = false;
 145 
 146 
 147     /** Number of \r\n's encountered. */
 148     private int crlfCount;
 149     /** Number of \r's encountered. A \r\n will not increment this. */
 150     private int crCount;
 151     /** Number of \n's encountered. A \r\n will not increment this. */
 152     private int lfCount;
 153 
 154     //
 155     // To correctly identify the start of a tag/comment/text we need two
 156     // ivars. Two are needed as handleText isn't invoked until the tag
 157     // after the text has been parsed, that is the parser parses the text,
 158     // then a tag, then invokes handleText followed by handleStart.
 159     //
 160     /** The start position of the current block. Block is overloaded here,
 161      * it really means the current start position for the current comment,
 162      * tag, text. Use getBlockStartPosition to access this. */
 163     private int currentBlockStartPos;
 164     /** Start position of the last block. */
 165     private int lastBlockStartPos;
 166 
 167     /**
 168      * array for mapping numeric references in range
 169      * 130-159 to displayable Unicode characters.
 170      */
 171     private static final char[] cp1252Map = {
 172         8218,  // ‚
 173         402,   // ƒ
 174         8222,  // „
 175         8230,  // …
 176         8224,  // †
 177         8225,  // ‡
 178         710,   // ˆ
 179         8240,  // ‰
 180         352,   // Š
 181         8249,  // ‹
 182         338,   // Œ
 183         141,   // 
 184         142,   // Ž
 185         143,   // 
 186         144,   // 
 187         8216,  // ‘
 188         8217,  // ’
 189         8220,  // “
 190         8221,  // ”
 191         8226,  // •
 192         8211,  // –
 193         8212,  // —
 194         732,   // ˜
 195         8482,  // ™
 196         353,   // š
 197         8250,  // ›
 198         339,   // œ
 199         157,   // 
 200         158,   // ž
 201         376    // Ÿ
 202     };
 203 
 204     /**
 205      * Creates parser with the specified {@code dtd}.
 206      *
 207      * @param dtd the dtd.
 208      */
 209     public Parser(DTD dtd) {
 210         this.dtd = dtd;
 211     }
 212 
 213 
 214     /**
 215      * @return the line number of the line currently being parsed
 216      */
 217     protected int getCurrentLine() {
 218         return ln;
 219     }
 220 
 221     /**
 222      * Returns the start position of the current block. Block is
 223      * overloaded here, it really means the current start position for
 224      * the current comment tag, text, block.... This is provided for
 225      * subclassers that wish to know the start of the current block when
 226      * called with one of the handleXXX methods.
 227      *
 228      * @return the start position of the current block
 229      */
 230     int getBlockStartPosition() {
 231         return Math.max(0, lastBlockStartPos - 1);
 232     }
 233 
 234     /**
 235      * Makes a TagElement.
 236      *
 237      * @param elem       the element storing the tag definition
 238      * @param fictional  the value of the flag "{@code fictional}" to be set for the tag
 239      *
 240      * @return the created {@code TagElement}
 241      */
 242     protected TagElement makeTag(Element elem, boolean fictional) {
 243         return new TagElement(elem, fictional);
 244     }
 245 
 246     /**
 247      * Makes a TagElement.
 248      *
 249      * @param elem  the element storing the tag definition
 250      *
 251      * @return the created {@code TagElement}
 252      */
 253     protected TagElement makeTag(Element elem) {
 254         return makeTag(elem, false);
 255     }
 256 
 257     /**
 258      * Returns attributes for the current tag.
 259      *
 260      * @return {@code SimpleAttributeSet} containing the attributes
 261      */
 262     protected SimpleAttributeSet getAttributes() {
 263         return attributes;
 264     }
 265 
 266     /**
 267      * Removes the current attributes.
 268      */
 269     protected void flushAttributes() {
 270         attributes.removeAttributes(attributes);
 271     }
 272 
 273     /**
 274      * Called when PCDATA is encountered.
 275      *
 276      * @param text  the section text
 277      */
 278     protected void handleText(char text[]) {
 279     }
 280 
 281     /**
 282      * Called when an HTML title tag is encountered.
 283      *
 284      * @param text  the title text
 285      */
 286     protected void handleTitle(char text[]) {
 287         // default behavior is to call handleText. Subclasses
 288         // can override if necessary.
 289         handleText(text);
 290     }
 291 
 292     /**
 293      * Called when an HTML comment is encountered.
 294      *
 295      * @param text  the comment being handled
 296      */
 297     protected void handleComment(char text[]) {
 298     }
 299 
 300     /**
 301      * Called when the content terminates without closing the HTML comment.
 302      */
 303     protected void handleEOFInComment() {
 304         // We've reached EOF.  Our recovery strategy is to
 305         // see if we have more than one line in the comment;
 306         // if so, we pretend that the comment was an unterminated
 307         // single line comment, and reparse the lines after the
 308         // first line as normal HTML content.
 309 
 310         int commentEndPos = strIndexOf('\n');
 311         if (commentEndPos >= 0) {
 312             handleComment(getChars(0, commentEndPos));
 313             try {
 314                 in.close();
 315                 in = new CharArrayReader(getChars(commentEndPos + 1));
 316                 ch = '>';
 317             } catch (IOException e) {
 318                 error("ioexception");
 319             }
 320 
 321             resetStrBuffer();
 322         } else {
 323             // no newline, so signal an error
 324             error("eof.comment");
 325         }
 326     }
 327 
 328     /**
 329      * Called when an empty tag is encountered.
 330      *
 331      * @param tag  the tag being handled
 332      * @throws ChangedCharSetException if the document charset was changed
 333      */
 334     protected void handleEmptyTag(TagElement tag) throws ChangedCharSetException {
 335     }
 336 
 337     /**
 338      * Called when a start tag is encountered.
 339      *
 340      * @param tag  the tag being handled
 341      */
 342     protected void handleStartTag(TagElement tag) {
 343     }
 344 
 345     /**
 346      * Called when an end tag is encountered.
 347      *
 348      * @param tag  the tag being handled
 349      */
 350     protected void handleEndTag(TagElement tag) {
 351     }
 352 
 353     /**
 354      * An error has occurred.
 355      *
 356      * @param ln   the number of line containing the error
 357      * @param msg  the error message
 358      */
 359     protected void handleError(int ln, String msg) {
 360         /*
 361         Thread.dumpStack();
 362         System.out.println("**** " + stack);
 363         System.out.println("line " + ln + ": error: " + msg);
 364         System.out.println();
 365         */
 366     }
 367 
 368     /**
 369      * Output text.
 370      */
 371     void handleText(TagElement tag) {
 372         if (tag.breaksFlow()) {
 373             space = false;
 374             if (!strict) {
 375                 ignoreSpace = true;
 376             }
 377         }
 378         if (textpos == 0) {
 379             if ((!space) || (stack == null) || last.breaksFlow() ||
 380                 !stack.advance(dtd.pcdata)) {
 381                 last = tag;
 382                 space = false;
 383                 lastBlockStartPos = currentBlockStartPos;
 384                 return;
 385             }
 386         }
 387         if (space) {
 388             if (!ignoreSpace) {
 389                 // enlarge buffer if needed
 390                 if (textpos + 1 > text.length) {
 391                     char newtext[] = new char[text.length + 200];
 392                     System.arraycopy(text, 0, newtext, 0, text.length);
 393                     text = newtext;
 394                 }
 395 
 396                 // output pending space
 397                 text[textpos++] = ' ';
 398                 if (!strict && !tag.getElement().isEmpty()) {
 399                     ignoreSpace = true;
 400                 }
 401             }
 402             space = false;
 403         }
 404         char newtext[] = new char[textpos];
 405         System.arraycopy(text, 0, newtext, 0, textpos);
 406         // Handles cases of bad html where the title tag
 407         // was getting lost when we did error recovery.
 408         if (tag.getElement().getName().equals("title")) {
 409             handleTitle(newtext);
 410         } else {
 411             handleText(newtext);
 412         }
 413         lastBlockStartPos = currentBlockStartPos;
 414         textpos = 0;
 415         last = tag;
 416         space = false;
 417     }
 418 
 419     /**
 420      * Invokes the error handler.
 421      *
 422      * @param err   the error type
 423      * @param arg1  the 1st error message argument
 424      * @param arg2  the 2nd error message argument
 425      * @param arg3  the 3rd error message argument
 426      */
 427     protected void error(String err, String arg1, String arg2,
 428         String arg3) {
 429         handleError(ln, err + " " + arg1 + " " + arg2 + " " + arg3);
 430     }
 431 
 432     /**
 433      * Invokes the error handler with the 3rd error message argument "?".
 434      *
 435      * @param err   the error type
 436      * @param arg1  the 1st error message argument
 437      * @param arg2  the 2nd error message argument
 438      */
 439     protected void error(String err, String arg1, String arg2) {
 440         error(err, arg1, arg2, "?");
 441     }
 442 
 443     /**
 444      * Invokes the error handler with the 2nd and 3rd error message argument "?".
 445      *
 446      * @param err   the error type
 447      * @param arg1  the 1st error message argument
 448      */
 449     protected void error(String err, String arg1) {
 450         error(err, arg1, "?", "?");
 451     }
 452 
 453     /**
 454      * Invokes the error handler with the 1st, 2nd and 3rd error message argument "?".
 455      *
 456      * @param err   the error type
 457      */
 458     protected void error(String err) {
 459         error(err, "?", "?", "?");
 460     }
 461 
 462 
 463     /**
 464      * Handle a start tag. The new tag is pushed
 465      * onto the tag stack. The attribute list is
 466      * checked for required attributes.
 467      *
 468      * @param tag  the tag
 469      * @throws ChangedCharSetException if the document charset was changed
 470      */
 471     protected void startTag(TagElement tag) throws ChangedCharSetException {
 472         Element elem = tag.getElement();
 473 
 474         // If the tag is an empty tag and texpos != 0
 475         // this implies that there is text before the
 476         // start tag that needs to be processed before
 477         // handling the tag.
 478         //
 479         if (!elem.isEmpty() ||
 480                     ((last != null) && !last.breaksFlow()) ||
 481                     (textpos != 0)) {
 482             handleText(tag);
 483         } else {
 484             // this variable gets updated in handleText().
 485             // Since in this case we do not call handleText()
 486             // we need to update it here.
 487             //
 488             last = tag;
 489             // Note that we should really check last.breakFlows before
 490             // assuming this should be false.
 491             space = false;
 492         }
 493         lastBlockStartPos = currentBlockStartPos;
 494 
 495         // check required attributes
 496         for (AttributeList a = elem.atts ; a != null ; a = a.next) {
 497             if ((a.modifier == REQUIRED) &&
 498                 ((attributes.isEmpty()) ||
 499                  ((!attributes.isDefined(a.name)) &&
 500                   (!attributes.isDefined(HTML.getAttributeKey(a.name)))))) {
 501                 error("req.att ", a.getName(), elem.getName());
 502             }
 503         }
 504 
 505         if (elem.isEmpty()) {
 506             handleEmptyTag(tag);
 507             /*
 508         } else if (elem.getName().equals("form")) {
 509             handleStartTag(tag);
 510             */
 511         } else {
 512             recent = elem;
 513             stack = new TagStack(tag, stack);
 514             handleStartTag(tag);
 515         }
 516     }
 517 
 518     /**
 519      * Handle an end tag. The end tag is popped
 520      * from the tag stack.
 521      *
 522      * @param omitted  {@code true} if the tag is no actually present in the
 523      *                 document, but is supposed by the parser
 524      */
 525     protected void endTag(boolean omitted) {
 526         handleText(stack.tag);
 527 
 528         if (omitted && !stack.elem.omitEnd()) {
 529             error("end.missing", stack.elem.getName());
 530         } else if (!stack.terminate()) {
 531             error("end.unexpected", stack.elem.getName());
 532         }
 533 
 534         // handle the tag
 535         handleEndTag(stack.tag);
 536         stack = stack.next;
 537         recent = (stack != null) ? stack.elem : null;
 538     }
 539 
 540 
 541     boolean ignoreElement(Element elem) {
 542 
 543         String stackElement = stack.elem.getName();
 544         String elemName = elem.getName();
 545         /* We ignore all elements that are not valid in the context of
 546            a table except <td>, <th> (these we handle in
 547            legalElementContext()) and #pcdata.  We also ignore the
 548            <font> tag in the context of <ul> and <ol> We additonally
 549            ignore the <meta> and the <style> tag if the body tag has
 550            been seen. **/
 551         if ((elemName.equals("html") && seenHtml) ||
 552             (elemName.equals("head") && seenHead) ||
 553             (elemName.equals("body") && seenBody)) {
 554             return true;
 555         }
 556         if (elemName.equals("dt") || elemName.equals("dd")) {
 557             TagStack s = stack;
 558             while (s != null && !s.elem.getName().equals("dl")) {
 559                 s = s.next;
 560             }
 561             if (s == null) {
 562                 return true;
 563             }
 564         }
 565 
 566         if (((stackElement.equals("table")) &&
 567              (!elemName.equals("#pcdata")) && (!elemName.equals("input"))) ||
 568             ((elemName.equals("font")) &&
 569              (stackElement.equals("ul") || stackElement.equals("ol"))) ||
 570             (elemName.equals("meta") && stack != null) ||
 571             (elemName.equals("style") && seenBody) ||
 572             (stackElement.equals("table") && elemName.equals("a"))) {
 573             return true;
 574         }
 575         return false;
 576     }
 577 
 578 
 579     /**
 580      * Marks the first time a tag has been seen in a document
 581      *
 582      * @param elem  the element represented by the tag
 583      */
 584 
 585     protected void markFirstTime(Element elem) {
 586         String elemName = elem.getName();
 587         if (elemName.equals("html")) {
 588             seenHtml = true;
 589         } else if (elemName.equals("head")) {
 590             seenHead = true;
 591         } else if (elemName.equals("body")) {
 592             if (buf.length == 1) {
 593                 // Refer to note in definition of buf for details on this.
 594                 char[] newBuf = new char[256];
 595 
 596                 newBuf[0] = buf[0];
 597                 buf = newBuf;
 598             }
 599             seenBody = true;
 600         }
 601     }
 602 
 603     /**
 604      * Create a legal content for an element.
 605      */
 606     boolean legalElementContext(Element elem) throws ChangedCharSetException {
 607 
 608         // System.out.println("-- legalContext -- " + elem);
 609 
 610         // Deal with the empty stack
 611         if (stack == null) {
 612             // System.out.println("-- stack is empty");
 613             if (elem != dtd.html) {
 614                 // System.out.println("-- pushing html");
 615                 startTag(makeTag(dtd.html, true));
 616                 return legalElementContext(elem);
 617             }
 618             return true;
 619         }
 620 
 621         // Is it allowed in the current context
 622         if (stack.advance(elem)) {
 623             // System.out.println("-- legal context");
 624             markFirstTime(elem);
 625             return true;
 626         }
 627         boolean insertTag = false;
 628 
 629         // The use of all error recovery strategies are contingent
 630         // on the value of the strict property.
 631         //
 632         // These are commonly occurring errors.  if insertTag is true,
 633         // then we want to adopt an error recovery strategy that
 634         // involves attempting to insert an additional tag to
 635         // legalize the context.  The two errors addressed here
 636         // are:
 637         // 1) when a <td> or <th> is seen soon after a <table> tag.
 638         //    In this case we insert a <tr>.
 639         // 2) when any other tag apart from a <tr> is seen
 640         //    in the context of a <tr>.  In this case we would
 641         //    like to add a <td>.  If a <tr> is seen within a
 642         //    <tr> context, then we will close out the current
 643         //    <tr>.
 644         //
 645         // This insertion strategy is handled later in the method.
 646         // The reason for checking this now, is that in other cases
 647         // we would like to apply other error recovery strategies for example
 648         // ignoring tags.
 649         //
 650         // In certain cases it is better to ignore a tag than try to
 651         // fix the situation.  So the first test is to see if this
 652         // is what we need to do.
 653         //
 654         String stackElemName = stack.elem.getName();
 655         String elemName = elem.getName();
 656 
 657 
 658         if (!strict &&
 659             ((stackElemName.equals("table") && elemName.equals("td")) ||
 660              (stackElemName.equals("table") && elemName.equals("th")) ||
 661              (stackElemName.equals("tr") && !elemName.equals("tr")))){
 662              insertTag = true;
 663         }
 664 
 665 
 666         if (!strict && !insertTag && (stack.elem.getName() != elem.getName() ||
 667                                       elem.getName().equals("body"))) {
 668             if (skipTag = ignoreElement(elem)) {
 669                 error("tag.ignore", elem.getName());
 670                 return skipTag;
 671             }
 672         }
 673 
 674         // Check for anything after the start of the table besides tr, td, th
 675         // or caption, and if those aren't there, insert the <tr> and call
 676         // legalElementContext again.
 677         if (!strict && stackElemName.equals("table") &&
 678             !elemName.equals("tr") && !elemName.equals("td") &&
 679             !elemName.equals("th") && !elemName.equals("caption")) {
 680             Element e = dtd.getElement("tr");
 681             TagElement t = makeTag(e, true);
 682             legalTagContext(t);
 683             startTag(t);
 684             error("start.missing", elem.getName());
 685             return legalElementContext(elem);
 686         }
 687 
 688         // They try to find a legal context by checking if the current
 689         // tag is valid in an enclosing context.  If so
 690         // close out the tags by outputing end tags and then
 691         // insert the current tag.  If the tags that are
 692         // being closed out do not have an optional end tag
 693         // specification in the DTD then an html error is
 694         // reported.
 695         //
 696         if (!insertTag && stack.terminate() && (!strict || stack.elem.omitEnd())) {
 697             for (TagStack s = stack.next ; s != null ; s = s.next) {
 698                 if (s.advance(elem)) {
 699                     while (stack != s) {
 700                         endTag(true);
 701                     }
 702                     return true;
 703                 }
 704                 if (!s.terminate() || (strict && !s.elem.omitEnd())) {
 705                     break;
 706                 }
 707             }
 708         }
 709 
 710         // Check if we know what tag is expected next.
 711         // If so insert the tag.  Report an error if the
 712         // tag does not have its start tag spec in the DTD as optional.
 713         //
 714         Element next = stack.first();
 715         if (next != null && (!strict || next.omitStart()) &&
 716            !(next==dtd.head && elem==dtd.pcdata) ) {
 717             // System.out.println("-- omitting start tag: " + next);
 718             TagElement t = makeTag(next, true);
 719             legalTagContext(t);
 720             startTag(t);
 721             if (!next.omitStart()) {
 722                 error("start.missing", elem.getName());
 723             }
 724             return legalElementContext(elem);
 725         }
 726 
 727 
 728         // Traverse the list of expected elements and determine if adding
 729         // any of these elements would make for a legal context.
 730         //
 731 
 732         if (!strict) {
 733             ContentModel content = stack.contentModel();
 734             Vector<Element> elemVec = new Vector<Element>();
 735             if (content != null) {
 736                 content.getElements(elemVec);
 737                 for (Element e : elemVec) {
 738                     // Ensure that this element has not been included as
 739                     // part of the exclusions in the DTD.
 740                     //
 741                     if (stack.excluded(e.getIndex())) {
 742                         continue;
 743                     }
 744 
 745                     boolean reqAtts = false;
 746 
 747                     for (AttributeList a = e.getAttributes(); a != null ; a = a.next) {
 748                         if (a.modifier == REQUIRED) {
 749                             reqAtts = true;
 750                             break;
 751                         }
 752                     }
 753                     // Ensure that no tag that has required attributes
 754                     // gets inserted.
 755                     //
 756                     if (reqAtts) {
 757                         continue;
 758                     }
 759 
 760                     ContentModel m = e.getContent();
 761                     if (m != null && m.first(elem)) {
 762                         // System.out.println("-- adding a legal tag: " + e);
 763                         TagElement t = makeTag(e, true);
 764                         legalTagContext(t);
 765                         startTag(t);
 766                         error("start.missing", e.getName());
 767                         return legalElementContext(elem);
 768                     }
 769                 }
 770             }
 771         }
 772 
 773         // Check if the stack can be terminated.  If so add the appropriate
 774         // end tag.  Report an error if the tag being ended does not have its
 775         // end tag spec in the DTD as optional.
 776         //
 777         if (stack.terminate() && (stack.elem != dtd.body) && (!strict || stack.elem.omitEnd())) {
 778             // System.out.println("-- omitting end tag: " + stack.elem);
 779             if (!stack.elem.omitEnd()) {
 780                 error("end.missing", elem.getName());
 781             }
 782 
 783             endTag(true);
 784             return legalElementContext(elem);
 785         }
 786 
 787         // At this point we know that something is screwed up.
 788         return false;
 789     }
 790 
 791     /**
 792      * Create a legal context for a tag.
 793      */
 794     void legalTagContext(TagElement tag) throws ChangedCharSetException {
 795         if (legalElementContext(tag.getElement())) {
 796             markFirstTime(tag.getElement());
 797             return;
 798         }
 799 
 800         // Avoid putting a block tag in a flow tag.
 801         if (tag.breaksFlow() && (stack != null) && !stack.tag.breaksFlow()) {
 802             endTag(true);
 803             legalTagContext(tag);
 804             return;
 805         }
 806 
 807         // Avoid putting something wierd in the head of the document.
 808         for (TagStack s = stack ; s != null ; s = s.next) {
 809             if (s.tag.getElement() == dtd.head) {
 810                 while (stack != s) {
 811                     endTag(true);
 812                 }
 813                 endTag(true);
 814                 legalTagContext(tag);
 815                 return;
 816             }
 817         }
 818 
 819         // Everything failed
 820         error("tag.unexpected", tag.getElement().getName());
 821     }
 822 
 823     /**
 824      * Error context. Something went wrong, make sure we are in
 825      * the document's body context
 826      */
 827     void errorContext() throws ChangedCharSetException {
 828         for (; (stack != null) && (stack.tag.getElement() != dtd.body) ; stack = stack.next) {
 829             handleEndTag(stack.tag);
 830         }
 831         if (stack == null) {
 832             legalElementContext(dtd.body);
 833             startTag(makeTag(dtd.body, true));
 834         }
 835     }
 836 
 837     /**
 838      * Add a char to the string buffer.
 839      */
 840     void addString(int c) {
 841         if (strpos  == str.length) {
 842             char newstr[] = new char[str.length + 128];
 843             System.arraycopy(str, 0, newstr, 0, str.length);
 844             str = newstr;
 845         }
 846         str[strpos++] = (char)c;
 847     }
 848 
 849     /**
 850      * Get the string that's been accumulated.
 851      */
 852     String getString(int pos) {
 853         char newStr[] = new char[strpos - pos];
 854         System.arraycopy(str, pos, newStr, 0, strpos - pos);
 855         strpos = pos;
 856         return new String(newStr);
 857     }
 858 
 859     char[] getChars(int pos) {
 860         char newStr[] = new char[strpos - pos];
 861         System.arraycopy(str, pos, newStr, 0, strpos - pos);
 862         strpos = pos;
 863         return newStr;
 864     }
 865 
 866     char[] getChars(int pos, int endPos) {
 867         char newStr[] = new char[endPos - pos];
 868         System.arraycopy(str, pos, newStr, 0, endPos - pos);
 869         // REMIND: it's not clear whether this version should set strpos or not
 870         // strpos = pos;
 871         return newStr;
 872     }
 873 
 874     void resetStrBuffer() {
 875         strpos = 0;
 876     }
 877 
 878     int strIndexOf(char target) {
 879         for (int i = 0; i < strpos; i++) {
 880             if (str[i] == target) {
 881                 return i;
 882             }
 883         }
 884 
 885         return -1;
 886     }
 887 
 888     /**
 889      * Skip space.
 890      * [5] 297:5
 891      */
 892     void skipSpace() throws IOException {
 893         while (true) {
 894             switch (ch) {
 895               case '\n':
 896                 ln++;
 897                 ch = readCh();
 898                 lfCount++;
 899                 break;
 900 
 901               case '\r':
 902                 ln++;
 903                 if ((ch = readCh()) == '\n') {
 904                     ch = readCh();
 905                     crlfCount++;
 906                 }
 907                 else {
 908                     crCount++;
 909                 }
 910                 break;
 911               case ' ':
 912               case '\t':
 913                 ch = readCh();
 914                 break;
 915 
 916               default:
 917                 return;
 918             }
 919         }
 920     }
 921 
 922     /**
 923      * Parse identifier. Uppercase characters are folded
 924      * to lowercase when lower is true. Returns falsed if
 925      * no identifier is found. [55] 346:17
 926      */
 927     boolean parseIdentifier(boolean lower) throws IOException {
 928         switch (ch) {
 929           case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
 930           case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
 931           case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
 932           case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
 933           case 'Y': case 'Z':
 934             if (lower) {
 935                 ch = 'a' + (ch - 'A');
 936             }
 937             break;
 938 
 939           case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
 940           case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
 941           case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
 942           case 's': case 't': case 'u': case 'v': case 'w': case 'x':
 943           case 'y': case 'z':
 944             break;
 945 
 946           default:
 947             return false;
 948         }
 949 
 950         while (true) {
 951             addString(ch);
 952 
 953             switch (ch = readCh()) {
 954               case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
 955               case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
 956               case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
 957               case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
 958               case 'Y': case 'Z':
 959                 if (lower) {
 960                     ch = 'a' + (ch - 'A');
 961                 }
 962                 break;
 963 
 964               case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
 965               case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
 966               case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
 967               case 's': case 't': case 'u': case 'v': case 'w': case 'x':
 968               case 'y': case 'z':
 969 
 970               case '0': case '1': case '2': case '3': case '4':
 971               case '5': case '6': case '7': case '8': case '9':
 972 
 973               case '.': case '-':
 974 
 975               case '_': // not officially allowed
 976                 break;
 977 
 978               default:
 979                 return true;
 980             }
 981         }
 982     }
 983 
 984     /**
 985      * Parse an entity reference. [59] 350:17
 986      */
 987     private char[] parseEntityReference() throws IOException {
 988         int pos = strpos;
 989 
 990         if ((ch = readCh()) == '#') {
 991             int n = 0;
 992             ch = readCh();
 993             if ((ch >= '0') && (ch <= '9') ||
 994                     ch == 'x' || ch == 'X') {
 995 
 996                 if ((ch >= '0') && (ch <= '9')) {
 997                     // parse decimal reference
 998                     while ((ch >= '0') && (ch <= '9')) {
 999                         n = (n * 10) + ch - '0';
1000                         ch = readCh();
1001                     }
1002                 } else {
1003                     // parse hexadecimal reference
1004                     ch = readCh();
1005                     char lch = (char) Character.toLowerCase(ch);
1006                     while ((lch >= '0') && (lch <= '9') ||
1007                             (lch >= 'a') && (lch <= 'f')) {
1008                         if (lch >= '0' && lch <= '9') {
1009                             n = (n * 16) + lch - '0';
1010                         } else {
1011                             n = (n * 16) + lch - 'a' + 10;
1012                         }
1013                         ch = readCh();
1014                         lch = (char) Character.toLowerCase(ch);
1015                     }
1016                 }
1017                 switch (ch) {
1018                     case '\n':
1019                         ln++;
1020                         ch = readCh();
1021                         lfCount++;
1022                         break;
1023 
1024                     case '\r':
1025                         ln++;
1026                         if ((ch = readCh()) == '\n') {
1027                             ch = readCh();
1028                             crlfCount++;
1029                         }
1030                         else {
1031                             crCount++;
1032                         }
1033                         break;
1034 
1035                     case ';':
1036                         ch = readCh();
1037                         break;
1038                 }
1039                 char data[] = mapNumericReference(n);
1040                 return data;
1041             }
1042             addString('#');
1043             if (!parseIdentifier(false)) {
1044                 error("ident.expected");
1045                 strpos = pos;
1046                 char data[] = {'&', '#'};
1047                 return data;
1048             }
1049         } else if (!parseIdentifier(false)) {
1050             char data[] = {'&'};
1051             return data;
1052         }
1053 
1054         boolean semicolon = false;
1055 
1056         switch (ch) {
1057           case '\n':
1058             ln++;
1059             ch = readCh();
1060             lfCount++;
1061             break;
1062 
1063           case '\r':
1064             ln++;
1065             if ((ch = readCh()) == '\n') {
1066                 ch = readCh();
1067                 crlfCount++;
1068             }
1069             else {
1070                 crCount++;
1071             }
1072             break;
1073 
1074           case ';':
1075             semicolon = true;
1076 
1077             ch = readCh();
1078             break;
1079         }
1080 
1081         String nm = getString(pos);
1082         Entity ent = dtd.getEntity(nm);
1083 
1084         // entities are case sensitive - however if strict
1085         // is false then we will try to make a match by
1086         // converting the string to all lowercase.
1087         //
1088         if (!strict && (ent == null)) {
1089             ent = dtd.getEntity(nm.toLowerCase());
1090         }
1091         if ((ent == null) || !ent.isGeneral()) {
1092 
1093             if (nm.length() == 0) {
1094                 error("invalid.entref", nm);
1095                 return new char[0];
1096             }
1097             /* given that there is not a match restore the entity reference */
1098             String str = "&" + nm + (semicolon ? ";" : "");
1099 
1100             char b[] = new char[str.length()];
1101             str.getChars(0, b.length, b, 0);
1102             return b;
1103         }
1104         return ent.getData();
1105     }
1106 
1107     /**
1108      * Converts numeric character reference to char array.
1109      *
1110      * Normally the code in a reference should be always converted
1111      * to the Unicode character with the same code, but due to
1112      * wide usage of Cp1252 charset most browsers map numeric references
1113      * in the range 130-159 (which are control chars in Unicode set)
1114      * to displayable characters with other codes.
1115      *
1116      * @param c the code of numeric character reference.
1117      * @return a char array corresponding to the reference code.
1118      */
1119     private char[] mapNumericReference(int c) {
1120         char[] data;
1121         if (c >= 0xffff) { // outside unicode BMP.
1122             try {
1123                 data = Character.toChars(c);
1124             } catch (IllegalArgumentException e) {
1125                 data = new char[0];
1126             }
1127         } else {
1128             data = new char[1];
1129             data[0] = (c < 130 || c > 159) ? (char) c : cp1252Map[c - 130];
1130         }
1131         return data;
1132     }
1133 
1134     /**
1135      * Parse a comment. [92] 391:7
1136      */
1137     void parseComment() throws IOException {
1138 
1139         while (true) {
1140             int c = ch;
1141             switch (c) {
1142               case '-':
1143                   /** Presuming that the start string of a comment "<!--" has
1144                       already been parsed, the '-' character is valid only as
1145                       part of a comment termination and further more it must
1146                       be present in even numbers. Hence if strict is true, we
1147                       presume the comment has been terminated and return.
1148                       However if strict is false, then there is no even number
1149                       requirement and this character can appear anywhere in the
1150                       comment.  The parser reads on until it sees the following
1151                       pattern: "-->" or "--!>".
1152                    **/
1153                 if (!strict && (strpos != 0) && (str[strpos - 1] == '-')) {
1154                     if ((ch = readCh()) == '>') {
1155                         return;
1156                     }
1157                     if (ch == '!') {
1158                         if ((ch = readCh()) == '>') {
1159                             return;
1160                         } else {
1161                             /* to account for extra read()'s that happened */
1162                             addString('-');
1163                             addString('!');
1164                             continue;
1165                         }
1166                     }
1167                     break;
1168                 }
1169 
1170                 if ((ch = readCh()) == '-') {
1171                     ch = readCh();
1172                     if (strict || ch == '>') {
1173                         return;
1174                     }
1175                     if (ch == '!') {
1176                         if ((ch = readCh()) == '>') {
1177                             return;
1178                         } else {
1179                             /* to account for extra read()'s that happened */
1180                             addString('-');
1181                             addString('!');
1182                             continue;
1183                         }
1184                     }
1185                     /* to account for the extra read() */
1186                     addString('-');
1187                 }
1188                 break;
1189 
1190               case -1:
1191                   handleEOFInComment();
1192                   return;
1193 
1194               case '\n':
1195                 ln++;
1196                 ch = readCh();
1197                 lfCount++;
1198                 break;
1199 
1200               case '>':
1201                 ch = readCh();
1202                 break;
1203 
1204               case '\r':
1205                 ln++;
1206                 if ((ch = readCh()) == '\n') {
1207                     ch = readCh();
1208                     crlfCount++;
1209                 }
1210                 else {
1211                     crCount++;
1212                 }
1213                 c = '\n';
1214                 break;
1215               default:
1216                 ch = readCh();
1217                 break;
1218             }
1219 
1220             addString(c);
1221         }
1222     }
1223 
1224     /**
1225      * Parse literal content. [46] 343:1 and [47] 344:1
1226      */
1227     void parseLiteral(boolean replace) throws IOException {
1228         while (true) {
1229             int c = ch;
1230             switch (c) {
1231               case -1:
1232                 error("eof.literal", stack.elem.getName());
1233                 endTag(true);
1234                 return;
1235 
1236               case '>':
1237                 ch = readCh();
1238                 int i = textpos - (stack.elem.name.length() + 2), j = 0;
1239 
1240                 // match end tag
1241                 if ((i >= 0) && (text[i++] == '<') && (text[i] == '/')) {
1242                     while ((++i < textpos) &&
1243                            (Character.toLowerCase(text[i]) == stack.elem.name.charAt(j++)));
1244                     if (i == textpos) {
1245                         textpos -= (stack.elem.name.length() + 2);
1246                         if ((textpos > 0) && (text[textpos-1] == '\n')) {
1247                             textpos--;
1248                         }
1249                         endTag(false);
1250                         return;
1251                     }
1252                 }
1253                 break;
1254 
1255               case '&':
1256                 char data[] = parseEntityReference();
1257                 if (textpos + data.length > text.length) {
1258                     char newtext[] = new char[Math.max(textpos + data.length + 128, text.length * 2)];
1259                     System.arraycopy(text, 0, newtext, 0, text.length);
1260                     text = newtext;
1261                 }
1262                 System.arraycopy(data, 0, text, textpos, data.length);
1263                 textpos += data.length;
1264                 continue;
1265 
1266               case '\n':
1267                 ln++;
1268                 ch = readCh();
1269                 lfCount++;
1270                 break;
1271 
1272               case '\r':
1273                 ln++;
1274                 if ((ch = readCh()) == '\n') {
1275                     ch = readCh();
1276                     crlfCount++;
1277                 }
1278                 else {
1279                     crCount++;
1280                 }
1281                 c = '\n';
1282                 break;
1283               default:
1284                 ch = readCh();
1285                 break;
1286             }
1287 
1288             // output character
1289             if (textpos == text.length) {
1290                 char newtext[] = new char[text.length + 128];
1291                 System.arraycopy(text, 0, newtext, 0, text.length);
1292                 text = newtext;
1293             }
1294             text[textpos++] = (char)c;
1295         }
1296     }
1297 
1298     /**
1299      * Parse attribute value. [33] 331:1
1300      */
1301     @SuppressWarnings("fallthrough")
1302     String parseAttributeValue(boolean lower) throws IOException {
1303         int delim = -1;
1304 
1305         // Check for a delimiter
1306         switch(ch) {
1307           case '\'':
1308           case '"':
1309             delim = ch;
1310             ch = readCh();
1311             break;
1312         }
1313 
1314         // Parse the rest of the value
1315         while (true) {
1316             int c = ch;
1317 
1318             switch (c) {
1319               case '\n':
1320                 ln++;
1321                 ch = readCh();
1322                 lfCount++;
1323                 if (delim < 0) {
1324                     return getString(0);
1325                 }
1326                 break;
1327 
1328               case '\r':
1329                 ln++;
1330 
1331                 if ((ch = readCh()) == '\n') {
1332                     ch = readCh();
1333                     crlfCount++;
1334                 }
1335                 else {
1336                     crCount++;
1337                 }
1338                 if (delim < 0) {
1339                     return getString(0);
1340                 }
1341                 break;
1342 
1343               case '\t':
1344                   if (delim < 0)
1345                       c = ' ';
1346                   // Fall through
1347               case ' ':
1348                 ch = readCh();
1349                 if (delim < 0) {
1350                     return getString(0);
1351                 }
1352                 break;
1353 
1354               case '>':
1355               case '<':
1356                 if (delim < 0) {
1357                     return getString(0);
1358                 }
1359                 ch = readCh();
1360                 break;
1361 
1362               case '\'':
1363               case '"':
1364                 ch = readCh();
1365                 if (c == delim) {
1366                     return getString(0);
1367                 } else if (delim == -1) {
1368                     error("attvalerr");
1369                     if (strict || ch == ' ') {
1370                         return getString(0);
1371                     } else {
1372                         continue;
1373                     }
1374                 }
1375                 break;
1376 
1377             case '=':
1378                 if (delim < 0) {
1379                     /* In SGML a construct like <img src=/cgi-bin/foo?x=1>
1380                        is considered invalid since an = sign can only be contained
1381                        in an attributes value if the string is quoted.
1382                        */
1383                     error("attvalerr");
1384                     /* If strict is true then we return with the string we have thus far.
1385                        Otherwise we accept the = sign as part of the attribute's value and
1386                        process the rest of the img tag. */
1387                     if (strict) {
1388                         return getString(0);
1389                     }
1390                 }
1391                 ch = readCh();
1392                 break;
1393 
1394               case '&':
1395                 if (strict && delim < 0) {
1396                     ch = readCh();
1397                     break;
1398                 }
1399 
1400                 char data[] = parseEntityReference();
1401                 for (int i = 0 ; i < data.length ; i++) {
1402                     c = data[i];
1403                     addString((lower && (c >= 'A') && (c <= 'Z')) ? 'a' + c - 'A' : c);
1404                 }
1405                 continue;
1406 
1407               case -1:
1408                 return getString(0);
1409 
1410               default:
1411                 if (lower && (c >= 'A') && (c <= 'Z')) {
1412                     c = 'a' + c - 'A';
1413                 }
1414                 ch = readCh();
1415                 break;
1416             }
1417             addString(c);
1418         }
1419     }
1420 
1421 
1422     /**
1423      * Parse attribute specification List. [31] 327:17
1424      */
1425     void parseAttributeSpecificationList(Element elem) throws IOException {
1426 
1427         while (true) {
1428             skipSpace();
1429 
1430             switch (ch) {
1431               case '/':
1432               case '>':
1433               case '<':
1434               case -1:
1435                 return;
1436 
1437               case '-':
1438                 if ((ch = readCh()) == '-') {
1439                     ch = readCh();
1440                     parseComment();
1441                     strpos = 0;
1442                 } else {
1443                     error("invalid.tagchar", "-", elem.getName());
1444                     ch = readCh();
1445                 }
1446                 continue;
1447             }
1448 
1449             AttributeList att;
1450             String attname;
1451             String attvalue;
1452 
1453             if (parseIdentifier(true)) {
1454                 attname = getString(0);
1455                 skipSpace();
1456                 if (ch == '=') {
1457                     ch = readCh();
1458                     skipSpace();
1459                     att = elem.getAttribute(attname);
1460 //  Bug ID 4102750
1461 //  Load the NAME of an Attribute Case Sensitive
1462 //  The case of the NAME  must be intact
1463 //  MG 021898
1464                     attvalue = parseAttributeValue((att != null) && (att.type != CDATA) && (att.type != NOTATION) && (att.type != NAME));
1465 //                  attvalue = parseAttributeValue((att != null) && (att.type != CDATA) && (att.type != NOTATION));
1466                 } else {
1467                     attvalue = attname;
1468                     att = elem.getAttributeByValue(attvalue);
1469                     if (att == null) {
1470                         att = elem.getAttribute(attname);
1471                         if (att != null) {
1472                             attvalue = att.getValue();
1473                         }
1474                         else {
1475                             // Make it null so that NULL_ATTRIBUTE_VALUE is
1476                             // used
1477                             attvalue = null;
1478                         }
1479                     }
1480                 }
1481             } else if (!strict && ch == ',') { // allows for comma separated attribute-value pairs
1482                 ch = readCh();
1483                 continue;
1484             } else if (!strict && ch == '"') { // allows for quoted attributes
1485                 ch = readCh();
1486                 skipSpace();
1487                 if (parseIdentifier(true)) {
1488                     attname = getString(0);
1489                     if (ch == '"') {
1490                         ch = readCh();
1491                     }
1492                     skipSpace();
1493                     if (ch == '=') {
1494                         ch = readCh();
1495                         skipSpace();
1496                         att = elem.getAttribute(attname);
1497                         attvalue = parseAttributeValue((att != null) &&
1498                                                 (att.type != CDATA) &&
1499                                                 (att.type != NOTATION));
1500                     } else {
1501                         attvalue = attname;
1502                         att = elem.getAttributeByValue(attvalue);
1503                         if (att == null) {
1504                             att = elem.getAttribute(attname);
1505                             if (att != null) {
1506                                 attvalue = att.getValue();
1507                             }
1508                         }
1509                     }
1510                 } else {
1511                     char str[] = {(char)ch};
1512                     error("invalid.tagchar", new String(str), elem.getName());
1513                     ch = readCh();
1514                     continue;
1515                 }
1516             } else if (!strict && (attributes.isEmpty()) && (ch == '=')) {
1517                 ch = readCh();
1518                 skipSpace();
1519                 attname = elem.getName();
1520                 att = elem.getAttribute(attname);
1521                 attvalue = parseAttributeValue((att != null) &&
1522                                                (att.type != CDATA) &&
1523                                                (att.type != NOTATION));
1524             } else if (!strict && (ch == '=')) {
1525                 ch = readCh();
1526                 skipSpace();
1527                 attvalue = parseAttributeValue(true);
1528                 error("attvalerr");
1529                 return;
1530             } else {
1531                 char str[] = {(char)ch};
1532                 error("invalid.tagchar", new String(str), elem.getName());
1533                 if (!strict) {
1534                     ch = readCh();
1535                     continue;
1536                 } else {
1537                     return;
1538                 }
1539             }
1540 
1541             if (att != null) {
1542                 attname = att.getName();
1543             } else {
1544                 error("invalid.tagatt", attname, elem.getName());
1545             }
1546 
1547             // Check out the value
1548             if (attributes.isDefined(attname)) {
1549                 error("multi.tagatt", attname, elem.getName());
1550             }
1551             if (attvalue == null) {
1552                 attvalue = ((att != null) && (att.value != null)) ? att.value :
1553                     HTML.NULL_ATTRIBUTE_VALUE;
1554             } else if ((att != null) && (att.values != null) && !att.values.contains(attvalue)) {
1555                 error("invalid.tagattval", attname, elem.getName());
1556             }
1557             HTML.Attribute attkey = HTML.getAttributeKey(attname);
1558             if (attkey == null) {
1559                 attributes.addAttribute(attname, attvalue);
1560             } else {
1561                 attributes.addAttribute(attkey, attvalue);
1562             }
1563         }
1564     }
1565 
1566     /**
1567      * Parses the Document Type Declaration markup declaration.
1568      * Currently ignores it.
1569      *
1570      * @return the string representation of the markup declaration
1571      * @throws IOException if an I/O error occurs
1572      */
1573     public String parseDTDMarkup() throws IOException {
1574 
1575         StringBuilder strBuff = new StringBuilder();
1576         ch = readCh();
1577         while(true) {
1578             switch (ch) {
1579             case '>':
1580                 ch = readCh();
1581                 return strBuff.toString();
1582             case -1:
1583                 error("invalid.markup");
1584                 return strBuff.toString();
1585             case '\n':
1586                 ln++;
1587                 ch = readCh();
1588                 lfCount++;
1589                 break;
1590             case '"':
1591                 ch = readCh();
1592                 break;
1593             case '\r':
1594                 ln++;
1595                 if ((ch = readCh()) == '\n') {
1596                     ch = readCh();
1597                     crlfCount++;
1598                 }
1599                 else {
1600                     crCount++;
1601                 }
1602                 break;
1603             default:
1604                 strBuff.append((char)(ch & 0xFF));
1605                 ch = readCh();
1606                 break;
1607             }
1608         }
1609     }
1610 
1611     /**
1612      * Parse markup declarations.
1613      * Currently only handles the Document Type Declaration markup.
1614      * Returns true if it is a markup declaration false otherwise.
1615      *
1616      * @param strBuff  the markup declaration
1617      * @return {@code true} if this is a valid markup declaration;
1618      *         otherwise {@code false}
1619      * @throws IOException if an I/O error occurs
1620      */
1621     protected boolean parseMarkupDeclarations(StringBuffer strBuff) throws IOException {
1622 
1623         /* Currently handles only the DOCTYPE */
1624         if ((strBuff.length() == "DOCTYPE".length()) &&
1625             (strBuff.toString().toUpperCase().equals("DOCTYPE"))) {
1626             parseDTDMarkup();
1627             return true;
1628         }
1629         return false;
1630     }
1631 
1632     /**
1633      * Parse an invalid tag.
1634      */
1635     void parseInvalidTag() throws IOException {
1636         // ignore all data upto the close bracket '>'
1637         while (true) {
1638             skipSpace();
1639             switch (ch) {
1640               case '>':
1641               case -1:
1642                   ch = readCh();
1643                 return;
1644               case '<':
1645                   return;
1646               default:
1647                   ch = readCh();
1648 
1649             }
1650         }
1651     }
1652 
1653     /**
1654      * Parse a start or end tag.
1655      */
1656     @SuppressWarnings("fallthrough")
1657     void parseTag() throws IOException {
1658         Element elem;
1659         boolean net = false;
1660         boolean warned = false;
1661         boolean unknown = false;
1662 
1663         switch (ch = readCh()) {
1664           case '!':
1665             switch (ch = readCh()) {
1666               case '-':
1667                 // Parse comment. [92] 391:7
1668                 while (true) {
1669                     if (ch == '-') {
1670                         if (!strict || ((ch = readCh()) == '-')) {
1671                             ch = readCh();
1672                             if (!strict && ch == '-') {
1673                                 ch = readCh();
1674                             }
1675                             // send over any text you might see
1676                             // before parsing and sending the
1677                             // comment
1678                             if (textpos != 0) {
1679                                 char newtext[] = new char[textpos];
1680                                 System.arraycopy(text, 0, newtext, 0, textpos);
1681                                 handleText(newtext);
1682                                 lastBlockStartPos = currentBlockStartPos;
1683                                 textpos = 0;
1684                             }
1685                             parseComment();
1686                             last = makeTag(dtd.getElement("comment"), true);
1687                             handleComment(getChars(0));
1688                             continue;
1689                         } else if (!warned) {
1690                             warned = true;
1691                             error("invalid.commentchar", "-");
1692                         }
1693                     }
1694                     skipSpace();
1695                     switch (ch) {
1696                       case '-':
1697                         continue;
1698                       case '>':
1699                         ch = readCh();
1700                         return;
1701                       case -1:
1702                         return;
1703                       default:
1704                         ch = readCh();
1705                         if (!warned) {
1706                             warned = true;
1707                             error("invalid.commentchar",
1708                                   String.valueOf((char)ch));
1709                         }
1710                         break;
1711                     }
1712                 }
1713 
1714               default:
1715                 // deal with marked sections
1716                 StringBuffer strBuff = new StringBuffer();
1717                 while (true) {
1718                     strBuff.append((char)ch);
1719                     if (parseMarkupDeclarations(strBuff)) {
1720                         return;
1721                     }
1722                     switch(ch) {
1723                       case '>':
1724                         ch = readCh();
1725                         // Fall through
1726                       case -1:
1727                         error("invalid.markup");
1728                         return;
1729                       case '\n':
1730                         ln++;
1731                         ch = readCh();
1732                         lfCount++;
1733                         break;
1734                       case '\r':
1735                         ln++;
1736                         if ((ch = readCh()) == '\n') {
1737                             ch = readCh();
1738                             crlfCount++;
1739                         }
1740                         else {
1741                             crCount++;
1742                         }
1743                         break;
1744 
1745                       default:
1746                         ch = readCh();
1747                         break;
1748                     }
1749                 }
1750             }
1751 
1752           case '/':
1753             // parse end tag [19] 317:4
1754             switch (ch = readCh()) {
1755               case '>':
1756                 ch = readCh();
1757                 // Fall through
1758               case '<':
1759                 // empty end tag. either </> or </<
1760                 if (recent == null) {
1761                     error("invalid.shortend");
1762                     return;
1763                 }
1764                 elem = recent;
1765                 break;
1766 
1767               default:
1768                 if (!parseIdentifier(true)) {
1769                     error("expected.endtagname");
1770                     return;
1771                 }
1772                 skipSpace();
1773                 switch (ch) {
1774                   case '>':
1775                     ch = readCh();
1776                     break;
1777                   case '<':
1778                     break;
1779 
1780                   default:
1781                     error("expected", "'>'");
1782                     while ((ch != -1) && (ch != '\n') && (ch != '>')) {
1783                         ch = readCh();
1784                     }
1785                     if (ch == '>') {
1786                         ch = readCh();
1787                     }
1788                     break;
1789                 }
1790                 String elemStr = getString(0);
1791                 if (!dtd.elementExists(elemStr)) {
1792                     error("end.unrecognized", elemStr);
1793                     // Ignore RE before end tag
1794                     if ((textpos > 0) && (text[textpos-1] == '\n')) {
1795                         textpos--;
1796                     }
1797                     elem = dtd.getElement("unknown");
1798                     elem.name = elemStr;
1799                     unknown = true;
1800                 } else {
1801                     elem = dtd.getElement(elemStr);
1802                 }
1803                 break;
1804             }
1805 
1806 
1807             // If the stack is null, we're seeing end tags without any begin
1808             // tags.  Ignore them.
1809 
1810             if (stack == null) {
1811                 error("end.extra.tag", elem.getName());
1812                 return;
1813             }
1814 
1815             // Ignore RE before end tag
1816             if ((textpos > 0) && (text[textpos-1] == '\n')) {
1817                 // In a pre tag, if there are blank lines
1818                 // we do not want to remove the newline
1819                 // before the end tag.  Hence this code.
1820                 //
1821                 if (stack.pre) {
1822                     if ((textpos > 1) && (text[textpos-2] != '\n')) {
1823                         textpos--;
1824                     }
1825                 } else {
1826                     textpos--;
1827                 }
1828             }
1829 
1830             // If the end tag is a form, since we did not put it
1831             // on the tag stack, there is no corresponding start
1832             // start tag to find. Hence do not touch the tag stack.
1833             //
1834 
1835             /*
1836             if (!strict && elem.getName().equals("form")) {
1837                 if (lastFormSent != null) {
1838                     handleEndTag(lastFormSent);
1839                     return;
1840                 } else {
1841                     // do nothing.
1842                     return;
1843                 }
1844             }
1845             */
1846 
1847             if (unknown) {
1848                 // we will not see a corresponding start tag
1849                 // on the stack.  If we are seeing an
1850                 // end tag, lets send this on as an empty
1851                 // tag with the end tag attribute set to
1852                 // true.
1853                 TagElement t = makeTag(elem);
1854                 handleText(t);
1855                 attributes.addAttribute(HTML.Attribute.ENDTAG, "true");
1856                 handleEmptyTag(makeTag(elem));
1857                 unknown = false;
1858                 return;
1859             }
1860 
1861             // find the corresponding start tag
1862 
1863             // A commonly occurring error appears to be the insertion
1864             // of extra end tags in a table.  The intent here is ignore
1865             // such extra end tags.
1866             //
1867             if (!strict) {
1868                 String stackElem = stack.elem.getName();
1869 
1870                 if (stackElem.equals("table")) {
1871                     // If it is not a valid end tag ignore it and return
1872                     //
1873                     if (!elem.getName().equals(stackElem)) {
1874                         error("tag.ignore", elem.getName());
1875                         return;
1876                     }
1877                 }
1878 
1879 
1880 
1881                 if (stackElem.equals("tr") ||
1882                     stackElem.equals("td")) {
1883                     if ((!elem.getName().equals("table")) &&
1884                         (!elem.getName().equals(stackElem))) {
1885                         error("tag.ignore", elem.getName());
1886                         return;
1887                     }
1888                 }
1889             }
1890             TagStack sp = stack;
1891 
1892             while ((sp != null) && (elem != sp.elem)) {
1893                 sp = sp.next;
1894             }
1895             if (sp == null) {
1896                 error("unmatched.endtag", elem.getName());
1897                 return;
1898             }
1899 
1900             // People put font ending tags in the darndest places.
1901             // Don't close other contexts based on them being between
1902             // a font tag and the corresponding end tag.  Instead,
1903             // ignore the end tag like it doesn't exist and allow the end
1904             // of the document to close us out.
1905             String elemName = elem.getName();
1906             if (stack != sp &&
1907                 (elemName.equals("font") ||
1908                  elemName.equals("center"))) {
1909 
1910                 // Since closing out a center tag can have real wierd
1911                 // effects on the formatting,  make sure that tags
1912                 // for which omitting an end tag is legimitate
1913                 // get closed out.
1914                 //
1915                 if (elemName.equals("center")) {
1916                     while(stack.elem.omitEnd() && stack != sp) {
1917                         endTag(true);
1918                     }
1919                     if (stack.elem == elem) {
1920                         endTag(false);
1921                     }
1922                 }
1923                 return;
1924             }
1925             // People do the same thing with center tags.  In this
1926             // case we would like to close off the center tag but
1927             // not necessarily all enclosing tags.
1928 
1929 
1930 
1931             // end tags
1932             while (stack != sp) {
1933                 endTag(true);
1934             }
1935 
1936             endTag(false);
1937             return;
1938 
1939           case -1:
1940             error("eof");
1941             return;
1942         }
1943 
1944         // start tag [14] 314:1
1945         if (!parseIdentifier(true)) {
1946             elem = recent;
1947             if ((ch != '>') || (elem == null)) {
1948                 error("expected.tagname");
1949                 return;
1950             }
1951         } else {
1952             String elemStr = getString(0);
1953 
1954             if (elemStr.equals("image")) {
1955                 elemStr = "img";
1956             }
1957 
1958             /* determine if this element is part of the dtd. */
1959 
1960             if (!dtd.elementExists(elemStr)) {
1961                 //              parseInvalidTag();
1962                 error("tag.unrecognized ", elemStr);
1963                 elem = dtd.getElement("unknown");
1964                 elem.name = elemStr;
1965                 unknown = true;
1966             } else {
1967                 elem = dtd.getElement(elemStr);
1968             }
1969         }
1970 
1971         // Parse attributes
1972         parseAttributeSpecificationList(elem);
1973 
1974         switch (ch) {
1975           case '/':
1976             net = true;
1977             // Fall through
1978           case '>':
1979             ch = readCh();
1980             if (ch == '>' && net) {
1981                 ch = readCh();
1982             }
1983           case '<':
1984             break;
1985 
1986           default:
1987             error("expected", "'>'");
1988             break;
1989         }
1990 
1991         if (!strict) {
1992           if (elem.getName().equals("script")) {
1993             error("javascript.unsupported");
1994           }
1995         }
1996 
1997         // ignore RE after start tag
1998         //
1999         if (!elem.isEmpty())  {
2000             if (ch == '\n') {
2001                 ln++;
2002                 lfCount++;
2003                 ch = readCh();
2004             } else if (ch == '\r') {
2005                 ln++;
2006                 if ((ch = readCh()) == '\n') {
2007                     ch = readCh();
2008                     crlfCount++;
2009                 }
2010                 else {
2011                     crCount++;
2012                 }
2013             }
2014         }
2015 
2016         // ensure a legal context for the tag
2017         TagElement tag = makeTag(elem, false);
2018 
2019 
2020         /** In dealing with forms, we have decided to treat
2021             them as legal in any context.  Also, even though
2022             they do have a start and an end tag, we will
2023             not put this tag on the stack.  This is to deal
2024             several pages in the web oasis that choose to
2025             start and end forms in any possible location. **/
2026 
2027         /*
2028         if (!strict && elem.getName().equals("form")) {
2029             if (lastFormSent == null) {
2030                 lastFormSent = tag;
2031             } else {
2032                 handleEndTag(lastFormSent);
2033                 lastFormSent = tag;
2034             }
2035         } else {
2036         */
2037             // Smlly, if a tag is unknown, we will apply
2038             // no legalTagContext logic to it.
2039             //
2040             if (!unknown) {
2041                 legalTagContext(tag);
2042 
2043                 // If skip tag is true,  this implies that
2044                 // the tag was illegal and that the error
2045                 // recovery strategy adopted is to ignore
2046                 // the tag.
2047                 if (!strict && skipTag) {
2048                     skipTag = false;
2049                     return;
2050                 }
2051             }
2052             /*
2053         }
2054             */
2055 
2056         startTag(tag);
2057 
2058         if (!elem.isEmpty()) {
2059             switch (elem.getType()) {
2060               case CDATA:
2061                 parseLiteral(false);
2062                 break;
2063               case RCDATA:
2064                 parseLiteral(true);
2065                 break;
2066               default:
2067                 if (stack != null) {
2068                     stack.net = net;
2069                 }
2070                 break;
2071             }
2072         }
2073     }
2074 
2075     private static final String START_COMMENT = "<!--";
2076     private static final String END_COMMENT = "-->";
2077     private static final char[] SCRIPT_END_TAG = "</script>".toCharArray();
2078     private static final char[] SCRIPT_END_TAG_UPPER_CASE =
2079                                         "</SCRIPT>".toCharArray();
2080 
2081     void parseScript() throws IOException {
2082         char[] charsToAdd = new char[SCRIPT_END_TAG.length];
2083         boolean insideComment = false;
2084 
2085         /* Here, ch should be the first character after <script> */
2086         while (true) {
2087             int i = 0;
2088             while (!insideComment && i < SCRIPT_END_TAG.length
2089                        && (SCRIPT_END_TAG[i] == ch
2090                            || SCRIPT_END_TAG_UPPER_CASE[i] == ch)) {
2091                 charsToAdd[i] = (char) ch;
2092                 ch = readCh();
2093                 i++;
2094             }
2095             if (i == SCRIPT_END_TAG.length) {
2096 
2097                 /*  '</script>' tag detected */
2098                 /* Here, ch == the first character after </script> */
2099                 return;
2100             } else {
2101 
2102                 /* To account for extra read()'s that happened */
2103                 for (int j = 0; j < i; j++) {
2104                     addString(charsToAdd[j]);
2105                 }
2106 
2107                 switch (ch) {
2108                 case -1:
2109                     error("eof.script");
2110                     return;
2111                 case '\n':
2112                     ln++;
2113                     ch = readCh();
2114                     lfCount++;
2115                     addString('\n');
2116                     break;
2117                 case '\r':
2118                     ln++;
2119                     if ((ch = readCh()) == '\n') {
2120                         ch = readCh();
2121                         crlfCount++;
2122                     } else {
2123                         crCount++;
2124                     }
2125                     addString('\n');
2126                     break;
2127                 default:
2128                     addString(ch);
2129                     String str = new String(getChars(0, strpos));
2130                     if (!insideComment && str.endsWith(START_COMMENT)) {
2131                         insideComment = true;
2132                     }
2133                     if (insideComment && str.endsWith(END_COMMENT)) {
2134                         insideComment = false;
2135                     }
2136                     ch = readCh();
2137                     break;
2138                 } // switch
2139             }
2140         } // while
2141     }
2142 
2143     /**
2144      * Parse Content. [24] 320:1
2145      */
2146     void parseContent() throws IOException {
2147         Thread curThread = Thread.currentThread();
2148 
2149         for (;;) {
2150             if (curThread.isInterrupted()) {
2151                 curThread.interrupt(); // resignal the interrupt
2152                 break;
2153             }
2154 
2155             int c = ch;
2156             currentBlockStartPos = currentPosition;
2157 
2158             if (recent == dtd.script) { // means: if after starting <script> tag
2159 
2160                 /* Here, ch has to be the first character after <script> */
2161                 parseScript();
2162                 last = makeTag(dtd.getElement("comment"), true);
2163 
2164                 /* Remove leading and trailing HTML comment declarations */
2165                 String str = new String(getChars(0)).trim();
2166                 int minLength = START_COMMENT.length() + END_COMMENT.length();
2167                 if (str.startsWith(START_COMMENT) && str.endsWith(END_COMMENT)
2168                        && str.length() >= (minLength)) {
2169                     str = str.substring(START_COMMENT.length(),
2170                                       str.length() - END_COMMENT.length());
2171                 }
2172 
2173                 /* Handle resulting chars as comment */
2174                 handleComment(str.toCharArray());
2175                 endTag(false);
2176                 lastBlockStartPos = currentPosition;
2177 
2178                 continue;
2179             } else {
2180                 switch (c) {
2181                   case '<':
2182                     parseTag();
2183                     lastBlockStartPos = currentPosition;
2184                     continue;
2185 
2186                   case '/':
2187                     ch = readCh();
2188                     if ((stack != null) && stack.net) {
2189                         // null end tag.
2190                         endTag(false);
2191                         continue;
2192                     } else if (textpos == 0) {
2193                         if (!legalElementContext(dtd.pcdata)) {
2194                             error("unexpected.pcdata");
2195                         }
2196                         if (last.breaksFlow()) {
2197                             space = false;
2198                         }
2199                     }
2200                     break;
2201 
2202                   case -1:
2203                     return;
2204 
2205                   case '&':
2206                     if (textpos == 0) {
2207                         if (!legalElementContext(dtd.pcdata)) {
2208                             error("unexpected.pcdata");
2209                         }
2210                         if (last.breaksFlow()) {
2211                             space = false;
2212                         }
2213                     }
2214                     char data[] = parseEntityReference();
2215                     if (textpos + data.length + 1 > text.length) {
2216                         char newtext[] = new char[Math.max(textpos + data.length + 128, text.length * 2)];
2217                         System.arraycopy(text, 0, newtext, 0, text.length);
2218                         text = newtext;
2219                     }
2220                     if (space) {
2221                         space = false;
2222                         text[textpos++] = ' ';
2223                     }
2224                     System.arraycopy(data, 0, text, textpos, data.length);
2225                     textpos += data.length;
2226                     ignoreSpace = false;
2227                     continue;
2228 
2229                   case '\n':
2230                     ln++;
2231                     lfCount++;
2232                     ch = readCh();
2233                     if ((stack != null) && stack.pre) {
2234                         break;
2235                     }
2236                     if (textpos == 0) {
2237                         lastBlockStartPos = currentPosition;
2238                     }
2239                     if (!ignoreSpace) {
2240                         space = true;
2241                     }
2242                     continue;
2243 
2244                   case '\r':
2245                     ln++;
2246                     c = '\n';
2247                     if ((ch = readCh()) == '\n') {
2248                         ch = readCh();
2249                         crlfCount++;
2250                     }
2251                     else {
2252                         crCount++;
2253                     }
2254                     if ((stack != null) && stack.pre) {
2255                         break;
2256                     }
2257                     if (textpos == 0) {
2258                         lastBlockStartPos = currentPosition;
2259                     }
2260                     if (!ignoreSpace) {
2261                         space = true;
2262                     }
2263                     continue;
2264 
2265 
2266                   case '\t':
2267                   case ' ':
2268                     ch = readCh();
2269                     if ((stack != null) && stack.pre) {
2270                         break;
2271                     }
2272                     if (textpos == 0) {
2273                         lastBlockStartPos = currentPosition;
2274                     }
2275                     if (!ignoreSpace) {
2276                         space = true;
2277                     }
2278                     continue;
2279 
2280                   default:
2281                     if (textpos == 0) {
2282                         if (!legalElementContext(dtd.pcdata)) {
2283                             error("unexpected.pcdata");
2284                         }
2285                         if (last.breaksFlow()) {
2286                             space = false;
2287                         }
2288                     }
2289                     ch = readCh();
2290                     break;
2291                 }
2292             }
2293 
2294             // enlarge buffer if needed
2295             if (textpos + 2 > text.length) {
2296                 char newtext[] = new char[text.length + 128];
2297                 System.arraycopy(text, 0, newtext, 0, text.length);
2298                 text = newtext;
2299             }
2300 
2301             // output pending space
2302             if (space) {
2303                 if (textpos == 0) {
2304                     lastBlockStartPos--;
2305                 }
2306                 text[textpos++] = ' ';
2307                 space = false;
2308             }
2309             text[textpos++] = (char)c;
2310             ignoreSpace = false;
2311         }
2312     }
2313 
2314     /**
2315      * Returns the end of line string. This will return the end of line
2316      * string that has been encountered the most, one of \r, \n or \r\n.
2317      */
2318     String getEndOfLineString() {
2319         if (crlfCount >= crCount) {
2320             if (lfCount >= crlfCount) {
2321                 return "\n";
2322             }
2323             else {
2324                 return "\r\n";
2325             }
2326         }
2327         else {
2328             if (crCount > lfCount) {
2329                 return "\r";
2330             }
2331             else {
2332                 return "\n";
2333             }
2334         }
2335     }
2336 
2337     /**
2338      * Parse an HTML stream, given a DTD.
2339      *
2340      * @param in  the reader to read the source from
2341      * @throws IOException if an I/O error occurs
2342      */
2343     public synchronized void parse(Reader in) throws IOException {
2344         this.in = in;
2345 
2346         this.ln = 1;
2347 
2348         seenHtml = false;
2349         seenHead = false;
2350         seenBody = false;
2351 
2352         crCount = lfCount = crlfCount = 0;
2353 
2354         try {
2355             ch = readCh();
2356             text = new char[1024];
2357             str = new char[128];
2358 
2359             parseContent();
2360             // NOTE: interruption may have occurred.  Control flows out
2361             // of here normally.
2362             while (stack != null) {
2363                 endTag(true);
2364             }
2365             in.close();
2366         } catch (IOException e) {
2367             errorContext();
2368             error("ioexception");
2369             throw e;
2370         } catch (Exception e) {
2371             errorContext();
2372             error("exception", e.getClass().getName(), e.getMessage());
2373             e.printStackTrace();
2374         } catch (ThreadDeath e) {
2375             errorContext();
2376             error("terminated");
2377             e.printStackTrace();
2378             throw e;
2379         } finally {
2380             for (; stack != null ; stack = stack.next) {
2381                 handleEndTag(stack.tag);
2382             }
2383 
2384             text = null;
2385             str = null;
2386         }
2387 
2388     }
2389 
2390 
2391     /*
2392      * Input cache.  This is much faster than calling down to a synchronized
2393      * method of BufferedReader for each byte.  Measurements done 5/30/97
2394      * show that there's no point in having a bigger buffer:  Increasing
2395      * the buffer to 8192 had no measurable impact for a program discarding
2396      * one character at a time (reading from an http URL to a local machine).
2397      * NOTE: If the current encoding is bogus, and we read too much
2398      * (past the content-type) we may suffer a MalformedInputException. For
2399      * this reason the initial size is 1 and when the body is encountered the
2400      * size is adjusted to 256.
2401      */
2402     private char buf[] = new char[1];
2403     private int pos;
2404     private int len;
2405     /*
2406         tracks position relative to the beginning of the
2407         document.
2408     */
2409     private int currentPosition;
2410 
2411 
2412     private final int readCh() throws IOException {
2413 
2414         if (pos >= len) {
2415 
2416             // This loop allows us to ignore interrupts if the flag
2417             // says so
2418             for (;;) {
2419                 try {
2420                     len = in.read(buf);
2421                     break;
2422                 } catch (InterruptedIOException ex) {
2423                     throw ex;
2424                 }
2425             }
2426 
2427             if (len <= 0) {
2428                 return -1;      // eof
2429             }
2430             pos = 0;
2431         }
2432         ++currentPosition;
2433 
2434         return buf[pos++];
2435     }
2436 
2437 
2438     /**
2439      * Returns the current position.
2440      *
2441      * @return the current position
2442      */
2443     protected int getCurrentPos() {
2444         return currentPosition;
2445     }
2446 }