/* * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 only, as * published by the Free Software Foundation. Oracle designates this * particular file as subject to the "Classpath" exception as provided * by Oracle in the LICENSE file that accompanied this code. * * This code is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * version 2 for more details (a copy is included in the LICENSE file that * accompanied this code). * * You should have received a copy of the GNU General Public License version * 2 along with this work; if not, write to the Free Software Foundation, * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. * * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA * or visit www.oracle.com if you need additional information or have any * questions. */ package javax.swing.text.html.parser; import javax.swing.text.SimpleAttributeSet; import javax.swing.text.html.HTML; import javax.swing.text.ChangedCharSetException; import java.io.*; import java.util.Hashtable; import java.util.Properties; import java.util.Vector; import java.util.Enumeration; import java.net.URL; import sun.misc.MessageUtils; /** * A simple DTD-driven HTML parser. The parser reads an * HTML file from an InputStream and calls various methods * (which should be overridden in a subclass) when tags and * data are encountered. *

* Unfortunately there are many badly implemented HTML parsers * out there, and as a result there are many badly formatted * HTML files. This parser attempts to parse most HTML files. * This means that the implementation sometimes deviates from * the SGML specification in favor of HTML. *

* The parser treats \r and \r\n as \n. Newlines after starttags * and before end tags are ignored just as specified in the SGML/HTML * specification. *

* The html spec does not specify how spaces are to be coalesced very well. * Specifically, the following scenarios are not discussed (note that a * space should be used here, but I am using &nbsp to force the space to * be displayed): *

* 'blah <strike> foo' which can be treated as: * 'blah <strike>foo' *

as well as: * '<a href="xx"> Using</a>' * which appears to be treated as: * '<a href="xx">Using</a>' *

* If strict is false, when a tag that breaks flow, * (TagElement.breaksFlows) or trailing whitespace is * encountered, all whitespace will be ignored until a non whitespace * character is encountered. This appears to give behavior closer to * the popular browsers. * * @see DTD * @see TagElement * @see SimpleAttributeSet * @author Arthur van Hoff * @author Sunita Mani */ public class Parser implements DTDConstants { private char text[] = new char[1024]; private int textpos = 0; private TagElement last; private boolean space; private char str[] = new char[128]; private int strpos = 0; protected DTD dtd = null; private int ch; private int ln; private Reader in; private Element recent; private TagStack stack; private boolean skipTag = false; private TagElement lastFormSent = null; private SimpleAttributeSet attributes = new SimpleAttributeSet(); // State for , and . Since people like to slap // together HTML documents without thinking, occasionally they // have multiple instances of these tags. These booleans track // the first sightings of these tags so they can be safely ignored // by the parser if repeated. private boolean seenHtml = false; private boolean seenHead = false; private boolean seenBody = false; /** * The html spec does not specify how spaces are coalesced very well. * If strict == false, ignoreSpace is used to try and mimic the behavior * of the popular browsers. *

* The problematic scenarios are: * 'blah <strike> foo' which can be treated as: * 'blah <strike>foo' * as well as: * '<a href="xx"> Using</a>' * which appears to be treated as: * '<a href="xx">Using</a>' *

* When a tag that breaks flow, or trailing whitespace is encountered * ignoreSpace is set to true. From then on, all whitespace will be * ignored. * ignoreSpace will be set back to false the first time a * non whitespace character is encountered. This appears to give * behavior closer to the popular browsers. */ private boolean ignoreSpace; /** * This flag determines whether or not the Parser will be strict * in enforcing SGML compatibility. If false, it will be lenient * with certain common classes of erroneous HTML constructs. * Strict or not, in either case an error will be recorded. * */ protected boolean strict = false; /** Number of \r\n's encountered. */ private int crlfCount; /** Number of \r's encountered. A \r\n will not increment this. */ private int crCount; /** Number of \n's encountered. A \r\n will not increment this. */ private int lfCount; // // To correctly identify the start of a tag/comment/text we need two // ivars. Two are needed as handleText isn't invoked until the tag // after the text has been parsed, that is the parser parses the text, // then a tag, then invokes handleText followed by handleStart. // /** The start position of the current block. Block is overloaded here, * it really means the current start position for the current comment, * tag, text. Use getBlockStartPosition to access this. */ private int currentBlockStartPos; /** Start position of the last block. */ private int lastBlockStartPos; /** * array for mapping numeric references in range * 130-159 to displayable Unicode characters. */ private static final char[] cp1252Map = { 8218, // ‚ 402, // ƒ 8222, // „ 8230, // … 8224, // † 8225, // ‡ 710, // ˆ 8240, // ‰ 352, // Š 8249, // ‹ 338, // Œ 141, // 142, // Ž 143, // 144, // 8216, // ‘ 8217, // ’ 8220, // “ 8221, // ” 8226, // • 8211, // – 8212, // — 732, // ˜ 8482, // ™ 353, // š 8250, // › 339, // œ 157, // 158, // ž 376 // Ÿ }; public Parser(DTD dtd) { this.dtd = dtd; } /** * @return the line number of the line currently being parsed */ protected int getCurrentLine() { return ln; } /** * Returns the start position of the current block. Block is * overloaded here, it really means the current start position for * the current comment tag, text, block.... This is provided for * subclassers that wish to know the start of the current block when * called with one of the handleXXX methods. */ int getBlockStartPosition() { return Math.max(0, lastBlockStartPos - 1); } /** * Makes a TagElement. */ protected TagElement makeTag(Element elem, boolean fictional) { return new TagElement(elem, fictional); } protected TagElement makeTag(Element elem) { return makeTag(elem, false); } protected SimpleAttributeSet getAttributes() { return attributes; } protected void flushAttributes() { attributes.removeAttributes(attributes); } /** * Called when PCDATA is encountered. */ protected void handleText(char text[]) { } /** * Called when an HTML title tag is encountered. */ protected void handleTitle(char text[]) { // default behavior is to call handleText. Subclasses // can override if necessary. handleText(text); } /** * Called when an HTML comment is encountered. */ protected void handleComment(char text[]) { } protected void handleEOFInComment() { // We've reached EOF. Our recovery strategy is to // see if we have more than one line in the comment; // if so, we pretend that the comment was an unterminated // single line comment, and reparse the lines after the // first line as normal HTML content. int commentEndPos = strIndexOf('\n'); if (commentEndPos >= 0) { handleComment(getChars(0, commentEndPos)); try { in.close(); in = new CharArrayReader(getChars(commentEndPos + 1)); ch = '>'; } catch (IOException e) { error("ioexception"); } resetStrBuffer(); } else { // no newline, so signal an error error("eof.comment"); } } /** * Called when an empty tag is encountered. */ protected void handleEmptyTag(TagElement tag) throws ChangedCharSetException { } /** * Called when a start tag is encountered. */ protected void handleStartTag(TagElement tag) { } /** * Called when an end tag is encountered. */ protected void handleEndTag(TagElement tag) { } /** * An error has occurred. */ protected void handleError(int ln, String msg) { /* Thread.dumpStack(); System.out.println("**** " + stack); System.out.println("line " + ln + ": error: " + msg); System.out.println(); */ } /** * Output text. */ void handleText(TagElement tag) { if (tag.breaksFlow()) { space = false; if (!strict) { ignoreSpace = true; } } if (textpos == 0) { if ((!space) || (stack == null) || last.breaksFlow() || !stack.advance(dtd.pcdata)) { last = tag; space = false; lastBlockStartPos = currentBlockStartPos; return; } } if (space) { if (!ignoreSpace) { // enlarge buffer if needed if (textpos + 1 > text.length) { char newtext[] = new char[text.length + 200]; System.arraycopy(text, 0, newtext, 0, text.length); text = newtext; } // output pending space text[textpos++] = ' '; if (!strict && !tag.getElement().isEmpty()) { ignoreSpace = true; } } space = false; } char newtext[] = new char[textpos]; System.arraycopy(text, 0, newtext, 0, textpos); // Handles cases of bad html where the title tag // was getting lost when we did error recovery. if (tag.getElement().getName().equals("title")) { handleTitle(newtext); } else { handleText(newtext); } lastBlockStartPos = currentBlockStartPos; textpos = 0; last = tag; space = false; } /** * Invoke the error handler. */ protected void error(String err, String arg1, String arg2, String arg3) { handleError(ln, err + " " + arg1 + " " + arg2 + " " + arg3); } protected void error(String err, String arg1, String arg2) { error(err, arg1, arg2, "?"); } protected void error(String err, String arg1) { error(err, arg1, "?", "?"); } protected void error(String err) { error(err, "?", "?", "?"); } /** * Handle a start tag. The new tag is pushed * onto the tag stack. The attribute list is * checked for required attributes. */ protected void startTag(TagElement tag) throws ChangedCharSetException { Element elem = tag.getElement(); // If the tag is an empty tag and texpos != 0 // this implies that there is text before the // start tag that needs to be processed before // handling the tag. // if (!elem.isEmpty() || ((last != null) && !last.breaksFlow()) || (textpos != 0)) { handleText(tag); } else { // this variable gets updated in handleText(). // Since in this case we do not call handleText() // we need to update it here. // last = tag; // Note that we should really check last.breakFlows before // assuming this should be false. space = false; } lastBlockStartPos = currentBlockStartPos; // check required attributes for (AttributeList a = elem.atts ; a != null ; a = a.next) { if ((a.modifier == REQUIRED) && ((attributes.isEmpty()) || ((!attributes.isDefined(a.name)) && (!attributes.isDefined(HTML.getAttributeKey(a.name)))))) { error("req.att ", a.getName(), elem.getName()); } } if (elem.isEmpty()) { handleEmptyTag(tag); /* } else if (elem.getName().equals("form")) { handleStartTag(tag); */ } else { recent = elem; stack = new TagStack(tag, stack); handleStartTag(tag); } } /** * Handle an end tag. The end tag is popped * from the tag stack. */ protected void endTag(boolean omitted) { handleText(stack.tag); if (omitted && !stack.elem.omitEnd()) { error("end.missing", stack.elem.getName()); } else if (!stack.terminate()) { error("end.unexpected", stack.elem.getName()); } // handle the tag handleEndTag(stack.tag); stack = stack.next; recent = (stack != null) ? stack.elem : null; } boolean ignoreElement(Element elem) { String stackElement = stack.elem.getName(); String elemName = elem.getName(); /* We ignore all elements that are not valid in the context of a table except , (these we handle in legalElementContext()) and #pcdata. We also ignore the tag in the context of

and