/* * Copyright (c) 1998, 2018, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 only, as * published by the Free Software Foundation. Oracle designates this * particular file as subject to the "Classpath" exception as provided * by Oracle in the LICENSE file that accompanied this code. * * This code is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * version 2 for more details (a copy is included in the LICENSE file that * accompanied this code). * * You should have received a copy of the GNU General Public License version * 2 along with this work; if not, write to the Free Software Foundation, * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. * * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA * or visit www.oracle.com if you need additional information or have any * questions. */ package javax.swing.text.html.parser; import javax.swing.text.SimpleAttributeSet; import javax.swing.text.html.HTMLEditorKit; import javax.swing.text.html.HTML; import javax.swing.text.ChangedCharSetException; import java.util.*; import java.io.*; import java.net.*; /** * A Parser for HTML Documents (actually, you can specify a DTD, but * you should really only use this class with the html dtd in swing). * Reads an InputStream of HTML and * invokes the appropriate methods in the ParserCallback class. This * is the default parser used by HTMLEditorKit to parse HTML url's. *

This will message the callback for all valid tags, as well as * tags that are implied but not explicitly specified. For example, the * html string (<p>blah) only has a p tag defined. The callback * will see the following methods: *

  1. handleStartTag(html, ...)
  2. *
  3. handleStartTag(head, ...)
  4. *
  5. handleEndTag(head)
  6. *
  7. handleStartTag(body, ...)
  8. *
  9. handleStartTag(p, ...)
  10. *
  11. handleText(...)
  12. *
  13. handleEndTag(p)
  14. *
  15. handleEndTag(body)
  16. *
  17. handleEndTag(html)
  18. *
* The items in italic are implied, that is, although they were not * explicitly specified, to be correct html they should have been present * (head isn't necessary, but it is still generated). For tags that * are implied, the AttributeSet argument will have a value of * Boolean.TRUE for the key * HTMLEditorKit.ParserCallback.IMPLIED. *

HTML.Attributes defines a type safe enumeration of html attributes. * If an attribute key of a tag is defined in HTML.Attribute, the * HTML.Attribute will be used as the key, otherwise a String will be used. * For example <p foo=bar class=neat> has two attributes. foo is * not defined in HTML.Attribute, where as class is, therefore the * AttributeSet will have two values in it, HTML.Attribute.CLASS with * a String value of 'neat' and the String key 'foo' with a String value of * 'bar'. *

The position argument will indicate the start of the tag, comment * or text. Similar to arrays, the first character in the stream has a * position of 0. For tags that are * implied the position will indicate * the location of the next encountered tag. In the first example, * the implied start body and html tags will have the same position as the * p tag, and the implied end p, html and body tags will all have the same * position. *

As html skips whitespace the position for text will be the position * of the first valid character, eg in the string '\n\n\nblah' * the text 'blah' will have a position of 3, the newlines are skipped. *

* For attributes that do not have a value, eg in the html * string <foo blah> the attribute blah * does not have a value, there are two possible values that will be * placed in the AttributeSet's value: *

*

* Once the stream has been parsed, the callback is notified of the most * likely end of line string. The end of line string will be one of * \n, \r or \r\n, which ever is encountered the most in parsing the * stream. * * @author Sunita Mani */ public class DocumentParser extends javax.swing.text.html.parser.Parser { private int inbody; private int intitle; private int inhead; private int instyle; private int inscript; private boolean seentitle; private HTMLEditorKit.ParserCallback callback = null; private boolean ignoreCharSet = false; private static final boolean debugFlag = false; /** * Creates document parser with the specified {@code dtd}. * * @param dtd the dtd. */ public DocumentParser(DTD dtd) { super(dtd); } /** * Parse an HTML stream, given a DTD. * * @param in the reader to read the source from * @param callback the callback * @param ignoreCharSet if {@code true} the charset is ignored * @throws IOException if an I/O error occurs */ public void parse(Reader in, HTMLEditorKit.ParserCallback callback, boolean ignoreCharSet) throws IOException { this.ignoreCharSet = ignoreCharSet; this.callback = callback; parse(in); // end of line callback.handleEndOfLineString(getEndOfLineString()); } /** * Handle Start Tag. */ protected void handleStartTag(TagElement tag) { Element elem = tag.getElement(); if (elem == dtd.body) { inbody++; } else if (elem == dtd.html) { } else if (elem == dtd.head) { inhead++; } else if (elem == dtd.title) { intitle++; } else if (elem == dtd.style) { instyle++; } else if (elem == dtd.script) { inscript++; } if (debugFlag) { if (tag.fictional()) { debug("Start Tag: " + tag.getHTMLTag() + " pos: " + getCurrentPos()); } else { debug("Start Tag: " + tag.getHTMLTag() + " attributes: " + getAttributes() + " pos: " + getCurrentPos()); } } if (tag.fictional()) { SimpleAttributeSet attrs = new SimpleAttributeSet(); attrs.addAttribute(HTMLEditorKit.ParserCallback.IMPLIED, Boolean.TRUE); callback.handleStartTag(tag.getHTMLTag(), attrs, getBlockStartPosition()); } else { callback.handleStartTag(tag.getHTMLTag(), getAttributes(), getBlockStartPosition()); flushAttributes(); } } protected void handleComment(char[] text) { if (debugFlag) { debug("comment: ->" + new String(text) + "<-" + " pos: " + getCurrentPos()); } callback.handleComment(text, getBlockStartPosition()); } /** * Handle Empty Tag. */ protected void handleEmptyTag(TagElement tag) throws ChangedCharSetException { Element elem = tag.getElement(); if (elem == dtd.meta && !ignoreCharSet) { SimpleAttributeSet atts = getAttributes(); if (atts != null) { String content = (String)atts.getAttribute(HTML.Attribute.CONTENT); if (content != null) { if ("content-type".equalsIgnoreCase((String)atts.getAttribute(HTML.Attribute.HTTPEQUIV))) { if (!content.equalsIgnoreCase("text/html") && !content.equalsIgnoreCase("text/plain")) { throw new ChangedCharSetException(content, false); } } else if ("charset" .equalsIgnoreCase((String)atts.getAttribute(HTML.Attribute.HTTPEQUIV))) { throw new ChangedCharSetException(content, true); } } } } if (inbody != 0 || elem == dtd.meta || elem == dtd.base || elem == dtd.isindex || elem == dtd.style || elem == dtd.link) { if (debugFlag) { if (tag.fictional()) { debug("Empty Tag: " + tag.getHTMLTag() + " pos: " + getCurrentPos()); } else { debug("Empty Tag: " + tag.getHTMLTag() + " attributes: " + getAttributes() + " pos: " + getCurrentPos()); } } if (tag.fictional()) { SimpleAttributeSet attrs = new SimpleAttributeSet(); attrs.addAttribute(HTMLEditorKit.ParserCallback.IMPLIED, Boolean.TRUE); callback.handleSimpleTag(tag.getHTMLTag(), attrs, getBlockStartPosition()); } else { callback.handleSimpleTag(tag.getHTMLTag(), getAttributes(), getBlockStartPosition()); flushAttributes(); } } } /** * Handle End Tag. */ protected void handleEndTag(TagElement tag) { Element elem = tag.getElement(); if (elem == dtd.body) { inbody--; } else if (elem == dtd.title) { intitle--; seentitle = true; } else if (elem == dtd.head) { inhead--; } else if (elem == dtd.style) { instyle--; } else if (elem == dtd.script) { inscript--; } if (debugFlag) { debug("End Tag: " + tag.getHTMLTag() + " pos: " + getCurrentPos()); } callback.handleEndTag(tag.getHTMLTag(), getBlockStartPosition()); } /** * Handle Text. */ protected void handleText(char[] data) { if (data != null) { if (inscript != 0) { callback.handleComment(data, getBlockStartPosition()); return; } if (inbody != 0 || ((instyle != 0) || ((intitle != 0) && !seentitle))) { if (debugFlag) { debug("text: ->" + new String(data) + "<-" + " pos: " + getCurrentPos()); } callback.handleText(data, getBlockStartPosition()); } } } /* * Error handling. */ protected void handleError(int ln, String errorMsg) { if (debugFlag) { debug("Error: ->" + errorMsg + "<-" + " pos: " + getCurrentPos()); } /* PENDING: need to improve the error string. */ callback.handleError(errorMsg, getCurrentPos()); } /* * debug messages */ private void debug(String msg) { System.out.println(msg); } }