1 /*
   2  * Copyright (c) 1998, 2018, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 
  26 package javax.swing.text.html.parser;
  27 
  28 import javax.swing.text.SimpleAttributeSet;
  29 import javax.swing.text.html.HTMLEditorKit;
  30 import javax.swing.text.html.HTML;
  31 import javax.swing.text.ChangedCharSetException;
  32 
  33 import java.util.*;
  34 import java.io.*;
  35 import java.net.*;
  36 
  37 /**
  38  * A Parser for HTML Documents (actually, you can specify a DTD, but
  39  * you should really only use this class with the html dtd in swing).
  40  * Reads an InputStream of HTML and
  41  * invokes the appropriate methods in the ParserCallback class. This
  42  * is the default parser used by HTMLEditorKit to parse HTML url's.
  43  * <p>This will message the callback for all valid tags, as well as
  44  * tags that are implied but not explicitly specified. For example, the
  45  * html string (&lt;p&gt;blah) only has a p tag defined. The callback
  46  * will see the following methods:
  47  * <ol><li><i>handleStartTag(html, ...)</i></li>
  48  *     <li><i>handleStartTag(head, ...)</i></li>
  49  *     <li><i>handleEndTag(head)</i></li>
  50  *     <li><i>handleStartTag(body, ...)</i></li>
  51  *     <li><i>handleStartTag(p, ...)</i></li>
  52  *     <li><i>handleText(...)</i></li>
  53  *     <li><i>handleEndTag(p)</i></li>
  54  *     <li><i>handleEndTag(body)</i></li>
  55  *     <li><i>handleEndTag(html)</i></li>
  56  * </ol>
  57  * The items in <i>italic</i> are implied, that is, although they were not
  58  * explicitly specified, to be correct html they should have been present
  59  * (head isn't necessary, but it is still generated). For tags that
  60  * are implied, the AttributeSet argument will have a value of
  61  * <code>Boolean.TRUE</code> for the key
  62  * <code>HTMLEditorKit.ParserCallback.IMPLIED</code>.
  63  * <p>HTML.Attributes defines a type safe enumeration of html attributes.
  64  * If an attribute key of a tag is defined in HTML.Attribute, the
  65  * HTML.Attribute will be used as the key, otherwise a String will be used.
  66  * For example &lt;p foo=bar class=neat&gt; has two attributes. foo is
  67  * not defined in HTML.Attribute, where as class is, therefore the
  68  * AttributeSet will have two values in it, HTML.Attribute.CLASS with
  69  * a String value of 'neat' and the String key 'foo' with a String value of
  70  * 'bar'.
  71  * <p>The position argument will indicate the start of the tag, comment
  72  * or text. Similar to arrays, the first character in the stream has a
  73  * position of 0. For tags that are
  74  * implied the position will indicate
  75  * the location of the next encountered tag. In the first example,
  76  * the implied start body and html tags will have the same position as the
  77  * p tag, and the implied end p, html and body tags will all have the same
  78  * position.
  79  * <p>As html skips whitespace the position for text will be the position
  80  * of the first valid character, eg in the string '\n\n\nblah'
  81  * the text 'blah' will have a position of 3, the newlines are skipped.
  82  * <p>
  83  * For attributes that do not have a value, eg in the html
  84  * string <code>&lt;foo blah&gt;</code> the attribute <code>blah</code>
  85  * does not have a value, there are two possible values that will be
  86  * placed in the AttributeSet's value:
  87  * <ul>
  88  * <li>If the DTD does not contain an definition for the element, or the
  89  *     definition does not have an explicit value then the value in the
  90  *     AttributeSet will be <code>HTML.NULL_ATTRIBUTE_VALUE</code>.
  91  * <li>If the DTD contains an explicit value, as in:
  92  *     <code>&lt;!ATTLIST OPTION selected (selected) #IMPLIED&gt;</code>
  93  *     this value from the dtd (in this case selected) will be used.
  94  * </ul>
  95  * <p>
  96  * Once the stream has been parsed, the callback is notified of the most
  97  * likely end of line string. The end of line string will be one of
  98  * \n, \r or \r\n, which ever is encountered the most in parsing the
  99  * stream.
 100  *
 101  * @author      Sunita Mani
 102  */
 103 public class DocumentParser extends javax.swing.text.html.parser.Parser {
 104 
 105     private int inbody;
 106     private int intitle;
 107     private int inhead;
 108     private int instyle;
 109     private int inscript;
 110     private boolean seentitle;
 111     private HTMLEditorKit.ParserCallback callback = null;
 112     private boolean ignoreCharSet = false;
 113     private static final boolean debugFlag = false;
 114 
 115     /**
 116      * Creates document parser with the specified {@code dtd}.
 117      *
 118      * @param dtd the dtd.
 119      */
 120     public DocumentParser(DTD dtd) {
 121         super(dtd);
 122     }
 123 
 124     /**
 125      * Parse an HTML stream, given a DTD.
 126      *
 127      * @param in the reader to read the source from
 128      * @param callback the callback
 129      * @param ignoreCharSet if {@code true} the charset is ignored
 130      * @throws IOException if an I/O error occurs
 131      */
 132     public void parse(Reader in, HTMLEditorKit.ParserCallback callback, boolean ignoreCharSet) throws IOException {
 133         this.ignoreCharSet = ignoreCharSet;
 134         this.callback = callback;
 135         parse(in);
 136         // end of line
 137         callback.handleEndOfLineString(getEndOfLineString());
 138     }
 139 
 140     /**
 141      * Handle Start Tag.
 142      */
 143     protected void handleStartTag(TagElement tag) {
 144 
 145         Element elem = tag.getElement();
 146         if (elem == dtd.body) {
 147             inbody++;
 148         } else if (elem == dtd.html) {
 149         } else if (elem == dtd.head) {
 150             inhead++;
 151         } else if (elem == dtd.title) {
 152             intitle++;
 153         } else if (elem == dtd.style) {
 154             instyle++;
 155         } else if (elem == dtd.script) {
 156             inscript++;
 157         }
 158         if (debugFlag) {
 159             if (tag.fictional()) {
 160                 debug("Start Tag: " + tag.getHTMLTag() + " pos: " + getCurrentPos());
 161             } else {
 162                 debug("Start Tag: " + tag.getHTMLTag() + " attributes: " +
 163                       getAttributes() + " pos: " + getCurrentPos());
 164             }
 165         }
 166         if (tag.fictional()) {
 167             SimpleAttributeSet attrs = new SimpleAttributeSet();
 168             attrs.addAttribute(HTMLEditorKit.ParserCallback.IMPLIED,
 169                                Boolean.TRUE);
 170             callback.handleStartTag(tag.getHTMLTag(), attrs,
 171                                     getBlockStartPosition());
 172         } else {
 173             callback.handleStartTag(tag.getHTMLTag(), getAttributes(),
 174                                     getBlockStartPosition());
 175             flushAttributes();
 176         }
 177     }
 178 
 179 
 180     protected void handleComment(char[] text) {
 181         if (debugFlag) {
 182             debug("comment: ->" + new String(text) + "<-"
 183                   + " pos: " + getCurrentPos());
 184         }
 185         callback.handleComment(text, getBlockStartPosition());
 186     }
 187 
 188     /**
 189      * Handle Empty Tag.
 190      */
 191     protected void handleEmptyTag(TagElement tag) throws ChangedCharSetException {
 192 
 193         Element elem = tag.getElement();
 194         if (elem == dtd.meta && !ignoreCharSet) {
 195             SimpleAttributeSet atts = getAttributes();
 196             if (atts != null) {
 197                 String content = (String)atts.getAttribute(HTML.Attribute.CONTENT);
 198                 if (content != null) {
 199                     if ("content-type".equalsIgnoreCase((String)atts.getAttribute(HTML.Attribute.HTTPEQUIV))) {
 200                         if (!content.equalsIgnoreCase("text/html") &&
 201                                 !content.equalsIgnoreCase("text/plain")) {
 202                             throw new ChangedCharSetException(content, false);
 203                         }
 204                     } else if ("charset" .equalsIgnoreCase((String)atts.getAttribute(HTML.Attribute.HTTPEQUIV))) {
 205                         throw new ChangedCharSetException(content, true);
 206                     }
 207                 }
 208             }
 209         }
 210         if (inbody != 0 || elem == dtd.meta || elem == dtd.base || elem == dtd.isindex || elem == dtd.style || elem == dtd.link) {
 211             if (debugFlag) {
 212                 if (tag.fictional()) {
 213                     debug("Empty Tag: " + tag.getHTMLTag() + " pos: " + getCurrentPos());
 214                 } else {
 215                     debug("Empty Tag: " + tag.getHTMLTag() + " attributes: "
 216                           + getAttributes() + " pos: " + getCurrentPos());
 217                 }
 218             }
 219             if (tag.fictional()) {
 220                 SimpleAttributeSet attrs = new SimpleAttributeSet();
 221                 attrs.addAttribute(HTMLEditorKit.ParserCallback.IMPLIED,
 222                                    Boolean.TRUE);
 223                 callback.handleSimpleTag(tag.getHTMLTag(), attrs,
 224                                          getBlockStartPosition());
 225             } else {
 226                 callback.handleSimpleTag(tag.getHTMLTag(), getAttributes(),
 227                                          getBlockStartPosition());
 228                 flushAttributes();
 229             }
 230         }
 231     }
 232 
 233     /**
 234      * Handle End Tag.
 235      */
 236     protected void handleEndTag(TagElement tag) {
 237         Element elem = tag.getElement();
 238         if (elem == dtd.body) {
 239             inbody--;
 240         } else if (elem == dtd.title) {
 241             intitle--;
 242             seentitle = true;
 243         } else if (elem == dtd.head) {
 244             inhead--;
 245         } else if (elem == dtd.style) {
 246             instyle--;
 247         } else if (elem == dtd.script) {
 248             inscript--;
 249         }
 250         if (debugFlag) {
 251             debug("End Tag: " + tag.getHTMLTag() + " pos: " + getCurrentPos());
 252         }
 253         callback.handleEndTag(tag.getHTMLTag(), getBlockStartPosition());
 254 
 255     }
 256 
 257     /**
 258      * Handle Text.
 259      */
 260     protected void handleText(char[] data) {
 261         if (data != null) {
 262             if (inscript != 0) {
 263                 callback.handleComment(data, getBlockStartPosition());
 264                 return;
 265             }
 266             if (inbody != 0 || ((instyle != 0) ||
 267                                 ((intitle != 0) && !seentitle))) {
 268                 if (debugFlag) {
 269                     debug("text:  ->" + new String(data) + "<-" + " pos: " + getCurrentPos());
 270                 }
 271                 callback.handleText(data, getBlockStartPosition());
 272             }
 273         }
 274     }
 275 
 276     /*
 277      * Error handling.
 278      */
 279     protected void handleError(int ln, String errorMsg) {
 280         if (debugFlag) {
 281             debug("Error: ->" + errorMsg + "<-" + " pos: " + getCurrentPos());
 282         }
 283         /* PENDING: need to improve the error string. */
 284         callback.handleError(errorMsg, getCurrentPos());
 285     }
 286 
 287 
 288     /*
 289      * debug messages
 290      */
 291     private void debug(String msg) {
 292         System.out.println(msg);
 293     }
 294 }