1 /* 2 * Copyright (c) 1998, 2013, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 package javax.swing.text.html.parser; 27 28 import javax.swing.text.SimpleAttributeSet; 29 import javax.swing.text.html.HTMLEditorKit; 30 import javax.swing.text.html.HTML; 31 import javax.swing.text.ChangedCharSetException; 32 33 import java.util.*; 34 import java.io.*; 35 import java.net.*; 36 37 /** 38 * A Parser for HTML Documents (actually, you can specify a DTD, but 39 * you should really only use this class with the html dtd in swing). 40 * Reads an InputStream of HTML and 41 * invokes the appropriate methods in the ParserCallback class. This 42 * is the default parser used by HTMLEditorKit to parse HTML url's. 43 * <p>This will message the callback for all valid tags, as well as 44 * tags that are implied but not explicitly specified. For example, the 45 * html string (<p>blah) only has a p tag defined. The callback 46 * will see the following methods: 47 * <ol><li><i>handleStartTag(html, ...)</i></li> 48 * <li><i>handleStartTag(head, ...)</i></li> 49 * <li><i>handleEndTag(head)</i></li> 50 * <li><i>handleStartTag(body, ...)</i></li> 51 * <li><i>handleStartTag(p, ...)</i></li> 52 * <li><i>handleText(...)</i></li> 53 * <li><i>handleEndTag(p)</i></li> 54 * <li><i>handleEndTag(body)</i></li> 55 * <li><i>handleEndTag(html)</i></li> 56 * </ol> 57 * The items in <i>italic</i> are implied, that is, although they were not 58 * explicitly specified, to be correct html they should have been present 59 * (head isn't necessary, but it is still generated). For tags that 60 * are implied, the AttributeSet argument will have a value of 61 * <code>Boolean.TRUE</code> for the key 62 * <code>HTMLEditorKit.ParserCallback.IMPLIED</code>. 63 * <p>HTML.Attributes defines a type safe enumeration of html attributes. 64 * If an attribute key of a tag is defined in HTML.Attribute, the 65 * HTML.Attribute will be used as the key, otherwise a String will be used. 66 * For example <p foo=bar class=neat> has two attributes. foo is 67 * not defined in HTML.Attribute, where as class is, therefore the 68 * AttributeSet will have two values in it, HTML.Attribute.CLASS with 69 * a String value of 'neat' and the String key 'foo' with a String value of 70 * 'bar'. 71 * <p>The position argument will indicate the start of the tag, comment 72 * or text. Similar to arrays, the first character in the stream has a 73 * position of 0. For tags that are 74 * implied the position will indicate 75 * the location of the next encountered tag. In the first example, 76 * the implied start body and html tags will have the same position as the 77 * p tag, and the implied end p, html and body tags will all have the same 78 * position. 79 * <p>As html skips whitespace the position for text will be the position 80 * of the first valid character, eg in the string '\n\n\nblah' 81 * the text 'blah' will have a position of 3, the newlines are skipped. 82 * <p> 83 * For attributes that do not have a value, eg in the html 84 * string <code><foo blah></code> the attribute <code>blah</code> 85 * does not have a value, there are two possible values that will be 86 * placed in the AttributeSet's value: 87 * <ul> 88 * <li>If the DTD does not contain an definition for the element, or the 89 * definition does not have an explicit value then the value in the 90 * AttributeSet will be <code>HTML.NULL_ATTRIBUTE_VALUE</code>. 91 * <li>If the DTD contains an explicit value, as in: 92 * <code><!ATTLIST OPTION selected (selected) #IMPLIED></code> 93 * this value from the dtd (in this case selected) will be used. 94 * </ul> 95 * <p> 96 * Once the stream has been parsed, the callback is notified of the most 97 * likely end of line string. The end of line string will be one of 98 * \n, \r or \r\n, which ever is encountered the most in parsing the 99 * stream. 100 * 101 * @author Sunita Mani 102 */ 103 public class DocumentParser extends javax.swing.text.html.parser.Parser { 104 105 private int inbody; 106 private int intitle; 107 private int inhead; 108 private int instyle; 109 private int inscript; 110 private boolean seentitle; 111 private HTMLEditorKit.ParserCallback callback = null; 112 private boolean ignoreCharSet = false; 113 private static final boolean debugFlag = false; 114 115 /** 116 * Creates document parser with the specified {@code dtd}. 117 * 118 * @param dtd the dtd. 119 */ 120 public DocumentParser(DTD dtd) { 121 super(dtd); 122 } 123 124 /** 125 * Parse an HTML stream, given a DTD. 126 * 127 * @param in the reader to read the source from 128 * @param callback the callback 129 * @param ignoreCharSet if {@code true} the charset is ignored 130 * @throws IOException if an I/O error occurs 131 */ 132 public void parse(Reader in, HTMLEditorKit.ParserCallback callback, boolean ignoreCharSet) throws IOException { 133 this.ignoreCharSet = ignoreCharSet; 134 this.callback = callback; 135 parse(in); 136 // end of line 137 callback.handleEndOfLineString(getEndOfLineString()); 138 } 139 140 /** 141 * Handle Start Tag. 142 */ 143 protected void handleStartTag(TagElement tag) { 144 145 Element elem = tag.getElement(); 146 if (elem == dtd.body) { 147 inbody++; 148 } else if (elem == dtd.html) { 149 } else if (elem == dtd.head) { 150 inhead++; 151 } else if (elem == dtd.title) { 152 intitle++; 153 } else if (elem == dtd.style) { 154 instyle++; 155 } else if (elem == dtd.script) { 156 inscript++; 157 } 158 if (debugFlag) { 159 if (tag.fictional()) { 160 debug("Start Tag: " + tag.getHTMLTag() + " pos: " + getCurrentPos()); 161 } else { 162 debug("Start Tag: " + tag.getHTMLTag() + " attributes: " + 163 getAttributes() + " pos: " + getCurrentPos()); 164 } 165 } 166 if (tag.fictional()) { 167 SimpleAttributeSet attrs = new SimpleAttributeSet(); 168 attrs.addAttribute(HTMLEditorKit.ParserCallback.IMPLIED, 169 Boolean.TRUE); 170 callback.handleStartTag(tag.getHTMLTag(), attrs, 171 getBlockStartPosition()); 172 } else { 173 callback.handleStartTag(tag.getHTMLTag(), getAttributes(), 174 getBlockStartPosition()); 175 flushAttributes(); 176 } 177 } 178 179 180 protected void handleComment(char text[]) { 181 if (debugFlag) { 182 debug("comment: ->" + new String(text) + "<-" 183 + " pos: " + getCurrentPos()); 184 } 185 callback.handleComment(text, getBlockStartPosition()); 186 } 187 188 /** 189 * Handle Empty Tag. 190 */ 191 protected void handleEmptyTag(TagElement tag) throws ChangedCharSetException { 192 193 Element elem = tag.getElement(); 194 if (elem == dtd.meta && !ignoreCharSet) { 195 SimpleAttributeSet atts = getAttributes(); 196 if (atts != null) { 197 String content = (String)atts.getAttribute(HTML.Attribute.CONTENT); 198 if (content != null) { 199 if ("content-type".equalsIgnoreCase((String)atts.getAttribute(HTML.Attribute.HTTPEQUIV))) { 200 if (!content.equalsIgnoreCase("text/html") && 201 !content.equalsIgnoreCase("text/plain")) { 202 throw new ChangedCharSetException(content, false); 203 } 204 } else if ("charset" .equalsIgnoreCase((String)atts.getAttribute(HTML.Attribute.HTTPEQUIV))) { 205 throw new ChangedCharSetException(content, true); 206 } 207 } 208 } 209 } 210 if (inbody != 0 || elem == dtd.meta || elem == dtd.base || elem == dtd.isindex || elem == dtd.style || elem == dtd.link) { 211 if (debugFlag) { 212 if (tag.fictional()) { 213 debug("Empty Tag: " + tag.getHTMLTag() + " pos: " + getCurrentPos()); 214 } else { 215 debug("Empty Tag: " + tag.getHTMLTag() + " attributes: " 216 + getAttributes() + " pos: " + getCurrentPos()); 217 } 218 } 219 if (tag.fictional()) { 220 SimpleAttributeSet attrs = new SimpleAttributeSet(); 221 attrs.addAttribute(HTMLEditorKit.ParserCallback.IMPLIED, 222 Boolean.TRUE); 223 callback.handleSimpleTag(tag.getHTMLTag(), attrs, 224 getBlockStartPosition()); 225 } else { 226 callback.handleSimpleTag(tag.getHTMLTag(), getAttributes(), 227 getBlockStartPosition()); 228 flushAttributes(); 229 } 230 } 231 } 232 233 /** 234 * Handle End Tag. 235 */ 236 protected void handleEndTag(TagElement tag) { 237 Element elem = tag.getElement(); 238 if (elem == dtd.body) { 239 inbody--; 240 } else if (elem == dtd.title) { 241 intitle--; 242 seentitle = true; 243 } else if (elem == dtd.head) { 244 inhead--; 245 } else if (elem == dtd.style) { 246 instyle--; 247 } else if (elem == dtd.script) { 248 inscript--; 249 } 250 if (debugFlag) { 251 debug("End Tag: " + tag.getHTMLTag() + " pos: " + getCurrentPos()); 252 } 253 callback.handleEndTag(tag.getHTMLTag(), getBlockStartPosition()); 254 255 } 256 257 /** 258 * Handle Text. 259 */ 260 protected void handleText(char data[]) { 261 if (data != null) { 262 if (inscript != 0) { 263 callback.handleComment(data, getBlockStartPosition()); 264 return; 265 } 266 if (inbody != 0 || ((instyle != 0) || 267 ((intitle != 0) && !seentitle))) { 268 if (debugFlag) { 269 debug("text: ->" + new String(data) + "<-" + " pos: " + getCurrentPos()); 270 } 271 callback.handleText(data, getBlockStartPosition()); 272 } 273 } 274 } 275 276 /* 277 * Error handling. 278 */ 279 protected void handleError(int ln, String errorMsg) { 280 if (debugFlag) { 281 debug("Error: ->" + errorMsg + "<-" + " pos: " + getCurrentPos()); 282 } 283 /* PENDING: need to improve the error string. */ 284 callback.handleError(errorMsg, getCurrentPos()); 285 } 286 287 288 /* 289 * debug messages 290 */ 291 private void debug(String msg) { 292 System.out.println(msg); 293 } 294 }