48 * HTML files. This parser attempts to parse most HTML files. 49 * This means that the implementation sometimes deviates from 50 * the SGML specification in favor of HTML. 51 * <p> 52 * The parser treats \r and \r\n as \n. Newlines after starttags 53 * and before end tags are ignored just as specified in the SGML/HTML 54 * specification. 55 * <p> 56 * The html spec does not specify how spaces are to be coalesced very well. 57 * Specifically, the following scenarios are not discussed (note that a 58 * space should be used here, but I am using &nbsp to force the space to 59 * be displayed): 60 * <p> 61 * '<b>blah <i> <strike> foo' which can be treated as: 62 * '<b>blah <i><strike>foo' 63 * <p>as well as: 64 * '<p><a href="xx"> <em>Using</em></a></p>' 65 * which appears to be treated as: 66 * '<p><a href="xx"><em>Using</em></a></p>' 67 * <p> 68 * If <code>strict</code> is false, when a tag that breaks flow, 69 * (<code>TagElement.breaksFlows</code>) or trailing whitespace is 70 * encountered, all whitespace will be ignored until a non whitespace 71 * character is encountered. This appears to give behavior closer to 72 * the popular browsers. 73 * 74 * @see DTD 75 * @see TagElement 76 * @see SimpleAttributeSet 77 * @author Arthur van Hoff 78 * @author Sunita Mani 79 */ 80 public 81 class Parser implements DTDConstants { 82 83 private char text[] = new char[1024]; 84 private int textpos = 0; 85 private TagElement last; 86 private boolean space; 87 88 private char str[] = new char[128]; 89 private int strpos = 0; | 48 * HTML files. This parser attempts to parse most HTML files. 49 * This means that the implementation sometimes deviates from 50 * the SGML specification in favor of HTML. 51 * <p> 52 * The parser treats \r and \r\n as \n. Newlines after starttags 53 * and before end tags are ignored just as specified in the SGML/HTML 54 * specification. 55 * <p> 56 * The html spec does not specify how spaces are to be coalesced very well. 57 * Specifically, the following scenarios are not discussed (note that a 58 * space should be used here, but I am using &nbsp to force the space to 59 * be displayed): 60 * <p> 61 * '<b>blah <i> <strike> foo' which can be treated as: 62 * '<b>blah <i><strike>foo' 63 * <p>as well as: 64 * '<p><a href="xx"> <em>Using</em></a></p>' 65 * which appears to be treated as: 66 * '<p><a href="xx"><em>Using</em></a></p>' 67 * <p> 68 * If {@code strict} is false, when a tag that breaks flow, 69 * ({@code TagElement.breaksFlows}) or trailing whitespace is 70 * encountered, all whitespace will be ignored until a non whitespace 71 * character is encountered. This appears to give behavior closer to 72 * the popular browsers. 73 * 74 * @see DTD 75 * @see TagElement 76 * @see SimpleAttributeSet 77 * @author Arthur van Hoff 78 * @author Sunita Mani 79 */ 80 public 81 class Parser implements DTDConstants { 82 83 private char text[] = new char[1024]; 84 private int textpos = 0; 85 private TagElement last; 86 private boolean space; 87 88 private char str[] = new char[128]; 89 private int strpos = 0; |