1 /*
2 * Copyright (c) 1998, 2017, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation. Oracle designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Oracle in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
61 * <p>as well as:
62 * '<p><a href="xx"> <em>Using</em></a></p>'
63 * which appears to be treated as:
64 * '<p><a href="xx"><em>Using</em></a></p>'
65 * <p>
66 * If <code>strict</code> is false, when a tag that breaks flow,
67 * (<code>TagElement.breaksFlows</code>) or trailing whitespace is
68 * encountered, all whitespace will be ignored until a non whitespace
69 * character is encountered. This appears to give behavior closer to
70 * the popular browsers.
71 *
72 * @see DTD
73 * @see TagElement
74 * @see SimpleAttributeSet
75 * @author Arthur van Hoff
76 * @author Sunita Mani
77 */
78 public
79 class Parser implements DTDConstants {
80
81 private char text[] = new char[1024];
82 private int textpos = 0;
83 private TagElement last;
84 private boolean space;
85
86 private char str[] = new char[128];
87 private int strpos = 0;
88
89 /**
90 * The dtd.
91 */
92 protected DTD dtd = null;
93
94 private int ch;
95 private int ln;
96 private Reader in;
97
98 private Element recent;
99 private TagStack stack;
100 private boolean skipTag = false;
101 private TagElement lastFormSent = null;
102 private SimpleAttributeSet attributes = new SimpleAttributeSet();
103
104 // State for <html>, <head> and <body>. Since people like to slap
105 // together HTML documents without thinking, occasionally they
106 // have multiple instances of these tags. These booleans track
256 * Returns attributes for the current tag.
257 *
258 * @return {@code SimpleAttributeSet} containing the attributes
259 */
260 protected SimpleAttributeSet getAttributes() {
261 return attributes;
262 }
263
264 /**
265 * Removes the current attributes.
266 */
267 protected void flushAttributes() {
268 attributes.removeAttributes(attributes);
269 }
270
271 /**
272 * Called when PCDATA is encountered.
273 *
274 * @param text the section text
275 */
276 protected void handleText(char text[]) {
277 }
278
279 /**
280 * Called when an HTML title tag is encountered.
281 *
282 * @param text the title text
283 */
284 protected void handleTitle(char text[]) {
285 // default behavior is to call handleText. Subclasses
286 // can override if necessary.
287 handleText(text);
288 }
289
290 /**
291 * Called when an HTML comment is encountered.
292 *
293 * @param text the comment being handled
294 */
295 protected void handleComment(char text[]) {
296 }
297
298 /**
299 * Called when the content terminates without closing the HTML comment.
300 */
301 protected void handleEOFInComment() {
302 // We've reached EOF. Our recovery strategy is to
303 // see if we have more than one line in the comment;
304 // if so, we pretend that the comment was an unterminated
305 // single line comment, and reparse the lines after the
306 // first line as normal HTML content.
307
308 int commentEndPos = strIndexOf('\n');
309 if (commentEndPos >= 0) {
310 handleComment(getChars(0, commentEndPos));
311 try {
312 in.close();
313 in = new CharArrayReader(getChars(commentEndPos + 1));
314 ch = '>';
315 } catch (IOException e) {
369 void handleText(TagElement tag) {
370 if (tag.breaksFlow()) {
371 space = false;
372 if (!strict) {
373 ignoreSpace = true;
374 }
375 }
376 if (textpos == 0) {
377 if ((!space) || (stack == null) || last.breaksFlow() ||
378 !stack.advance(dtd.pcdata)) {
379 last = tag;
380 space = false;
381 lastBlockStartPos = currentBlockStartPos;
382 return;
383 }
384 }
385 if (space) {
386 if (!ignoreSpace) {
387 // enlarge buffer if needed
388 if (textpos + 1 > text.length) {
389 char newtext[] = new char[text.length + 200];
390 System.arraycopy(text, 0, newtext, 0, text.length);
391 text = newtext;
392 }
393
394 // output pending space
395 text[textpos++] = ' ';
396 if (!strict && !tag.getElement().isEmpty()) {
397 ignoreSpace = true;
398 }
399 }
400 space = false;
401 }
402 char newtext[] = new char[textpos];
403 System.arraycopy(text, 0, newtext, 0, textpos);
404 // Handles cases of bad html where the title tag
405 // was getting lost when we did error recovery.
406 if (tag.getElement().getName().equals("title")) {
407 handleTitle(newtext);
408 } else {
409 handleText(newtext);
410 }
411 lastBlockStartPos = currentBlockStartPos;
412 textpos = 0;
413 last = tag;
414 space = false;
415 }
416
417 /**
418 * Invokes the error handler.
419 *
420 * @param err the error type
421 * @param arg1 the 1st error message argument
422 * @param arg2 the 2nd error message argument
820
821 /**
822 * Error context. Something went wrong, make sure we are in
823 * the document's body context
824 */
825 void errorContext() throws ChangedCharSetException {
826 for (; (stack != null) && (stack.tag.getElement() != dtd.body) ; stack = stack.next) {
827 handleEndTag(stack.tag);
828 }
829 if (stack == null) {
830 legalElementContext(dtd.body);
831 startTag(makeTag(dtd.body, true));
832 }
833 }
834
835 /**
836 * Add a char to the string buffer.
837 */
838 void addString(int c) {
839 if (strpos == str.length) {
840 char newstr[] = new char[str.length + 128];
841 System.arraycopy(str, 0, newstr, 0, str.length);
842 str = newstr;
843 }
844 str[strpos++] = (char)c;
845 }
846
847 /**
848 * Get the string that's been accumulated.
849 */
850 String getString(int pos) {
851 char newStr[] = new char[strpos - pos];
852 System.arraycopy(str, pos, newStr, 0, strpos - pos);
853 strpos = pos;
854 return new String(newStr);
855 }
856
857 char[] getChars(int pos) {
858 char newStr[] = new char[strpos - pos];
859 System.arraycopy(str, pos, newStr, 0, strpos - pos);
860 strpos = pos;
861 return newStr;
862 }
863
864 char[] getChars(int pos, int endPos) {
865 char newStr[] = new char[endPos - pos];
866 System.arraycopy(str, pos, newStr, 0, endPos - pos);
867 // REMIND: it's not clear whether this version should set strpos or not
868 // strpos = pos;
869 return newStr;
870 }
871
872 void resetStrBuffer() {
873 strpos = 0;
874 }
875
876 int strIndexOf(char target) {
877 for (int i = 0; i < strpos; i++) {
878 if (str[i] == target) {
879 return i;
880 }
881 }
882
883 return -1;
884 }
885
1017 ln++;
1018 ch = readCh();
1019 lfCount++;
1020 break;
1021
1022 case '\r':
1023 ln++;
1024 if ((ch = readCh()) == '\n') {
1025 ch = readCh();
1026 crlfCount++;
1027 }
1028 else {
1029 crCount++;
1030 }
1031 break;
1032
1033 case ';':
1034 ch = readCh();
1035 break;
1036 }
1037 char data[] = mapNumericReference(n);
1038 return data;
1039 }
1040 addString('#');
1041 if (!parseIdentifier(false)) {
1042 error("ident.expected");
1043 strpos = pos;
1044 char data[] = {'&', '#'};
1045 return data;
1046 }
1047 } else if (!parseIdentifier(false)) {
1048 char data[] = {'&'};
1049 return data;
1050 }
1051
1052 boolean semicolon = false;
1053
1054 switch (ch) {
1055 case '\n':
1056 ln++;
1057 ch = readCh();
1058 lfCount++;
1059 break;
1060
1061 case '\r':
1062 ln++;
1063 if ((ch = readCh()) == '\n') {
1064 ch = readCh();
1065 crlfCount++;
1066 }
1067 else {
1068 crCount++;
1078
1079 String nm = getString(pos);
1080 Entity ent = dtd.getEntity(nm);
1081
1082 // entities are case sensitive - however if strict
1083 // is false then we will try to make a match by
1084 // converting the string to all lowercase.
1085 //
1086 if (!strict && (ent == null)) {
1087 ent = dtd.getEntity(nm.toLowerCase());
1088 }
1089 if ((ent == null) || !ent.isGeneral()) {
1090
1091 if (nm.length() == 0) {
1092 error("invalid.entref", nm);
1093 return new char[0];
1094 }
1095 /* given that there is not a match restore the entity reference */
1096 String str = "&" + nm + (semicolon ? ";" : "");
1097
1098 char b[] = new char[str.length()];
1099 str.getChars(0, b.length, b, 0);
1100 return b;
1101 }
1102 return ent.getData();
1103 }
1104
1105 /**
1106 * Converts numeric character reference to char array.
1107 *
1108 * Normally the code in a reference should be always converted
1109 * to the Unicode character with the same code, but due to
1110 * wide usage of Cp1252 charset most browsers map numeric references
1111 * in the range 130-159 (which are control chars in Unicode set)
1112 * to displayable characters with other codes.
1113 *
1114 * @param c the code of numeric character reference.
1115 * @return a char array corresponding to the reference code.
1116 */
1117 private char[] mapNumericReference(int c) {
1118 char[] data;
1234 case '>':
1235 ch = readCh();
1236 int i = textpos - (stack.elem.name.length() + 2), j = 0;
1237
1238 // match end tag
1239 if ((i >= 0) && (text[i++] == '<') && (text[i] == '/')) {
1240 while ((++i < textpos) &&
1241 (Character.toLowerCase(text[i]) == stack.elem.name.charAt(j++)));
1242 if (i == textpos) {
1243 textpos -= (stack.elem.name.length() + 2);
1244 if ((textpos > 0) && (text[textpos-1] == '\n')) {
1245 textpos--;
1246 }
1247 endTag(false);
1248 return;
1249 }
1250 }
1251 break;
1252
1253 case '&':
1254 char data[] = parseEntityReference();
1255 if (textpos + data.length > text.length) {
1256 char newtext[] = new char[Math.max(textpos + data.length + 128, text.length * 2)];
1257 System.arraycopy(text, 0, newtext, 0, text.length);
1258 text = newtext;
1259 }
1260 System.arraycopy(data, 0, text, textpos, data.length);
1261 textpos += data.length;
1262 continue;
1263
1264 case '\n':
1265 ln++;
1266 ch = readCh();
1267 lfCount++;
1268 break;
1269
1270 case '\r':
1271 ln++;
1272 if ((ch = readCh()) == '\n') {
1273 ch = readCh();
1274 crlfCount++;
1275 }
1276 else {
1277 crCount++;
1278 }
1279 c = '\n';
1280 break;
1281 default:
1282 ch = readCh();
1283 break;
1284 }
1285
1286 // output character
1287 if (textpos == text.length) {
1288 char newtext[] = new char[text.length + 128];
1289 System.arraycopy(text, 0, newtext, 0, text.length);
1290 text = newtext;
1291 }
1292 text[textpos++] = (char)c;
1293 }
1294 }
1295
1296 /**
1297 * Parse attribute value. [33] 331:1
1298 */
1299 @SuppressWarnings("fallthrough")
1300 String parseAttributeValue(boolean lower) throws IOException {
1301 int delim = -1;
1302
1303 // Check for a delimiter
1304 switch(ch) {
1305 case '\'':
1306 case '"':
1307 delim = ch;
1308 ch = readCh();
1378 is considered invalid since an = sign can only be contained
1379 in an attributes value if the string is quoted.
1380 */
1381 error("attvalerr");
1382 /* If strict is true then we return with the string we have thus far.
1383 Otherwise we accept the = sign as part of the attribute's value and
1384 process the rest of the img tag. */
1385 if (strict) {
1386 return getString(0);
1387 }
1388 }
1389 ch = readCh();
1390 break;
1391
1392 case '&':
1393 if (strict && delim < 0) {
1394 ch = readCh();
1395 break;
1396 }
1397
1398 char data[] = parseEntityReference();
1399 for (int i = 0 ; i < data.length ; i++) {
1400 c = data[i];
1401 addString((lower && (c >= 'A') && (c <= 'Z')) ? 'a' + c - 'A' : c);
1402 }
1403 continue;
1404
1405 case -1:
1406 return getString(0);
1407
1408 default:
1409 if (lower && (c >= 'A') && (c <= 'Z')) {
1410 c = 'a' + c - 'A';
1411 }
1412 ch = readCh();
1413 break;
1414 }
1415 addString(c);
1416 }
1417 }
1418
1489 }
1490 skipSpace();
1491 if (ch == '=') {
1492 ch = readCh();
1493 skipSpace();
1494 att = elem.getAttribute(attname);
1495 attvalue = parseAttributeValue((att != null) &&
1496 (att.type != CDATA) &&
1497 (att.type != NOTATION));
1498 } else {
1499 attvalue = attname;
1500 att = elem.getAttributeByValue(attvalue);
1501 if (att == null) {
1502 att = elem.getAttribute(attname);
1503 if (att != null) {
1504 attvalue = att.getValue();
1505 }
1506 }
1507 }
1508 } else {
1509 char str[] = {(char)ch};
1510 error("invalid.tagchar", new String(str), elem.getName());
1511 ch = readCh();
1512 continue;
1513 }
1514 } else if (!strict && (attributes.isEmpty()) && (ch == '=')) {
1515 ch = readCh();
1516 skipSpace();
1517 attname = elem.getName();
1518 att = elem.getAttribute(attname);
1519 attvalue = parseAttributeValue((att != null) &&
1520 (att.type != CDATA) &&
1521 (att.type != NOTATION));
1522 } else if (!strict && (ch == '=')) {
1523 ch = readCh();
1524 skipSpace();
1525 attvalue = parseAttributeValue(true);
1526 error("attvalerr");
1527 return;
1528 } else {
1529 char str[] = {(char)ch};
1530 error("invalid.tagchar", new String(str), elem.getName());
1531 if (!strict) {
1532 ch = readCh();
1533 continue;
1534 } else {
1535 return;
1536 }
1537 }
1538
1539 if (att != null) {
1540 attname = att.getName();
1541 } else {
1542 error("invalid.tagatt", attname, elem.getName());
1543 }
1544
1545 // Check out the value
1546 if (attributes.isDefined(attname)) {
1547 error("multi.tagatt", attname, elem.getName());
1548 }
1549 if (attvalue == null) {
1657 boolean net = false;
1658 boolean warned = false;
1659 boolean unknown = false;
1660
1661 switch (ch = readCh()) {
1662 case '!':
1663 switch (ch = readCh()) {
1664 case '-':
1665 // Parse comment. [92] 391:7
1666 while (true) {
1667 if (ch == '-') {
1668 if (!strict || ((ch = readCh()) == '-')) {
1669 ch = readCh();
1670 if (!strict && ch == '-') {
1671 ch = readCh();
1672 }
1673 // send over any text you might see
1674 // before parsing and sending the
1675 // comment
1676 if (textpos != 0) {
1677 char newtext[] = new char[textpos];
1678 System.arraycopy(text, 0, newtext, 0, textpos);
1679 handleText(newtext);
1680 lastBlockStartPos = currentBlockStartPos;
1681 textpos = 0;
1682 }
1683 parseComment();
1684 last = makeTag(dtd.getElement("comment"), true);
1685 handleComment(getChars(0));
1686 continue;
1687 } else if (!warned) {
1688 warned = true;
1689 error("invalid.commentchar", "-");
1690 }
1691 }
1692 skipSpace();
1693 switch (ch) {
1694 case '-':
1695 continue;
1696 case '>':
1697 ch = readCh();
2207 error("unexpected.pcdata");
2208 }
2209 if (last.breaksFlow()) {
2210 space = false;
2211 }
2212 }
2213 break;
2214
2215 case -1:
2216 return;
2217
2218 case '&':
2219 if (textpos == 0) {
2220 if (!legalElementContext(dtd.pcdata)) {
2221 error("unexpected.pcdata");
2222 }
2223 if (last.breaksFlow()) {
2224 space = false;
2225 }
2226 }
2227 char data[] = parseEntityReference();
2228 if (textpos + data.length + 1 > text.length) {
2229 char newtext[] = new char[Math.max(textpos + data.length + 128, text.length * 2)];
2230 System.arraycopy(text, 0, newtext, 0, text.length);
2231 text = newtext;
2232 }
2233 if (space) {
2234 space = false;
2235 text[textpos++] = ' ';
2236 }
2237 System.arraycopy(data, 0, text, textpos, data.length);
2238 textpos += data.length;
2239 ignoreSpace = false;
2240 continue;
2241
2242 case '\n':
2243 ln++;
2244 lfCount++;
2245 ch = readCh();
2246 if ((stack != null) && stack.pre) {
2247 break;
2248 }
2249 if (textpos == 0) {
2289 space = true;
2290 }
2291 continue;
2292
2293 default:
2294 if (textpos == 0) {
2295 if (!legalElementContext(dtd.pcdata)) {
2296 error("unexpected.pcdata");
2297 }
2298 if (last.breaksFlow()) {
2299 space = false;
2300 }
2301 }
2302 ch = readCh();
2303 break;
2304 }
2305 }
2306
2307 // enlarge buffer if needed
2308 if (textpos + 2 > text.length) {
2309 char newtext[] = new char[text.length + 128];
2310 System.arraycopy(text, 0, newtext, 0, text.length);
2311 text = newtext;
2312 }
2313
2314 // output pending space
2315 if (space) {
2316 if (textpos == 0) {
2317 lastBlockStartPos--;
2318 }
2319 text[textpos++] = ' ';
2320 space = false;
2321 }
2322 text[textpos++] = (char)c;
2323 ignoreSpace = false;
2324 }
2325 }
2326
2327 /**
2328 * Returns the end of line string. This will return the end of line
2329 * string that has been encountered the most, one of \r, \n or \r\n.
2395 }
2396
2397 text = null;
2398 str = null;
2399 }
2400
2401 }
2402
2403
2404 /*
2405 * Input cache. This is much faster than calling down to a synchronized
2406 * method of BufferedReader for each byte. Measurements done 5/30/97
2407 * show that there's no point in having a bigger buffer: Increasing
2408 * the buffer to 8192 had no measurable impact for a program discarding
2409 * one character at a time (reading from an http URL to a local machine).
2410 * NOTE: If the current encoding is bogus, and we read too much
2411 * (past the content-type) we may suffer a MalformedInputException. For
2412 * this reason the initial size is 1 and when the body is encountered the
2413 * size is adjusted to 256.
2414 */
2415 private char buf[] = new char[1];
2416 private int pos;
2417 private int len;
2418 /*
2419 tracks position relative to the beginning of the
2420 document.
2421 */
2422 private int currentPosition;
2423
2424
2425 private int readCh() throws IOException {
2426
2427 if (pos >= len) {
2428
2429 // This loop allows us to ignore interrupts if the flag
2430 // says so
2431 for (;;) {
2432 try {
2433 len = in.read(buf);
2434 break;
2435 } catch (InterruptedIOException ex) {
|
1 /*
2 * Copyright (c) 1998, 2018, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation. Oracle designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Oracle in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
61 * <p>as well as:
62 * '<p><a href="xx"> <em>Using</em></a></p>'
63 * which appears to be treated as:
64 * '<p><a href="xx"><em>Using</em></a></p>'
65 * <p>
66 * If <code>strict</code> is false, when a tag that breaks flow,
67 * (<code>TagElement.breaksFlows</code>) or trailing whitespace is
68 * encountered, all whitespace will be ignored until a non whitespace
69 * character is encountered. This appears to give behavior closer to
70 * the popular browsers.
71 *
72 * @see DTD
73 * @see TagElement
74 * @see SimpleAttributeSet
75 * @author Arthur van Hoff
76 * @author Sunita Mani
77 */
78 public
79 class Parser implements DTDConstants {
80
81 private char[] text = new char[1024];
82 private int textpos = 0;
83 private TagElement last;
84 private boolean space;
85
86 private char[] str = new char[128];
87 private int strpos = 0;
88
89 /**
90 * The dtd.
91 */
92 protected DTD dtd = null;
93
94 private int ch;
95 private int ln;
96 private Reader in;
97
98 private Element recent;
99 private TagStack stack;
100 private boolean skipTag = false;
101 private TagElement lastFormSent = null;
102 private SimpleAttributeSet attributes = new SimpleAttributeSet();
103
104 // State for <html>, <head> and <body>. Since people like to slap
105 // together HTML documents without thinking, occasionally they
106 // have multiple instances of these tags. These booleans track
256 * Returns attributes for the current tag.
257 *
258 * @return {@code SimpleAttributeSet} containing the attributes
259 */
260 protected SimpleAttributeSet getAttributes() {
261 return attributes;
262 }
263
264 /**
265 * Removes the current attributes.
266 */
267 protected void flushAttributes() {
268 attributes.removeAttributes(attributes);
269 }
270
271 /**
272 * Called when PCDATA is encountered.
273 *
274 * @param text the section text
275 */
276 protected void handleText(char[] text) {
277 }
278
279 /**
280 * Called when an HTML title tag is encountered.
281 *
282 * @param text the title text
283 */
284 protected void handleTitle(char[] text) {
285 // default behavior is to call handleText. Subclasses
286 // can override if necessary.
287 handleText(text);
288 }
289
290 /**
291 * Called when an HTML comment is encountered.
292 *
293 * @param text the comment being handled
294 */
295 protected void handleComment(char[] text) {
296 }
297
298 /**
299 * Called when the content terminates without closing the HTML comment.
300 */
301 protected void handleEOFInComment() {
302 // We've reached EOF. Our recovery strategy is to
303 // see if we have more than one line in the comment;
304 // if so, we pretend that the comment was an unterminated
305 // single line comment, and reparse the lines after the
306 // first line as normal HTML content.
307
308 int commentEndPos = strIndexOf('\n');
309 if (commentEndPos >= 0) {
310 handleComment(getChars(0, commentEndPos));
311 try {
312 in.close();
313 in = new CharArrayReader(getChars(commentEndPos + 1));
314 ch = '>';
315 } catch (IOException e) {
369 void handleText(TagElement tag) {
370 if (tag.breaksFlow()) {
371 space = false;
372 if (!strict) {
373 ignoreSpace = true;
374 }
375 }
376 if (textpos == 0) {
377 if ((!space) || (stack == null) || last.breaksFlow() ||
378 !stack.advance(dtd.pcdata)) {
379 last = tag;
380 space = false;
381 lastBlockStartPos = currentBlockStartPos;
382 return;
383 }
384 }
385 if (space) {
386 if (!ignoreSpace) {
387 // enlarge buffer if needed
388 if (textpos + 1 > text.length) {
389 char[] newtext = new char[text.length + 200];
390 System.arraycopy(text, 0, newtext, 0, text.length);
391 text = newtext;
392 }
393
394 // output pending space
395 text[textpos++] = ' ';
396 if (!strict && !tag.getElement().isEmpty()) {
397 ignoreSpace = true;
398 }
399 }
400 space = false;
401 }
402 char[] newtext = new char[textpos];
403 System.arraycopy(text, 0, newtext, 0, textpos);
404 // Handles cases of bad html where the title tag
405 // was getting lost when we did error recovery.
406 if (tag.getElement().getName().equals("title")) {
407 handleTitle(newtext);
408 } else {
409 handleText(newtext);
410 }
411 lastBlockStartPos = currentBlockStartPos;
412 textpos = 0;
413 last = tag;
414 space = false;
415 }
416
417 /**
418 * Invokes the error handler.
419 *
420 * @param err the error type
421 * @param arg1 the 1st error message argument
422 * @param arg2 the 2nd error message argument
820
821 /**
822 * Error context. Something went wrong, make sure we are in
823 * the document's body context
824 */
825 void errorContext() throws ChangedCharSetException {
826 for (; (stack != null) && (stack.tag.getElement() != dtd.body) ; stack = stack.next) {
827 handleEndTag(stack.tag);
828 }
829 if (stack == null) {
830 legalElementContext(dtd.body);
831 startTag(makeTag(dtd.body, true));
832 }
833 }
834
835 /**
836 * Add a char to the string buffer.
837 */
838 void addString(int c) {
839 if (strpos == str.length) {
840 char[] newstr = new char[str.length + 128];
841 System.arraycopy(str, 0, newstr, 0, str.length);
842 str = newstr;
843 }
844 str[strpos++] = (char)c;
845 }
846
847 /**
848 * Get the string that's been accumulated.
849 */
850 String getString(int pos) {
851 char[] newStr = new char[strpos - pos];
852 System.arraycopy(str, pos, newStr, 0, strpos - pos);
853 strpos = pos;
854 return new String(newStr);
855 }
856
857 char[] getChars(int pos) {
858 char[] newStr = new char[strpos - pos];
859 System.arraycopy(str, pos, newStr, 0, strpos - pos);
860 strpos = pos;
861 return newStr;
862 }
863
864 char[] getChars(int pos, int endPos) {
865 char[] newStr = new char[endPos - pos];
866 System.arraycopy(str, pos, newStr, 0, endPos - pos);
867 // REMIND: it's not clear whether this version should set strpos or not
868 // strpos = pos;
869 return newStr;
870 }
871
872 void resetStrBuffer() {
873 strpos = 0;
874 }
875
876 int strIndexOf(char target) {
877 for (int i = 0; i < strpos; i++) {
878 if (str[i] == target) {
879 return i;
880 }
881 }
882
883 return -1;
884 }
885
1017 ln++;
1018 ch = readCh();
1019 lfCount++;
1020 break;
1021
1022 case '\r':
1023 ln++;
1024 if ((ch = readCh()) == '\n') {
1025 ch = readCh();
1026 crlfCount++;
1027 }
1028 else {
1029 crCount++;
1030 }
1031 break;
1032
1033 case ';':
1034 ch = readCh();
1035 break;
1036 }
1037 char[] data = mapNumericReference(n);
1038 return data;
1039 }
1040 addString('#');
1041 if (!parseIdentifier(false)) {
1042 error("ident.expected");
1043 strpos = pos;
1044 char[] data = {'&', '#'};
1045 return data;
1046 }
1047 } else if (!parseIdentifier(false)) {
1048 char[] data = {'&'};
1049 return data;
1050 }
1051
1052 boolean semicolon = false;
1053
1054 switch (ch) {
1055 case '\n':
1056 ln++;
1057 ch = readCh();
1058 lfCount++;
1059 break;
1060
1061 case '\r':
1062 ln++;
1063 if ((ch = readCh()) == '\n') {
1064 ch = readCh();
1065 crlfCount++;
1066 }
1067 else {
1068 crCount++;
1078
1079 String nm = getString(pos);
1080 Entity ent = dtd.getEntity(nm);
1081
1082 // entities are case sensitive - however if strict
1083 // is false then we will try to make a match by
1084 // converting the string to all lowercase.
1085 //
1086 if (!strict && (ent == null)) {
1087 ent = dtd.getEntity(nm.toLowerCase());
1088 }
1089 if ((ent == null) || !ent.isGeneral()) {
1090
1091 if (nm.length() == 0) {
1092 error("invalid.entref", nm);
1093 return new char[0];
1094 }
1095 /* given that there is not a match restore the entity reference */
1096 String str = "&" + nm + (semicolon ? ";" : "");
1097
1098 char[] b = new char[str.length()];
1099 str.getChars(0, b.length, b, 0);
1100 return b;
1101 }
1102 return ent.getData();
1103 }
1104
1105 /**
1106 * Converts numeric character reference to char array.
1107 *
1108 * Normally the code in a reference should be always converted
1109 * to the Unicode character with the same code, but due to
1110 * wide usage of Cp1252 charset most browsers map numeric references
1111 * in the range 130-159 (which are control chars in Unicode set)
1112 * to displayable characters with other codes.
1113 *
1114 * @param c the code of numeric character reference.
1115 * @return a char array corresponding to the reference code.
1116 */
1117 private char[] mapNumericReference(int c) {
1118 char[] data;
1234 case '>':
1235 ch = readCh();
1236 int i = textpos - (stack.elem.name.length() + 2), j = 0;
1237
1238 // match end tag
1239 if ((i >= 0) && (text[i++] == '<') && (text[i] == '/')) {
1240 while ((++i < textpos) &&
1241 (Character.toLowerCase(text[i]) == stack.elem.name.charAt(j++)));
1242 if (i == textpos) {
1243 textpos -= (stack.elem.name.length() + 2);
1244 if ((textpos > 0) && (text[textpos-1] == '\n')) {
1245 textpos--;
1246 }
1247 endTag(false);
1248 return;
1249 }
1250 }
1251 break;
1252
1253 case '&':
1254 char[] data = parseEntityReference();
1255 if (textpos + data.length > text.length) {
1256 char[] newtext = new char[Math.max(textpos + data.length + 128, text.length * 2)];
1257 System.arraycopy(text, 0, newtext, 0, text.length);
1258 text = newtext;
1259 }
1260 System.arraycopy(data, 0, text, textpos, data.length);
1261 textpos += data.length;
1262 continue;
1263
1264 case '\n':
1265 ln++;
1266 ch = readCh();
1267 lfCount++;
1268 break;
1269
1270 case '\r':
1271 ln++;
1272 if ((ch = readCh()) == '\n') {
1273 ch = readCh();
1274 crlfCount++;
1275 }
1276 else {
1277 crCount++;
1278 }
1279 c = '\n';
1280 break;
1281 default:
1282 ch = readCh();
1283 break;
1284 }
1285
1286 // output character
1287 if (textpos == text.length) {
1288 char[] newtext = new char[text.length + 128];
1289 System.arraycopy(text, 0, newtext, 0, text.length);
1290 text = newtext;
1291 }
1292 text[textpos++] = (char)c;
1293 }
1294 }
1295
1296 /**
1297 * Parse attribute value. [33] 331:1
1298 */
1299 @SuppressWarnings("fallthrough")
1300 String parseAttributeValue(boolean lower) throws IOException {
1301 int delim = -1;
1302
1303 // Check for a delimiter
1304 switch(ch) {
1305 case '\'':
1306 case '"':
1307 delim = ch;
1308 ch = readCh();
1378 is considered invalid since an = sign can only be contained
1379 in an attributes value if the string is quoted.
1380 */
1381 error("attvalerr");
1382 /* If strict is true then we return with the string we have thus far.
1383 Otherwise we accept the = sign as part of the attribute's value and
1384 process the rest of the img tag. */
1385 if (strict) {
1386 return getString(0);
1387 }
1388 }
1389 ch = readCh();
1390 break;
1391
1392 case '&':
1393 if (strict && delim < 0) {
1394 ch = readCh();
1395 break;
1396 }
1397
1398 char[] data = parseEntityReference();
1399 for (int i = 0 ; i < data.length ; i++) {
1400 c = data[i];
1401 addString((lower && (c >= 'A') && (c <= 'Z')) ? 'a' + c - 'A' : c);
1402 }
1403 continue;
1404
1405 case -1:
1406 return getString(0);
1407
1408 default:
1409 if (lower && (c >= 'A') && (c <= 'Z')) {
1410 c = 'a' + c - 'A';
1411 }
1412 ch = readCh();
1413 break;
1414 }
1415 addString(c);
1416 }
1417 }
1418
1489 }
1490 skipSpace();
1491 if (ch == '=') {
1492 ch = readCh();
1493 skipSpace();
1494 att = elem.getAttribute(attname);
1495 attvalue = parseAttributeValue((att != null) &&
1496 (att.type != CDATA) &&
1497 (att.type != NOTATION));
1498 } else {
1499 attvalue = attname;
1500 att = elem.getAttributeByValue(attvalue);
1501 if (att == null) {
1502 att = elem.getAttribute(attname);
1503 if (att != null) {
1504 attvalue = att.getValue();
1505 }
1506 }
1507 }
1508 } else {
1509 char[] str = {(char)ch};
1510 error("invalid.tagchar", new String(str), elem.getName());
1511 ch = readCh();
1512 continue;
1513 }
1514 } else if (!strict && (attributes.isEmpty()) && (ch == '=')) {
1515 ch = readCh();
1516 skipSpace();
1517 attname = elem.getName();
1518 att = elem.getAttribute(attname);
1519 attvalue = parseAttributeValue((att != null) &&
1520 (att.type != CDATA) &&
1521 (att.type != NOTATION));
1522 } else if (!strict && (ch == '=')) {
1523 ch = readCh();
1524 skipSpace();
1525 attvalue = parseAttributeValue(true);
1526 error("attvalerr");
1527 return;
1528 } else {
1529 char[] str = {(char)ch};
1530 error("invalid.tagchar", new String(str), elem.getName());
1531 if (!strict) {
1532 ch = readCh();
1533 continue;
1534 } else {
1535 return;
1536 }
1537 }
1538
1539 if (att != null) {
1540 attname = att.getName();
1541 } else {
1542 error("invalid.tagatt", attname, elem.getName());
1543 }
1544
1545 // Check out the value
1546 if (attributes.isDefined(attname)) {
1547 error("multi.tagatt", attname, elem.getName());
1548 }
1549 if (attvalue == null) {
1657 boolean net = false;
1658 boolean warned = false;
1659 boolean unknown = false;
1660
1661 switch (ch = readCh()) {
1662 case '!':
1663 switch (ch = readCh()) {
1664 case '-':
1665 // Parse comment. [92] 391:7
1666 while (true) {
1667 if (ch == '-') {
1668 if (!strict || ((ch = readCh()) == '-')) {
1669 ch = readCh();
1670 if (!strict && ch == '-') {
1671 ch = readCh();
1672 }
1673 // send over any text you might see
1674 // before parsing and sending the
1675 // comment
1676 if (textpos != 0) {
1677 char[] newtext = new char[textpos];
1678 System.arraycopy(text, 0, newtext, 0, textpos);
1679 handleText(newtext);
1680 lastBlockStartPos = currentBlockStartPos;
1681 textpos = 0;
1682 }
1683 parseComment();
1684 last = makeTag(dtd.getElement("comment"), true);
1685 handleComment(getChars(0));
1686 continue;
1687 } else if (!warned) {
1688 warned = true;
1689 error("invalid.commentchar", "-");
1690 }
1691 }
1692 skipSpace();
1693 switch (ch) {
1694 case '-':
1695 continue;
1696 case '>':
1697 ch = readCh();
2207 error("unexpected.pcdata");
2208 }
2209 if (last.breaksFlow()) {
2210 space = false;
2211 }
2212 }
2213 break;
2214
2215 case -1:
2216 return;
2217
2218 case '&':
2219 if (textpos == 0) {
2220 if (!legalElementContext(dtd.pcdata)) {
2221 error("unexpected.pcdata");
2222 }
2223 if (last.breaksFlow()) {
2224 space = false;
2225 }
2226 }
2227 char[] data = parseEntityReference();
2228 if (textpos + data.length + 1 > text.length) {
2229 char[] newtext = new char[Math.max(textpos + data.length + 128, text.length * 2)];
2230 System.arraycopy(text, 0, newtext, 0, text.length);
2231 text = newtext;
2232 }
2233 if (space) {
2234 space = false;
2235 text[textpos++] = ' ';
2236 }
2237 System.arraycopy(data, 0, text, textpos, data.length);
2238 textpos += data.length;
2239 ignoreSpace = false;
2240 continue;
2241
2242 case '\n':
2243 ln++;
2244 lfCount++;
2245 ch = readCh();
2246 if ((stack != null) && stack.pre) {
2247 break;
2248 }
2249 if (textpos == 0) {
2289 space = true;
2290 }
2291 continue;
2292
2293 default:
2294 if (textpos == 0) {
2295 if (!legalElementContext(dtd.pcdata)) {
2296 error("unexpected.pcdata");
2297 }
2298 if (last.breaksFlow()) {
2299 space = false;
2300 }
2301 }
2302 ch = readCh();
2303 break;
2304 }
2305 }
2306
2307 // enlarge buffer if needed
2308 if (textpos + 2 > text.length) {
2309 char[] newtext = new char[text.length + 128];
2310 System.arraycopy(text, 0, newtext, 0, text.length);
2311 text = newtext;
2312 }
2313
2314 // output pending space
2315 if (space) {
2316 if (textpos == 0) {
2317 lastBlockStartPos--;
2318 }
2319 text[textpos++] = ' ';
2320 space = false;
2321 }
2322 text[textpos++] = (char)c;
2323 ignoreSpace = false;
2324 }
2325 }
2326
2327 /**
2328 * Returns the end of line string. This will return the end of line
2329 * string that has been encountered the most, one of \r, \n or \r\n.
2395 }
2396
2397 text = null;
2398 str = null;
2399 }
2400
2401 }
2402
2403
2404 /*
2405 * Input cache. This is much faster than calling down to a synchronized
2406 * method of BufferedReader for each byte. Measurements done 5/30/97
2407 * show that there's no point in having a bigger buffer: Increasing
2408 * the buffer to 8192 had no measurable impact for a program discarding
2409 * one character at a time (reading from an http URL to a local machine).
2410 * NOTE: If the current encoding is bogus, and we read too much
2411 * (past the content-type) we may suffer a MalformedInputException. For
2412 * this reason the initial size is 1 and when the body is encountered the
2413 * size is adjusted to 256.
2414 */
2415 private char[] buf = new char[1];
2416 private int pos;
2417 private int len;
2418 /*
2419 tracks position relative to the beginning of the
2420 document.
2421 */
2422 private int currentPosition;
2423
2424
2425 private int readCh() throws IOException {
2426
2427 if (pos >= len) {
2428
2429 // This loop allows us to ignore interrupts if the flag
2430 // says so
2431 for (;;) {
2432 try {
2433 len = in.read(buf);
2434 break;
2435 } catch (InterruptedIOException ex) {
|