< prev index next >

src/java.desktop/share/classes/javax/swing/text/html/parser/Parser.java

Print this page


   1 /*
   2  * Copyright (c) 1998, 2017, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any


  61  * <p>as well as:
  62  * '&lt;p&gt;&lt;a href="xx"&gt;&nbsp;&lt;em&gt;Using&lt;/em&gt;&lt;/a&gt;&lt;/p&gt;'
  63  * which appears to be treated as:
  64  * '&lt;p&gt;&lt;a href="xx"&gt;&lt;em&gt;Using&lt;/em&gt;&lt;/a&gt;&lt;/p&gt;'
  65  * <p>
  66  * If <code>strict</code> is false, when a tag that breaks flow,
  67  * (<code>TagElement.breaksFlows</code>) or trailing whitespace is
  68  * encountered, all whitespace will be ignored until a non whitespace
  69  * character is encountered. This appears to give behavior closer to
  70  * the popular browsers.
  71  *
  72  * @see DTD
  73  * @see TagElement
  74  * @see SimpleAttributeSet
  75  * @author Arthur van Hoff
  76  * @author Sunita Mani
  77  */
  78 public
  79 class Parser implements DTDConstants {
  80 
  81     private char text[] = new char[1024];
  82     private int textpos = 0;
  83     private TagElement last;
  84     private boolean space;
  85 
  86     private char str[] = new char[128];
  87     private int strpos = 0;
  88 
  89     /**
  90      * The dtd.
  91      */
  92     protected DTD dtd = null;
  93 
  94     private int ch;
  95     private int ln;
  96     private Reader in;
  97 
  98     private Element recent;
  99     private TagStack stack;
 100     private boolean skipTag = false;
 101     private TagElement lastFormSent = null;
 102     private SimpleAttributeSet attributes = new SimpleAttributeSet();
 103 
 104     // State for <html>, <head> and <body>.  Since people like to slap
 105     // together HTML documents without thinking, occasionally they
 106     // have multiple instances of these tags.  These booleans track


 256      * Returns attributes for the current tag.
 257      *
 258      * @return {@code SimpleAttributeSet} containing the attributes
 259      */
 260     protected SimpleAttributeSet getAttributes() {
 261         return attributes;
 262     }
 263 
 264     /**
 265      * Removes the current attributes.
 266      */
 267     protected void flushAttributes() {
 268         attributes.removeAttributes(attributes);
 269     }
 270 
 271     /**
 272      * Called when PCDATA is encountered.
 273      *
 274      * @param text  the section text
 275      */
 276     protected void handleText(char text[]) {
 277     }
 278 
 279     /**
 280      * Called when an HTML title tag is encountered.
 281      *
 282      * @param text  the title text
 283      */
 284     protected void handleTitle(char text[]) {
 285         // default behavior is to call handleText. Subclasses
 286         // can override if necessary.
 287         handleText(text);
 288     }
 289 
 290     /**
 291      * Called when an HTML comment is encountered.
 292      *
 293      * @param text  the comment being handled
 294      */
 295     protected void handleComment(char text[]) {
 296     }
 297 
 298     /**
 299      * Called when the content terminates without closing the HTML comment.
 300      */
 301     protected void handleEOFInComment() {
 302         // We've reached EOF.  Our recovery strategy is to
 303         // see if we have more than one line in the comment;
 304         // if so, we pretend that the comment was an unterminated
 305         // single line comment, and reparse the lines after the
 306         // first line as normal HTML content.
 307 
 308         int commentEndPos = strIndexOf('\n');
 309         if (commentEndPos >= 0) {
 310             handleComment(getChars(0, commentEndPos));
 311             try {
 312                 in.close();
 313                 in = new CharArrayReader(getChars(commentEndPos + 1));
 314                 ch = '>';
 315             } catch (IOException e) {


 369     void handleText(TagElement tag) {
 370         if (tag.breaksFlow()) {
 371             space = false;
 372             if (!strict) {
 373                 ignoreSpace = true;
 374             }
 375         }
 376         if (textpos == 0) {
 377             if ((!space) || (stack == null) || last.breaksFlow() ||
 378                 !stack.advance(dtd.pcdata)) {
 379                 last = tag;
 380                 space = false;
 381                 lastBlockStartPos = currentBlockStartPos;
 382                 return;
 383             }
 384         }
 385         if (space) {
 386             if (!ignoreSpace) {
 387                 // enlarge buffer if needed
 388                 if (textpos + 1 > text.length) {
 389                     char newtext[] = new char[text.length + 200];
 390                     System.arraycopy(text, 0, newtext, 0, text.length);
 391                     text = newtext;
 392                 }
 393 
 394                 // output pending space
 395                 text[textpos++] = ' ';
 396                 if (!strict && !tag.getElement().isEmpty()) {
 397                     ignoreSpace = true;
 398                 }
 399             }
 400             space = false;
 401         }
 402         char newtext[] = new char[textpos];
 403         System.arraycopy(text, 0, newtext, 0, textpos);
 404         // Handles cases of bad html where the title tag
 405         // was getting lost when we did error recovery.
 406         if (tag.getElement().getName().equals("title")) {
 407             handleTitle(newtext);
 408         } else {
 409             handleText(newtext);
 410         }
 411         lastBlockStartPos = currentBlockStartPos;
 412         textpos = 0;
 413         last = tag;
 414         space = false;
 415     }
 416 
 417     /**
 418      * Invokes the error handler.
 419      *
 420      * @param err   the error type
 421      * @param arg1  the 1st error message argument
 422      * @param arg2  the 2nd error message argument


 820 
 821     /**
 822      * Error context. Something went wrong, make sure we are in
 823      * the document's body context
 824      */
 825     void errorContext() throws ChangedCharSetException {
 826         for (; (stack != null) && (stack.tag.getElement() != dtd.body) ; stack = stack.next) {
 827             handleEndTag(stack.tag);
 828         }
 829         if (stack == null) {
 830             legalElementContext(dtd.body);
 831             startTag(makeTag(dtd.body, true));
 832         }
 833     }
 834 
 835     /**
 836      * Add a char to the string buffer.
 837      */
 838     void addString(int c) {
 839         if (strpos  == str.length) {
 840             char newstr[] = new char[str.length + 128];
 841             System.arraycopy(str, 0, newstr, 0, str.length);
 842             str = newstr;
 843         }
 844         str[strpos++] = (char)c;
 845     }
 846 
 847     /**
 848      * Get the string that's been accumulated.
 849      */
 850     String getString(int pos) {
 851         char newStr[] = new char[strpos - pos];
 852         System.arraycopy(str, pos, newStr, 0, strpos - pos);
 853         strpos = pos;
 854         return new String(newStr);
 855     }
 856 
 857     char[] getChars(int pos) {
 858         char newStr[] = new char[strpos - pos];
 859         System.arraycopy(str, pos, newStr, 0, strpos - pos);
 860         strpos = pos;
 861         return newStr;
 862     }
 863 
 864     char[] getChars(int pos, int endPos) {
 865         char newStr[] = new char[endPos - pos];
 866         System.arraycopy(str, pos, newStr, 0, endPos - pos);
 867         // REMIND: it's not clear whether this version should set strpos or not
 868         // strpos = pos;
 869         return newStr;
 870     }
 871 
 872     void resetStrBuffer() {
 873         strpos = 0;
 874     }
 875 
 876     int strIndexOf(char target) {
 877         for (int i = 0; i < strpos; i++) {
 878             if (str[i] == target) {
 879                 return i;
 880             }
 881         }
 882 
 883         return -1;
 884     }
 885 


1017                         ln++;
1018                         ch = readCh();
1019                         lfCount++;
1020                         break;
1021 
1022                     case '\r':
1023                         ln++;
1024                         if ((ch = readCh()) == '\n') {
1025                             ch = readCh();
1026                             crlfCount++;
1027                         }
1028                         else {
1029                             crCount++;
1030                         }
1031                         break;
1032 
1033                     case ';':
1034                         ch = readCh();
1035                         break;
1036                 }
1037                 char data[] = mapNumericReference(n);
1038                 return data;
1039             }
1040             addString('#');
1041             if (!parseIdentifier(false)) {
1042                 error("ident.expected");
1043                 strpos = pos;
1044                 char data[] = {'&', '#'};
1045                 return data;
1046             }
1047         } else if (!parseIdentifier(false)) {
1048             char data[] = {'&'};
1049             return data;
1050         }
1051 
1052         boolean semicolon = false;
1053 
1054         switch (ch) {
1055           case '\n':
1056             ln++;
1057             ch = readCh();
1058             lfCount++;
1059             break;
1060 
1061           case '\r':
1062             ln++;
1063             if ((ch = readCh()) == '\n') {
1064                 ch = readCh();
1065                 crlfCount++;
1066             }
1067             else {
1068                 crCount++;


1078 
1079         String nm = getString(pos);
1080         Entity ent = dtd.getEntity(nm);
1081 
1082         // entities are case sensitive - however if strict
1083         // is false then we will try to make a match by
1084         // converting the string to all lowercase.
1085         //
1086         if (!strict && (ent == null)) {
1087             ent = dtd.getEntity(nm.toLowerCase());
1088         }
1089         if ((ent == null) || !ent.isGeneral()) {
1090 
1091             if (nm.length() == 0) {
1092                 error("invalid.entref", nm);
1093                 return new char[0];
1094             }
1095             /* given that there is not a match restore the entity reference */
1096             String str = "&" + nm + (semicolon ? ";" : "");
1097 
1098             char b[] = new char[str.length()];
1099             str.getChars(0, b.length, b, 0);
1100             return b;
1101         }
1102         return ent.getData();
1103     }
1104 
1105     /**
1106      * Converts numeric character reference to char array.
1107      *
1108      * Normally the code in a reference should be always converted
1109      * to the Unicode character with the same code, but due to
1110      * wide usage of Cp1252 charset most browsers map numeric references
1111      * in the range 130-159 (which are control chars in Unicode set)
1112      * to displayable characters with other codes.
1113      *
1114      * @param c the code of numeric character reference.
1115      * @return a char array corresponding to the reference code.
1116      */
1117     private char[] mapNumericReference(int c) {
1118         char[] data;


1234               case '>':
1235                 ch = readCh();
1236                 int i = textpos - (stack.elem.name.length() + 2), j = 0;
1237 
1238                 // match end tag
1239                 if ((i >= 0) && (text[i++] == '<') && (text[i] == '/')) {
1240                     while ((++i < textpos) &&
1241                            (Character.toLowerCase(text[i]) == stack.elem.name.charAt(j++)));
1242                     if (i == textpos) {
1243                         textpos -= (stack.elem.name.length() + 2);
1244                         if ((textpos > 0) && (text[textpos-1] == '\n')) {
1245                             textpos--;
1246                         }
1247                         endTag(false);
1248                         return;
1249                     }
1250                 }
1251                 break;
1252 
1253               case '&':
1254                 char data[] = parseEntityReference();
1255                 if (textpos + data.length > text.length) {
1256                     char newtext[] = new char[Math.max(textpos + data.length + 128, text.length * 2)];
1257                     System.arraycopy(text, 0, newtext, 0, text.length);
1258                     text = newtext;
1259                 }
1260                 System.arraycopy(data, 0, text, textpos, data.length);
1261                 textpos += data.length;
1262                 continue;
1263 
1264               case '\n':
1265                 ln++;
1266                 ch = readCh();
1267                 lfCount++;
1268                 break;
1269 
1270               case '\r':
1271                 ln++;
1272                 if ((ch = readCh()) == '\n') {
1273                     ch = readCh();
1274                     crlfCount++;
1275                 }
1276                 else {
1277                     crCount++;
1278                 }
1279                 c = '\n';
1280                 break;
1281               default:
1282                 ch = readCh();
1283                 break;
1284             }
1285 
1286             // output character
1287             if (textpos == text.length) {
1288                 char newtext[] = new char[text.length + 128];
1289                 System.arraycopy(text, 0, newtext, 0, text.length);
1290                 text = newtext;
1291             }
1292             text[textpos++] = (char)c;
1293         }
1294     }
1295 
1296     /**
1297      * Parse attribute value. [33] 331:1
1298      */
1299     @SuppressWarnings("fallthrough")
1300     String parseAttributeValue(boolean lower) throws IOException {
1301         int delim = -1;
1302 
1303         // Check for a delimiter
1304         switch(ch) {
1305           case '\'':
1306           case '"':
1307             delim = ch;
1308             ch = readCh();


1378                        is considered invalid since an = sign can only be contained
1379                        in an attributes value if the string is quoted.
1380                        */
1381                     error("attvalerr");
1382                     /* If strict is true then we return with the string we have thus far.
1383                        Otherwise we accept the = sign as part of the attribute's value and
1384                        process the rest of the img tag. */
1385                     if (strict) {
1386                         return getString(0);
1387                     }
1388                 }
1389                 ch = readCh();
1390                 break;
1391 
1392               case '&':
1393                 if (strict && delim < 0) {
1394                     ch = readCh();
1395                     break;
1396                 }
1397 
1398                 char data[] = parseEntityReference();
1399                 for (int i = 0 ; i < data.length ; i++) {
1400                     c = data[i];
1401                     addString((lower && (c >= 'A') && (c <= 'Z')) ? 'a' + c - 'A' : c);
1402                 }
1403                 continue;
1404 
1405               case -1:
1406                 return getString(0);
1407 
1408               default:
1409                 if (lower && (c >= 'A') && (c <= 'Z')) {
1410                     c = 'a' + c - 'A';
1411                 }
1412                 ch = readCh();
1413                 break;
1414             }
1415             addString(c);
1416         }
1417     }
1418 


1489                     }
1490                     skipSpace();
1491                     if (ch == '=') {
1492                         ch = readCh();
1493                         skipSpace();
1494                         att = elem.getAttribute(attname);
1495                         attvalue = parseAttributeValue((att != null) &&
1496                                                 (att.type != CDATA) &&
1497                                                 (att.type != NOTATION));
1498                     } else {
1499                         attvalue = attname;
1500                         att = elem.getAttributeByValue(attvalue);
1501                         if (att == null) {
1502                             att = elem.getAttribute(attname);
1503                             if (att != null) {
1504                                 attvalue = att.getValue();
1505                             }
1506                         }
1507                     }
1508                 } else {
1509                     char str[] = {(char)ch};
1510                     error("invalid.tagchar", new String(str), elem.getName());
1511                     ch = readCh();
1512                     continue;
1513                 }
1514             } else if (!strict && (attributes.isEmpty()) && (ch == '=')) {
1515                 ch = readCh();
1516                 skipSpace();
1517                 attname = elem.getName();
1518                 att = elem.getAttribute(attname);
1519                 attvalue = parseAttributeValue((att != null) &&
1520                                                (att.type != CDATA) &&
1521                                                (att.type != NOTATION));
1522             } else if (!strict && (ch == '=')) {
1523                 ch = readCh();
1524                 skipSpace();
1525                 attvalue = parseAttributeValue(true);
1526                 error("attvalerr");
1527                 return;
1528             } else {
1529                 char str[] = {(char)ch};
1530                 error("invalid.tagchar", new String(str), elem.getName());
1531                 if (!strict) {
1532                     ch = readCh();
1533                     continue;
1534                 } else {
1535                     return;
1536                 }
1537             }
1538 
1539             if (att != null) {
1540                 attname = att.getName();
1541             } else {
1542                 error("invalid.tagatt", attname, elem.getName());
1543             }
1544 
1545             // Check out the value
1546             if (attributes.isDefined(attname)) {
1547                 error("multi.tagatt", attname, elem.getName());
1548             }
1549             if (attvalue == null) {


1657         boolean net = false;
1658         boolean warned = false;
1659         boolean unknown = false;
1660 
1661         switch (ch = readCh()) {
1662           case '!':
1663             switch (ch = readCh()) {
1664               case '-':
1665                 // Parse comment. [92] 391:7
1666                 while (true) {
1667                     if (ch == '-') {
1668                         if (!strict || ((ch = readCh()) == '-')) {
1669                             ch = readCh();
1670                             if (!strict && ch == '-') {
1671                                 ch = readCh();
1672                             }
1673                             // send over any text you might see
1674                             // before parsing and sending the
1675                             // comment
1676                             if (textpos != 0) {
1677                                 char newtext[] = new char[textpos];
1678                                 System.arraycopy(text, 0, newtext, 0, textpos);
1679                                 handleText(newtext);
1680                                 lastBlockStartPos = currentBlockStartPos;
1681                                 textpos = 0;
1682                             }
1683                             parseComment();
1684                             last = makeTag(dtd.getElement("comment"), true);
1685                             handleComment(getChars(0));
1686                             continue;
1687                         } else if (!warned) {
1688                             warned = true;
1689                             error("invalid.commentchar", "-");
1690                         }
1691                     }
1692                     skipSpace();
1693                     switch (ch) {
1694                       case '-':
1695                         continue;
1696                       case '>':
1697                         ch = readCh();


2207                             error("unexpected.pcdata");
2208                         }
2209                         if (last.breaksFlow()) {
2210                             space = false;
2211                         }
2212                     }
2213                     break;
2214 
2215                   case -1:
2216                     return;
2217 
2218                   case '&':
2219                     if (textpos == 0) {
2220                         if (!legalElementContext(dtd.pcdata)) {
2221                             error("unexpected.pcdata");
2222                         }
2223                         if (last.breaksFlow()) {
2224                             space = false;
2225                         }
2226                     }
2227                     char data[] = parseEntityReference();
2228                     if (textpos + data.length + 1 > text.length) {
2229                         char newtext[] = new char[Math.max(textpos + data.length + 128, text.length * 2)];
2230                         System.arraycopy(text, 0, newtext, 0, text.length);
2231                         text = newtext;
2232                     }
2233                     if (space) {
2234                         space = false;
2235                         text[textpos++] = ' ';
2236                     }
2237                     System.arraycopy(data, 0, text, textpos, data.length);
2238                     textpos += data.length;
2239                     ignoreSpace = false;
2240                     continue;
2241 
2242                   case '\n':
2243                     ln++;
2244                     lfCount++;
2245                     ch = readCh();
2246                     if ((stack != null) && stack.pre) {
2247                         break;
2248                     }
2249                     if (textpos == 0) {


2289                         space = true;
2290                     }
2291                     continue;
2292 
2293                   default:
2294                     if (textpos == 0) {
2295                         if (!legalElementContext(dtd.pcdata)) {
2296                             error("unexpected.pcdata");
2297                         }
2298                         if (last.breaksFlow()) {
2299                             space = false;
2300                         }
2301                     }
2302                     ch = readCh();
2303                     break;
2304                 }
2305             }
2306 
2307             // enlarge buffer if needed
2308             if (textpos + 2 > text.length) {
2309                 char newtext[] = new char[text.length + 128];
2310                 System.arraycopy(text, 0, newtext, 0, text.length);
2311                 text = newtext;
2312             }
2313 
2314             // output pending space
2315             if (space) {
2316                 if (textpos == 0) {
2317                     lastBlockStartPos--;
2318                 }
2319                 text[textpos++] = ' ';
2320                 space = false;
2321             }
2322             text[textpos++] = (char)c;
2323             ignoreSpace = false;
2324         }
2325     }
2326 
2327     /**
2328      * Returns the end of line string. This will return the end of line
2329      * string that has been encountered the most, one of \r, \n or \r\n.


2395             }
2396 
2397             text = null;
2398             str = null;
2399         }
2400 
2401     }
2402 
2403 
2404     /*
2405      * Input cache.  This is much faster than calling down to a synchronized
2406      * method of BufferedReader for each byte.  Measurements done 5/30/97
2407      * show that there's no point in having a bigger buffer:  Increasing
2408      * the buffer to 8192 had no measurable impact for a program discarding
2409      * one character at a time (reading from an http URL to a local machine).
2410      * NOTE: If the current encoding is bogus, and we read too much
2411      * (past the content-type) we may suffer a MalformedInputException. For
2412      * this reason the initial size is 1 and when the body is encountered the
2413      * size is adjusted to 256.
2414      */
2415     private char buf[] = new char[1];
2416     private int pos;
2417     private int len;
2418     /*
2419         tracks position relative to the beginning of the
2420         document.
2421     */
2422     private int currentPosition;
2423 
2424 
2425     private int readCh() throws IOException {
2426 
2427         if (pos >= len) {
2428 
2429             // This loop allows us to ignore interrupts if the flag
2430             // says so
2431             for (;;) {
2432                 try {
2433                     len = in.read(buf);
2434                     break;
2435                 } catch (InterruptedIOException ex) {


   1 /*
   2  * Copyright (c) 1998, 2018, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any


  61  * <p>as well as:
  62  * '&lt;p&gt;&lt;a href="xx"&gt;&nbsp;&lt;em&gt;Using&lt;/em&gt;&lt;/a&gt;&lt;/p&gt;'
  63  * which appears to be treated as:
  64  * '&lt;p&gt;&lt;a href="xx"&gt;&lt;em&gt;Using&lt;/em&gt;&lt;/a&gt;&lt;/p&gt;'
  65  * <p>
  66  * If <code>strict</code> is false, when a tag that breaks flow,
  67  * (<code>TagElement.breaksFlows</code>) or trailing whitespace is
  68  * encountered, all whitespace will be ignored until a non whitespace
  69  * character is encountered. This appears to give behavior closer to
  70  * the popular browsers.
  71  *
  72  * @see DTD
  73  * @see TagElement
  74  * @see SimpleAttributeSet
  75  * @author Arthur van Hoff
  76  * @author Sunita Mani
  77  */
  78 public
  79 class Parser implements DTDConstants {
  80 
  81     private char[] text = new char[1024];
  82     private int textpos = 0;
  83     private TagElement last;
  84     private boolean space;
  85 
  86     private char[] str = new char[128];
  87     private int strpos = 0;
  88 
  89     /**
  90      * The dtd.
  91      */
  92     protected DTD dtd = null;
  93 
  94     private int ch;
  95     private int ln;
  96     private Reader in;
  97 
  98     private Element recent;
  99     private TagStack stack;
 100     private boolean skipTag = false;
 101     private TagElement lastFormSent = null;
 102     private SimpleAttributeSet attributes = new SimpleAttributeSet();
 103 
 104     // State for <html>, <head> and <body>.  Since people like to slap
 105     // together HTML documents without thinking, occasionally they
 106     // have multiple instances of these tags.  These booleans track


 256      * Returns attributes for the current tag.
 257      *
 258      * @return {@code SimpleAttributeSet} containing the attributes
 259      */
 260     protected SimpleAttributeSet getAttributes() {
 261         return attributes;
 262     }
 263 
 264     /**
 265      * Removes the current attributes.
 266      */
 267     protected void flushAttributes() {
 268         attributes.removeAttributes(attributes);
 269     }
 270 
 271     /**
 272      * Called when PCDATA is encountered.
 273      *
 274      * @param text  the section text
 275      */
 276     protected void handleText(char[] text) {
 277     }
 278 
 279     /**
 280      * Called when an HTML title tag is encountered.
 281      *
 282      * @param text  the title text
 283      */
 284     protected void handleTitle(char[] text) {
 285         // default behavior is to call handleText. Subclasses
 286         // can override if necessary.
 287         handleText(text);
 288     }
 289 
 290     /**
 291      * Called when an HTML comment is encountered.
 292      *
 293      * @param text  the comment being handled
 294      */
 295     protected void handleComment(char[] text) {
 296     }
 297 
 298     /**
 299      * Called when the content terminates without closing the HTML comment.
 300      */
 301     protected void handleEOFInComment() {
 302         // We've reached EOF.  Our recovery strategy is to
 303         // see if we have more than one line in the comment;
 304         // if so, we pretend that the comment was an unterminated
 305         // single line comment, and reparse the lines after the
 306         // first line as normal HTML content.
 307 
 308         int commentEndPos = strIndexOf('\n');
 309         if (commentEndPos >= 0) {
 310             handleComment(getChars(0, commentEndPos));
 311             try {
 312                 in.close();
 313                 in = new CharArrayReader(getChars(commentEndPos + 1));
 314                 ch = '>';
 315             } catch (IOException e) {


 369     void handleText(TagElement tag) {
 370         if (tag.breaksFlow()) {
 371             space = false;
 372             if (!strict) {
 373                 ignoreSpace = true;
 374             }
 375         }
 376         if (textpos == 0) {
 377             if ((!space) || (stack == null) || last.breaksFlow() ||
 378                 !stack.advance(dtd.pcdata)) {
 379                 last = tag;
 380                 space = false;
 381                 lastBlockStartPos = currentBlockStartPos;
 382                 return;
 383             }
 384         }
 385         if (space) {
 386             if (!ignoreSpace) {
 387                 // enlarge buffer if needed
 388                 if (textpos + 1 > text.length) {
 389                     char[] newtext = new char[text.length + 200];
 390                     System.arraycopy(text, 0, newtext, 0, text.length);
 391                     text = newtext;
 392                 }
 393 
 394                 // output pending space
 395                 text[textpos++] = ' ';
 396                 if (!strict && !tag.getElement().isEmpty()) {
 397                     ignoreSpace = true;
 398                 }
 399             }
 400             space = false;
 401         }
 402         char[] newtext = new char[textpos];
 403         System.arraycopy(text, 0, newtext, 0, textpos);
 404         // Handles cases of bad html where the title tag
 405         // was getting lost when we did error recovery.
 406         if (tag.getElement().getName().equals("title")) {
 407             handleTitle(newtext);
 408         } else {
 409             handleText(newtext);
 410         }
 411         lastBlockStartPos = currentBlockStartPos;
 412         textpos = 0;
 413         last = tag;
 414         space = false;
 415     }
 416 
 417     /**
 418      * Invokes the error handler.
 419      *
 420      * @param err   the error type
 421      * @param arg1  the 1st error message argument
 422      * @param arg2  the 2nd error message argument


 820 
 821     /**
 822      * Error context. Something went wrong, make sure we are in
 823      * the document's body context
 824      */
 825     void errorContext() throws ChangedCharSetException {
 826         for (; (stack != null) && (stack.tag.getElement() != dtd.body) ; stack = stack.next) {
 827             handleEndTag(stack.tag);
 828         }
 829         if (stack == null) {
 830             legalElementContext(dtd.body);
 831             startTag(makeTag(dtd.body, true));
 832         }
 833     }
 834 
 835     /**
 836      * Add a char to the string buffer.
 837      */
 838     void addString(int c) {
 839         if (strpos  == str.length) {
 840             char[] newstr = new char[str.length + 128];
 841             System.arraycopy(str, 0, newstr, 0, str.length);
 842             str = newstr;
 843         }
 844         str[strpos++] = (char)c;
 845     }
 846 
 847     /**
 848      * Get the string that's been accumulated.
 849      */
 850     String getString(int pos) {
 851         char[] newStr = new char[strpos - pos];
 852         System.arraycopy(str, pos, newStr, 0, strpos - pos);
 853         strpos = pos;
 854         return new String(newStr);
 855     }
 856 
 857     char[] getChars(int pos) {
 858         char[] newStr = new char[strpos - pos];
 859         System.arraycopy(str, pos, newStr, 0, strpos - pos);
 860         strpos = pos;
 861         return newStr;
 862     }
 863 
 864     char[] getChars(int pos, int endPos) {
 865         char[] newStr = new char[endPos - pos];
 866         System.arraycopy(str, pos, newStr, 0, endPos - pos);
 867         // REMIND: it's not clear whether this version should set strpos or not
 868         // strpos = pos;
 869         return newStr;
 870     }
 871 
 872     void resetStrBuffer() {
 873         strpos = 0;
 874     }
 875 
 876     int strIndexOf(char target) {
 877         for (int i = 0; i < strpos; i++) {
 878             if (str[i] == target) {
 879                 return i;
 880             }
 881         }
 882 
 883         return -1;
 884     }
 885 


1017                         ln++;
1018                         ch = readCh();
1019                         lfCount++;
1020                         break;
1021 
1022                     case '\r':
1023                         ln++;
1024                         if ((ch = readCh()) == '\n') {
1025                             ch = readCh();
1026                             crlfCount++;
1027                         }
1028                         else {
1029                             crCount++;
1030                         }
1031                         break;
1032 
1033                     case ';':
1034                         ch = readCh();
1035                         break;
1036                 }
1037                 char[] data = mapNumericReference(n);
1038                 return data;
1039             }
1040             addString('#');
1041             if (!parseIdentifier(false)) {
1042                 error("ident.expected");
1043                 strpos = pos;
1044                 char[] data = {'&', '#'};
1045                 return data;
1046             }
1047         } else if (!parseIdentifier(false)) {
1048             char[] data = {'&'};
1049             return data;
1050         }
1051 
1052         boolean semicolon = false;
1053 
1054         switch (ch) {
1055           case '\n':
1056             ln++;
1057             ch = readCh();
1058             lfCount++;
1059             break;
1060 
1061           case '\r':
1062             ln++;
1063             if ((ch = readCh()) == '\n') {
1064                 ch = readCh();
1065                 crlfCount++;
1066             }
1067             else {
1068                 crCount++;


1078 
1079         String nm = getString(pos);
1080         Entity ent = dtd.getEntity(nm);
1081 
1082         // entities are case sensitive - however if strict
1083         // is false then we will try to make a match by
1084         // converting the string to all lowercase.
1085         //
1086         if (!strict && (ent == null)) {
1087             ent = dtd.getEntity(nm.toLowerCase());
1088         }
1089         if ((ent == null) || !ent.isGeneral()) {
1090 
1091             if (nm.length() == 0) {
1092                 error("invalid.entref", nm);
1093                 return new char[0];
1094             }
1095             /* given that there is not a match restore the entity reference */
1096             String str = "&" + nm + (semicolon ? ";" : "");
1097 
1098             char[] b = new char[str.length()];
1099             str.getChars(0, b.length, b, 0);
1100             return b;
1101         }
1102         return ent.getData();
1103     }
1104 
1105     /**
1106      * Converts numeric character reference to char array.
1107      *
1108      * Normally the code in a reference should be always converted
1109      * to the Unicode character with the same code, but due to
1110      * wide usage of Cp1252 charset most browsers map numeric references
1111      * in the range 130-159 (which are control chars in Unicode set)
1112      * to displayable characters with other codes.
1113      *
1114      * @param c the code of numeric character reference.
1115      * @return a char array corresponding to the reference code.
1116      */
1117     private char[] mapNumericReference(int c) {
1118         char[] data;


1234               case '>':
1235                 ch = readCh();
1236                 int i = textpos - (stack.elem.name.length() + 2), j = 0;
1237 
1238                 // match end tag
1239                 if ((i >= 0) && (text[i++] == '<') && (text[i] == '/')) {
1240                     while ((++i < textpos) &&
1241                            (Character.toLowerCase(text[i]) == stack.elem.name.charAt(j++)));
1242                     if (i == textpos) {
1243                         textpos -= (stack.elem.name.length() + 2);
1244                         if ((textpos > 0) && (text[textpos-1] == '\n')) {
1245                             textpos--;
1246                         }
1247                         endTag(false);
1248                         return;
1249                     }
1250                 }
1251                 break;
1252 
1253               case '&':
1254                 char[] data = parseEntityReference();
1255                 if (textpos + data.length > text.length) {
1256                     char[] newtext = new char[Math.max(textpos + data.length + 128, text.length * 2)];
1257                     System.arraycopy(text, 0, newtext, 0, text.length);
1258                     text = newtext;
1259                 }
1260                 System.arraycopy(data, 0, text, textpos, data.length);
1261                 textpos += data.length;
1262                 continue;
1263 
1264               case '\n':
1265                 ln++;
1266                 ch = readCh();
1267                 lfCount++;
1268                 break;
1269 
1270               case '\r':
1271                 ln++;
1272                 if ((ch = readCh()) == '\n') {
1273                     ch = readCh();
1274                     crlfCount++;
1275                 }
1276                 else {
1277                     crCount++;
1278                 }
1279                 c = '\n';
1280                 break;
1281               default:
1282                 ch = readCh();
1283                 break;
1284             }
1285 
1286             // output character
1287             if (textpos == text.length) {
1288                 char[] newtext = new char[text.length + 128];
1289                 System.arraycopy(text, 0, newtext, 0, text.length);
1290                 text = newtext;
1291             }
1292             text[textpos++] = (char)c;
1293         }
1294     }
1295 
1296     /**
1297      * Parse attribute value. [33] 331:1
1298      */
1299     @SuppressWarnings("fallthrough")
1300     String parseAttributeValue(boolean lower) throws IOException {
1301         int delim = -1;
1302 
1303         // Check for a delimiter
1304         switch(ch) {
1305           case '\'':
1306           case '"':
1307             delim = ch;
1308             ch = readCh();


1378                        is considered invalid since an = sign can only be contained
1379                        in an attributes value if the string is quoted.
1380                        */
1381                     error("attvalerr");
1382                     /* If strict is true then we return with the string we have thus far.
1383                        Otherwise we accept the = sign as part of the attribute's value and
1384                        process the rest of the img tag. */
1385                     if (strict) {
1386                         return getString(0);
1387                     }
1388                 }
1389                 ch = readCh();
1390                 break;
1391 
1392               case '&':
1393                 if (strict && delim < 0) {
1394                     ch = readCh();
1395                     break;
1396                 }
1397 
1398                 char[] data = parseEntityReference();
1399                 for (int i = 0 ; i < data.length ; i++) {
1400                     c = data[i];
1401                     addString((lower && (c >= 'A') && (c <= 'Z')) ? 'a' + c - 'A' : c);
1402                 }
1403                 continue;
1404 
1405               case -1:
1406                 return getString(0);
1407 
1408               default:
1409                 if (lower && (c >= 'A') && (c <= 'Z')) {
1410                     c = 'a' + c - 'A';
1411                 }
1412                 ch = readCh();
1413                 break;
1414             }
1415             addString(c);
1416         }
1417     }
1418 


1489                     }
1490                     skipSpace();
1491                     if (ch == '=') {
1492                         ch = readCh();
1493                         skipSpace();
1494                         att = elem.getAttribute(attname);
1495                         attvalue = parseAttributeValue((att != null) &&
1496                                                 (att.type != CDATA) &&
1497                                                 (att.type != NOTATION));
1498                     } else {
1499                         attvalue = attname;
1500                         att = elem.getAttributeByValue(attvalue);
1501                         if (att == null) {
1502                             att = elem.getAttribute(attname);
1503                             if (att != null) {
1504                                 attvalue = att.getValue();
1505                             }
1506                         }
1507                     }
1508                 } else {
1509                     char[] str = {(char)ch};
1510                     error("invalid.tagchar", new String(str), elem.getName());
1511                     ch = readCh();
1512                     continue;
1513                 }
1514             } else if (!strict && (attributes.isEmpty()) && (ch == '=')) {
1515                 ch = readCh();
1516                 skipSpace();
1517                 attname = elem.getName();
1518                 att = elem.getAttribute(attname);
1519                 attvalue = parseAttributeValue((att != null) &&
1520                                                (att.type != CDATA) &&
1521                                                (att.type != NOTATION));
1522             } else if (!strict && (ch == '=')) {
1523                 ch = readCh();
1524                 skipSpace();
1525                 attvalue = parseAttributeValue(true);
1526                 error("attvalerr");
1527                 return;
1528             } else {
1529                 char[] str = {(char)ch};
1530                 error("invalid.tagchar", new String(str), elem.getName());
1531                 if (!strict) {
1532                     ch = readCh();
1533                     continue;
1534                 } else {
1535                     return;
1536                 }
1537             }
1538 
1539             if (att != null) {
1540                 attname = att.getName();
1541             } else {
1542                 error("invalid.tagatt", attname, elem.getName());
1543             }
1544 
1545             // Check out the value
1546             if (attributes.isDefined(attname)) {
1547                 error("multi.tagatt", attname, elem.getName());
1548             }
1549             if (attvalue == null) {


1657         boolean net = false;
1658         boolean warned = false;
1659         boolean unknown = false;
1660 
1661         switch (ch = readCh()) {
1662           case '!':
1663             switch (ch = readCh()) {
1664               case '-':
1665                 // Parse comment. [92] 391:7
1666                 while (true) {
1667                     if (ch == '-') {
1668                         if (!strict || ((ch = readCh()) == '-')) {
1669                             ch = readCh();
1670                             if (!strict && ch == '-') {
1671                                 ch = readCh();
1672                             }
1673                             // send over any text you might see
1674                             // before parsing and sending the
1675                             // comment
1676                             if (textpos != 0) {
1677                                 char[] newtext = new char[textpos];
1678                                 System.arraycopy(text, 0, newtext, 0, textpos);
1679                                 handleText(newtext);
1680                                 lastBlockStartPos = currentBlockStartPos;
1681                                 textpos = 0;
1682                             }
1683                             parseComment();
1684                             last = makeTag(dtd.getElement("comment"), true);
1685                             handleComment(getChars(0));
1686                             continue;
1687                         } else if (!warned) {
1688                             warned = true;
1689                             error("invalid.commentchar", "-");
1690                         }
1691                     }
1692                     skipSpace();
1693                     switch (ch) {
1694                       case '-':
1695                         continue;
1696                       case '>':
1697                         ch = readCh();


2207                             error("unexpected.pcdata");
2208                         }
2209                         if (last.breaksFlow()) {
2210                             space = false;
2211                         }
2212                     }
2213                     break;
2214 
2215                   case -1:
2216                     return;
2217 
2218                   case '&':
2219                     if (textpos == 0) {
2220                         if (!legalElementContext(dtd.pcdata)) {
2221                             error("unexpected.pcdata");
2222                         }
2223                         if (last.breaksFlow()) {
2224                             space = false;
2225                         }
2226                     }
2227                     char[] data = parseEntityReference();
2228                     if (textpos + data.length + 1 > text.length) {
2229                         char[] newtext = new char[Math.max(textpos + data.length + 128, text.length * 2)];
2230                         System.arraycopy(text, 0, newtext, 0, text.length);
2231                         text = newtext;
2232                     }
2233                     if (space) {
2234                         space = false;
2235                         text[textpos++] = ' ';
2236                     }
2237                     System.arraycopy(data, 0, text, textpos, data.length);
2238                     textpos += data.length;
2239                     ignoreSpace = false;
2240                     continue;
2241 
2242                   case '\n':
2243                     ln++;
2244                     lfCount++;
2245                     ch = readCh();
2246                     if ((stack != null) && stack.pre) {
2247                         break;
2248                     }
2249                     if (textpos == 0) {


2289                         space = true;
2290                     }
2291                     continue;
2292 
2293                   default:
2294                     if (textpos == 0) {
2295                         if (!legalElementContext(dtd.pcdata)) {
2296                             error("unexpected.pcdata");
2297                         }
2298                         if (last.breaksFlow()) {
2299                             space = false;
2300                         }
2301                     }
2302                     ch = readCh();
2303                     break;
2304                 }
2305             }
2306 
2307             // enlarge buffer if needed
2308             if (textpos + 2 > text.length) {
2309                 char[] newtext = new char[text.length + 128];
2310                 System.arraycopy(text, 0, newtext, 0, text.length);
2311                 text = newtext;
2312             }
2313 
2314             // output pending space
2315             if (space) {
2316                 if (textpos == 0) {
2317                     lastBlockStartPos--;
2318                 }
2319                 text[textpos++] = ' ';
2320                 space = false;
2321             }
2322             text[textpos++] = (char)c;
2323             ignoreSpace = false;
2324         }
2325     }
2326 
2327     /**
2328      * Returns the end of line string. This will return the end of line
2329      * string that has been encountered the most, one of \r, \n or \r\n.


2395             }
2396 
2397             text = null;
2398             str = null;
2399         }
2400 
2401     }
2402 
2403 
2404     /*
2405      * Input cache.  This is much faster than calling down to a synchronized
2406      * method of BufferedReader for each byte.  Measurements done 5/30/97
2407      * show that there's no point in having a bigger buffer:  Increasing
2408      * the buffer to 8192 had no measurable impact for a program discarding
2409      * one character at a time (reading from an http URL to a local machine).
2410      * NOTE: If the current encoding is bogus, and we read too much
2411      * (past the content-type) we may suffer a MalformedInputException. For
2412      * this reason the initial size is 1 and when the body is encountered the
2413      * size is adjusted to 256.
2414      */
2415     private char[] buf = new char[1];
2416     private int pos;
2417     private int len;
2418     /*
2419         tracks position relative to the beginning of the
2420         document.
2421     */
2422     private int currentPosition;
2423 
2424 
2425     private int readCh() throws IOException {
2426 
2427         if (pos >= len) {
2428 
2429             // This loop allows us to ignore interrupts if the flag
2430             // says so
2431             for (;;) {
2432                 try {
2433                     len = in.read(buf);
2434                     break;
2435                 } catch (InterruptedIOException ex) {


< prev index next >