1 /*
2 * Copyright (c) 2008, 2014, Oracle and/or its affiliates.
3 * All rights reserved. Use is subject to license terms.
4 *
5 * This file is available and licensed under the following license:
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 *
11 * - Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * - Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the distribution.
16 * - Neither the name of Oracle Corporation nor the names of its
17 * contributors may be used to endorse or promote products derived
18 * from this software without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
98
99 // Remove malformed garbage from links
100 { "(?x) <a (?:\\s+ (?: (href \\s* = \\s* \\\"[^\\\"]*\\\") "
101 + "| (name \\s* = \\s* \\\"[^\\\"]*\\\") "
102 + "| \\w+\\s*=\\s*\\\"[^\\\"]*\\\" "
103 + "| \\w+\\s*=\\s*[^\\s\\\">]+))* \\s* >", "<a $1 $2>" },
104
105 // { "(?i)</?[a-z]+\b(<!div|h1|h2|a|img)[^>]*>", "" }, // Remove unnecessary tags
106 // { "</?(?:p|br)[^>]*>", "" }, // Remove unnecessary tags
107 // { "<meta [^>]+>", "" },
108 // { "[ \\t\\x0B\\f\\r]+", " " },
109 // { "[ \\t\\x0B\\f\\r\\n]+", "\\n" },
110 // // fix links like <a href=init-window-big.gif> to <a href="init-window-big.gif">
111 // { "<a href=([^\">]+)>", "<a href=\"$1\">" },
112 // { "type=\"text/css\" media=\"screen\"", "" },
113 // { "<input [^>]+>", "" },
114 // { "target=_top", "" },
115 // { "xWebsiteObjectType <Matches> `Data File`", "" },
116 // { "<hr>", "<hr/>" },
117 // { "&", "&" },
118 // { "<span class=red>", "<span class=\"red\">" },
119 };
120 private static final Pattern[] COMPILED_PATTERNS = new Pattern[REPLACEMENTS.length];
121
122 static {
123 for (int i = 0; i < REPLACEMENTS.length; i++) {
124 COMPILED_PATTERNS[i] = Pattern.compile(REPLACEMENTS[i][0]);
125 }
126 }
127
128 public static DocPage parseDocsPage(final String url, String content) throws Exception {
129
130 for (int i = 0; i < REPLACEMENTS.length; i++) {
131 content = COMPILED_PATTERNS[i].matcher(content).replaceAll(REPLACEMENTS[i][1]);
132 }
133 try {
134 DocHandler handler = new DocHandler(url);
135 XMLReader xmlParser = XMLReaderFactory.createXMLReader();
136 xmlParser.setContentHandler(handler);
137 xmlParser.setEntityResolver(handler);
138 xmlParser.parse(new InputSource(new StringReader(content)));
|
1 /*
2 * Copyright (c) 2008, 2015, Oracle and/or its affiliates.
3 * All rights reserved. Use is subject to license terms.
4 *
5 * This file is available and licensed under the following license:
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 *
11 * - Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * - Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the distribution.
16 * - Neither the name of Oracle Corporation nor the names of its
17 * contributors may be used to endorse or promote products derived
18 * from this software without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
98
99 // Remove malformed garbage from links
100 { "(?x) <a (?:\\s+ (?: (href \\s* = \\s* \\\"[^\\\"]*\\\") "
101 + "| (name \\s* = \\s* \\\"[^\\\"]*\\\") "
102 + "| \\w+\\s*=\\s*\\\"[^\\\"]*\\\" "
103 + "| \\w+\\s*=\\s*[^\\s\\\">]+))* \\s* >", "<a $1 $2>" },
104
105 // { "(?i)</?[a-z]+\b(<!div|h1|h2|a|img)[^>]*>", "" }, // Remove unnecessary tags
106 // { "</?(?:p|br)[^>]*>", "" }, // Remove unnecessary tags
107 // { "<meta [^>]+>", "" },
108 // { "[ \\t\\x0B\\f\\r]+", " " },
109 // { "[ \\t\\x0B\\f\\r\\n]+", "\\n" },
110 // // fix links like <a href=init-window-big.gif> to <a href="init-window-big.gif">
111 // { "<a href=([^\">]+)>", "<a href=\"$1\">" },
112 // { "type=\"text/css\" media=\"screen\"", "" },
113 // { "<input [^>]+>", "" },
114 // { "target=_top", "" },
115 // { "xWebsiteObjectType <Matches> `Data File`", "" },
116 // { "<hr>", "<hr/>" },
117 // { "&", "&" },
118 // { "<span class=red>", "<span class=\"red\">" },
119 };
120 private static final Pattern[] COMPILED_PATTERNS = new Pattern[REPLACEMENTS.length];
121
122 static {
123 for (int i = 0; i < REPLACEMENTS.length; i++) {
124 COMPILED_PATTERNS[i] = Pattern.compile(REPLACEMENTS[i][0]);
125 }
126 }
127
128 public static DocPage parseDocsPage(final String url, String content) throws Exception {
129
130 for (int i = 0; i < REPLACEMENTS.length; i++) {
131 content = COMPILED_PATTERNS[i].matcher(content).replaceAll(REPLACEMENTS[i][1]);
132 }
133 try {
134 DocHandler handler = new DocHandler(url);
135 XMLReader xmlParser = XMLReaderFactory.createXMLReader();
136 xmlParser.setContentHandler(handler);
137 xmlParser.setEntityResolver(handler);
138 xmlParser.parse(new InputSource(new StringReader(content)));
|