1 /* 2 * Copyright (c) 2008, 2015, Oracle and/or its affiliates. 3 * All rights reserved. Use is subject to license terms. 4 * 5 * This file is available and licensed under the following license: 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * - Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * - Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the distribution. 16 * - Neither the name of Oracle Corporation nor the names of its 17 * contributors may be used to endorse or promote products derived 18 * from this software without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 23 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 24 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 25 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 26 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 27 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 28 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 29 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 30 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 */ 32 package ensemble.compiletime.search; 33 34 import java.io.File; 35 import java.io.IOException; 36 import java.io.StringReader; 37 import java.nio.file.Files; 38 import java.util.ArrayList; 39 import java.util.List; 40 import java.util.regex.Pattern; 41 import javafx.collections.FXCollections; 42 import org.xml.sax.Attributes; 43 import org.xml.sax.InputSource; 44 import org.xml.sax.SAXException; 45 import org.xml.sax.XMLReader; 46 import org.xml.sax.helpers.DefaultHandler; 47 import org.xml.sax.helpers.XMLReaderFactory; 48 49 public class DocumentationIndexer { 50 static { 51 System.setProperty("java.net.useSystemProxies", "true"); 52 } 53 54 private enum State {DEFAULT, BOOK_TITLE, CHAPTER, SECT1, SECT_H1_H2}; 55 56 public static class Section { 57 public final String name; 58 public final String content; 59 public final String url; 60 61 public Section(String name, String content, String url) { 62 this.name = name; 63 this.content = content; 64 this.url = url; 65 } 66 67 @Override public String toString() { 68 return "Section{" + "name=" + name + ", content=" + content + '}'; 69 } 70 } 71 72 public static class DocPage { 73 public final String bookTitle; 74 public final String chapter; 75 public final String nextUrl; 76 public final List<Section> sections; 77 78 public DocPage(String bookTitle, String chapter, String nextUrl, List<Section> sections) { 79 this.bookTitle = bookTitle; 80 this.chapter = chapter; 81 this.nextUrl = nextUrl; 82 this.sections = sections; 83 } 84 } 85 86 private static int tmpIndex = 0; 87 88 private static final String[][] REPLACEMENTS = { 89 90 // Remove any comments 91 { "(?s)<!--.*?-->", "" }, 92 93 // Remove scripts and styles 94 { "(?s)<(script|style).*?</\\1>", "" }, 95 96 // Remove unnecessary tags 97 { "(?i)</?(?!html\\b|div\\b|h1\\b|h2\\b|a\\b|img\\b)(\\w+\\b)[^>]*>", "" }, 98 99 // Remove malformed garbage from links 100 { "(?x) <a (?:\\s+ (?: (href \\s* = \\s* \\\"[^\\\"]*\\\") " 101 + "| (name \\s* = \\s* \\\"[^\\\"]*\\\") " 102 + "| \\w+\\s*=\\s*\\\"[^\\\"]*\\\" " 103 + "| \\w+\\s*=\\s*[^\\s\\\">]+))* \\s* >", "<a $1 $2>" }, 104 105 // { "(?i)</?[a-z]+\b(<!div|h1|h2|a|img)[^>]*>", "" }, // Remove unnecessary tags 106 // { "</?(?:p|br)[^>]*>", "" }, // Remove unnecessary tags 107 // { "<meta [^>]+>", "" }, 108 // { "[ \\t\\x0B\\f\\r]+", " " }, 109 // { "[ \\t\\x0B\\f\\r\\n]+", "\\n" }, 110 // // fix links like <a href=init-window-big.gif> to <a href="init-window-big.gif"> 111 // { "<a href=([^\">]+)>", "<a href=\"$1\">" }, 112 // { "type=\"text/css\" media=\"screen\"", "" }, 113 // { "<input [^>]+>", "" }, 114 // { "target=_top", "" }, 115 // { "xWebsiteObjectType <Matches> `Data File`", "" }, 116 // { "<hr>", "<hr/>" }, 117 // { "&", "&" }, 118 // { "<span class=red>", "<span class=\"red\">" }, 119 }; 120 private static final Pattern[] COMPILED_PATTERNS = new Pattern[REPLACEMENTS.length]; 121 122 static { 123 for (int i = 0; i < REPLACEMENTS.length; i++) { 124 COMPILED_PATTERNS[i] = Pattern.compile(REPLACEMENTS[i][0]); 125 } 126 } 127 128 public static DocPage parseDocsPage(final String url, String content) throws Exception { 129 130 for (int i = 0; i < REPLACEMENTS.length; i++) { 131 content = COMPILED_PATTERNS[i].matcher(content).replaceAll(REPLACEMENTS[i][1]); 132 } 133 try { 134 DocHandler handler = new DocHandler(url); 135 XMLReader xmlParser = XMLReaderFactory.createXMLReader(); 136 xmlParser.setContentHandler(handler); 137 xmlParser.setEntityResolver(handler); 138 xmlParser.parse(new InputSource(new StringReader(content))); 139 return handler.getDocPage(); 140 } catch (SAXException | IOException e) { 141 String filename = "tmp" + tmpIndex++ + ".txt"; 142 Files.write(new File(filename).toPath(), FXCollections.observableArrayList(content)); 143 throw new RuntimeException("\"Failed to parse '" + url + "', see content in " + filename + ".", e); 144 } 145 } 146 147 public static void main(String[] args) throws Exception { 148 // final String url = "https://docs.oracle.com/javafx/2/layout/builtin_layouts.htm#sthref15"; 149 final String url = "https://docs.oracle.com/javafx/2/overview/jfxpub-overview.htm"; 150 parseDocsPage(url, BuildEnsembleSearchIndex.grabWebPage(url).toString()); 151 } 152 153 private static class DocHandler extends DefaultHandler { 154 private final String url; 155 private State state = State.DEFAULT; 156 private int divDepth = 0; 157 private StringBuilder buf = new StringBuilder(); 158 private String bookTitle; 159 private String chapter; 160 private String sectName; 161 private String sectContent; 162 private String sectUrl; 163 private String currentLink; 164 private String currentLinkName; 165 private String nextUrl; 166 private List<Section> sections = new ArrayList<>(); 167 private DocPage docPage; 168 169 public DocHandler(String url) { 170 this.url = url; 171 } 172 173 public DocPage getDocPage() { 174 return docPage; 175 } 176 177 @Override public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException { 178 // System.out.println("<<<<< localName = " + localName+" qName = " + qName+" ["+state+"]"); 179 switch(state) { 180 case DEFAULT: 181 if ("div".equals(localName)) { 182 String id = attributes.getValue("id"); 183 String classSt = attributes.getValue("class"); 184 if ("bookTitle".equals(id)) { 185 state = State.BOOK_TITLE; 186 buf.setLength(0); 187 } else if ("sect1".equals(classSt) || "refsect1".equals(classSt)) { 188 state = State.SECT1; 189 buf.setLength(0); 190 divDepth = 0; 191 } 192 } else if ("h1".equals(localName)) { 193 String classSt = attributes.getValue("class"); 194 195 if ("chapter".equals(classSt)) { 196 state = State.CHAPTER; 197 buf.setLength(0); 198 } 199 } else if ("a".equals(localName)) { 200 currentLink = attributes.getValue("href"); 201 currentLinkName = attributes.getValue("name"); 202 } else if ("img".equals(localName) && "Next".equals(attributes.getValue("alt"))) { 203 nextUrl = url.substring(0,url.lastIndexOf('/')+1) + currentLink; 204 } 205 break; 206 case SECT1: 207 if ("div".equals(localName)) { 208 divDepth ++; 209 } else if ("h1".equals(localName) || "h2".equals(localName)) { 210 state = State.SECT_H1_H2; 211 buf.setLength(0); 212 } 213 break; 214 } 215 } 216 217 @Override public void endElement(String uri, String localName, String qName) throws SAXException { 218 // System.out.println(">>>> localName = " + localName+" qName = " + qName+" ["+state+"]"); 219 switch(state) { 220 case SECT1: 221 if ("div".equals(localName)) { 222 if (divDepth == 0) { 223 sectContent = buf.toString().trim(); 224 final int hashIndex = url.indexOf('#'); 225 final String sectionUrl = (hashIndex == -1 ? url : url.substring(0,hashIndex)) + "#" + currentLinkName; 226 sections.add(new Section(sectName, sectContent, sectionUrl)); 227 state = State.DEFAULT; 228 } else { 229 divDepth --; 230 } 231 } 232 break; 233 case SECT_H1_H2: 234 if ("h1".equals(localName) || "h2".equals(localName)) { 235 state = State.SECT1; 236 sectName = buf.toString().trim(); 237 buf.setLength(0); 238 } 239 case BOOK_TITLE: 240 if ("div".equals(localName)) { 241 bookTitle = buf.toString().trim(); 242 state = State.DEFAULT; 243 } 244 break; 245 case CHAPTER: 246 if ("h1".equals(localName)) { 247 chapter = buf.toString().trim(); 248 state = State.DEFAULT; 249 } 250 break; 251 } 252 } 253 254 @Override public void endDocument() throws SAXException { 255 docPage = new DocPage(bookTitle, chapter, nextUrl, sections); 256 } 257 258 @Override public void characters(char[] ch, int start, int length) throws SAXException { 259 switch(state) { 260 case BOOK_TITLE: 261 case CHAPTER: 262 case SECT1: 263 case SECT_H1_H2: 264 buf.append(ch,start,length); 265 break; 266 } 267 } 268 269 // AVOID LONG WAITS FOR 270 @Override public InputSource resolveEntity(String publicId, String systemId) throws IOException, SAXException { 271 return new InputSource(new StringReader("")); 272 } 273 } 274 }