1 /*
   2  * Copyright (c) 2008, 2015, Oracle and/or its affiliates.
   3  * All rights reserved. Use is subject to license terms.
   4  *
   5  * This file is available and licensed under the following license:
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions
   9  * are met:
  10  *
  11  *  - Redistributions of source code must retain the above copyright
  12  *    notice, this list of conditions and the following disclaimer.
  13  *  - Redistributions in binary form must reproduce the above copyright
  14  *    notice, this list of conditions and the following disclaimer in
  15  *    the documentation and/or other materials provided with the distribution.
  16  *  - Neither the name of Oracle Corporation nor the names of its
  17  *    contributors may be used to endorse or promote products derived
  18  *    from this software without specific prior written permission.
  19  *
  20  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  21  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  22  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  23  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  24  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  25  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  26  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  27  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  28  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  29  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  30  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  31  */
  32 package ensemble.compiletime.search;
  33 
  34 import java.io.File;
  35 import java.io.IOException;
  36 import java.io.StringReader;
  37 import java.nio.file.Files;
  38 import java.util.ArrayList;
  39 import java.util.List;
  40 import java.util.regex.Pattern;
  41 import javafx.collections.FXCollections;
  42 import org.xml.sax.Attributes;
  43 import org.xml.sax.InputSource;
  44 import org.xml.sax.SAXException;
  45 import org.xml.sax.XMLReader;
  46 import org.xml.sax.helpers.DefaultHandler;
  47 import org.xml.sax.helpers.XMLReaderFactory;
  48 
  49 public class DocumentationIndexer {
  50     static {
  51         System.setProperty("java.net.useSystemProxies", "true");
  52     }
  53     
  54     private enum State {DEFAULT, BOOK_TITLE, CHAPTER, SECT1, SECT_H1_H2};
  55     
  56     public static class Section {
  57         public final String name;
  58         public final String content;
  59         public final String url;
  60 
  61         public Section(String name, String content, String url) {
  62             this.name = name;
  63             this.content = content;
  64             this.url = url;
  65         }
  66 
  67         @Override public String toString() {
  68             return "Section{" + "name=" + name + ", content=" + content + '}';
  69         }
  70     }
  71     
  72     public static class DocPage {
  73         public final String bookTitle;
  74         public final String chapter;
  75         public final String nextUrl;
  76         public final List<Section> sections;
  77 
  78         public DocPage(String bookTitle, String chapter, String nextUrl, List<Section> sections) {
  79             this.bookTitle = bookTitle;
  80             this.chapter = chapter;
  81             this.nextUrl = nextUrl;
  82             this.sections = sections;
  83         }
  84     }
  85     
  86     private static int tmpIndex = 0;
  87     
  88     private static final String[][] REPLACEMENTS = {
  89         
  90         // Remove any comments
  91         { "(?s)<!--.*?-->", "" }, 
  92         
  93         // Remove scripts and styles
  94         { "(?s)<(script|style).*?</\\1>", "" }, 
  95         
  96         // Remove unnecessary tags
  97         { "(?i)</?(?!html\\b|div\\b|h1\\b|h2\\b|a\\b|img\\b)(\\w+\\b)[^>]*>", "" }, 
  98         
  99         // Remove malformed garbage from links
 100         { "(?x) <a (?:\\s+ (?: (href \\s* = \\s* \\\"[^\\\"]*\\\") "
 101             + "| (name \\s* = \\s* \\\"[^\\\"]*\\\") "
 102             + "| \\w+\\s*=\\s*\\\"[^\\\"]*\\\" "
 103             + "| \\w+\\s*=\\s*[^\\s\\\">]+))* \\s* >", "<a $1 $2>" }, 
 104         
 105 //        { "(?i)</?[a-z]+\b(<!div|h1|h2|a|img)[^>]*>", "" }, // Remove unnecessary tags
 106 //        { "</?(?:p|br)[^>]*>", "" }, // Remove unnecessary tags
 107 //        { "<meta [^>]+>", "" },
 108 //        { "[ \\t\\x0B\\f\\r]+", " " },
 109 //        { "[ \\t\\x0B\\f\\r\\n]+", "\\n" },
 110 //        // fix links like <a href=init-window-big.gif> to <a href="init-window-big.gif">
 111 //        { "<a href=([^\">]+)>", "<a href=\"$1\">" },
 112 //        { "type=\"text/css\" media=\"screen\"", "" },
 113 //        { "<input [^>]+>", "" },
 114 //        { "target=_top", "" },
 115 //        { "xWebsiteObjectType <Matches> `Data File`", "" },
 116 //        { "<hr>", "<hr/>" },
 117 //        { "&", "&amp;" },
 118 //        { "<span class=red>", "<span class=\"red\">" },
 119     };
 120     private static final Pattern[] COMPILED_PATTERNS = new Pattern[REPLACEMENTS.length];
 121     
 122     static {
 123         for (int i = 0; i < REPLACEMENTS.length; i++) {
 124             COMPILED_PATTERNS[i] = Pattern.compile(REPLACEMENTS[i][0]);
 125         }
 126     }
 127     
 128     public static DocPage parseDocsPage(final String url, String content) throws Exception {
 129         
 130         for (int i = 0; i < REPLACEMENTS.length; i++) {
 131             content = COMPILED_PATTERNS[i].matcher(content).replaceAll(REPLACEMENTS[i][1]);
 132         }
 133         try {
 134             DocHandler handler = new DocHandler(url);
 135             XMLReader xmlParser = XMLReaderFactory.createXMLReader();
 136             xmlParser.setContentHandler(handler);
 137             xmlParser.setEntityResolver(handler);
 138             xmlParser.parse(new InputSource(new StringReader(content)));
 139             return handler.getDocPage();
 140         } catch (SAXException | IOException e) {
 141             String filename = "tmp" + tmpIndex++ + ".txt";
 142             Files.write(new File(filename).toPath(), FXCollections.observableArrayList(content));
 143             throw new RuntimeException("\"Failed to parse '" + url + "', see content in " + filename + ".", e);
 144         }
 145     }
 146     
 147     public static void main(String[] args) throws Exception {
 148 //        final String url = "https://docs.oracle.com/javafx/2/layout/builtin_layouts.htm#sthref15";
 149         final String url = "https://docs.oracle.com/javafx/2/overview/jfxpub-overview.htm";
 150         parseDocsPage(url, BuildEnsembleSearchIndex.grabWebPage(url).toString());
 151     }
 152     
 153     private static class DocHandler extends DefaultHandler {
 154         private final String url;
 155         private State state = State.DEFAULT;
 156         private int divDepth = 0;
 157         private StringBuilder buf = new StringBuilder();
 158         private String bookTitle;
 159         private String chapter;
 160         private String sectName;
 161         private String sectContent;
 162         private String sectUrl;
 163         private String currentLink;
 164         private String currentLinkName;
 165         private String nextUrl;
 166         private List<Section> sections = new ArrayList<>();
 167         private DocPage docPage;
 168 
 169         public DocHandler(String url) {
 170             this.url = url;
 171         }
 172 
 173         public DocPage getDocPage() {
 174             return docPage;
 175         }
 176 
 177         @Override public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {
 178 //                System.out.println("<<<<<  localName = " + localName+"  qName = " + qName+" ["+state+"]");
 179             switch(state) {
 180                 case DEFAULT:
 181                     if ("div".equals(localName)) {
 182                         String id = attributes.getValue("id");
 183                         String classSt = attributes.getValue("class");
 184                         if ("bookTitle".equals(id)) {
 185                             state = State.BOOK_TITLE;
 186                             buf.setLength(0);
 187                         } else if ("sect1".equals(classSt) || "refsect1".equals(classSt)) {
 188                             state = State.SECT1;
 189                             buf.setLength(0);
 190                             divDepth = 0;
 191                         }
 192                     } else if ("h1".equals(localName)) {
 193                         String classSt = attributes.getValue("class");
 194 
 195                         if ("chapter".equals(classSt)) {
 196                             state = State.CHAPTER;
 197                             buf.setLength(0);
 198                         }
 199                     } else if ("a".equals(localName)) {
 200                         currentLink = attributes.getValue("href");
 201                         currentLinkName = attributes.getValue("name");
 202                     } else if ("img".equals(localName) && "Next".equals(attributes.getValue("alt"))) {
 203                         nextUrl = url.substring(0,url.lastIndexOf('/')+1) + currentLink;
 204                     }
 205                     break;
 206                 case SECT1: 
 207                     if ("div".equals(localName)) {
 208                         divDepth ++;
 209                     } else if ("h1".equals(localName) || "h2".equals(localName)) {
 210                         state = State.SECT_H1_H2;
 211                         buf.setLength(0);
 212                     }
 213                     break;
 214             }       
 215         }
 216 
 217         @Override public void endElement(String uri, String localName, String qName) throws SAXException {
 218 //                System.out.println(">>>>  localName = " + localName+"  qName = " + qName+" ["+state+"]");
 219             switch(state) {
 220                 case SECT1:
 221                     if ("div".equals(localName)) {
 222                         if (divDepth == 0) {
 223                             sectContent = buf.toString().trim();
 224                             final int hashIndex = url.indexOf('#');
 225                             final String sectionUrl = (hashIndex == -1 ? url : url.substring(0,hashIndex)) + "#" + currentLinkName;
 226                             sections.add(new Section(sectName, sectContent, sectionUrl));
 227                             state = State.DEFAULT;
 228                         } else {
 229                             divDepth --;
 230                         }
 231                     }
 232                     break;
 233                 case SECT_H1_H2:
 234                     if ("h1".equals(localName) || "h2".equals(localName)) {
 235                         state = State.SECT1;
 236                         sectName = buf.toString().trim();
 237                         buf.setLength(0);
 238                     }
 239                 case BOOK_TITLE:
 240                     if ("div".equals(localName)) {
 241                         bookTitle = buf.toString().trim();
 242                         state = State.DEFAULT;
 243                     }
 244                     break;
 245                 case CHAPTER:
 246                     if ("h1".equals(localName)) {
 247                         chapter = buf.toString().trim();
 248                         state = State.DEFAULT;
 249                     }
 250                     break;
 251             }
 252         }
 253 
 254         @Override public void endDocument() throws SAXException {
 255             docPage = new DocPage(bookTitle, chapter, nextUrl, sections);
 256         }
 257 
 258         @Override public void characters(char[] ch, int start, int length) throws SAXException {
 259             switch(state) {
 260                 case BOOK_TITLE:
 261                 case CHAPTER:
 262                 case SECT1:
 263                 case SECT_H1_H2:
 264                     buf.append(ch,start,length);
 265                     break;
 266             }
 267         }
 268 
 269         // AVOID LONG WAITS FOR 
 270         @Override public InputSource resolveEntity(String publicId, String systemId) throws IOException, SAXException {
 271             return new InputSource(new StringReader(""));
 272         }
 273     }
 274 }