< prev index next >

apps/samples/Ensemble8/src/compiletime/java/ensemble/compiletime/search/BuildEnsembleSearchIndex.java

Print this page
rev 9898 : 8178275: Ensemble: Upgrade version of Lucene to 7.1.0
Reviewed-by: aghaisas, prr
   1 /*
   2  * Copyright (c) 2008, 2015, Oracle and/or its affiliates.
   3  * All rights reserved. Use is subject to license terms.
   4  *
   5  * This file is available and licensed under the following license:
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions
   9  * are met:
  10  *
  11  *  - Redistributions of source code must retain the above copyright
  12  *    notice, this list of conditions and the following disclaimer.
  13  *  - Redistributions in binary form must reproduce the above copyright
  14  *    notice, this list of conditions and the following disclaimer in
  15  *    the documentation and/or other materials provided with the distribution.
  16  *  - Neither the name of Oracle Corporation nor the names of its
  17  *    contributors may be used to endorse or promote products derived
  18  *    from this software without specific prior written permission.
  19  *
  20  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  21  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  22  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  23  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  24  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  25  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  26  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  27  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  28  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  29  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  30  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  31  */
  32 package ensemble.compiletime.search;
  33 
  34 import ensemble.compiletime.Sample;
  35 import java.io.*;




  36 import java.net.URL;
  37 import java.util.ArrayList;
  38 import java.util.List;
  39 import java.util.concurrent.Callable;
  40 import java.util.concurrent.ExecutionException;
  41 import java.util.concurrent.Future;
  42 import java.util.concurrent.LinkedBlockingQueue;
  43 import java.util.concurrent.ThreadFactory;
  44 import java.util.concurrent.ThreadPoolExecutor;
  45 import java.util.concurrent.TimeUnit;
  46 import java.util.logging.Level;
  47 import java.util.logging.Logger;
  48 import java.util.regex.Matcher;
  49 import java.util.regex.Pattern;
  50 import org.apache.lucene.analysis.Analyzer;
  51 import org.apache.lucene.analysis.standard.StandardAnalyzer;
  52 import org.apache.lucene.document.Document;
  53 import org.apache.lucene.document.Field;



  54 import org.apache.lucene.index.IndexWriter;
  55 import org.apache.lucene.index.IndexWriterConfig;
  56 import org.apache.lucene.index.IndexWriterConfig.OpenMode;
  57 import org.apache.lucene.store.Directory;
  58 import org.apache.lucene.store.FSDirectory;
  59 import org.apache.lucene.util.Version;
  60 
  61 /**
  62  * Generate the lucene index that Ensemble uses for its search
  63  */
  64 public class BuildEnsembleSearchIndex {
  65 
  66     public static void buildSearchIndex(List<Sample> allSamples, String javaDocBaseUrl, String javafxDocumentationHome, File indexDir){
  67         try {
  68             List<Document> docs = new ArrayList<>();
  69             List<Callable<List<Document>>> tasks = new ArrayList<>();
  70             // create callables to collect data
  71             System.out.println("Creating Documents for Samples...");
  72             docs.addAll(indexSamples(allSamples));
  73             System.out.println("Creating tasks for getting all documentation...");


  74             tasks.addAll(indexJavaDocAllClasses(javaDocBaseUrl));
  75             tasks.addAll(indexAllDocumentation(javafxDocumentationHome));
  76             // execute all the tasks in 32 threads, collecting all the documents to write
  77             System.out.println("Executing tasks getting all documentation...");
  78             try {
  79                 ThreadPoolExecutor executor = new ThreadPoolExecutor(32,32,30, TimeUnit.SECONDS,new LinkedBlockingQueue());
  80                 executor.setThreadFactory(new ThreadFactory() {
  81                     int index = 0;
  82                     @Override public Thread newThread(Runnable r) {
  83                         Thread thread = new Thread(r,"Thread-"+(++index));
  84                         thread.setDaemon(true);
  85                         return thread;
  86                     }
  87                 });
  88                 List<Future<List<Document>>> results = executor.invokeAll(tasks);
  89                 for(Future<List<Document>> future : results) {
  90                     docs.addAll(future.get());
  91                 }
  92             } catch (ExecutionException | InterruptedException ex) {
  93                 Logger.getLogger(BuildEnsembleSearchIndex.class.getName()).log(Level.SEVERE, null, ex);
  94             }
  95             // create index
  96             System.out.println("Indexing to directory '" + indexDir + "'...");
  97             Directory dir = FSDirectory.open(indexDir);
  98             Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_31);
  99             IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_31, analyzer);
 100             iwc.setOpenMode(OpenMode.CREATE);
 101             try (IndexWriter writer = new IndexWriter(dir, iwc)) {
 102                 // write all docs
 103                 System.out.println("Writing ["+docs.size()+"] documents to index....");
 104                 writer.addDocuments(docs);
 105                 // optimize the writen index
 106                 System.out.println("Optimizing search index....");
 107                 writer.optimize();
 108                 System.out.println("NUMBER OF INDEXED DOCUMENTS = ["+writer.numDocs()+"]");
 109             }
 110             // write file listing all the search index files, so we know what
 111             // is in the jar file at runtime
 112             try (FileWriter listAllOut = new FileWriter(new File(indexDir,"listAll.txt"))) {
 113                 for (String fileName: dir.listAll()) {
 114                     if (!"listAll.txt".equals(fileName)) { // don't include the "listAll.txt" file

 115                         Long length = dir.fileLength(fileName);
 116                         listAllOut.write(fileName);
 117                         listAllOut.write(':');
 118                         listAllOut.write(length.toString());
 119                         listAllOut.write('\n');
 120                     }
 121                 }
 122                 listAllOut.flush();
 123             }
 124             System.out.println("Finished writing search index to directory '" + indexDir);
 125         } catch (IOException ex) {
 126             Logger.getLogger(BuildEnsembleSearchIndex.class.getName()).log(Level.SEVERE, null, ex);
 127         }
 128     }
 129 
 130     private static List<Callable<List<Document>>> indexAllDocumentation(String javafxDocumentationHome) throws IOException{
 131         List<Callable<List<Document>>> tasks = new ArrayList<>();
 132         CharSequence content = grabWebPage(javafxDocumentationHome);
 133         String baseUrl = javafxDocumentationHome.substring(0,javafxDocumentationHome.lastIndexOf('/')+1);
 134 //        System.out.println("baseUrl = " + baseUrl);


 150             tasks.add((Callable<List<Document>>) () -> indexDocumentationPage(docPageUrl));
 151         }
 152         System.out.println(" --- end of list ---");
 153         return tasks;
 154     }
 155 
 156     private static List<Document> indexDocumentationPage(String docPageUrl) throws IOException{
 157         List<Document> docs = new ArrayList<>();
 158         try {
 159 //            System.out.println("PROCESSING... ["+docPageUrl+"] on Thread ["+Thread.currentThread().getName()+"]");
 160 //            System.out.println("==================================================================");
 161 //            System.out.println("Parsing docs page ["+docPageUrl+"] ...");
 162             DocumentationIndexer.DocPage docPage = DocumentationIndexer.parseDocsPage(docPageUrl, grabWebPage(docPageUrl).toString());
 163 //            System.out.println("TITLE="+docPage.bookTitle+"   CHAPTER="+docPage.chapter+"    SECTIONS=["+docPage.sections.size()+"]");
 164             for (DocumentationIndexer.Section section: docPage.sections) {
 165                 if (section.name == null) {
 166                     System.out.println("section.name = "+section.name+" docPage.bookTitle="+docPage.bookTitle+"    "+docPageUrl);
 167                 }
 168                 // write documentation section entry to index
 169                 docs.add(createDocument(DocumentType.DOC,
 170                     new Field("bookTitle", docPage.bookTitle, Field.Store.YES, Field.Index.ANALYZED),
 171                     new Field("chapter", docPage.chapter==null? "" : docPage.chapter, Field.Store.YES, Field.Index.ANALYZED),
 172                     new Field("name", section.name, Field.Store.YES, Field.Index.ANALYZED),
 173                     new Field("description", section.content, Field.Store.NO, Field.Index.ANALYZED),
 174                     new Field("ensemblePath", section.url, Field.Store.YES, Field.Index.NOT_ANALYZED)
 175                 ));
 176             }
 177             // handle next page if there is one
 178             if (docPage.nextUrl != null) {
 179                 docs.addAll(indexDocumentationPage(docPage.nextUrl));
 180             }
 181 
 182         } catch (Exception ex) {
 183             System.out.println("FAILED TO PARSE DOCS PAGE SO IGNORED: ["+docPageUrl+"]");
 184             ex.printStackTrace(System.out);
 185         }
 186         return docs;
 187     }
 188 
 189     private static List<Callable<List<Document>>> indexJavaDocAllClasses(final String javaDocBaseUrl) throws IOException{
 190         CharSequence content = grabWebPage(javaDocBaseUrl+"allclasses-noframe.html");
 191         List<Callable<List<Document>>> tasks = new ArrayList<>();
 192         // parse package
 193         Matcher matcher = findClassUrl.matcher(content);
 194         while (matcher.find()) {
 195             final String classUrl = javaDocBaseUrl+matcher.group(1);
 196             tasks.add((Callable<List<Document>>) () -> indexApiDocs(classUrl));
 197         }
 198         return tasks;
 199     }
 200 
 201     /**
 202      * Add all samples to the search index
 203      */
 204     private static List<Document> indexSamples(List<Sample> allSamples) throws IOException {
 205         List<Document> docs = new ArrayList<>();
 206         for (Sample sample: allSamples) {
 207             // write class entry to index
 208             docs.add(createDocument(DocumentType.SAMPLE,
 209                 new Field("name", sample.name, Field.Store.YES, Field.Index.ANALYZED),
 210                 new Field("description", sample.description, Field.Store.NO, Field.Index.ANALYZED),
 211                 new Field("shortDescription", sample.description.substring(0, Math.min(160, sample.description.length())),
 212                         Field.Store.YES, Field.Index.NOT_ANALYZED),
 213                 new Field("ensemblePath", "sample://"+sample.ensemblePath, Field.Store.YES, Field.Index.NOT_ANALYZED)
 214             ));
 215         }
 216         return docs;
 217     }
 218 
 219     /**
 220      * Index a JavaDoc page for a single class, interface or enum
 221      *
 222      * @param writer The index writer to add documents to
 223      * @param url The url to the javadoc html file
 224      * @throws IOException If there was a problem indexing the file
 225      */
 226     private static List<Document> indexApiDocs(String url) throws IOException {
 227 //        System.out.println("PROCESSING... ["+url+"] on Thread ["+Thread.currentThread().getName()+"]");
 228         final List<Document> docs = new ArrayList<>();
 229         CharSequence content = grabWebPage(url);
 230         // extract package and class
 231         Matcher packageAndClassMatcher = PACKAGE_AND_CLASS.matcher(content);
 232         // search and if we fail to find ignore this file
 233         if (!packageAndClassMatcher.find()) {


 240         String packageName = packageAndClassMatcher.group(1);
 241         //System.out.println("        packageName = " + packageName);
 242         String classType = packageAndClassMatcher.group(2).toLowerCase();
 243         //System.out.println("        classType = " + classType);
 244         String className = packageAndClassMatcher.group(3);
 245         //System.out.println("        className = " + className);
 246         // extract document type
 247         DocumentType documentType = DocumentType.CLASS;
 248         if ("enum".equals(classType)) {
 249             documentType = DocumentType.ENUM;
 250         }
 251         // extract javadoc description
 252         Matcher classDescriptionMatcher = CLASS_DESCRIPTION.matcher(content);
 253         String classDescription = "";
 254         if (classDescriptionMatcher.find()) {
 255             classDescription = cleanHTML(classDescriptionMatcher.group(1));
 256         }
 257         ///System.out.println("classDescription = " + classDescription);
 258         // write class entry to index
 259         docs.add(createDocument(documentType,
 260                 new Field("name", className, Field.Store.YES, Field.Index.ANALYZED),
 261                 new Field("description", classDescription, Field.Store.NO, Field.Index.ANALYZED),
 262                 new Field("shortDescription", classDescription.substring(0,Math.min(160,classDescription.length())),
 263                         Field.Store.YES, Field.Index.NOT_ANALYZED),
 264                 new Field("package", packageName, Field.Store.YES, Field.Index.ANALYZED),
 265                 new Field("url", url, Field.Store.YES, Field.Index.NOT_ANALYZED),
 266                 new Field("ensemblePath", url, Field.Store.YES, Field.Index.NOT_ANALYZED) // TODO what do we need here
 267         ));
 268 
 269         // extract properties
 270         Matcher propertySummaryMatcher = PROPERTY_SUMMARY.matcher(content);
 271         if (propertySummaryMatcher.find()) {
 272             String propertySummaryTable = propertySummaryMatcher.group(1);
 273             Matcher propertyMatcher = PROPERTY.matcher(propertySummaryTable);
 274             while (propertyMatcher.find()) {
 275                 String propUrl = propertyMatcher.group(1);
 276                 String propertyName = propertyMatcher.group(2);
 277                 String description = cleanHTML(propertyMatcher.group(3));
 278                 //System.out.println("            propertyName = " + propertyName);
 279                 //System.out.println("                    description = " + description);
 280                 //System.out.println("                    url = " + url);
 281                 propUrl = url + "#" + propertyName;
 282                 //System.out.println("                    oracle url = " + url);
 283                 // write class entry to index
 284                 docs.add(createDocument(DocumentType.PROPERTY,
 285                         new Field("name", propertyName, Field.Store.YES, Field.Index.ANALYZED),
 286                         new Field("description", description, Field.Store.NO, Field.Index.ANALYZED),
 287                         new Field("shortDescription", description.substring(0,Math.min(160,description.length())),
 288                                 Field.Store.YES, Field.Index.NOT_ANALYZED),
 289                         new Field("url", propUrl, Field.Store.YES, Field.Index.NOT_ANALYZED),
 290                         new Field("className", className, Field.Store.YES, Field.Index.NOT_ANALYZED),
 291                         new Field("package", packageName, Field.Store.YES, Field.Index.NOT_ANALYZED),
 292                         new Field("ensemblePath", url + "#" + propertyName, Field.Store.YES, Field.Index.NOT_ANALYZED) // TODO what do we need here
 293                 ));
 294             }
 295         }
 296         // extract methods
 297         Matcher methodSummaryMatcher = METHOD_SUMMARY.matcher(content);
 298         if (methodSummaryMatcher.find()) {
 299             String methodSummaryTable = methodSummaryMatcher.group(1);
 300             Matcher methodMatcher = PROPERTY.matcher(methodSummaryTable);
 301             while (methodMatcher.find()) {
 302                 String methodUrl = methodMatcher.group(1);
 303                 String methodName = methodMatcher.group(2);
 304                 String description = cleanHTML(methodMatcher.group(3));
 305                 //System.out.println("            methodName = " + methodName);
 306                 //System.out.println("                    description = " + description);
 307                 //System.out.println("                    url = " + url);
 308                 methodUrl = url + "#" + methodName+"()";
 309                 //System.out.println("                    oracle url = " + url);
 310                 // write class entry to index
 311                 docs.add(createDocument(DocumentType.METHOD,
 312                         new Field("name", methodName, Field.Store.YES, Field.Index.ANALYZED),
 313                         new Field("description", description, Field.Store.NO, Field.Index.ANALYZED),
 314                         new Field("shortDescription", description.substring(0,Math.min(160,description.length())),
 315                                 Field.Store.YES, Field.Index.NOT_ANALYZED),
 316                         new Field("url", methodUrl, Field.Store.YES, Field.Index.NOT_ANALYZED),
 317                         new Field("className", className, Field.Store.YES, Field.Index.NOT_ANALYZED),
 318                         new Field("package", packageName, Field.Store.YES, Field.Index.NOT_ANALYZED),
 319                         new Field("ensemblePath", url + "#" + methodName + "()", Field.Store.YES, Field.Index.NOT_ANALYZED) // TODO what do we need here
 320                 ));
 321             }
 322         }
 323         // extract fields
 324         Matcher fieldSummaryMatcher = FIELD_SUMMARY.matcher(content);
 325         if (fieldSummaryMatcher.find()) {
 326             String fieldSummaryTable = fieldSummaryMatcher.group(1);
 327             Matcher fieldMatcher = PROPERTY.matcher(fieldSummaryTable);
 328             while (fieldMatcher.find()) {
 329                 String fieldUrl = fieldMatcher.group(1);
 330                 String fieldName = fieldMatcher.group(2);
 331                 String description = cleanHTML(fieldMatcher.group(3));
 332                 //System.out.println(" #####     fieldName = " + fieldName);
 333                 //System.out.println("                    description = " + description);
 334                 //System.out.println("                    url = " + url);
 335                 fieldUrl = url + "#" + fieldName;
 336                 //System.out.println("                    oracle url = " + url);
 337                 // write class entry to index
 338                 docs.add(createDocument(DocumentType.FIELD,
 339                         new Field("name", fieldName, Field.Store.YES, Field.Index.ANALYZED),
 340                         new Field("description", description, Field.Store.NO, Field.Index.ANALYZED),
 341                         new Field("shortDescription", description.substring(0,Math.min(160,description.length())),
 342                                 Field.Store.YES, Field.Index.NOT_ANALYZED),
 343                         new Field("url", fieldUrl, Field.Store.YES, Field.Index.NOT_ANALYZED),
 344                         new Field("className", className, Field.Store.YES, Field.Index.NOT_ANALYZED),
 345                         new Field("package", packageName, Field.Store.YES, Field.Index.NOT_ANALYZED),
 346                         new Field("ensemblePath", url + "#" + fieldName, Field.Store.YES, Field.Index.NOT_ANALYZED) // TODO what do we need here
 347                 ));
 348             }
 349         }
 350         // extract enums
 351         Matcher enumSummaryMatcher = ENUM_SUMMARY.matcher(content);
 352         if (enumSummaryMatcher.find()) {
 353             String enumSummaryTable = enumSummaryMatcher.group(1);
 354             Matcher enumMatcher = PROPERTY.matcher(enumSummaryTable);
 355             while (enumMatcher.find()) {
 356                 String enumUrl = enumMatcher.group(1);
 357                 String enumName = enumMatcher.group(2);
 358                 String description = cleanHTML(enumMatcher.group(3));
 359                 //System.out.println("            enumName = " + enumName);
 360                 //System.out.println("                    description = " + description);
 361                 //System.out.println("                    url = " + url);
 362                 enumUrl = url + "#" + enumName;
 363                 ///System.out.println("                    oracle url = " + url);
 364                 // write class entry to index
 365                 docs.add(createDocument(DocumentType.ENUM,
 366                         new Field("name", enumName, Field.Store.YES, Field.Index.ANALYZED),
 367                         new Field("description", description, Field.Store.NO, Field.Index.ANALYZED),
 368                         new Field("shortDescription", description.substring(0,Math.min(160,description.length())),
 369                                 Field.Store.YES, Field.Index.NOT_ANALYZED),
 370                         new Field("url", enumUrl, Field.Store.YES, Field.Index.NOT_ANALYZED),
 371                         new Field("className", className, Field.Store.YES, Field.Index.NOT_ANALYZED),
 372                         new Field("package", packageName, Field.Store.YES, Field.Index.NOT_ANALYZED),
 373                         new Field("ensemblePath", url+ "#" + enumName, Field.Store.YES, Field.Index.NOT_ANALYZED) // TODO what do we need here
 374                 ));
 375             }
 376         }
 377         return docs;
 378     }
 379 
 380     /**
 381      * Create a new document
 382      *
 383      * @param documentType The document type to save in the doc
 384      * @param fields       The searchable and data fields to write into doc
 385      * @throws IOException If there was problem writing doc
 386      */
 387     private static Document createDocument(DocumentType documentType, Field... fields) throws IOException {
 388         // make a new, empty document
 389         Document doc = new Document();
 390         // add doc type field
 391         doc.add(new Field("documentType", documentType.toString(), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));

 392         // add other fields
 393         if (fields != null) {
 394             for (Field field : fields) {
 395                 doc.add(field);
 396             }
 397         }
 398         return doc;
 399     }
 400 
 401     /**
 402      * Create a new document and write it to the given writer
 403      *
 404      * @param writer       The writer to write out to
 405      * @param documentType The document type to save in the doc
 406      * @param fields       The searchable and data fields to write into doc
 407      * @throws IOException If there was problem writing doc
 408      */
 409     private static void addDocument(IndexWriter writer, DocumentType documentType, Field... fields) throws IOException {
 410         // make a new, empty document
 411         Document doc = new Document();
 412         // add doc type field
 413         doc.add(new Field("documentType", documentType.toString(), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));

 414         // add other fields
 415         if (fields != null) {
 416             for (Field field : fields) {
 417                 doc.add(field);
 418             }
 419         }
 420         // write into index, assuming we are recreating every time
 421         writer.addDocument(doc);
 422     }
 423 
 424     /**
 425      * Clean HTML, removing all tags and un-escaping so that we can index it cleanly
 426      *
 427      * @param html The html to clean
 428      * @return cleaned html
 429      */
 430     private static String cleanHTML(String html) {
 431         html = html.replaceAll("(&nbsp;|\\s|[ ])+", " ").trim(); // cleanup whitespace
 432         html = html.replaceAll("<.*?>", " "); // remove html tags
 433         html = html.replaceAll("&lt;", "<"); // un-escape <


   1 /*
   2  * Copyright (c) 2008, 2017, Oracle and/or its affiliates.
   3  * All rights reserved. Use is subject to license terms.
   4  *
   5  * This file is available and licensed under the following license:
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions
   9  * are met:
  10  *
  11  *  - Redistributions of source code must retain the above copyright
  12  *    notice, this list of conditions and the following disclaimer.
  13  *  - Redistributions in binary form must reproduce the above copyright
  14  *    notice, this list of conditions and the following disclaimer in
  15  *    the documentation and/or other materials provided with the distribution.
  16  *  - Neither the name of Oracle Corporation nor the names of its
  17  *    contributors may be used to endorse or promote products derived
  18  *    from this software without specific prior written permission.
  19  *
  20  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  21  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  22  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  23  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  24  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  25  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  26  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  27  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  28  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  29  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  30  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  31  */
  32 package ensemble.compiletime.search;
  33 
  34 import ensemble.compiletime.Sample;
  35 import java.io.BufferedReader;
  36 import java.io.File;
  37 import java.io.FileWriter;
  38 import java.io.IOException;
  39 import java.io.InputStreamReader;
  40 import java.net.URL;
  41 import java.util.ArrayList;
  42 import java.util.List;
  43 import java.util.concurrent.Callable;
  44 import java.util.concurrent.ExecutionException;
  45 import java.util.concurrent.Future;
  46 import java.util.concurrent.LinkedBlockingQueue;
  47 import java.util.concurrent.ThreadFactory;
  48 import java.util.concurrent.ThreadPoolExecutor;
  49 import java.util.concurrent.TimeUnit;
  50 import java.util.logging.Level;
  51 import java.util.logging.Logger;
  52 import java.util.regex.Matcher;
  53 import java.util.regex.Pattern;
  54 import org.apache.lucene.analysis.Analyzer;
  55 import org.apache.lucene.analysis.standard.StandardAnalyzer;
  56 import org.apache.lucene.document.Document;
  57 import org.apache.lucene.document.Field;
  58 import org.apache.lucene.document.SortedDocValuesField;
  59 import org.apache.lucene.document.StringField;
  60 import org.apache.lucene.document.TextField;
  61 import org.apache.lucene.index.IndexWriter;
  62 import org.apache.lucene.index.IndexWriterConfig;
  63 import org.apache.lucene.index.IndexWriterConfig.OpenMode;
  64 import org.apache.lucene.store.Directory;
  65 import org.apache.lucene.store.FSDirectory;
  66 import org.apache.lucene.util.BytesRef;
  67 
  68 /**
  69  * Generate the lucene index that Ensemble uses for its search
  70  */
  71 public class BuildEnsembleSearchIndex {
  72 
  73     public static void buildSearchIndex(List<Sample> allSamples, String javaDocBaseUrl, String javafxDocumentationHome, File indexDir){
  74         try {
  75             List<Document> docs = new ArrayList<>();
  76             List<Callable<List<Document>>> tasks = new ArrayList<>();
  77             // create callables to collect data
  78             System.out.println("Creating Documents for Samples...");
  79             docs.addAll(indexSamples(allSamples));
  80             System.out.println("Creating tasks for getting all documentation...");
  81             System.out.println("javaDocBaseUrl = " + javaDocBaseUrl);
  82             System.out.println("javafxDocumentationHome = " + javafxDocumentationHome);
  83             tasks.addAll(indexJavaDocAllClasses(javaDocBaseUrl));
  84             tasks.addAll(indexAllDocumentation(javafxDocumentationHome));
  85             // execute all the tasks in 32 threads, collecting all the documents to write
  86             System.out.println("Executing tasks getting all documentation...");
  87             try {
  88                 ThreadPoolExecutor executor = new ThreadPoolExecutor(32,32,30, TimeUnit.SECONDS,new LinkedBlockingQueue());
  89                 executor.setThreadFactory(new ThreadFactory() {
  90                     int index = 0;
  91                     @Override public Thread newThread(Runnable r) {
  92                         Thread thread = new Thread(r,"Thread-"+(++index));
  93                         thread.setDaemon(true);
  94                         return thread;
  95                     }
  96                 });
  97                 List<Future<List<Document>>> results = executor.invokeAll(tasks);
  98                 for(Future<List<Document>> future : results) {
  99                     docs.addAll(future.get());
 100                 }
 101             } catch (ExecutionException | InterruptedException ex) {
 102                 Logger.getLogger(BuildEnsembleSearchIndex.class.getName()).log(Level.SEVERE, null, ex);
 103             }
 104             // create index
 105             System.out.println("Indexing to directory '" + indexDir + "'...");
 106             Directory dir = FSDirectory.open(indexDir.toPath());
 107             Analyzer analyzer = new StandardAnalyzer();
 108             IndexWriterConfig iwc = new IndexWriterConfig(analyzer);
 109             iwc.setOpenMode(OpenMode.CREATE);
 110             try (IndexWriter writer = new IndexWriter(dir, iwc)) {
 111                 // write all docs
 112                 System.out.println("Writing ["+docs.size()+"] documents to index....");
 113                 writer.addDocuments(docs);



 114                 System.out.println("NUMBER OF INDEXED DOCUMENTS = ["+writer.numDocs()+"]");
 115             }
 116             // write file listing all the search index files, so we know what
 117             // is in the jar file at runtime
 118             try (FileWriter listAllOut = new FileWriter(new File(indexDir,"listAll.txt"))) {
 119                 for (String fileName: dir.listAll()) {
 120                     // don't include the "listAll.txt" file or "write.lock"
 121                     if (!"listAll.txt".equals(fileName) && !"write.lock".equals(fileName)) {
 122                         Long length = dir.fileLength(fileName);
 123                         listAllOut.write(fileName);
 124                         listAllOut.write(':');
 125                         listAllOut.write(length.toString());
 126                         listAllOut.write('\n');
 127                     }
 128                 }
 129                 listAllOut.flush();
 130             }
 131             System.out.println("Finished writing search index to directory '" + indexDir);
 132         } catch (IOException ex) {
 133             Logger.getLogger(BuildEnsembleSearchIndex.class.getName()).log(Level.SEVERE, null, ex);
 134         }
 135     }
 136 
 137     private static List<Callable<List<Document>>> indexAllDocumentation(String javafxDocumentationHome) throws IOException{
 138         List<Callable<List<Document>>> tasks = new ArrayList<>();
 139         CharSequence content = grabWebPage(javafxDocumentationHome);
 140         String baseUrl = javafxDocumentationHome.substring(0,javafxDocumentationHome.lastIndexOf('/')+1);
 141 //        System.out.println("baseUrl = " + baseUrl);


 157             tasks.add((Callable<List<Document>>) () -> indexDocumentationPage(docPageUrl));
 158         }
 159         System.out.println(" --- end of list ---");
 160         return tasks;
 161     }
 162 
 163     private static List<Document> indexDocumentationPage(String docPageUrl) throws IOException{
 164         List<Document> docs = new ArrayList<>();
 165         try {
 166 //            System.out.println("PROCESSING... ["+docPageUrl+"] on Thread ["+Thread.currentThread().getName()+"]");
 167 //            System.out.println("==================================================================");
 168 //            System.out.println("Parsing docs page ["+docPageUrl+"] ...");
 169             DocumentationIndexer.DocPage docPage = DocumentationIndexer.parseDocsPage(docPageUrl, grabWebPage(docPageUrl).toString());
 170 //            System.out.println("TITLE="+docPage.bookTitle+"   CHAPTER="+docPage.chapter+"    SECTIONS=["+docPage.sections.size()+"]");
 171             for (DocumentationIndexer.Section section: docPage.sections) {
 172                 if (section.name == null) {
 173                     System.out.println("section.name = "+section.name+" docPage.bookTitle="+docPage.bookTitle+"    "+docPageUrl);
 174                 }
 175                 // write documentation section entry to index
 176                 docs.add(createDocument(DocumentType.DOC,
 177                     new TextField("bookTitle", docPage.bookTitle, Field.Store.YES),
 178                     new TextField("chapter", docPage.chapter==null? "" : docPage.chapter, Field.Store.YES),
 179                     new TextField("name", section.name, Field.Store.YES),
 180                     new TextField("description", section.content, Field.Store.NO),
 181                     new StringField("ensemblePath", section.url, Field.Store.YES)
 182                 ));
 183             }
 184             // handle next page if there is one
 185             if (docPage.nextUrl != null) {
 186                 docs.addAll(indexDocumentationPage(docPage.nextUrl));
 187             }
 188 
 189         } catch (Exception ex) {
 190             System.out.println("FAILED TO PARSE DOCS PAGE SO IGNORED: ["+docPageUrl+"]");
 191             ex.printStackTrace(System.out);
 192         }
 193         return docs;
 194     }
 195 
 196     private static List<Callable<List<Document>>> indexJavaDocAllClasses(final String javaDocBaseUrl) throws IOException{
 197         CharSequence content = grabWebPage(javaDocBaseUrl+"allclasses-noframe.html");
 198         List<Callable<List<Document>>> tasks = new ArrayList<>();
 199         // parse package
 200         Matcher matcher = findClassUrl.matcher(content);
 201         while (matcher.find()) {
 202             final String classUrl = javaDocBaseUrl+matcher.group(1);
 203             tasks.add((Callable<List<Document>>) () -> indexApiDocs(classUrl));
 204         }
 205         return tasks;
 206     }
 207 
 208     /**
 209      * Add all samples to the search index
 210      */
 211     private static List<Document> indexSamples(List<Sample> allSamples) throws IOException {
 212         List<Document> docs = new ArrayList<>();
 213         for (Sample sample: allSamples) {
 214             // write class entry to index
 215             docs.add(createDocument(DocumentType.SAMPLE,
 216                 new TextField("name", sample.name, Field.Store.YES),
 217                 new TextField("description", sample.description, Field.Store.NO),
 218                 new StringField("shortDescription", sample.description.substring(0, Math.min(160, sample.description.length())),
 219                         Field.Store.YES),
 220                 new StringField("ensemblePath", "sample://"+sample.ensemblePath, Field.Store.YES)
 221             ));
 222         }
 223         return docs;
 224     }
 225 
 226     /**
 227      * Index a JavaDoc page for a single class, interface or enum
 228      *
 229      * @param writer The index writer to add documents to
 230      * @param url The url to the javadoc html file
 231      * @throws IOException If there was a problem indexing the file
 232      */
 233     private static List<Document> indexApiDocs(String url) throws IOException {
 234 //        System.out.println("PROCESSING... ["+url+"] on Thread ["+Thread.currentThread().getName()+"]");
 235         final List<Document> docs = new ArrayList<>();
 236         CharSequence content = grabWebPage(url);
 237         // extract package and class
 238         Matcher packageAndClassMatcher = PACKAGE_AND_CLASS.matcher(content);
 239         // search and if we fail to find ignore this file
 240         if (!packageAndClassMatcher.find()) {


 247         String packageName = packageAndClassMatcher.group(1);
 248         //System.out.println("        packageName = " + packageName);
 249         String classType = packageAndClassMatcher.group(2).toLowerCase();
 250         //System.out.println("        classType = " + classType);
 251         String className = packageAndClassMatcher.group(3);
 252         //System.out.println("        className = " + className);
 253         // extract document type
 254         DocumentType documentType = DocumentType.CLASS;
 255         if ("enum".equals(classType)) {
 256             documentType = DocumentType.ENUM;
 257         }
 258         // extract javadoc description
 259         Matcher classDescriptionMatcher = CLASS_DESCRIPTION.matcher(content);
 260         String classDescription = "";
 261         if (classDescriptionMatcher.find()) {
 262             classDescription = cleanHTML(classDescriptionMatcher.group(1));
 263         }
 264         ///System.out.println("classDescription = " + classDescription);
 265         // write class entry to index
 266         docs.add(createDocument(documentType,
 267                 new TextField("name", className, Field.Store.YES),
 268                 new TextField("description", classDescription, Field.Store.NO),
 269                 new StringField("shortDescription", classDescription.substring(0,Math.min(160,classDescription.length())),
 270                         Field.Store.YES),
 271                 new TextField("package", packageName, Field.Store.YES),
 272                 new StringField("url", url, Field.Store.YES),
 273                 new StringField("ensemblePath", url, Field.Store.YES) // TODO what do we need here
 274         ));
 275 
 276         // extract properties
 277         Matcher propertySummaryMatcher = PROPERTY_SUMMARY.matcher(content);
 278         if (propertySummaryMatcher.find()) {
 279             String propertySummaryTable = propertySummaryMatcher.group(1);
 280             Matcher propertyMatcher = PROPERTY.matcher(propertySummaryTable);
 281             while (propertyMatcher.find()) {
 282                 String propUrl = propertyMatcher.group(1);
 283                 String propertyName = propertyMatcher.group(2);
 284                 String description = cleanHTML(propertyMatcher.group(3));
 285                 //System.out.println("            propertyName = " + propertyName);
 286                 //System.out.println("                    description = " + description);
 287                 //System.out.println("                    url = " + url);
 288                 propUrl = url + "#" + propertyName;
 289                 //System.out.println("                    oracle url = " + url);
 290                 // write class entry to index
 291                 docs.add(createDocument(DocumentType.PROPERTY,
 292                         new TextField("name", propertyName, Field.Store.YES),
 293                         new TextField("description", description, Field.Store.NO),
 294                         new StringField("shortDescription", description.substring(0,Math.min(160,description.length())),
 295                                 Field.Store.YES),
 296                         new StringField("url", propUrl, Field.Store.YES),
 297                         new StringField("className", className, Field.Store.YES),
 298                         new StringField("package", packageName, Field.Store.YES),
 299                         new StringField("ensemblePath", url + "#" + propertyName, Field.Store.YES) // TODO what do we need here
 300                 ));
 301             }
 302         }
 303         // extract methods
 304         Matcher methodSummaryMatcher = METHOD_SUMMARY.matcher(content);
 305         if (methodSummaryMatcher.find()) {
 306             String methodSummaryTable = methodSummaryMatcher.group(1);
 307             Matcher methodMatcher = PROPERTY.matcher(methodSummaryTable);
 308             while (methodMatcher.find()) {
 309                 String methodUrl = methodMatcher.group(1);
 310                 String methodName = methodMatcher.group(2);
 311                 String description = cleanHTML(methodMatcher.group(3));
 312                 //System.out.println("            methodName = " + methodName);
 313                 //System.out.println("                    description = " + description);
 314                 //System.out.println("                    url = " + url);
 315                 methodUrl = url + "#" + methodName+"()";
 316                 //System.out.println("                    oracle url = " + url);
 317                 // write class entry to index
 318                 docs.add(createDocument(DocumentType.METHOD,
 319                         new TextField("name", methodName, Field.Store.YES),
 320                         new TextField("description", description, Field.Store.NO),
 321                         new StringField("shortDescription", description.substring(0,Math.min(160,description.length())),
 322                                 Field.Store.YES),
 323                         new StringField("url", methodUrl, Field.Store.YES),
 324                         new StringField("className", className, Field.Store.YES),
 325                         new StringField("package", packageName, Field.Store.YES),
 326                         new StringField("ensemblePath", url + "#" + methodName + "()", Field.Store.YES) // TODO what do we need here
 327                 ));
 328             }
 329         }
 330         // extract fields
 331         Matcher fieldSummaryMatcher = FIELD_SUMMARY.matcher(content);
 332         if (fieldSummaryMatcher.find()) {
 333             String fieldSummaryTable = fieldSummaryMatcher.group(1);
 334             Matcher fieldMatcher = PROPERTY.matcher(fieldSummaryTable);
 335             while (fieldMatcher.find()) {
 336                 String fieldUrl = fieldMatcher.group(1);
 337                 String fieldName = fieldMatcher.group(2);
 338                 String description = cleanHTML(fieldMatcher.group(3));
 339                 //System.out.println(" #####     fieldName = " + fieldName);
 340                 //System.out.println("                    description = " + description);
 341                 //System.out.println("                    url = " + url);
 342                 fieldUrl = url + "#" + fieldName;
 343                 //System.out.println("                    oracle url = " + url);
 344                 // write class entry to index
 345                 docs.add(createDocument(DocumentType.FIELD,
 346                         new TextField("name", fieldName, Field.Store.YES),
 347                         new TextField("description", description, Field.Store.NO),
 348                         new StringField("shortDescription", description.substring(0,Math.min(160,description.length())),
 349                                 Field.Store.YES),
 350                         new StringField("url", fieldUrl, Field.Store.YES),
 351                         new StringField("className", className, Field.Store.YES),
 352                         new StringField("package", packageName, Field.Store.YES),
 353                         new StringField("ensemblePath", url + "#" + fieldName, Field.Store.YES) // TODO what do we need here
 354                 ));
 355             }
 356         }
 357         // extract enums
 358         Matcher enumSummaryMatcher = ENUM_SUMMARY.matcher(content);
 359         if (enumSummaryMatcher.find()) {
 360             String enumSummaryTable = enumSummaryMatcher.group(1);
 361             Matcher enumMatcher = PROPERTY.matcher(enumSummaryTable);
 362             while (enumMatcher.find()) {
 363                 String enumUrl = enumMatcher.group(1);
 364                 String enumName = enumMatcher.group(2);
 365                 String description = cleanHTML(enumMatcher.group(3));
 366                 //System.out.println("            enumName = " + enumName);
 367                 //System.out.println("                    description = " + description);
 368                 //System.out.println("                    url = " + url);
 369                 enumUrl = url + "#" + enumName;
 370                 ///System.out.println("                    oracle url = " + url);
 371                 // write class entry to index
 372                 docs.add(createDocument(DocumentType.ENUM,
 373                         new TextField("name", enumName, Field.Store.YES),
 374                         new TextField("description", description, Field.Store.NO),
 375                         new StringField("shortDescription", description.substring(0,Math.min(160,description.length())),
 376                                 Field.Store.YES),
 377                         new StringField("url", enumUrl, Field.Store.YES),
 378                         new StringField("className", className, Field.Store.YES),
 379                         new StringField("package", packageName, Field.Store.YES),
 380                         new StringField("ensemblePath", url+ "#" + enumName, Field.Store.YES) // TODO what do we need here
 381                 ));
 382             }
 383         }
 384         return docs;
 385     }
 386 
 387     /**
 388      * Create a new document
 389      *
 390      * @param documentType The document type to save in the doc
 391      * @param fields       The searchable and data fields to write into doc
 392      * @throws IOException If there was problem writing doc
 393      */
 394     private static Document createDocument(DocumentType documentType, Field... fields) throws IOException {
 395         // make a new, empty document
 396         Document doc = new Document();
 397         // add doc type field + sorting field
 398         doc.add(new StringField("documentType", documentType.toString(), Field.Store.YES));
 399         doc.add(new SortedDocValuesField("documentType", new BytesRef(documentType.toString())));
 400         // add other fields
 401         if (fields != null) {
 402             for (Field field : fields) {
 403                 doc.add(field);
 404             }
 405         }
 406         return doc;
 407     }
 408 
 409     /**
 410      * Create a new document and write it to the given writer
 411      *
 412      * @param writer       The writer to write out to
 413      * @param documentType The document type to save in the doc
 414      * @param fields       The searchable and data fields to write into doc
 415      * @throws IOException If there was problem writing doc
 416      */
 417     private static void addDocument(IndexWriter writer, DocumentType documentType, Field... fields) throws IOException {
 418         // make a new, empty document
 419         Document doc = new Document();
 420         // add doc type field + sorting field
 421         doc.add(new StringField("documentType", documentType.toString(), Field.Store.YES));
 422         doc.add(new SortedDocValuesField("documentType", new BytesRef(documentType.toString())));
 423         // add other fields
 424         if (fields != null) {
 425             for (Field field : fields) {
 426                 doc.add(field);
 427             }
 428         }
 429         // write into index, assuming we are recreating every time
 430         writer.addDocument(doc);
 431     }
 432 
 433     /**
 434      * Clean HTML, removing all tags and un-escaping so that we can index it cleanly
 435      *
 436      * @param html The html to clean
 437      * @return cleaned html
 438      */
 439     private static String cleanHTML(String html) {
 440         html = html.replaceAll("(&nbsp;|\\s|[ ])+", " ").trim(); // cleanup whitespace
 441         html = html.replaceAll("<.*?>", " "); // remove html tags
 442         html = html.replaceAll("&lt;", "<"); // un-escape <


< prev index next >