1 /*
2 * Copyright (c) 2008, 2015, Oracle and/or its affiliates.
3 * All rights reserved. Use is subject to license terms.
4 *
5 * This file is available and licensed under the following license:
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 *
11 * - Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * - Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the distribution.
16 * - Neither the name of Oracle Corporation nor the names of its
17 * contributors may be used to endorse or promote products derived
18 * from this software without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 */
32 package ensemble.compiletime.search;
33
34 import ensemble.compiletime.Sample;
35 import java.io.*;
36 import java.net.URL;
37 import java.util.ArrayList;
38 import java.util.List;
39 import java.util.concurrent.Callable;
40 import java.util.concurrent.ExecutionException;
41 import java.util.concurrent.Future;
42 import java.util.concurrent.LinkedBlockingQueue;
43 import java.util.concurrent.ThreadFactory;
44 import java.util.concurrent.ThreadPoolExecutor;
45 import java.util.concurrent.TimeUnit;
46 import java.util.logging.Level;
47 import java.util.logging.Logger;
48 import java.util.regex.Matcher;
49 import java.util.regex.Pattern;
50 import org.apache.lucene.analysis.Analyzer;
51 import org.apache.lucene.analysis.standard.StandardAnalyzer;
52 import org.apache.lucene.document.Document;
53 import org.apache.lucene.document.Field;
54 import org.apache.lucene.index.IndexWriter;
55 import org.apache.lucene.index.IndexWriterConfig;
56 import org.apache.lucene.index.IndexWriterConfig.OpenMode;
57 import org.apache.lucene.store.Directory;
58 import org.apache.lucene.store.FSDirectory;
59 import org.apache.lucene.util.Version;
60
61 /**
62 * Generate the lucene index that Ensemble uses for its search
63 */
64 public class BuildEnsembleSearchIndex {
65
66 public static void buildSearchIndex(List<Sample> allSamples, String javaDocBaseUrl, String javafxDocumentationHome, File indexDir){
67 try {
68 List<Document> docs = new ArrayList<>();
69 List<Callable<List<Document>>> tasks = new ArrayList<>();
70 // create callables to collect data
71 System.out.println("Creating Documents for Samples...");
72 docs.addAll(indexSamples(allSamples));
73 System.out.println("Creating tasks for getting all documentation...");
74 tasks.addAll(indexJavaDocAllClasses(javaDocBaseUrl));
75 tasks.addAll(indexAllDocumentation(javafxDocumentationHome));
76 // execute all the tasks in 32 threads, collecting all the documents to write
77 System.out.println("Executing tasks getting all documentation...");
78 try {
79 ThreadPoolExecutor executor = new ThreadPoolExecutor(32,32,30, TimeUnit.SECONDS,new LinkedBlockingQueue());
80 executor.setThreadFactory(new ThreadFactory() {
81 int index = 0;
82 @Override public Thread newThread(Runnable r) {
83 Thread thread = new Thread(r,"Thread-"+(++index));
84 thread.setDaemon(true);
85 return thread;
86 }
87 });
88 List<Future<List<Document>>> results = executor.invokeAll(tasks);
89 for(Future<List<Document>> future : results) {
90 docs.addAll(future.get());
91 }
92 } catch (ExecutionException | InterruptedException ex) {
93 Logger.getLogger(BuildEnsembleSearchIndex.class.getName()).log(Level.SEVERE, null, ex);
94 }
95 // create index
96 System.out.println("Indexing to directory '" + indexDir + "'...");
97 Directory dir = FSDirectory.open(indexDir);
98 Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_31);
99 IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_31, analyzer);
100 iwc.setOpenMode(OpenMode.CREATE);
101 try (IndexWriter writer = new IndexWriter(dir, iwc)) {
102 // write all docs
103 System.out.println("Writing ["+docs.size()+"] documents to index....");
104 writer.addDocuments(docs);
105 // optimize the writen index
106 System.out.println("Optimizing search index....");
107 writer.optimize();
108 System.out.println("NUMBER OF INDEXED DOCUMENTS = ["+writer.numDocs()+"]");
109 }
110 // write file listing all the search index files, so we know what
111 // is in the jar file at runtime
112 try (FileWriter listAllOut = new FileWriter(new File(indexDir,"listAll.txt"))) {
113 for (String fileName: dir.listAll()) {
114 if (!"listAll.txt".equals(fileName)) { // don't include the "listAll.txt" file
115 Long length = dir.fileLength(fileName);
116 listAllOut.write(fileName);
117 listAllOut.write(':');
118 listAllOut.write(length.toString());
119 listAllOut.write('\n');
120 }
121 }
122 listAllOut.flush();
123 }
124 System.out.println("Finished writing search index to directory '" + indexDir);
125 } catch (IOException ex) {
126 Logger.getLogger(BuildEnsembleSearchIndex.class.getName()).log(Level.SEVERE, null, ex);
127 }
128 }
129
130 private static List<Callable<List<Document>>> indexAllDocumentation(String javafxDocumentationHome) throws IOException{
131 List<Callable<List<Document>>> tasks = new ArrayList<>();
132 CharSequence content = grabWebPage(javafxDocumentationHome);
133 String baseUrl = javafxDocumentationHome.substring(0,javafxDocumentationHome.lastIndexOf('/')+1);
134 // System.out.println("baseUrl = " + baseUrl);
150 tasks.add((Callable<List<Document>>) () -> indexDocumentationPage(docPageUrl));
151 }
152 System.out.println(" --- end of list ---");
153 return tasks;
154 }
155
156 private static List<Document> indexDocumentationPage(String docPageUrl) throws IOException{
157 List<Document> docs = new ArrayList<>();
158 try {
159 // System.out.println("PROCESSING... ["+docPageUrl+"] on Thread ["+Thread.currentThread().getName()+"]");
160 // System.out.println("==================================================================");
161 // System.out.println("Parsing docs page ["+docPageUrl+"] ...");
162 DocumentationIndexer.DocPage docPage = DocumentationIndexer.parseDocsPage(docPageUrl, grabWebPage(docPageUrl).toString());
163 // System.out.println("TITLE="+docPage.bookTitle+" CHAPTER="+docPage.chapter+" SECTIONS=["+docPage.sections.size()+"]");
164 for (DocumentationIndexer.Section section: docPage.sections) {
165 if (section.name == null) {
166 System.out.println("section.name = "+section.name+" docPage.bookTitle="+docPage.bookTitle+" "+docPageUrl);
167 }
168 // write documentation section entry to index
169 docs.add(createDocument(DocumentType.DOC,
170 new Field("bookTitle", docPage.bookTitle, Field.Store.YES, Field.Index.ANALYZED),
171 new Field("chapter", docPage.chapter==null? "" : docPage.chapter, Field.Store.YES, Field.Index.ANALYZED),
172 new Field("name", section.name, Field.Store.YES, Field.Index.ANALYZED),
173 new Field("description", section.content, Field.Store.NO, Field.Index.ANALYZED),
174 new Field("ensemblePath", section.url, Field.Store.YES, Field.Index.NOT_ANALYZED)
175 ));
176 }
177 // handle next page if there is one
178 if (docPage.nextUrl != null) {
179 docs.addAll(indexDocumentationPage(docPage.nextUrl));
180 }
181
182 } catch (Exception ex) {
183 System.out.println("FAILED TO PARSE DOCS PAGE SO IGNORED: ["+docPageUrl+"]");
184 ex.printStackTrace(System.out);
185 }
186 return docs;
187 }
188
189 private static List<Callable<List<Document>>> indexJavaDocAllClasses(final String javaDocBaseUrl) throws IOException{
190 CharSequence content = grabWebPage(javaDocBaseUrl+"allclasses-noframe.html");
191 List<Callable<List<Document>>> tasks = new ArrayList<>();
192 // parse package
193 Matcher matcher = findClassUrl.matcher(content);
194 while (matcher.find()) {
195 final String classUrl = javaDocBaseUrl+matcher.group(1);
196 tasks.add((Callable<List<Document>>) () -> indexApiDocs(classUrl));
197 }
198 return tasks;
199 }
200
201 /**
202 * Add all samples to the search index
203 */
204 private static List<Document> indexSamples(List<Sample> allSamples) throws IOException {
205 List<Document> docs = new ArrayList<>();
206 for (Sample sample: allSamples) {
207 // write class entry to index
208 docs.add(createDocument(DocumentType.SAMPLE,
209 new Field("name", sample.name, Field.Store.YES, Field.Index.ANALYZED),
210 new Field("description", sample.description, Field.Store.NO, Field.Index.ANALYZED),
211 new Field("shortDescription", sample.description.substring(0, Math.min(160, sample.description.length())),
212 Field.Store.YES, Field.Index.NOT_ANALYZED),
213 new Field("ensemblePath", "sample://"+sample.ensemblePath, Field.Store.YES, Field.Index.NOT_ANALYZED)
214 ));
215 }
216 return docs;
217 }
218
219 /**
220 * Index a JavaDoc page for a single class, interface or enum
221 *
222 * @param writer The index writer to add documents to
223 * @param url The url to the javadoc html file
224 * @throws IOException If there was a problem indexing the file
225 */
226 private static List<Document> indexApiDocs(String url) throws IOException {
227 // System.out.println("PROCESSING... ["+url+"] on Thread ["+Thread.currentThread().getName()+"]");
228 final List<Document> docs = new ArrayList<>();
229 CharSequence content = grabWebPage(url);
230 // extract package and class
231 Matcher packageAndClassMatcher = PACKAGE_AND_CLASS.matcher(content);
232 // search and if we fail to find ignore this file
233 if (!packageAndClassMatcher.find()) {
240 String packageName = packageAndClassMatcher.group(1);
241 //System.out.println(" packageName = " + packageName);
242 String classType = packageAndClassMatcher.group(2).toLowerCase();
243 //System.out.println(" classType = " + classType);
244 String className = packageAndClassMatcher.group(3);
245 //System.out.println(" className = " + className);
246 // extract document type
247 DocumentType documentType = DocumentType.CLASS;
248 if ("enum".equals(classType)) {
249 documentType = DocumentType.ENUM;
250 }
251 // extract javadoc description
252 Matcher classDescriptionMatcher = CLASS_DESCRIPTION.matcher(content);
253 String classDescription = "";
254 if (classDescriptionMatcher.find()) {
255 classDescription = cleanHTML(classDescriptionMatcher.group(1));
256 }
257 ///System.out.println("classDescription = " + classDescription);
258 // write class entry to index
259 docs.add(createDocument(documentType,
260 new Field("name", className, Field.Store.YES, Field.Index.ANALYZED),
261 new Field("description", classDescription, Field.Store.NO, Field.Index.ANALYZED),
262 new Field("shortDescription", classDescription.substring(0,Math.min(160,classDescription.length())),
263 Field.Store.YES, Field.Index.NOT_ANALYZED),
264 new Field("package", packageName, Field.Store.YES, Field.Index.ANALYZED),
265 new Field("url", url, Field.Store.YES, Field.Index.NOT_ANALYZED),
266 new Field("ensemblePath", url, Field.Store.YES, Field.Index.NOT_ANALYZED) // TODO what do we need here
267 ));
268
269 // extract properties
270 Matcher propertySummaryMatcher = PROPERTY_SUMMARY.matcher(content);
271 if (propertySummaryMatcher.find()) {
272 String propertySummaryTable = propertySummaryMatcher.group(1);
273 Matcher propertyMatcher = PROPERTY.matcher(propertySummaryTable);
274 while (propertyMatcher.find()) {
275 String propUrl = propertyMatcher.group(1);
276 String propertyName = propertyMatcher.group(2);
277 String description = cleanHTML(propertyMatcher.group(3));
278 //System.out.println(" propertyName = " + propertyName);
279 //System.out.println(" description = " + description);
280 //System.out.println(" url = " + url);
281 propUrl = url + "#" + propertyName;
282 //System.out.println(" oracle url = " + url);
283 // write class entry to index
284 docs.add(createDocument(DocumentType.PROPERTY,
285 new Field("name", propertyName, Field.Store.YES, Field.Index.ANALYZED),
286 new Field("description", description, Field.Store.NO, Field.Index.ANALYZED),
287 new Field("shortDescription", description.substring(0,Math.min(160,description.length())),
288 Field.Store.YES, Field.Index.NOT_ANALYZED),
289 new Field("url", propUrl, Field.Store.YES, Field.Index.NOT_ANALYZED),
290 new Field("className", className, Field.Store.YES, Field.Index.NOT_ANALYZED),
291 new Field("package", packageName, Field.Store.YES, Field.Index.NOT_ANALYZED),
292 new Field("ensemblePath", url + "#" + propertyName, Field.Store.YES, Field.Index.NOT_ANALYZED) // TODO what do we need here
293 ));
294 }
295 }
296 // extract methods
297 Matcher methodSummaryMatcher = METHOD_SUMMARY.matcher(content);
298 if (methodSummaryMatcher.find()) {
299 String methodSummaryTable = methodSummaryMatcher.group(1);
300 Matcher methodMatcher = PROPERTY.matcher(methodSummaryTable);
301 while (methodMatcher.find()) {
302 String methodUrl = methodMatcher.group(1);
303 String methodName = methodMatcher.group(2);
304 String description = cleanHTML(methodMatcher.group(3));
305 //System.out.println(" methodName = " + methodName);
306 //System.out.println(" description = " + description);
307 //System.out.println(" url = " + url);
308 methodUrl = url + "#" + methodName+"()";
309 //System.out.println(" oracle url = " + url);
310 // write class entry to index
311 docs.add(createDocument(DocumentType.METHOD,
312 new Field("name", methodName, Field.Store.YES, Field.Index.ANALYZED),
313 new Field("description", description, Field.Store.NO, Field.Index.ANALYZED),
314 new Field("shortDescription", description.substring(0,Math.min(160,description.length())),
315 Field.Store.YES, Field.Index.NOT_ANALYZED),
316 new Field("url", methodUrl, Field.Store.YES, Field.Index.NOT_ANALYZED),
317 new Field("className", className, Field.Store.YES, Field.Index.NOT_ANALYZED),
318 new Field("package", packageName, Field.Store.YES, Field.Index.NOT_ANALYZED),
319 new Field("ensemblePath", url + "#" + methodName + "()", Field.Store.YES, Field.Index.NOT_ANALYZED) // TODO what do we need here
320 ));
321 }
322 }
323 // extract fields
324 Matcher fieldSummaryMatcher = FIELD_SUMMARY.matcher(content);
325 if (fieldSummaryMatcher.find()) {
326 String fieldSummaryTable = fieldSummaryMatcher.group(1);
327 Matcher fieldMatcher = PROPERTY.matcher(fieldSummaryTable);
328 while (fieldMatcher.find()) {
329 String fieldUrl = fieldMatcher.group(1);
330 String fieldName = fieldMatcher.group(2);
331 String description = cleanHTML(fieldMatcher.group(3));
332 //System.out.println(" ##### fieldName = " + fieldName);
333 //System.out.println(" description = " + description);
334 //System.out.println(" url = " + url);
335 fieldUrl = url + "#" + fieldName;
336 //System.out.println(" oracle url = " + url);
337 // write class entry to index
338 docs.add(createDocument(DocumentType.FIELD,
339 new Field("name", fieldName, Field.Store.YES, Field.Index.ANALYZED),
340 new Field("description", description, Field.Store.NO, Field.Index.ANALYZED),
341 new Field("shortDescription", description.substring(0,Math.min(160,description.length())),
342 Field.Store.YES, Field.Index.NOT_ANALYZED),
343 new Field("url", fieldUrl, Field.Store.YES, Field.Index.NOT_ANALYZED),
344 new Field("className", className, Field.Store.YES, Field.Index.NOT_ANALYZED),
345 new Field("package", packageName, Field.Store.YES, Field.Index.NOT_ANALYZED),
346 new Field("ensemblePath", url + "#" + fieldName, Field.Store.YES, Field.Index.NOT_ANALYZED) // TODO what do we need here
347 ));
348 }
349 }
350 // extract enums
351 Matcher enumSummaryMatcher = ENUM_SUMMARY.matcher(content);
352 if (enumSummaryMatcher.find()) {
353 String enumSummaryTable = enumSummaryMatcher.group(1);
354 Matcher enumMatcher = PROPERTY.matcher(enumSummaryTable);
355 while (enumMatcher.find()) {
356 String enumUrl = enumMatcher.group(1);
357 String enumName = enumMatcher.group(2);
358 String description = cleanHTML(enumMatcher.group(3));
359 //System.out.println(" enumName = " + enumName);
360 //System.out.println(" description = " + description);
361 //System.out.println(" url = " + url);
362 enumUrl = url + "#" + enumName;
363 ///System.out.println(" oracle url = " + url);
364 // write class entry to index
365 docs.add(createDocument(DocumentType.ENUM,
366 new Field("name", enumName, Field.Store.YES, Field.Index.ANALYZED),
367 new Field("description", description, Field.Store.NO, Field.Index.ANALYZED),
368 new Field("shortDescription", description.substring(0,Math.min(160,description.length())),
369 Field.Store.YES, Field.Index.NOT_ANALYZED),
370 new Field("url", enumUrl, Field.Store.YES, Field.Index.NOT_ANALYZED),
371 new Field("className", className, Field.Store.YES, Field.Index.NOT_ANALYZED),
372 new Field("package", packageName, Field.Store.YES, Field.Index.NOT_ANALYZED),
373 new Field("ensemblePath", url+ "#" + enumName, Field.Store.YES, Field.Index.NOT_ANALYZED) // TODO what do we need here
374 ));
375 }
376 }
377 return docs;
378 }
379
380 /**
381 * Create a new document
382 *
383 * @param documentType The document type to save in the doc
384 * @param fields The searchable and data fields to write into doc
385 * @throws IOException If there was problem writing doc
386 */
387 private static Document createDocument(DocumentType documentType, Field... fields) throws IOException {
388 // make a new, empty document
389 Document doc = new Document();
390 // add doc type field
391 doc.add(new Field("documentType", documentType.toString(), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));
392 // add other fields
393 if (fields != null) {
394 for (Field field : fields) {
395 doc.add(field);
396 }
397 }
398 return doc;
399 }
400
401 /**
402 * Create a new document and write it to the given writer
403 *
404 * @param writer The writer to write out to
405 * @param documentType The document type to save in the doc
406 * @param fields The searchable and data fields to write into doc
407 * @throws IOException If there was problem writing doc
408 */
409 private static void addDocument(IndexWriter writer, DocumentType documentType, Field... fields) throws IOException {
410 // make a new, empty document
411 Document doc = new Document();
412 // add doc type field
413 doc.add(new Field("documentType", documentType.toString(), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));
414 // add other fields
415 if (fields != null) {
416 for (Field field : fields) {
417 doc.add(field);
418 }
419 }
420 // write into index, assuming we are recreating every time
421 writer.addDocument(doc);
422 }
423
424 /**
425 * Clean HTML, removing all tags and un-escaping so that we can index it cleanly
426 *
427 * @param html The html to clean
428 * @return cleaned html
429 */
430 private static String cleanHTML(String html) {
431 html = html.replaceAll("( |\\s|[ ])+", " ").trim(); // cleanup whitespace
432 html = html.replaceAll("<.*?>", " "); // remove html tags
433 html = html.replaceAll("<", "<"); // un-escape <
|
1 /*
2 * Copyright (c) 2008, 2017, Oracle and/or its affiliates.
3 * All rights reserved. Use is subject to license terms.
4 *
5 * This file is available and licensed under the following license:
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 *
11 * - Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * - Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the distribution.
16 * - Neither the name of Oracle Corporation nor the names of its
17 * contributors may be used to endorse or promote products derived
18 * from this software without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 */
32 package ensemble.compiletime.search;
33
34 import ensemble.compiletime.Sample;
35 import java.io.BufferedReader;
36 import java.io.File;
37 import java.io.FileWriter;
38 import java.io.IOException;
39 import java.io.InputStreamReader;
40 import java.net.URL;
41 import java.util.ArrayList;
42 import java.util.List;
43 import java.util.concurrent.Callable;
44 import java.util.concurrent.ExecutionException;
45 import java.util.concurrent.Future;
46 import java.util.concurrent.LinkedBlockingQueue;
47 import java.util.concurrent.ThreadFactory;
48 import java.util.concurrent.ThreadPoolExecutor;
49 import java.util.concurrent.TimeUnit;
50 import java.util.logging.Level;
51 import java.util.logging.Logger;
52 import java.util.regex.Matcher;
53 import java.util.regex.Pattern;
54 import org.apache.lucene.analysis.Analyzer;
55 import org.apache.lucene.analysis.standard.StandardAnalyzer;
56 import org.apache.lucene.document.Document;
57 import org.apache.lucene.document.Field;
58 import org.apache.lucene.document.SortedDocValuesField;
59 import org.apache.lucene.document.StringField;
60 import org.apache.lucene.document.TextField;
61 import org.apache.lucene.index.IndexWriter;
62 import org.apache.lucene.index.IndexWriterConfig;
63 import org.apache.lucene.index.IndexWriterConfig.OpenMode;
64 import org.apache.lucene.store.Directory;
65 import org.apache.lucene.store.FSDirectory;
66 import org.apache.lucene.util.BytesRef;
67
68 /**
69 * Generate the lucene index that Ensemble uses for its search
70 */
71 public class BuildEnsembleSearchIndex {
72
73 public static void buildSearchIndex(List<Sample> allSamples, String javaDocBaseUrl, String javafxDocumentationHome, File indexDir){
74 try {
75 List<Document> docs = new ArrayList<>();
76 List<Callable<List<Document>>> tasks = new ArrayList<>();
77 // create callables to collect data
78 System.out.println("Creating Documents for Samples...");
79 docs.addAll(indexSamples(allSamples));
80 System.out.println("Creating tasks for getting all documentation...");
81 System.out.println("javaDocBaseUrl = " + javaDocBaseUrl);
82 System.out.println("javafxDocumentationHome = " + javafxDocumentationHome);
83 tasks.addAll(indexJavaDocAllClasses(javaDocBaseUrl));
84 tasks.addAll(indexAllDocumentation(javafxDocumentationHome));
85 // execute all the tasks in 32 threads, collecting all the documents to write
86 System.out.println("Executing tasks getting all documentation...");
87 try {
88 ThreadPoolExecutor executor = new ThreadPoolExecutor(32,32,30, TimeUnit.SECONDS,new LinkedBlockingQueue());
89 executor.setThreadFactory(new ThreadFactory() {
90 int index = 0;
91 @Override public Thread newThread(Runnable r) {
92 Thread thread = new Thread(r,"Thread-"+(++index));
93 thread.setDaemon(true);
94 return thread;
95 }
96 });
97 List<Future<List<Document>>> results = executor.invokeAll(tasks);
98 for(Future<List<Document>> future : results) {
99 docs.addAll(future.get());
100 }
101 } catch (ExecutionException | InterruptedException ex) {
102 Logger.getLogger(BuildEnsembleSearchIndex.class.getName()).log(Level.SEVERE, null, ex);
103 }
104 // create index
105 System.out.println("Indexing to directory '" + indexDir + "'...");
106 Directory dir = FSDirectory.open(indexDir.toPath());
107 Analyzer analyzer = new StandardAnalyzer();
108 IndexWriterConfig iwc = new IndexWriterConfig(analyzer);
109 iwc.setOpenMode(OpenMode.CREATE);
110 try (IndexWriter writer = new IndexWriter(dir, iwc)) {
111 // write all docs
112 System.out.println("Writing ["+docs.size()+"] documents to index....");
113 writer.addDocuments(docs);
114 System.out.println("NUMBER OF INDEXED DOCUMENTS = ["+writer.numDocs()+"]");
115 }
116 // write file listing all the search index files, so we know what
117 // is in the jar file at runtime
118 try (FileWriter listAllOut = new FileWriter(new File(indexDir,"listAll.txt"))) {
119 for (String fileName: dir.listAll()) {
120 // don't include the "listAll.txt" file or "write.lock"
121 if (!"listAll.txt".equals(fileName) && !"write.lock".equals(fileName)) {
122 Long length = dir.fileLength(fileName);
123 listAllOut.write(fileName);
124 listAllOut.write(':');
125 listAllOut.write(length.toString());
126 listAllOut.write('\n');
127 }
128 }
129 listAllOut.flush();
130 }
131 System.out.println("Finished writing search index to directory '" + indexDir);
132 } catch (IOException ex) {
133 Logger.getLogger(BuildEnsembleSearchIndex.class.getName()).log(Level.SEVERE, null, ex);
134 }
135 }
136
137 private static List<Callable<List<Document>>> indexAllDocumentation(String javafxDocumentationHome) throws IOException{
138 List<Callable<List<Document>>> tasks = new ArrayList<>();
139 CharSequence content = grabWebPage(javafxDocumentationHome);
140 String baseUrl = javafxDocumentationHome.substring(0,javafxDocumentationHome.lastIndexOf('/')+1);
141 // System.out.println("baseUrl = " + baseUrl);
157 tasks.add((Callable<List<Document>>) () -> indexDocumentationPage(docPageUrl));
158 }
159 System.out.println(" --- end of list ---");
160 return tasks;
161 }
162
163 private static List<Document> indexDocumentationPage(String docPageUrl) throws IOException{
164 List<Document> docs = new ArrayList<>();
165 try {
166 // System.out.println("PROCESSING... ["+docPageUrl+"] on Thread ["+Thread.currentThread().getName()+"]");
167 // System.out.println("==================================================================");
168 // System.out.println("Parsing docs page ["+docPageUrl+"] ...");
169 DocumentationIndexer.DocPage docPage = DocumentationIndexer.parseDocsPage(docPageUrl, grabWebPage(docPageUrl).toString());
170 // System.out.println("TITLE="+docPage.bookTitle+" CHAPTER="+docPage.chapter+" SECTIONS=["+docPage.sections.size()+"]");
171 for (DocumentationIndexer.Section section: docPage.sections) {
172 if (section.name == null) {
173 System.out.println("section.name = "+section.name+" docPage.bookTitle="+docPage.bookTitle+" "+docPageUrl);
174 }
175 // write documentation section entry to index
176 docs.add(createDocument(DocumentType.DOC,
177 new TextField("bookTitle", docPage.bookTitle, Field.Store.YES),
178 new TextField("chapter", docPage.chapter==null? "" : docPage.chapter, Field.Store.YES),
179 new TextField("name", section.name, Field.Store.YES),
180 new TextField("description", section.content, Field.Store.NO),
181 new StringField("ensemblePath", section.url, Field.Store.YES)
182 ));
183 }
184 // handle next page if there is one
185 if (docPage.nextUrl != null) {
186 docs.addAll(indexDocumentationPage(docPage.nextUrl));
187 }
188
189 } catch (Exception ex) {
190 System.out.println("FAILED TO PARSE DOCS PAGE SO IGNORED: ["+docPageUrl+"]");
191 ex.printStackTrace(System.out);
192 }
193 return docs;
194 }
195
196 private static List<Callable<List<Document>>> indexJavaDocAllClasses(final String javaDocBaseUrl) throws IOException{
197 CharSequence content = grabWebPage(javaDocBaseUrl+"allclasses-noframe.html");
198 List<Callable<List<Document>>> tasks = new ArrayList<>();
199 // parse package
200 Matcher matcher = findClassUrl.matcher(content);
201 while (matcher.find()) {
202 final String classUrl = javaDocBaseUrl+matcher.group(1);
203 tasks.add((Callable<List<Document>>) () -> indexApiDocs(classUrl));
204 }
205 return tasks;
206 }
207
208 /**
209 * Add all samples to the search index
210 */
211 private static List<Document> indexSamples(List<Sample> allSamples) throws IOException {
212 List<Document> docs = new ArrayList<>();
213 for (Sample sample: allSamples) {
214 // write class entry to index
215 docs.add(createDocument(DocumentType.SAMPLE,
216 new TextField("name", sample.name, Field.Store.YES),
217 new TextField("description", sample.description, Field.Store.NO),
218 new StringField("shortDescription", sample.description.substring(0, Math.min(160, sample.description.length())),
219 Field.Store.YES),
220 new StringField("ensemblePath", "sample://"+sample.ensemblePath, Field.Store.YES)
221 ));
222 }
223 return docs;
224 }
225
226 /**
227 * Index a JavaDoc page for a single class, interface or enum
228 *
229 * @param writer The index writer to add documents to
230 * @param url The url to the javadoc html file
231 * @throws IOException If there was a problem indexing the file
232 */
233 private static List<Document> indexApiDocs(String url) throws IOException {
234 // System.out.println("PROCESSING... ["+url+"] on Thread ["+Thread.currentThread().getName()+"]");
235 final List<Document> docs = new ArrayList<>();
236 CharSequence content = grabWebPage(url);
237 // extract package and class
238 Matcher packageAndClassMatcher = PACKAGE_AND_CLASS.matcher(content);
239 // search and if we fail to find ignore this file
240 if (!packageAndClassMatcher.find()) {
247 String packageName = packageAndClassMatcher.group(1);
248 //System.out.println(" packageName = " + packageName);
249 String classType = packageAndClassMatcher.group(2).toLowerCase();
250 //System.out.println(" classType = " + classType);
251 String className = packageAndClassMatcher.group(3);
252 //System.out.println(" className = " + className);
253 // extract document type
254 DocumentType documentType = DocumentType.CLASS;
255 if ("enum".equals(classType)) {
256 documentType = DocumentType.ENUM;
257 }
258 // extract javadoc description
259 Matcher classDescriptionMatcher = CLASS_DESCRIPTION.matcher(content);
260 String classDescription = "";
261 if (classDescriptionMatcher.find()) {
262 classDescription = cleanHTML(classDescriptionMatcher.group(1));
263 }
264 ///System.out.println("classDescription = " + classDescription);
265 // write class entry to index
266 docs.add(createDocument(documentType,
267 new TextField("name", className, Field.Store.YES),
268 new TextField("description", classDescription, Field.Store.NO),
269 new StringField("shortDescription", classDescription.substring(0,Math.min(160,classDescription.length())),
270 Field.Store.YES),
271 new TextField("package", packageName, Field.Store.YES),
272 new StringField("url", url, Field.Store.YES),
273 new StringField("ensemblePath", url, Field.Store.YES) // TODO what do we need here
274 ));
275
276 // extract properties
277 Matcher propertySummaryMatcher = PROPERTY_SUMMARY.matcher(content);
278 if (propertySummaryMatcher.find()) {
279 String propertySummaryTable = propertySummaryMatcher.group(1);
280 Matcher propertyMatcher = PROPERTY.matcher(propertySummaryTable);
281 while (propertyMatcher.find()) {
282 String propUrl = propertyMatcher.group(1);
283 String propertyName = propertyMatcher.group(2);
284 String description = cleanHTML(propertyMatcher.group(3));
285 //System.out.println(" propertyName = " + propertyName);
286 //System.out.println(" description = " + description);
287 //System.out.println(" url = " + url);
288 propUrl = url + "#" + propertyName;
289 //System.out.println(" oracle url = " + url);
290 // write class entry to index
291 docs.add(createDocument(DocumentType.PROPERTY,
292 new TextField("name", propertyName, Field.Store.YES),
293 new TextField("description", description, Field.Store.NO),
294 new StringField("shortDescription", description.substring(0,Math.min(160,description.length())),
295 Field.Store.YES),
296 new StringField("url", propUrl, Field.Store.YES),
297 new StringField("className", className, Field.Store.YES),
298 new StringField("package", packageName, Field.Store.YES),
299 new StringField("ensemblePath", url + "#" + propertyName, Field.Store.YES) // TODO what do we need here
300 ));
301 }
302 }
303 // extract methods
304 Matcher methodSummaryMatcher = METHOD_SUMMARY.matcher(content);
305 if (methodSummaryMatcher.find()) {
306 String methodSummaryTable = methodSummaryMatcher.group(1);
307 Matcher methodMatcher = PROPERTY.matcher(methodSummaryTable);
308 while (methodMatcher.find()) {
309 String methodUrl = methodMatcher.group(1);
310 String methodName = methodMatcher.group(2);
311 String description = cleanHTML(methodMatcher.group(3));
312 //System.out.println(" methodName = " + methodName);
313 //System.out.println(" description = " + description);
314 //System.out.println(" url = " + url);
315 methodUrl = url + "#" + methodName+"()";
316 //System.out.println(" oracle url = " + url);
317 // write class entry to index
318 docs.add(createDocument(DocumentType.METHOD,
319 new TextField("name", methodName, Field.Store.YES),
320 new TextField("description", description, Field.Store.NO),
321 new StringField("shortDescription", description.substring(0,Math.min(160,description.length())),
322 Field.Store.YES),
323 new StringField("url", methodUrl, Field.Store.YES),
324 new StringField("className", className, Field.Store.YES),
325 new StringField("package", packageName, Field.Store.YES),
326 new StringField("ensemblePath", url + "#" + methodName + "()", Field.Store.YES) // TODO what do we need here
327 ));
328 }
329 }
330 // extract fields
331 Matcher fieldSummaryMatcher = FIELD_SUMMARY.matcher(content);
332 if (fieldSummaryMatcher.find()) {
333 String fieldSummaryTable = fieldSummaryMatcher.group(1);
334 Matcher fieldMatcher = PROPERTY.matcher(fieldSummaryTable);
335 while (fieldMatcher.find()) {
336 String fieldUrl = fieldMatcher.group(1);
337 String fieldName = fieldMatcher.group(2);
338 String description = cleanHTML(fieldMatcher.group(3));
339 //System.out.println(" ##### fieldName = " + fieldName);
340 //System.out.println(" description = " + description);
341 //System.out.println(" url = " + url);
342 fieldUrl = url + "#" + fieldName;
343 //System.out.println(" oracle url = " + url);
344 // write class entry to index
345 docs.add(createDocument(DocumentType.FIELD,
346 new TextField("name", fieldName, Field.Store.YES),
347 new TextField("description", description, Field.Store.NO),
348 new StringField("shortDescription", description.substring(0,Math.min(160,description.length())),
349 Field.Store.YES),
350 new StringField("url", fieldUrl, Field.Store.YES),
351 new StringField("className", className, Field.Store.YES),
352 new StringField("package", packageName, Field.Store.YES),
353 new StringField("ensemblePath", url + "#" + fieldName, Field.Store.YES) // TODO what do we need here
354 ));
355 }
356 }
357 // extract enums
358 Matcher enumSummaryMatcher = ENUM_SUMMARY.matcher(content);
359 if (enumSummaryMatcher.find()) {
360 String enumSummaryTable = enumSummaryMatcher.group(1);
361 Matcher enumMatcher = PROPERTY.matcher(enumSummaryTable);
362 while (enumMatcher.find()) {
363 String enumUrl = enumMatcher.group(1);
364 String enumName = enumMatcher.group(2);
365 String description = cleanHTML(enumMatcher.group(3));
366 //System.out.println(" enumName = " + enumName);
367 //System.out.println(" description = " + description);
368 //System.out.println(" url = " + url);
369 enumUrl = url + "#" + enumName;
370 ///System.out.println(" oracle url = " + url);
371 // write class entry to index
372 docs.add(createDocument(DocumentType.ENUM,
373 new TextField("name", enumName, Field.Store.YES),
374 new TextField("description", description, Field.Store.NO),
375 new StringField("shortDescription", description.substring(0,Math.min(160,description.length())),
376 Field.Store.YES),
377 new StringField("url", enumUrl, Field.Store.YES),
378 new StringField("className", className, Field.Store.YES),
379 new StringField("package", packageName, Field.Store.YES),
380 new StringField("ensemblePath", url+ "#" + enumName, Field.Store.YES) // TODO what do we need here
381 ));
382 }
383 }
384 return docs;
385 }
386
387 /**
388 * Create a new document
389 *
390 * @param documentType The document type to save in the doc
391 * @param fields The searchable and data fields to write into doc
392 * @throws IOException If there was problem writing doc
393 */
394 private static Document createDocument(DocumentType documentType, Field... fields) throws IOException {
395 // make a new, empty document
396 Document doc = new Document();
397 // add doc type field + sorting field
398 doc.add(new StringField("documentType", documentType.toString(), Field.Store.YES));
399 doc.add(new SortedDocValuesField("documentType", new BytesRef(documentType.toString())));
400 // add other fields
401 if (fields != null) {
402 for (Field field : fields) {
403 doc.add(field);
404 }
405 }
406 return doc;
407 }
408
409 /**
410 * Create a new document and write it to the given writer
411 *
412 * @param writer The writer to write out to
413 * @param documentType The document type to save in the doc
414 * @param fields The searchable and data fields to write into doc
415 * @throws IOException If there was problem writing doc
416 */
417 private static void addDocument(IndexWriter writer, DocumentType documentType, Field... fields) throws IOException {
418 // make a new, empty document
419 Document doc = new Document();
420 // add doc type field + sorting field
421 doc.add(new StringField("documentType", documentType.toString(), Field.Store.YES));
422 doc.add(new SortedDocValuesField("documentType", new BytesRef(documentType.toString())));
423 // add other fields
424 if (fields != null) {
425 for (Field field : fields) {
426 doc.add(field);
427 }
428 }
429 // write into index, assuming we are recreating every time
430 writer.addDocument(doc);
431 }
432
433 /**
434 * Clean HTML, removing all tags and un-escaping so that we can index it cleanly
435 *
436 * @param html The html to clean
437 * @return cleaned html
438 */
439 private static String cleanHTML(String html) {
440 html = html.replaceAll("( |\\s|[ ])+", " ").trim(); // cleanup whitespace
441 html = html.replaceAll("<.*?>", " "); // remove html tags
442 html = html.replaceAll("<", "<"); // un-escape <
|