Guest User

Untitled

a guest
May 25th, 2018
96
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 4.31 KB | None | 0 0
  1. package wbi1112;
  2.  
  3. import java.io.*;
  4. import java.util.List;
  5.  
  6. import net.htmlparser.jericho.*;
  7.  
  8. import org.apache.lucene.analysis.standard.StandardAnalyzer;
  9. import org.apache.lucene.document.Document;
  10. import org.apache.lucene.document.Field;
  11. import org.apache.lucene.index.IndexWriter;
  12. import org.apache.lucene.index.IndexWriterConfig;
  13.  
  14.  
  15. import org.apache.lucene.store.FSDirectory;
  16. import org.apache.lucene.util.Version;
  17.  
  18.  
  19. public class IndexCreator {
  20.  
  21. static IndexWriter contentw;
  22. static IndexWriter imagew;
  23.  
  24. static boolean rmdir(File f){
  25.  
  26. if (f.isDirectory()){
  27. String[] files = f.list();
  28. for(int i=0; i<files.length; i++){
  29.  
  30. rmdir( new File(f, files[i]) );
  31.  
  32. }
  33. }
  34.  
  35. return f.delete();
  36.  
  37. }
  38. /**
  39. * @param args
  40. */
  41. public static void main(String[] args) {
  42.  
  43. // usage: IndexCreator ContentIndexDir ImageIndexDir PageDir
  44. // open specified path or working dir
  45.  
  46. if (args.length != 1) {
  47. System.out.println("usage: java -jar IndexCreator.jar PageDir");
  48. System.out.println("indices are created in .imgidx / .cntidx");
  49. System.exit(1);
  50.  
  51. }
  52.  
  53. // get paths and check
  54. String sContent = ".cntidx";
  55. String sImage = ".imgidx";
  56. String sPath = args[0];
  57.  
  58. final File fPath = new File(sPath);
  59. final File fContent = new File(sContent);
  60. final File fImage = new File(sImage);
  61.  
  62. if (!fPath.exists() || !fPath.canRead()) {
  63. System.out.println("Page Directory couldn't be read.");
  64. System.exit(1);
  65. }
  66.  
  67. rmdir(fContent);
  68. rmdir(fImage);
  69.  
  70. fContent.mkdir();
  71. fImage.mkdir();
  72.  
  73. if (! (fContent.exists() && fImage.exists() && fContent.canWrite() && fImage.canWrite()) ){
  74. System.out.println("One or both of the Index-Directories cannot be written to.");
  75. System.exit(1);
  76. }
  77.  
  78. // build the indices
  79. try {
  80.  
  81. // one for content, one for images
  82. contentw = new IndexWriter(FSDirectory.open(fContent),new IndexWriterConfig(Version.LUCENE_31, new StandardAnalyzer(Version.LUCENE_31)));
  83. imagew = new IndexWriter(FSDirectory.open(fImage), new IndexWriterConfig(Version.LUCENE_31, new StandardAnalyzer(Version.LUCENE_31)));
  84.  
  85. // in one method
  86. index(fPath);
  87.  
  88. contentw.close();
  89. imagew.close();
  90.  
  91.  
  92. } catch (Exception e) {
  93. e.printStackTrace();
  94. System.out.println(e.getMessage());
  95. }
  96.  
  97. System.out.println("done");
  98. }
  99.  
  100. static void index(File f){
  101.  
  102.  
  103.  
  104. // recurse into subdirectories
  105. if( f.isDirectory() ){
  106.  
  107. String[] files = f.list();
  108. for(int i=0; i<files.length; i++){
  109.  
  110. index( new File(f, files[i]) );
  111.  
  112. }
  113.  
  114. }
  115.  
  116. // but only look at .htm files
  117. if(! f.getPath().toLowerCase().endsWith(".htm") )
  118. return;
  119.  
  120.  
  121.  
  122. FileInputStream fis;
  123.  
  124. try {
  125.  
  126. Document doc = new Document();
  127.  
  128. // add path
  129. doc.add( new Field("path", f.getPath(), Field.Store.YES, Field.Index.NO));
  130.  
  131. // read file
  132. fis = new FileInputStream(f);
  133.  
  134. // use jericho html-parser to get contents of html-file
  135. Source src = new Source( new BufferedReader(new InputStreamReader(fis, "UTF-8")));
  136.  
  137.  
  138. // the plain text
  139. doc.add( new Field("content",
  140. src.getTextExtractor().setIncludeAttributes(true).toString(),
  141. Field.Store.NO, Field.Index.ANALYZED )
  142. );
  143.  
  144. // the title
  145. doc.add( new Field("title",
  146. src.getFirstElement(HTMLElementName.TITLE ).getContent().toString(),
  147. Field.Store.YES, Field.Index.NOT_ANALYZED )
  148. );
  149.  
  150. // add to the content index
  151. contentw.addDocument(doc);
  152.  
  153. // are there (specially designated) images in our html-files?
  154. List<Element> elementList=src.getAllElementsByClass("d_image");
  155.  
  156. for (Element element : elementList) {
  157.  
  158. // loop through all of them and add them to the..
  159. Document d = new Document();
  160. d.add( new Field("path", element.getFirstElement(HTMLElementName.IMG).getAttributeValue("src"), Field.Store.YES, Field.Index.NO));
  161. d.add( new Field("caption", element.getTextExtractor().toString(), Field.Store.YES, Field.Index.NOT_ANALYZED ));
  162. // ..image index
  163. imagew.addDocument(d);
  164.  
  165. }
  166.  
  167. fis.close();
  168.  
  169. } catch (Exception e) {
  170. e.printStackTrace();
  171.  
  172. System.out.println(e.getMessage());
  173. }
  174.  
  175.  
  176.  
  177.  
  178. }
  179. }
Add Comment
Please, Sign In to add comment