daily pastebin goal
28%
SHARE
TWEET

Untitled

a guest Feb 19th, 2019 67 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. package com.github.horitaku1124.java_searcher;
  2.  
  3. import java.io.BufferedReader;
  4. import java.io.IOException;
  5. import java.io.InputStream;
  6. import java.io.InputStreamReader;
  7. import java.nio.charset.StandardCharsets;
  8. import java.nio.file.FileVisitResult;
  9. import java.nio.file.Files;
  10. import java.nio.file.Path;
  11. import java.nio.file.Paths;
  12. import java.nio.file.SimpleFileVisitor;
  13. import java.nio.file.attribute.BasicFileAttributes;
  14. import java.util.Date;
  15.  
  16. import org.apache.lucene.analysis.Analyzer;
  17. import org.apache.lucene.analysis.standard.StandardAnalyzer;
  18. import org.apache.lucene.document.LongPoint;
  19. import org.apache.lucene.document.Document;
  20. import org.apache.lucene.document.Field;
  21. import org.apache.lucene.document.StringField;
  22. import org.apache.lucene.document.TextField;
  23. import org.apache.lucene.index.IndexWriter;
  24. import org.apache.lucene.index.IndexWriterConfig.OpenMode;
  25. import org.apache.lucene.index.IndexWriterConfig;
  26. import org.apache.lucene.index.Term;
  27. import org.apache.lucene.store.Directory;
  28. import org.apache.lucene.store.FSDirectory;
  29. import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
  30.  
  31.  
  32. public class IndexFiles {
  33.     private IndexFiles() {}
  34.  
  35.     /** Index all text files under a directory. */
  36.     public static void main(String[] args) {
  37.         String usage = "java org.apache.lucene.demo.IndexFiles"
  38.                 + " [-index INDEX_PATH] [-docs DOCS_PATH] [-update]\n\n"
  39.                 + "This indexes the documents in DOCS_PATH, creating a Lucene index"
  40.                 + "in INDEX_PATH that can be searched with SearchFiles";
  41.         String indexPath = "index";
  42.         String docsPath = null;
  43.         boolean create = true;
  44.         for(int i=0;i<args.length;i++) {
  45.             if ("-index".equals(args[i])) {
  46.                 indexPath = args[i+1];
  47.                 i++;
  48.             } else if ("-docs".equals(args[i])) {
  49.                 docsPath = args[i+1];
  50.                 i++;
  51.             } else if ("-update".equals(args[i])) {
  52.                 create = false;
  53.             }
  54.         }
  55.  
  56.         if (docsPath == null) {
  57.             System.err.println("Usage: " + usage);
  58.             System.exit(1);
  59.         }
  60.  
  61.         final Path docDir = Paths.get(docsPath);
  62.         if (!Files.isReadable(docDir)) {
  63.             System.out.println("Document directory '" +docDir.toAbsolutePath()+ "' does not exist or is not readable, please check the path");
  64.             System.exit(1);
  65.         }
  66.  
  67.         Date start = new Date();
  68.         try {
  69.             System.out.println("Indexing to directory '" + indexPath + "'...");
  70.  
  71.             Directory dir = FSDirectory.open(Paths.get(indexPath));
  72.             Analyzer analyzer = new StandardAnalyzer();
  73.             IndexWriterConfig iwc = new IndexWriterConfig(analyzer);
  74.  
  75.             if (create) {
  76.                 // Create a new index in the directory, removing any
  77.                 // previously indexed documents:
  78.                 iwc.setOpenMode(OpenMode.CREATE);
  79.             } else {
  80.                 // Add new documents to an existing index:
  81.                 iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);
  82.             }
  83.  
  84.             // Optional: for better indexing performance, if you
  85.             // are indexing many documents, increase the RAM
  86.             // buffer.  But if you do this, increase the max heap
  87.             // size to the JVM (eg add -Xmx512m or -Xmx1g):
  88.             //
  89.             // iwc.setRAMBufferSizeMB(256.0);
  90.  
  91.             IndexWriter writer = new IndexWriter(dir, iwc);
  92.             indexDocs(writer, docDir);
  93.  
  94.             // NOTE: if you want to maximize search performance,
  95.             // you can optionally call forceMerge here.  This can be
  96.             // a terribly costly operation, so generally it's only
  97.             // worth it when your index is relatively static (ie
  98.             // you're done adding documents to it):
  99.             //
  100.             // writer.forceMerge(1);
  101.  
  102.             writer.close();
  103.  
  104.             Date end = new Date();
  105.             System.out.println(end.getTime() - start.getTime() + " total milliseconds");
  106.  
  107.         } catch (IOException e) {
  108.             e.printStackTrace();
  109.         }
  110.     }
  111.  
  112.     /**
  113.      * Indexes the given file using the given writer, or if a directory is given,
  114.      * recurses over files and directories found under the given directory.
  115.      *
  116.      * NOTE: This method indexes one document per input file.  This is slow.  For good
  117.      * throughput, put multiple documents into your input file(s).  An example of this is
  118.      * in the benchmark module, which can create "line doc" files, one document per line,
  119.      * using the
  120.      * <a href="../../../../../contrib-benchmark/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.html"
  121.      * >WriteLineDocTask</a>.
  122.      *
  123.      * @param writer Writer to the index where the given file/dir info will be stored
  124.      * @param path The file to index, or the directory to recurse into to find files to index
  125.      * @throws IOException If there is a low-level I/O error
  126.      */
  127.     static void indexDocs(final IndexWriter writer, Path path) throws IOException {
  128.         if (Files.isDirectory(path)) {
  129.             Files.walkFileTree(path, new SimpleFileVisitor<Path>() {
  130.                 @Override
  131.                 public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException {
  132.                     try {
  133.                         String fileName = file.toAbsolutePath().toString();
  134.                         if (fileName.endsWith(".xls") || fileName.endsWith(".xlsx")) {
  135.                             indexDoc_xls(writer, file, attrs.lastModifiedTime().toMillis());
  136.                         }
  137.                     } catch (Exception e) {
  138.                         e.printStackTrace();
  139.                         // don't index files that can't be read.
  140.                     }
  141.                     return FileVisitResult.CONTINUE;
  142.                 }
  143.             });
  144.         } else {
  145.             indexDoc(writer, path, Files.getLastModifiedTime(path).toMillis());
  146.         }
  147.     }
  148.  
  149.     static void indexDoc_xls(IndexWriter writer, Path file, long lastModified) throws IOException, InvalidFormatException {
  150.         String content = DumpExcel.readFile(file.toString());
  151.  
  152.         // make a new, empty document
  153.         Document doc = new Document();
  154.  
  155.         // Add the path of the file as a field named "path".  Use a
  156.         // field that is indexed (i.e. searchable), but don't tokenize
  157.         // the field into separate words and don't index term frequency
  158.         // or positional information:
  159.         Field pathField = new StringField("path", file.toString(), Field.Store.YES);
  160.         doc.add(pathField);
  161.  
  162.         // Add the last modified date of the file a field named "modified".
  163.         // Use a LongPoint that is indexed (i.e. efficiently filterable with
  164.         // PointRangeQuery).  This indexes to milli-second resolution, which
  165.         // is often too fine.  You could instead create a number based on
  166.         // year/month/day/hour/minutes/seconds, down the resolution you require.
  167.         // For example the long value 2011021714 would mean
  168.         // February 17, 2011, 2-3 PM.
  169.         doc.add(new LongPoint("modified", lastModified));
  170.  
  171.         // Add the contents of the file to a field named "contents".  Specify a Reader,
  172.         // so that the text of the file is tokenized and indexed, but not stored.
  173.         // Note that FileReader expects the file to be in UTF-8 encoding.
  174.         // If that's not the case searching for special characters will fail.
  175.         doc.add(new TextField("contents", content, Field.Store.YES));
  176.  
  177.         if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
  178.             // New index, so we just add the document (no old document can be there):
  179.             System.out.println("adding " + file);
  180.             writer.addDocument(doc);
  181.         } else {
  182.             // Existing index (an old copy of this document may have been indexed) so
  183.             // we use updateDocument instead to replace the old one matching the exact
  184.             // path, if present:
  185.             System.out.println("updating " + file);
  186.             writer.updateDocument(new Term("path", file.toString()), doc);
  187.         }
  188.     }
  189.     /** Indexes a single document */
  190.     static void indexDoc(IndexWriter writer, Path file, long lastModified) throws IOException {
  191.         try (InputStream stream = Files.newInputStream(file)) {
  192.             // make a new, empty document
  193.             Document doc = new Document();
  194.  
  195.             // Add the path of the file as a field named "path".  Use a
  196.             // field that is indexed (i.e. searchable), but don't tokenize
  197.             // the field into separate words and don't index term frequency
  198.             // or positional information:
  199.             Field pathField = new StringField("path", file.toString(), Field.Store.YES);
  200.             doc.add(pathField);
  201.  
  202.             // Add the last modified date of the file a field named "modified".
  203.             // Use a LongPoint that is indexed (i.e. efficiently filterable with
  204.             // PointRangeQuery).  This indexes to milli-second resolution, which
  205.             // is often too fine.  You could instead create a number based on
  206.             // year/month/day/hour/minutes/seconds, down the resolution you require.
  207.             // For example the long value 2011021714 would mean
  208.             // February 17, 2011, 2-3 PM.
  209.             doc.add(new LongPoint("modified", lastModified));
  210.  
  211.             // Add the contents of the file to a field named "contents".  Specify a Reader,
  212.             // so that the text of the file is tokenized and indexed, but not stored.
  213.             // Note that FileReader expects the file to be in UTF-8 encoding.
  214.             // If that's not the case searching for special characters will fail.
  215.             doc.add(new TextField("contents", new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8))));
  216.  
  217.             if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
  218.                 // New index, so we just add the document (no old document can be there):
  219.                 System.out.println("adding " + file);
  220.                 writer.addDocument(doc);
  221.             } else {
  222.                 // Existing index (an old copy of this document may have been indexed) so
  223.                 // we use updateDocument instead to replace the old one matching the exact
  224.                 // path, if present:
  225.                 System.out.println("updating " + file);
  226.                 writer.updateDocument(new Term("path", file.toString()), doc);
  227.             }
  228.         }
  229.     }
  230. }
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
 
Top