Advertisement
Guest User

Untitled

a guest
Feb 19th, 2019
82
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 10.27 KB | None | 0 0
  1. package com.github.horitaku1124.java_searcher;
  2.  
  3. import java.io.BufferedReader;
  4. import java.io.IOException;
  5. import java.io.InputStream;
  6. import java.io.InputStreamReader;
  7. import java.nio.charset.StandardCharsets;
  8. import java.nio.file.FileVisitResult;
  9. import java.nio.file.Files;
  10. import java.nio.file.Path;
  11. import java.nio.file.Paths;
  12. import java.nio.file.SimpleFileVisitor;
  13. import java.nio.file.attribute.BasicFileAttributes;
  14. import java.util.Date;
  15.  
  16. import org.apache.lucene.analysis.Analyzer;
  17. import org.apache.lucene.analysis.standard.StandardAnalyzer;
  18. import org.apache.lucene.document.LongPoint;
  19. import org.apache.lucene.document.Document;
  20. import org.apache.lucene.document.Field;
  21. import org.apache.lucene.document.StringField;
  22. import org.apache.lucene.document.TextField;
  23. import org.apache.lucene.index.IndexWriter;
  24. import org.apache.lucene.index.IndexWriterConfig.OpenMode;
  25. import org.apache.lucene.index.IndexWriterConfig;
  26. import org.apache.lucene.index.Term;
  27. import org.apache.lucene.store.Directory;
  28. import org.apache.lucene.store.FSDirectory;
  29. import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
  30.  
  31.  
  32. public class IndexFiles {
  33. private IndexFiles() {}
  34.  
  35. /** Index all text files under a directory. */
  36. public static void main(String[] args) {
  37. String usage = "java org.apache.lucene.demo.IndexFiles"
  38. + " [-index INDEX_PATH] [-docs DOCS_PATH] [-update]\n\n"
  39. + "This indexes the documents in DOCS_PATH, creating a Lucene index"
  40. + "in INDEX_PATH that can be searched with SearchFiles";
  41. String indexPath = "index";
  42. String docsPath = null;
  43. boolean create = true;
  44. for(int i=0;i<args.length;i++) {
  45. if ("-index".equals(args[i])) {
  46. indexPath = args[i+1];
  47. i++;
  48. } else if ("-docs".equals(args[i])) {
  49. docsPath = args[i+1];
  50. i++;
  51. } else if ("-update".equals(args[i])) {
  52. create = false;
  53. }
  54. }
  55.  
  56. if (docsPath == null) {
  57. System.err.println("Usage: " + usage);
  58. System.exit(1);
  59. }
  60.  
  61. final Path docDir = Paths.get(docsPath);
  62. if (!Files.isReadable(docDir)) {
  63. System.out.println("Document directory '" +docDir.toAbsolutePath()+ "' does not exist or is not readable, please check the path");
  64. System.exit(1);
  65. }
  66.  
  67. Date start = new Date();
  68. try {
  69. System.out.println("Indexing to directory '" + indexPath + "'...");
  70.  
  71. Directory dir = FSDirectory.open(Paths.get(indexPath));
  72. Analyzer analyzer = new StandardAnalyzer();
  73. IndexWriterConfig iwc = new IndexWriterConfig(analyzer);
  74.  
  75. if (create) {
  76. // Create a new index in the directory, removing any
  77. // previously indexed documents:
  78. iwc.setOpenMode(OpenMode.CREATE);
  79. } else {
  80. // Add new documents to an existing index:
  81. iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);
  82. }
  83.  
  84. // Optional: for better indexing performance, if you
  85. // are indexing many documents, increase the RAM
  86. // buffer. But if you do this, increase the max heap
  87. // size to the JVM (eg add -Xmx512m or -Xmx1g):
  88. //
  89. // iwc.setRAMBufferSizeMB(256.0);
  90.  
  91. IndexWriter writer = new IndexWriter(dir, iwc);
  92. indexDocs(writer, docDir);
  93.  
  94. // NOTE: if you want to maximize search performance,
  95. // you can optionally call forceMerge here. This can be
  96. // a terribly costly operation, so generally it's only
  97. // worth it when your index is relatively static (ie
  98. // you're done adding documents to it):
  99. //
  100. // writer.forceMerge(1);
  101.  
  102. writer.close();
  103.  
  104. Date end = new Date();
  105. System.out.println(end.getTime() - start.getTime() + " total milliseconds");
  106.  
  107. } catch (IOException e) {
  108. e.printStackTrace();
  109. }
  110. }
  111.  
  112. /**
  113. * Indexes the given file using the given writer, or if a directory is given,
  114. * recurses over files and directories found under the given directory.
  115. *
  116. * NOTE: This method indexes one document per input file. This is slow. For good
  117. * throughput, put multiple documents into your input file(s). An example of this is
  118. * in the benchmark module, which can create "line doc" files, one document per line,
  119. * using the
  120. * <a href="../../../../../contrib-benchmark/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.html"
  121. * >WriteLineDocTask</a>.
  122. *
  123. * @param writer Writer to the index where the given file/dir info will be stored
  124. * @param path The file to index, or the directory to recurse into to find files to index
  125. * @throws IOException If there is a low-level I/O error
  126. */
  127. static void indexDocs(final IndexWriter writer, Path path) throws IOException {
  128. if (Files.isDirectory(path)) {
  129. Files.walkFileTree(path, new SimpleFileVisitor<Path>() {
  130. @Override
  131. public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException {
  132. try {
  133. String fileName = file.toAbsolutePath().toString();
  134. if (fileName.endsWith(".xls") || fileName.endsWith(".xlsx")) {
  135. indexDoc_xls(writer, file, attrs.lastModifiedTime().toMillis());
  136. }
  137. } catch (Exception e) {
  138. e.printStackTrace();
  139. // don't index files that can't be read.
  140. }
  141. return FileVisitResult.CONTINUE;
  142. }
  143. });
  144. } else {
  145. indexDoc(writer, path, Files.getLastModifiedTime(path).toMillis());
  146. }
  147. }
  148.  
  149. static void indexDoc_xls(IndexWriter writer, Path file, long lastModified) throws IOException, InvalidFormatException {
  150. String content = DumpExcel.readFile(file.toString());
  151.  
  152. // make a new, empty document
  153. Document doc = new Document();
  154.  
  155. // Add the path of the file as a field named "path". Use a
  156. // field that is indexed (i.e. searchable), but don't tokenize
  157. // the field into separate words and don't index term frequency
  158. // or positional information:
  159. Field pathField = new StringField("path", file.toString(), Field.Store.YES);
  160. doc.add(pathField);
  161.  
  162. // Add the last modified date of the file a field named "modified".
  163. // Use a LongPoint that is indexed (i.e. efficiently filterable with
  164. // PointRangeQuery). This indexes to milli-second resolution, which
  165. // is often too fine. You could instead create a number based on
  166. // year/month/day/hour/minutes/seconds, down the resolution you require.
  167. // For example the long value 2011021714 would mean
  168. // February 17, 2011, 2-3 PM.
  169. doc.add(new LongPoint("modified", lastModified));
  170.  
  171. // Add the contents of the file to a field named "contents". Specify a Reader,
  172. // so that the text of the file is tokenized and indexed, but not stored.
  173. // Note that FileReader expects the file to be in UTF-8 encoding.
  174. // If that's not the case searching for special characters will fail.
  175. doc.add(new TextField("contents", content, Field.Store.YES));
  176.  
  177. if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
  178. // New index, so we just add the document (no old document can be there):
  179. System.out.println("adding " + file);
  180. writer.addDocument(doc);
  181. } else {
  182. // Existing index (an old copy of this document may have been indexed) so
  183. // we use updateDocument instead to replace the old one matching the exact
  184. // path, if present:
  185. System.out.println("updating " + file);
  186. writer.updateDocument(new Term("path", file.toString()), doc);
  187. }
  188. }
  189. /** Indexes a single document */
  190. static void indexDoc(IndexWriter writer, Path file, long lastModified) throws IOException {
  191. try (InputStream stream = Files.newInputStream(file)) {
  192. // make a new, empty document
  193. Document doc = new Document();
  194.  
  195. // Add the path of the file as a field named "path". Use a
  196. // field that is indexed (i.e. searchable), but don't tokenize
  197. // the field into separate words and don't index term frequency
  198. // or positional information:
  199. Field pathField = new StringField("path", file.toString(), Field.Store.YES);
  200. doc.add(pathField);
  201.  
  202. // Add the last modified date of the file a field named "modified".
  203. // Use a LongPoint that is indexed (i.e. efficiently filterable with
  204. // PointRangeQuery). This indexes to milli-second resolution, which
  205. // is often too fine. You could instead create a number based on
  206. // year/month/day/hour/minutes/seconds, down the resolution you require.
  207. // For example the long value 2011021714 would mean
  208. // February 17, 2011, 2-3 PM.
  209. doc.add(new LongPoint("modified", lastModified));
  210.  
  211. // Add the contents of the file to a field named "contents". Specify a Reader,
  212. // so that the text of the file is tokenized and indexed, but not stored.
  213. // Note that FileReader expects the file to be in UTF-8 encoding.
  214. // If that's not the case searching for special characters will fail.
  215. doc.add(new TextField("contents", new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8))));
  216.  
  217. if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
  218. // New index, so we just add the document (no old document can be there):
  219. System.out.println("adding " + file);
  220. writer.addDocument(doc);
  221. } else {
  222. // Existing index (an old copy of this document may have been indexed) so
  223. // we use updateDocument instead to replace the old one matching the exact
  224. // path, if present:
  225. System.out.println("updating " + file);
  226. writer.updateDocument(new Term("path", file.toString()), doc);
  227. }
  228. }
  229. }
  230. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement