Advertisement
Guest User

Untitled

a guest
Aug 4th, 2015
154
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 6.89 KB | None | 0 0
  1. @Grapes(
  2. @Grab('org.apache.lucene:lucene-core:3.4.0')
  3. )
  4.  
  5. import java.util.zip.*
  6.  
  7. import org.apache.lucene.analysis.Analyzer
  8. import org.apache.lucene.analysis.standard.StandardAnalyzer
  9. import org.apache.lucene.analysis.SimpleAnalyzer
  10. import org.apache.lucene.document.Document
  11. import org.apache.lucene.document.Field
  12. import org.apache.lucene.document.Field.Index
  13. import org.apache.lucene.document.Field.Store
  14. import org.apache.lucene.document.NumericField
  15. import org.apache.lucene.index.IndexWriter
  16. import org.apache.lucene.index.IndexWriterConfig.OpenMode
  17. import org.apache.lucene.index.IndexWriterConfig
  18. import org.apache.lucene.index.Term
  19. import org.apache.lucene.store.Directory
  20. import org.apache.lucene.store.FSDirectory
  21. import org.apache.lucene.util.Version
  22.  
  23. import java.io.BufferedReader
  24. import java.io.File
  25. import java.io.FileInputStream
  26. import java.io.FileNotFoundException
  27. import java.io.IOException
  28. import java.io.InputStreamReader
  29. import java.util.Date
  30.  
  31. /** Index all text files under a directory.
  32. * <p>
  33. * This is a command-line application demonstrating simple Lucene indexing.
  34. * Run it with no command-line arguments for usage information.
  35. */
  36. public class IndexFiles {
  37. def indexDir = "./index"
  38. def docDirs = null
  39. def indexAction = OpenMode.CREATE
  40.  
  41. def writer = null
  42.  
  43. public IndexFiles(String indexDir, List docDirs) {
  44. this.indexDir = indexDir
  45. this.docDirs = docDirs
  46.  
  47. prepareIndexWriter()
  48. }
  49.  
  50. private prepareIndexWriter() {
  51. def dir = FSDirectory.open(new File(indexDir))
  52. def analyzer = new StandardAnalyzer(Version.LUCENE_34)
  53. def iwc = new IndexWriterConfig(Version.LUCENE_34, analyzer)
  54. iwc.setOpenMode(OpenMode.CREATE)
  55.  
  56. writer = new IndexWriter(dir, iwc)
  57. }
  58.  
  59. private void processFile(File file) {
  60. if (!file.canRead()) return
  61.  
  62. def fileReader
  63. def fis = new FileInputStream(file)
  64. if (file.name.toLowerCase().endsWith(".zip")) {
  65. def zis = new ZipInputStream( fis )
  66.  
  67. def entry = null
  68. while((entry = zis.getNextEntry()) != null) {
  69. if (entry.isDirectory()) continue
  70. fileReader = new BufferedReader(
  71. new InputStreamReader(zis, "UTF-8") {
  72. @Override public void close() { return } // Override the close method to avoid closing the stream
  73. })
  74. indexDoc("zip:" + file.path + "@" + entry.name, entry.time, fileReader)
  75. }
  76. zis.close()
  77. } else {
  78. fileReader = new BufferedReader( new InputStreamReader(fis, "UTF-8"))
  79. indexDoc(file.path, file.lastModified(), fileReader)
  80. }
  81. fis.close()
  82. }
  83.  
  84. public void doIndex() {
  85. docDirs.each { fn ->
  86. def file = new File(fn)
  87.  
  88. if (file.isDirectory()) file.eachFileRecurse { processFile(it) }
  89. else processFile(file)
  90. }
  91.  
  92. // NOTE: if you want to maximize search performance,
  93. // you can optionally call optimize here. This can be
  94. // a costly operation, so generally it's only worth
  95. // it when your index is relatively static (ie you're
  96. // done adding documents to it):
  97. //
  98. // writer.optimize();
  99. }
  100.  
  101. public void close() { writer.close() }
  102.  
  103. /** Index all text files under a directory. */
  104. public static void main(String[] args) {
  105. def indexFiles = new IndexFiles("./tmp/index", ["./tmp/data"])
  106. def start = new Date()
  107. indexFiles.doIndex()
  108. indexFiles.close()
  109. def end = new Date()
  110. println(end.getTime() - start.getTime() + " total milliseconds")
  111. }
  112.  
  113. /**
  114. * Indexes the given file using the given writer, or if a directory is given,
  115. * recurses over files and directories found under the given directory.
  116. *
  117. * NOTE: This method indexes one document per input file. This is slow. For good
  118. * throughput, put multiple documents into your input file(s). An example of this is
  119. * in the benchmark module, which can create "line doc" files, one document per line,
  120. * using the
  121. * <a href="../../../../../contrib-benchmark/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.html"
  122. * >WriteLineDocTask</a>.
  123. *
  124. * @param writer Writer to the index where the given file/dir info will be stored
  125. * @param file The file to index, or the directory to recurse into to find files to index
  126. * @throws IOException
  127. */
  128. void indexDoc(String fileName, long modifiedTime, Reader fileReader) throws IOException {
  129. // do not try to index files that cannot be read
  130. try {
  131. // make a new, empty document
  132. def doc = new Document()
  133.  
  134. // Add the path of the file as a field named "path". Use a
  135. // field that is indexed (i.e. searchable), but don't tokenize
  136. // the field into separate words and don't index term frequency
  137. // or positional information:
  138. def pathField = new Field("path", fileName, Store.YES, Index.NOT_ANALYZED)
  139. doc.add(pathField)
  140.  
  141. // Add the last modified date of the file a field named "modified".
  142. // Use a NumericField that is indexed (i.e. efficiently filterable with
  143. // NumericRangeFilter). This indexes to milli-second resolution, which
  144. // is often too fine. You could instead create a number based on
  145. // year/month/day/hour/minutes/seconds, down the resolution you require.
  146. // For example the long value 2011021714 would mean
  147. // February 17, 2011, 2-3 PM.
  148. //def modifiedField = new NumericField("modified")
  149. def modifiedField = new NumericField("modified", Store.YES, true)
  150. modifiedField.setLongValue(modifiedTime)
  151. doc.add(modifiedField)
  152.  
  153. // Add the contents of the file to a field named "contents". Specify a Reader,
  154. // so that the text of the file is tokenized and indexed, but not stored.
  155. // Note that FileReader expects the file to be in UTF-8 encoding.
  156. // If that's not the case searching for special characters will fail.
  157. doc.add(new Field("contents", fileReader))
  158.  
  159. if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
  160. // New index, so we just add the document (no old document can be there):
  161. println "adding " + fileName
  162. writer.addDocument(doc);
  163. } else {
  164. // Existing index (an old copy of this document may have been indexed) so
  165. // we use updateDocument instead to replace the old one matching the exact
  166. // path, if present:
  167. println "updating " + fileName
  168. writer.updateDocument(new Term("path", fileName), doc)
  169. }
  170.  
  171. } catch (e) {
  172. println e
  173. return
  174. }
  175. }
  176. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement