Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- @Grapes(
- @Grab('org.apache.lucene:lucene-core:3.4.0')
- )
- import java.util.zip.*
- import org.apache.lucene.analysis.Analyzer
- import org.apache.lucene.analysis.standard.StandardAnalyzer
- import org.apache.lucene.analysis.SimpleAnalyzer
- import org.apache.lucene.document.Document
- import org.apache.lucene.document.Field
- import org.apache.lucene.document.Field.Index
- import org.apache.lucene.document.Field.Store
- import org.apache.lucene.document.NumericField
- import org.apache.lucene.index.IndexWriter
- import org.apache.lucene.index.IndexWriterConfig.OpenMode
- import org.apache.lucene.index.IndexWriterConfig
- import org.apache.lucene.index.Term
- import org.apache.lucene.store.Directory
- import org.apache.lucene.store.FSDirectory
- import org.apache.lucene.util.Version
- import java.io.BufferedReader
- import java.io.File
- import java.io.FileInputStream
- import java.io.FileNotFoundException
- import java.io.IOException
- import java.io.InputStreamReader
- import java.util.Date
- /** Index all text files under a directory.
- * <p>
- * This is a command-line application demonstrating simple Lucene indexing.
- * Run it with no command-line arguments for usage information.
- */
- public class IndexFiles {
- def indexDir = "./index"
- def docDirs = null
- def indexAction = OpenMode.CREATE
- def writer = null
- public IndexFiles(String indexDir, List docDirs) {
- this.indexDir = indexDir
- this.docDirs = docDirs
- prepareIndexWriter()
- }
- private prepareIndexWriter() {
- def dir = FSDirectory.open(new File(indexDir))
- def analyzer = new StandardAnalyzer(Version.LUCENE_34)
- def iwc = new IndexWriterConfig(Version.LUCENE_34, analyzer)
- iwc.setOpenMode(OpenMode.CREATE)
- writer = new IndexWriter(dir, iwc)
- }
- private void processFile(File file) {
- if (!file.canRead()) return
- def fileReader
- def fis = new FileInputStream(file)
- if (file.name.toLowerCase().endsWith(".zip")) {
- def zis = new ZipInputStream( fis )
- def entry = null
- while((entry = zis.getNextEntry()) != null) {
- if (entry.isDirectory()) continue
- fileReader = new BufferedReader(
- new InputStreamReader(zis, "UTF-8") {
- @Override public void close() { return } // Override the close method to avoid closing the stream
- })
- indexDoc("zip:" + file.path + "@" + entry.name, entry.time, fileReader)
- }
- zis.close()
- } else {
- fileReader = new BufferedReader( new InputStreamReader(fis, "UTF-8"))
- indexDoc(file.path, file.lastModified(), fileReader)
- }
- fis.close()
- }
- public void doIndex() {
- docDirs.each { fn ->
- def file = new File(fn)
- if (file.isDirectory()) file.eachFileRecurse { processFile(it) }
- else processFile(file)
- }
- // NOTE: if you want to maximize search performance,
- // you can optionally call optimize here. This can be
- // a costly operation, so generally it's only worth
- // it when your index is relatively static (ie you're
- // done adding documents to it):
- //
- // writer.optimize();
- }
- public void close() { writer.close() }
- /** Index all text files under a directory. */
- public static void main(String[] args) {
- def indexFiles = new IndexFiles("./tmp/index", ["./tmp/data"])
- def start = new Date()
- indexFiles.doIndex()
- indexFiles.close()
- def end = new Date()
- println(end.getTime() - start.getTime() + " total milliseconds")
- }
- /**
- * Indexes the given file using the given writer, or if a directory is given,
- * recurses over files and directories found under the given directory.
- *
- * NOTE: This method indexes one document per input file. This is slow. For good
- * throughput, put multiple documents into your input file(s). An example of this is
- * in the benchmark module, which can create "line doc" files, one document per line,
- * using the
- * <a href="../../../../../contrib-benchmark/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.html"
- * >WriteLineDocTask</a>.
- *
- * @param writer Writer to the index where the given file/dir info will be stored
- * @param file The file to index, or the directory to recurse into to find files to index
- * @throws IOException
- */
- void indexDoc(String fileName, long modifiedTime, Reader fileReader) throws IOException {
- // do not try to index files that cannot be read
- try {
- // make a new, empty document
- def doc = new Document()
- // Add the path of the file as a field named "path". Use a
- // field that is indexed (i.e. searchable), but don't tokenize
- // the field into separate words and don't index term frequency
- // or positional information:
- def pathField = new Field("path", fileName, Store.YES, Index.NOT_ANALYZED)
- doc.add(pathField)
- // Add the last modified date of the file a field named "modified".
- // Use a NumericField that is indexed (i.e. efficiently filterable with
- // NumericRangeFilter). This indexes to milli-second resolution, which
- // is often too fine. You could instead create a number based on
- // year/month/day/hour/minutes/seconds, down the resolution you require.
- // For example the long value 2011021714 would mean
- // February 17, 2011, 2-3 PM.
- //def modifiedField = new NumericField("modified")
- def modifiedField = new NumericField("modified", Store.YES, true)
- modifiedField.setLongValue(modifiedTime)
- doc.add(modifiedField)
- // Add the contents of the file to a field named "contents". Specify a Reader,
- // so that the text of the file is tokenized and indexed, but not stored.
- // Note that FileReader expects the file to be in UTF-8 encoding.
- // If that's not the case searching for special characters will fail.
- doc.add(new Field("contents", fileReader))
- if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
- // New index, so we just add the document (no old document can be there):
- println "adding " + fileName
- writer.addDocument(doc);
- } else {
- // Existing index (an old copy of this document may have been indexed) so
- // we use updateDocument instead to replace the old one matching the exact
- // path, if present:
- println "updating " + fileName
- writer.updateDocument(new Term("path", fileName), doc)
- }
- } catch (e) {
- println e
- return
- }
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement