Untitled

@Grapes(
    @Grab('org.apache.lucene:lucene-core:3.4.0')
)

import java.util.zip.*

import org.apache.lucene.analysis.Analyzer
import org.apache.lucene.analysis.standard.StandardAnalyzer
import org.apache.lucene.analysis.SimpleAnalyzer
import org.apache.lucene.document.Document
import org.apache.lucene.document.Field
import org.apache.lucene.document.Field.Index
import org.apache.lucene.document.Field.Store
import org.apache.lucene.document.NumericField
import org.apache.lucene.index.IndexWriter
import org.apache.lucene.index.IndexWriterConfig.OpenMode
import org.apache.lucene.index.IndexWriterConfig
import org.apache.lucene.index.Term
import org.apache.lucene.store.Directory
import org.apache.lucene.store.FSDirectory
import org.apache.lucene.util.Version

import java.io.BufferedReader
import java.io.File
import java.io.FileInputStream
import java.io.FileNotFoundException
import java.io.IOException
import java.io.InputStreamReader
import java.util.Date

/** Index all text files under a directory.
 * <p>
 * This is a command-line application demonstrating simple Lucene indexing.
 * Run it with no command-line arguments for usage information.
 */
public class IndexFiles {
    def indexDir    = "./index"
    def docDirs     = null
    def indexAction = OpenMode.CREATE

    def writer      = null

    public IndexFiles(String indexDir, List docDirs) {
        this.indexDir = indexDir
        this.docDirs = docDirs

        prepareIndexWriter()
    }

    private prepareIndexWriter() {
        def dir = FSDirectory.open(new File(indexDir))
        def analyzer = new StandardAnalyzer(Version.LUCENE_34)
        def iwc = new IndexWriterConfig(Version.LUCENE_34, analyzer)
        iwc.setOpenMode(OpenMode.CREATE)

        writer = new IndexWriter(dir, iwc)
    }

    private void processFile(File file) {
        if (!file.canRead()) return

        def fileReader
        def fis = new FileInputStream(file)
        if (file.name.toLowerCase().endsWith(".zip")) {
            def zis = new ZipInputStream( fis )

            def entry = null
            while((entry = zis.getNextEntry()) != null) {
                if (entry.isDirectory()) continue
                fileReader = new BufferedReader(
                             new InputStreamReader(zis, "UTF-8") {
                                @Override public void close() { return } // Override the close method to avoid closing the stream
                             })
                indexDoc("zip:" + file.path + "@" + entry.name, entry.time, fileReader)
            }
            zis.close()
        } else {
            fileReader = new BufferedReader( new InputStreamReader(fis, "UTF-8"))
            indexDoc(file.path, file.lastModified(), fileReader)
        }
        fis.close()
    }

    public void doIndex() {
        docDirs.each { fn ->
            def file = new File(fn)

            if (file.isDirectory()) file.eachFileRecurse { processFile(it) }
            else processFile(file)
        }

        // NOTE: if you want to maximize search performance,
        // you can optionally call optimize here.  This can be
        // a costly operation, so generally it's only worth
        // it when your index is relatively static (ie you're
        // done adding documents to it):
        //
        // writer.optimize();
    }

    public void close() { writer.close() }

    /** Index all text files under a directory. */
    public static void main(String[] args) {
        def indexFiles = new IndexFiles("./tmp/index", ["./tmp/data"])
        def start = new Date()
        indexFiles.doIndex()
        indexFiles.close()
        def end = new Date()
        println(end.getTime() - start.getTime() + " total milliseconds")
    }

    /**
    * Indexes the given file using the given writer, or if a directory is given,
    * recurses over files and directories found under the given directory.
    *
    * NOTE: This method indexes one document per input file.  This is slow.  For good
    * throughput, put multiple documents into your input file(s).  An example of this is
    * in the benchmark module, which can create "line doc" files, one document per line,
    * using the
    * <a href="../../../../../contrib-benchmark/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.html"
    * >WriteLineDocTask</a>.
    *
    * @param writer Writer to the index where the given file/dir info will be stored
    * @param file The file to index, or the directory to recurse into to find files to index
    * @throws IOException
    */
    void indexDoc(String fileName, long modifiedTime, Reader fileReader) throws IOException {
        // do not try to index files that cannot be read
        try {
            // make a new, empty document
            def doc = new Document()

            // Add the path of the file as a field named "path".  Use a
            // field that is indexed (i.e. searchable), but don't tokenize
            // the field into separate words and don't index term frequency
            // or positional information:
            def pathField = new Field("path", fileName, Store.YES, Index.NOT_ANALYZED)
            doc.add(pathField)

            // Add the last modified date of the file a field named "modified".
            // Use a NumericField that is indexed (i.e. efficiently filterable with
            // NumericRangeFilter).  This indexes to milli-second resolution, which
            // is often too fine.  You could instead create a number based on
            // year/month/day/hour/minutes/seconds, down the resolution you require.
            // For example the long value 2011021714 would mean
            // February 17, 2011, 2-3 PM.
            //def modifiedField = new NumericField("modified")
            def modifiedField = new NumericField("modified", Store.YES, true)
            modifiedField.setLongValue(modifiedTime)
            doc.add(modifiedField)

            // Add the contents of the file to a field named "contents".  Specify a Reader,
            // so that the text of the file is tokenized and indexed, but not stored.
            // Note that FileReader expects the file to be in UTF-8 encoding.
            // If that's not the case searching for special characters will fail.
            doc.add(new Field("contents", fileReader))

            if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
                // New index, so we just add the document (no old document can be there):
                println "adding " + fileName
                writer.addDocument(doc);
            } else {
                // Existing index (an old copy of this document may have been indexed) so
                // we use updateDocument instead to replace the old one matching the exact
                // path, if present:
                println "updating " + fileName
                writer.updateDocument(new Term("path", fileName), doc)
            }

        } catch (e) {
            println e
            return
        }
    }
}