Pastebin.com

import java.io.IOException;

import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Index;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.similar.MoreLikeThis;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;

public class Main {
    public static void main(String[] args) throws CorruptIndexException,
            LockObtainFailedException, IOException {
        // Setting up the index
        StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_36);
        Directory index = new RAMDirectory();
        IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_36,
                analyzer);

        // Populate the index
        IndexWriter w = new IndexWriter(index, config);
        addBook(w, "My first Bratwurst", "'Amazing' - Financial Times");
        addBook(w, "My first Bratwurst", "'Amazing' - Financial Times");// Duplicate
        addBook(w, "Financial Studies - Part 1",
                "Rainer Bratwurst with his first book");
        addBook(w, "Financial Studies - Part 2",
                "Author Rainer Bratwurst does not so well in this one");
        addBook(w, "Financial Studies - Part 3",
                "Rainer Bratwurst my favorite author");
        addBook(w, "Amazing Times",
                "Author Rainer Bratwurst strikes again. First choice for my Kids.");
        addBook(w, "Amazing Times - Back again",
                "This one is not like part 1 from author Rainer Bratwurst");
        addBook(w, "Why Times New Roman is Amazing",
                "An essay I wrote in my first year at college");
        w.close();

        // Setting up MoreLikeThis
        IndexReader ir = IndexReader.open(index);
        MoreLikeThis mlt = new MoreLikeThis(ir);
        mlt.setFieldNames(new String[] { "title", "content" });
        mlt.setMinTermFreq(1);
        mlt.setMinDocFreq(1);

        // Build the MLT-Query for the first document in the index and execute
        // it
        Query q = mlt.like(0);
        IndexSearcher is = new IndexSearcher(ir);
        TopDocs result = is.search(q, 10);
        for (ScoreDoc sd : result.scoreDocs) {
            // Print all titles similar to "My first Bratwurst" - the duplicate
            // is not found
            Document document = ir.document(sd.doc);
            String title = document.get("title");
            System.out.println(title);
            // Result:
            //
            // Amazing Times
            // Why Times New Roman is Amazing
            // Financial Studies - Part 1
            // Financial Studies - Part 3
            // Amazing Times - Back again
            // Financial Studies - Part 2
        }

    }

    private static void addBook(IndexWriter w, String title, String content)
            throws IOException {
        Document doc = new Document();
        doc.add(new Field("title", title, Store.YES, Index.ANALYZED));
        doc.add(new Field("content", content, Store.YES, Index.ANALYZED));
        w.addDocument(doc);
    }

}