Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import java.io.IOException;
- import org.apache.lucene.analysis.standard.StandardAnalyzer;
- import org.apache.lucene.document.Document;
- import org.apache.lucene.document.Field;
- import org.apache.lucene.document.Field.Index;
- import org.apache.lucene.document.Field.Store;
- import org.apache.lucene.index.CorruptIndexException;
- import org.apache.lucene.index.IndexReader;
- import org.apache.lucene.index.IndexWriter;
- import org.apache.lucene.index.IndexWriterConfig;
- import org.apache.lucene.search.IndexSearcher;
- import org.apache.lucene.search.Query;
- import org.apache.lucene.search.ScoreDoc;
- import org.apache.lucene.search.TopDocs;
- import org.apache.lucene.search.similar.MoreLikeThis;
- import org.apache.lucene.store.Directory;
- import org.apache.lucene.store.LockObtainFailedException;
- import org.apache.lucene.store.RAMDirectory;
- import org.apache.lucene.util.Version;
- public class Main {
- public static void main(String[] args) throws CorruptIndexException,
- LockObtainFailedException, IOException {
- // Setting up the index
- StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_36);
- Directory index = new RAMDirectory();
- IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_36,
- analyzer);
- // Populate the index
- IndexWriter w = new IndexWriter(index, config);
- addBook(w, "My first Bratwurst", "'Amazing' - Financial Times");
- addBook(w, "My first Bratwurst", "'Amazing' - Financial Times");// Duplicate
- addBook(w, "Financial Studies - Part 1",
- "Rainer Bratwurst with his first book");
- addBook(w, "Financial Studies - Part 2",
- "Author Rainer Bratwurst does not so well in this one");
- addBook(w, "Financial Studies - Part 3",
- "Rainer Bratwurst my favorite author");
- addBook(w, "Amazing Times",
- "Author Rainer Bratwurst strikes again. First choice for my Kids.");
- addBook(w, "Amazing Times - Back again",
- "This one is not like part 1 from author Rainer Bratwurst");
- addBook(w, "Why Times New Roman is Amazing",
- "An essay I wrote in my first year at college");
- w.close();
- // Setting up MoreLikeThis
- IndexReader ir = IndexReader.open(index);
- MoreLikeThis mlt = new MoreLikeThis(ir);
- mlt.setFieldNames(new String[] { "title", "content" });
- mlt.setMinTermFreq(1);
- mlt.setMinDocFreq(1);
- // Build the MLT-Query for the first document in the index and execute
- // it
- Query q = mlt.like(0);
- IndexSearcher is = new IndexSearcher(ir);
- TopDocs result = is.search(q, 10);
- for (ScoreDoc sd : result.scoreDocs) {
- // Print all titles similar to "My first Bratwurst" - the duplicate
- // is not found
- Document document = ir.document(sd.doc);
- String title = document.get("title");
- System.out.println(title);
- // Result:
- //
- // Amazing Times
- // Why Times New Roman is Amazing
- // Financial Studies - Part 1
- // Financial Studies - Part 3
- // Amazing Times - Back again
- // Financial Studies - Part 2
- }
- }
- private static void addBook(IndexWriter w, String title, String content)
- throws IOException {
- Document doc = new Document();
- doc.add(new Field("title", title, Store.YES, Index.ANALYZED));
- doc.add(new Field("content", content, Store.YES, Index.ANALYZED));
- w.addDocument(doc);
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement