Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import java.io.IOException;
- import org.apache.lucene.analysis.standard.StandardAnalyzer;
- import org.apache.lucene.document.Document;
- import org.apache.lucene.document.Field;
- import org.apache.lucene.document.Field.Index;
- import org.apache.lucene.document.Field.Store;
- import org.apache.lucene.index.CorruptIndexException;
- import org.apache.lucene.index.IndexReader;
- import org.apache.lucene.index.IndexWriter;
- import org.apache.lucene.index.IndexWriterConfig;
- import org.apache.lucene.search.IndexSearcher;
- import org.apache.lucene.search.Query;
- import org.apache.lucene.search.ScoreDoc;
- import org.apache.lucene.search.TopDocs;
- import org.apache.lucene.search.similar.MoreLikeThis;
- import org.apache.lucene.store.Directory;
- import org.apache.lucene.store.LockObtainFailedException;
- import org.apache.lucene.store.RAMDirectory;
- import org.apache.lucene.util.Version;
- public class Main {
- public static void main(String[] args) throws CorruptIndexException, LockObtainFailedException, IOException{
- //Setting up the index
- StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_36);
- Directory index = new RAMDirectory();
- IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_36, analyzer);
- //Populate the index
- IndexWriter w = new IndexWriter(index, config);
- addBook(w, "My first Bratwurst", "'Amazing' - Financial Times");
- addBook(w, "My first Bratwurst", "'Amazing' - Financial Times"); //identical to the first one
- addBook(w, "Financial Studies - Part 1", "Rainer Bratwurst with his first book");
- addBook(w, "Financial Studies - Part 2", "Author Rainer Bratwurst does not so well in this one");
- addBook(w, "Financial Studies - Part 3", "Rainer Bratwurst my favorite author");
- addBook(w, "Amazing Times", "Author Rainer Bratwurst strikes again. First choice for my Kids.");
- addBook(w, "Amazing Times - Back again", "This one is not like part 1 from author Rainer Bratwurst");
- addBook(w, "Why Times New Roman is Amazing", "An essay I wrote in my first year at college");
- w.close();
- //Setting up MoreLikeThis
- IndexReader ir = IndexReader.open(index);
- MoreLikeThis mlt = new MoreLikeThis(ir);
- mlt.setFieldNames(new String[]{"title", "content"});
- mlt.setMinTermFreq(1);
- mlt.setMinDocFreq(1);
- //Build the MLT-Query for the first document in the index and execute it
- Query q = mlt.like(0);
- IndexSearcher is = new IndexSearcher(ir);
- TopDocs result = is.search(q, 10);
- for(ScoreDoc sd : result.scoreDocs){
- //Print all titles similar to "My first Bratwurst" - the duplicate is not found
- Document document = ir.document(sd.doc);
- String title = document.get("title");
- System.out.println(title);
- }
- }
- private static void addBook(IndexWriter w, String title, String content) throws IOException {
- Document doc = new Document();
- doc.add(new Field("title", title, Store.YES, Index.ANALYZED));
- doc.add(new Field("content", content, Store.YES, Index.ANALYZED));
- w.addDocument(doc);
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement