Advertisement
Guest User

MoreLikeThis gone wrong

a guest
Nov 21st, 2012
51
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Java 2.98 KB | None | 0 0
  1. import java.io.IOException;
  2.  
  3. import org.apache.lucene.analysis.standard.StandardAnalyzer;
  4. import org.apache.lucene.document.Document;
  5. import org.apache.lucene.document.Field;
  6. import org.apache.lucene.document.Field.Index;
  7. import org.apache.lucene.document.Field.Store;
  8. import org.apache.lucene.index.CorruptIndexException;
  9. import org.apache.lucene.index.IndexReader;
  10. import org.apache.lucene.index.IndexWriter;
  11. import org.apache.lucene.index.IndexWriterConfig;
  12. import org.apache.lucene.search.IndexSearcher;
  13. import org.apache.lucene.search.Query;
  14. import org.apache.lucene.search.ScoreDoc;
  15. import org.apache.lucene.search.TopDocs;
  16. import org.apache.lucene.search.similar.MoreLikeThis;
  17. import org.apache.lucene.store.Directory;
  18. import org.apache.lucene.store.LockObtainFailedException;
  19. import org.apache.lucene.store.RAMDirectory;
  20. import org.apache.lucene.util.Version;
  21.  
  22.  
  23. public class Main {
  24.     public static void main(String[] args) throws CorruptIndexException, LockObtainFailedException, IOException{
  25.         //Setting up the index
  26.         StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_36);
  27.         Directory index = new RAMDirectory();
  28.         IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_36, analyzer);
  29.  
  30.         //Populate the index
  31.         IndexWriter w = new IndexWriter(index, config);
  32.         addBook(w, "My first Bratwurst", "'Amazing' - Financial Times");
  33.         addBook(w, "My first Bratwurst", "'Amazing' - Financial Times"); //identical to the first one
  34.         addBook(w, "Financial Studies - Part 1", "Rainer Bratwurst with his first book");
  35.         addBook(w, "Financial Studies - Part 2", "Author Rainer Bratwurst does not so well in this one");
  36.         addBook(w, "Financial Studies - Part 3", "Rainer Bratwurst my favorite author");
  37.         addBook(w, "Amazing Times", "Author Rainer Bratwurst strikes again. First choice for my Kids.");
  38.         addBook(w, "Amazing Times - Back again", "This one is not like part 1 from author Rainer Bratwurst");
  39.         addBook(w, "Why Times New Roman is Amazing", "An essay I wrote in my first year at college");
  40.         w.close();
  41.        
  42.         //Setting up MoreLikeThis
  43.         IndexReader ir = IndexReader.open(index);
  44.         MoreLikeThis mlt = new MoreLikeThis(ir);
  45.         mlt.setFieldNames(new String[]{"title", "content"});
  46.         mlt.setMinTermFreq(1);
  47.         mlt.setMinDocFreq(1);
  48.        
  49.         //Build the MLT-Query for the first document in the index and execute it
  50.         Query q = mlt.like(0);
  51.         IndexSearcher is = new IndexSearcher(ir);
  52.         TopDocs result = is.search(q, 10);
  53.         for(ScoreDoc sd : result.scoreDocs){
  54.             //Print all titles similar to "My first Bratwurst" - the duplicate is not found
  55.             Document document = ir.document(sd.doc);
  56.             String title = document.get("title");
  57.             System.out.println(title);
  58.         }
  59.        
  60.     }
  61.     private static void addBook(IndexWriter w, String title, String content) throws IOException {
  62.           Document doc = new Document();
  63.           doc.add(new Field("title", title, Store.YES, Index.ANALYZED));
  64.           doc.add(new Field("content", content, Store.YES, Index.ANALYZED));
  65.           w.addDocument(doc);
  66.         }
  67.  
  68. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement