Pastebin launched a little side project called VERYVIRAL.com, check it out ;-) Want more features on Pastebin? Sign Up, it's FREE!
Guest

MoreLikeThis gone wrong

By: a guest on Nov 21st, 2012  |  syntax: Java  |  size: 3.23 KB  |  views: 17  |  expires: Never
download  |  raw  |  embed  |  report abuse  |  print
This paste has a previous version, view the difference. Text below is selected. Please press Ctrl+C to copy to your clipboard. (⌘+C on Mac)
  1. import java.io.IOException;
  2.  
  3. import org.apache.lucene.analysis.standard.StandardAnalyzer;
  4. import org.apache.lucene.document.Document;
  5. import org.apache.lucene.document.Field;
  6. import org.apache.lucene.document.Field.Index;
  7. import org.apache.lucene.document.Field.Store;
  8. import org.apache.lucene.index.CorruptIndexException;
  9. import org.apache.lucene.index.IndexReader;
  10. import org.apache.lucene.index.IndexWriter;
  11. import org.apache.lucene.index.IndexWriterConfig;
  12. import org.apache.lucene.search.IndexSearcher;
  13. import org.apache.lucene.search.Query;
  14. import org.apache.lucene.search.ScoreDoc;
  15. import org.apache.lucene.search.TopDocs;
  16. import org.apache.lucene.search.similar.MoreLikeThis;
  17. import org.apache.lucene.store.Directory;
  18. import org.apache.lucene.store.LockObtainFailedException;
  19. import org.apache.lucene.store.RAMDirectory;
  20. import org.apache.lucene.util.Version;
  21.  
  22. public class Main {
  23.         public static void main(String[] args) throws CorruptIndexException,
  24.                         LockObtainFailedException, IOException {
  25.                 // Setting up the index
  26.                 StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_36);
  27.                 Directory index = new RAMDirectory();
  28.                 IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_36,
  29.                                 analyzer);
  30.  
  31.                 // Populate the index
  32.                 IndexWriter w = new IndexWriter(index, config);
  33.                 addBook(w, "My first Bratwurst", "'Amazing' - Financial Times");
  34.                 addBook(w, "My first Bratwurst", "'Amazing' - Financial Times");// Duplicate
  35.                 addBook(w, "Financial Studies - Part 1",
  36.                                 "Rainer Bratwurst with his first book");
  37.                 addBook(w, "Financial Studies - Part 2",
  38.                                 "Author Rainer Bratwurst does not so well in this one");
  39.                 addBook(w, "Financial Studies - Part 3",
  40.                                 "Rainer Bratwurst my favorite author");
  41.                 addBook(w, "Amazing Times",
  42.                                 "Author Rainer Bratwurst strikes again. First choice for my Kids.");
  43.                 addBook(w, "Amazing Times - Back again",
  44.                                 "This one is not like part 1 from author Rainer Bratwurst");
  45.                 addBook(w, "Why Times New Roman is Amazing",
  46.                                 "An essay I wrote in my first year at college");
  47.                 w.close();
  48.  
  49.                 // Setting up MoreLikeThis
  50.                 IndexReader ir = IndexReader.open(index);
  51.                 MoreLikeThis mlt = new MoreLikeThis(ir);
  52.                 mlt.setFieldNames(new String[] { "title", "content" });
  53.                 mlt.setMinTermFreq(1);
  54.                 mlt.setMinDocFreq(1);
  55.  
  56.                 // Build the MLT-Query for the first document in the index and execute
  57.                 // it
  58.                 Query q = mlt.like(0);
  59.                 IndexSearcher is = new IndexSearcher(ir);
  60.                 TopDocs result = is.search(q, 10);
  61.                 for (ScoreDoc sd : result.scoreDocs) {
  62.                         // Print all titles similar to "My first Bratwurst" - the duplicate
  63.                         // is not found
  64.                         Document document = ir.document(sd.doc);
  65.                         String title = document.get("title");
  66.                         System.out.println(title);
  67.                         // Result:
  68.                         //               
  69.                         // Amazing Times
  70.                         // Why Times New Roman is Amazing
  71.                         // Financial Studies - Part 1
  72.                         // Financial Studies - Part 3
  73.                         // Amazing Times - Back again
  74.                         // Financial Studies - Part 2
  75.                 }
  76.  
  77.         }
  78.  
  79.         private static void addBook(IndexWriter w, String title, String content)
  80.                         throws IOException {
  81.                 Document doc = new Document();
  82.                 doc.add(new Field("title", title, Store.YES, Index.ANALYZED));
  83.                 doc.add(new Field("content", content, Store.YES, Index.ANALYZED));
  84.                 w.addDocument(doc);
  85.         }
  86.  
  87. }