Advertisement
Guest User

MoreLikeThis gone wrong

a guest
Nov 21st, 2012
186
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. import java.io.IOException;
  2.  
  3. import org.apache.lucene.analysis.standard.StandardAnalyzer;
  4. import org.apache.lucene.document.Document;
  5. import org.apache.lucene.document.Field;
  6. import org.apache.lucene.document.Field.Index;
  7. import org.apache.lucene.document.Field.Store;
  8. import org.apache.lucene.index.CorruptIndexException;
  9. import org.apache.lucene.index.IndexReader;
  10. import org.apache.lucene.index.IndexWriter;
  11. import org.apache.lucene.index.IndexWriterConfig;
  12. import org.apache.lucene.search.IndexSearcher;
  13. import org.apache.lucene.search.Query;
  14. import org.apache.lucene.search.ScoreDoc;
  15. import org.apache.lucene.search.TopDocs;
  16. import org.apache.lucene.search.similar.MoreLikeThis;
  17. import org.apache.lucene.store.Directory;
  18. import org.apache.lucene.store.LockObtainFailedException;
  19. import org.apache.lucene.store.RAMDirectory;
  20. import org.apache.lucene.util.Version;
  21.  
  22. public class Main {
  23.     public static void main(String[] args) throws CorruptIndexException,
  24.             LockObtainFailedException, IOException {
  25.         // Setting up the index
  26.         StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_36);
  27.         Directory index = new RAMDirectory();
  28.         IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_36,
  29.                 analyzer);
  30.  
  31.         // Populate the index
  32.         IndexWriter w = new IndexWriter(index, config);
  33.         addBook(w, "My first Bratwurst", "'Amazing' - Financial Times");
  34.         addBook(w, "My first Bratwurst", "'Amazing' - Financial Times");// Duplicate
  35.         addBook(w, "Financial Studies - Part 1",
  36.                 "Rainer Bratwurst with his first book");
  37.         addBook(w, "Financial Studies - Part 2",
  38.                 "Author Rainer Bratwurst does not so well in this one");
  39.         addBook(w, "Financial Studies - Part 3",
  40.                 "Rainer Bratwurst my favorite author");
  41.         addBook(w, "Amazing Times",
  42.                 "Author Rainer Bratwurst strikes again. First choice for my Kids.");
  43.         addBook(w, "Amazing Times - Back again",
  44.                 "This one is not like part 1 from author Rainer Bratwurst");
  45.         addBook(w, "Why Times New Roman is Amazing",
  46.                 "An essay I wrote in my first year at college");
  47.         w.close();
  48.  
  49.         // Setting up MoreLikeThis
  50.         IndexReader ir = IndexReader.open(index);
  51.         MoreLikeThis mlt = new MoreLikeThis(ir);
  52.         mlt.setFieldNames(new String[] { "title", "content" });
  53.         mlt.setMinTermFreq(1);
  54.         mlt.setMinDocFreq(1);
  55.  
  56.         // Build the MLT-Query for the first document in the index and execute
  57.         // it
  58.         Query q = mlt.like(0);
  59.         IndexSearcher is = new IndexSearcher(ir);
  60.         TopDocs result = is.search(q, 10);
  61.         for (ScoreDoc sd : result.scoreDocs) {
  62.             // Print all titles similar to "My first Bratwurst" - the duplicate
  63.             // is not found
  64.             Document document = ir.document(sd.doc);
  65.             String title = document.get("title");
  66.             System.out.println(title);
  67.             // Result:
  68.             //       
  69.             // Amazing Times
  70.             // Why Times New Roman is Amazing
  71.             // Financial Studies - Part 1
  72.             // Financial Studies - Part 3
  73.             // Amazing Times - Back again
  74.             // Financial Studies - Part 2
  75.         }
  76.  
  77.     }
  78.  
  79.     private static void addBook(IndexWriter w, String title, String content)
  80.             throws IOException {
  81.         Document doc = new Document();
  82.         doc.add(new Field("title", title, Store.YES, Index.ANALYZED));
  83.         doc.add(new Field("content", content, Store.YES, Index.ANALYZED));
  84.         w.addDocument(doc);
  85.     }
  86.  
  87. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement