SHOW:
|
|
- or go back to the newest paste.
1 | import java.io.IOException; | |
2 | ||
3 | import org.apache.lucene.analysis.standard.StandardAnalyzer; | |
4 | import org.apache.lucene.document.Document; | |
5 | import org.apache.lucene.document.Field; | |
6 | import org.apache.lucene.document.Field.Index; | |
7 | import org.apache.lucene.document.Field.Store; | |
8 | import org.apache.lucene.index.CorruptIndexException; | |
9 | import org.apache.lucene.index.IndexReader; | |
10 | import org.apache.lucene.index.IndexWriter; | |
11 | import org.apache.lucene.index.IndexWriterConfig; | |
12 | import org.apache.lucene.search.IndexSearcher; | |
13 | import org.apache.lucene.search.Query; | |
14 | import org.apache.lucene.search.ScoreDoc; | |
15 | import org.apache.lucene.search.TopDocs; | |
16 | import org.apache.lucene.search.similar.MoreLikeThis; | |
17 | import org.apache.lucene.store.Directory; | |
18 | import org.apache.lucene.store.LockObtainFailedException; | |
19 | import org.apache.lucene.store.RAMDirectory; | |
20 | import org.apache.lucene.util.Version; | |
21 | ||
22 | public class Main { | |
23 | public static void main(String[] args) throws CorruptIndexException, | |
24 | - | public static void main(String[] args) throws CorruptIndexException, LockObtainFailedException, IOException{ |
24 | + | LockObtainFailedException, IOException { |
25 | - | //Setting up the index |
25 | + | // Setting up the index |
26 | StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_36); | |
27 | Directory index = new RAMDirectory(); | |
28 | - | IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_36, analyzer); |
28 | + | IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_36, |
29 | analyzer); | |
30 | - | //Populate the index |
30 | + | |
31 | // Populate the index | |
32 | IndexWriter w = new IndexWriter(index, config); | |
33 | - | addBook(w, "My first Bratwurst", "'Amazing' - Financial Times"); //identical to the first one |
33 | + | |
34 | - | addBook(w, "Financial Studies - Part 1", "Rainer Bratwurst with his first book"); |
34 | + | addBook(w, "My first Bratwurst", "'Amazing' - Financial Times");// Duplicate |
35 | - | addBook(w, "Financial Studies - Part 2", "Author Rainer Bratwurst does not so well in this one"); |
35 | + | addBook(w, "Financial Studies - Part 1", |
36 | - | addBook(w, "Financial Studies - Part 3", "Rainer Bratwurst my favorite author"); |
36 | + | "Rainer Bratwurst with his first book"); |
37 | - | addBook(w, "Amazing Times", "Author Rainer Bratwurst strikes again. First choice for my Kids."); |
37 | + | addBook(w, "Financial Studies - Part 2", |
38 | - | addBook(w, "Amazing Times - Back again", "This one is not like part 1 from author Rainer Bratwurst"); |
38 | + | "Author Rainer Bratwurst does not so well in this one"); |
39 | - | addBook(w, "Why Times New Roman is Amazing", "An essay I wrote in my first year at college"); |
39 | + | addBook(w, "Financial Studies - Part 3", |
40 | "Rainer Bratwurst my favorite author"); | |
41 | - | |
41 | + | addBook(w, "Amazing Times", |
42 | - | //Setting up MoreLikeThis |
42 | + | "Author Rainer Bratwurst strikes again. First choice for my Kids."); |
43 | addBook(w, "Amazing Times - Back again", | |
44 | "This one is not like part 1 from author Rainer Bratwurst"); | |
45 | - | mlt.setFieldNames(new String[]{"title", "content"}); |
45 | + | addBook(w, "Why Times New Roman is Amazing", |
46 | "An essay I wrote in my first year at college"); | |
47 | w.close(); | |
48 | - | |
48 | + | |
49 | - | //Build the MLT-Query for the first document in the index and execute it |
49 | + | // Setting up MoreLikeThis |
50 | IndexReader ir = IndexReader.open(index); | |
51 | MoreLikeThis mlt = new MoreLikeThis(ir); | |
52 | mlt.setFieldNames(new String[] { "title", "content" }); | |
53 | - | for(ScoreDoc sd : result.scoreDocs){ |
53 | + | |
54 | - | //Print all titles similar to "My first Bratwurst" - the duplicate is not found |
54 | + | |
55 | ||
56 | // Build the MLT-Query for the first document in the index and execute | |
57 | // it | |
58 | Query q = mlt.like(0); | |
59 | - | |
59 | + | |
60 | TopDocs result = is.search(q, 10); | |
61 | - | private static void addBook(IndexWriter w, String title, String content) throws IOException { |
61 | + | for (ScoreDoc sd : result.scoreDocs) { |
62 | - | Document doc = new Document(); |
62 | + | // Print all titles similar to "My first Bratwurst" - the duplicate |
63 | - | doc.add(new Field("title", title, Store.YES, Index.ANALYZED)); |
63 | + | // is not found |
64 | - | doc.add(new Field("content", content, Store.YES, Index.ANALYZED)); |
64 | + | |
65 | - | w.addDocument(doc); |
65 | + | |
66 | System.out.println(title); | |
67 | // Result: | |
68 | // | |
69 | // Amazing Times | |
70 | // Why Times New Roman is Amazing | |
71 | // Financial Studies - Part 1 | |
72 | // Financial Studies - Part 3 | |
73 | // Amazing Times - Back again | |
74 | // Financial Studies - Part 2 | |
75 | } | |
76 | ||
77 | } | |
78 | ||
79 | private static void addBook(IndexWriter w, String title, String content) | |
80 | throws IOException { | |
81 | Document doc = new Document(); | |
82 | doc.add(new Field("title", title, Store.YES, Index.ANALYZED)); | |
83 | doc.add(new Field("content", content, Store.YES, Index.ANALYZED)); | |
84 | w.addDocument(doc); | |
85 | } | |
86 | ||
87 | } |