Guest User

Untitled

a guest
Feb 25th, 2018
68
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 4.25 KB | None | 0 0
  1. package rex1nlp;
  2.  
  3. import java.io.File;
  4. import java.io.FileInputStream;
  5. import java.io.IOException;
  6. import java.io.InputStream;
  7. import opennlp.tools.tokenize.TokenizerME;
  8. import opennlp.tools.tokenize.TokenizerModel;
  9. import org.apache.lucene.analysis.standard.StandardAnalyzer;
  10. import org.apache.lucene.document.Document;
  11. import org.apache.lucene.document.Field;
  12. import org.apache.lucene.document.StringField;
  13. import org.apache.lucene.document.TextField;
  14. import org.apache.lucene.index.DirectoryReader;
  15. import org.apache.lucene.index.IndexReader;
  16. import org.apache.lucene.index.IndexWriter;
  17. import org.apache.lucene.index.IndexWriterConfig;
  18. import org.apache.lucene.queryparser.classic.QueryParser;
  19. import org.apache.lucene.queryparser.flexible.standard.parser.ParseException;
  20. import org.apache.lucene.search.IndexSearcher;
  21. import org.apache.lucene.search.Query;
  22. import org.apache.lucene.search.ScoreDoc;
  23. import org.apache.lucene.search.TopDocs;
  24. import org.apache.lucene.store.Directory;
  25. import org.apache.lucene.store.RAMDirectory;
  26. import org.apache.tika.exception.TikaException;
  27. import org.apache.tika.metadata.Metadata;
  28. import org.apache.tika.parser.AutoDetectParser;
  29. import org.apache.tika.parser.ParseContext;
  30. import org.apache.tika.parser.Parser;
  31. import org.apache.tika.sax.BodyContentHandler;
  32. import org.xml.sax.ContentHandler;
  33. import org.xml.sax.SAXException;
  34. public class luceneRex {
  35.  
  36.  
  37.  
  38. public static void main(String[] args) throws IOException, ParseException, org.apache.lucene.queryparser.classic.ParseException, TikaException, SAXException {
  39. InputStream inputStreamTokenizer = new
  40. FileInputStream("C:\\Users\\RexPC\\Documents\\Programming\\Apache OpenNLP\\Models\\Original OpenNLP Models\\en-token.bin");
  41.  
  42. TokenizerModel tokenModel = new TokenizerModel(inputStreamTokenizer);
  43.  
  44. //Instantiating the TokenizerME class
  45. TokenizerME tokenizer = new TokenizerME(tokenModel);
  46.  
  47. String target = "C:\\Users\\RexPC\\Documents\\Haily.docx";
  48.  
  49. File document = new File(target);
  50. Parser parser = new AutoDetectParser();
  51.  
  52. ContentHandler handler = new BodyContentHandler();
  53. Metadata metadata = new Metadata();
  54.  
  55. parser.parse(new FileInputStream(document), handler, metadata, new ParseContext());
  56.  
  57.  
  58. // 0. Specify the analyzer for tokenizing text.
  59. // The same analyzer should be used for indexing and searching
  60. StandardAnalyzer analyzer = new StandardAnalyzer();
  61.  
  62. // 1. create the index
  63. Directory index = new RAMDirectory();
  64.  
  65. IndexWriterConfig config = new IndexWriterConfig(analyzer);
  66.  
  67. try (IndexWriter w = new IndexWriter(index, config)) {
  68. addDoc(w, handler.toString(), "193398817");
  69. // System.out.println(handler.toString());
  70. }
  71.  
  72. // 2. query
  73. String querystr = args.length > 0 ? args[0] : "Cigna";
  74.  
  75. // the "title" arg specifies the default field to use
  76. // when no field is explicitly specified in the query.
  77. Query q = new QueryParser("title", analyzer).parse(querystr);
  78.  
  79. // 3. search
  80. int hitsPerPage = 10;
  81. try (IndexReader reader = DirectoryReader.open(index)) {
  82. IndexSearcher searcher = new IndexSearcher(reader);
  83. TopDocs docs = searcher.search(q, hitsPerPage);
  84. ScoreDoc[] hits = docs.scoreDocs;
  85. // 4. display results
  86. System.out.println("Found " + hits.length + " hits.");
  87. for (int i = 0; i<hits.length; ++i) {
  88. int docId = hits[i].doc;
  89. Document d = searcher.doc(docId);
  90. System.out.println((i + 1) + ". " + d.get("isbn") + "\t" + d.get("title"));
  91. }
  92. // reader can only be closed when there
  93. // is no need to access the documents any more.
  94. }
  95. }
  96.  
  97. private static void addDoc(IndexWriter w, String title, String isbn) throws IOException {
  98. Document doc = new Document();
  99. doc.add(new TextField("title", title, Field.Store.YES));
  100.  
  101. // use a string field for isbn because we don't want it tokenized
  102. doc.add(new StringField("isbn", isbn, Field.Store.YES));
  103. w.addDocument(doc);
  104.  
  105.  
  106. }
  107. }
Add Comment
Please, Sign In to add comment