Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- package rex1nlp;
- import java.io.File;
- import java.io.FileInputStream;
- import java.io.IOException;
- import java.io.InputStream;
- import opennlp.tools.tokenize.TokenizerME;
- import opennlp.tools.tokenize.TokenizerModel;
- import org.apache.lucene.analysis.standard.StandardAnalyzer;
- import org.apache.lucene.document.Document;
- import org.apache.lucene.document.Field;
- import org.apache.lucene.document.StringField;
- import org.apache.lucene.document.TextField;
- import org.apache.lucene.index.DirectoryReader;
- import org.apache.lucene.index.IndexReader;
- import org.apache.lucene.index.IndexWriter;
- import org.apache.lucene.index.IndexWriterConfig;
- import org.apache.lucene.queryparser.classic.QueryParser;
- import org.apache.lucene.queryparser.flexible.standard.parser.ParseException;
- import org.apache.lucene.search.IndexSearcher;
- import org.apache.lucene.search.Query;
- import org.apache.lucene.search.ScoreDoc;
- import org.apache.lucene.search.TopDocs;
- import org.apache.lucene.store.Directory;
- import org.apache.lucene.store.RAMDirectory;
- import org.apache.tika.exception.TikaException;
- import org.apache.tika.metadata.Metadata;
- import org.apache.tika.parser.AutoDetectParser;
- import org.apache.tika.parser.ParseContext;
- import org.apache.tika.parser.Parser;
- import org.apache.tika.sax.BodyContentHandler;
- import org.xml.sax.ContentHandler;
- import org.xml.sax.SAXException;
- public class luceneRex {
- public static void main(String[] args) throws IOException, ParseException, org.apache.lucene.queryparser.classic.ParseException, TikaException, SAXException {
- InputStream inputStreamTokenizer = new
- FileInputStream("C:\\Users\\RexPC\\Documents\\Programming\\Apache OpenNLP\\Models\\Original OpenNLP Models\\en-token.bin");
- TokenizerModel tokenModel = new TokenizerModel(inputStreamTokenizer);
- //Instantiating the TokenizerME class
- TokenizerME tokenizer = new TokenizerME(tokenModel);
- String target = "C:\\Users\\RexPC\\Documents\\Haily.docx";
- File document = new File(target);
- Parser parser = new AutoDetectParser();
- ContentHandler handler = new BodyContentHandler();
- Metadata metadata = new Metadata();
- parser.parse(new FileInputStream(document), handler, metadata, new ParseContext());
- // 0. Specify the analyzer for tokenizing text.
- // The same analyzer should be used for indexing and searching
- StandardAnalyzer analyzer = new StandardAnalyzer();
- // 1. create the index
- Directory index = new RAMDirectory();
- IndexWriterConfig config = new IndexWriterConfig(analyzer);
- try (IndexWriter w = new IndexWriter(index, config)) {
- addDoc(w, handler.toString(), "193398817");
- // System.out.println(handler.toString());
- }
- // 2. query
- String querystr = args.length > 0 ? args[0] : "Cigna";
- // the "title" arg specifies the default field to use
- // when no field is explicitly specified in the query.
- Query q = new QueryParser("title", analyzer).parse(querystr);
- // 3. search
- int hitsPerPage = 10;
- try (IndexReader reader = DirectoryReader.open(index)) {
- IndexSearcher searcher = new IndexSearcher(reader);
- TopDocs docs = searcher.search(q, hitsPerPage);
- ScoreDoc[] hits = docs.scoreDocs;
- // 4. display results
- System.out.println("Found " + hits.length + " hits.");
- for (int i = 0; i<hits.length; ++i) {
- int docId = hits[i].doc;
- Document d = searcher.doc(docId);
- System.out.println((i + 1) + ". " + d.get("isbn") + "\t" + d.get("title"));
- }
- // reader can only be closed when there
- // is no need to access the documents any more.
- }
- }
- private static void addDoc(IndexWriter w, String title, String isbn) throws IOException {
- Document doc = new Document();
- doc.add(new TextField("title", title, Field.Store.YES));
- // use a string field for isbn because we don't want it tokenized
- doc.add(new StringField("isbn", isbn, Field.Store.YES));
- w.addDocument(doc);
- }
- }
Add Comment
Please, Sign In to add comment