Untitled

package rex1nlp;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import opennlp.tools.tokenize.TokenizerME;
import opennlp.tools.tokenize.TokenizerModel;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.queryparser.flexible.standard.parser.ParseException;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
public class luceneRex {


      public static void main(String[] args) throws IOException, ParseException, org.apache.lucene.queryparser.classic.ParseException, TikaException, SAXException {
            InputStream inputStreamTokenizer = new
         FileInputStream("C:\\Users\\RexPC\\Documents\\Programming\\Apache OpenNLP\\Models\\Original OpenNLP Models\\en-token.bin");

      TokenizerModel tokenModel = new TokenizerModel(inputStreamTokenizer);

      //Instantiating the TokenizerME class
      TokenizerME tokenizer = new TokenizerME(tokenModel);

      String target = "C:\\Users\\RexPC\\Documents\\Haily.docx";

        File document = new File(target);
        Parser parser = new AutoDetectParser();

        ContentHandler handler = new BodyContentHandler();
        Metadata metadata = new Metadata();

         parser.parse(new FileInputStream(document), handler, metadata, new ParseContext());


        // 0. Specify the analyzer for tokenizing text.
        //    The same analyzer should be used for indexing and searching
        StandardAnalyzer analyzer = new StandardAnalyzer();

        // 1. create the index
        Directory index = new RAMDirectory();

        IndexWriterConfig config = new IndexWriterConfig(analyzer);

          try (IndexWriter w = new IndexWriter(index, config)) {
              addDoc(w, handler.toString(), "193398817");
           // System.out.println(handler.toString());
          }

        // 2. query
        String querystr = args.length > 0 ? args[0] : "Cigna";

        // the "title" arg specifies the default field to use
        // when no field is explicitly specified in the query.
        Query q = new QueryParser("title", analyzer).parse(querystr);

        // 3. search
        int hitsPerPage = 10;
          try (IndexReader reader = DirectoryReader.open(index)) {
              IndexSearcher searcher = new IndexSearcher(reader);
              TopDocs docs = searcher.search(q, hitsPerPage);
              ScoreDoc[] hits = docs.scoreDocs;
              // 4. display results
              System.out.println("Found " + hits.length + " hits.");
              for (int i = 0; i<hits.length; ++i) {
                  int docId = hits[i].doc;
                  Document d = searcher.doc(docId);
                  System.out.println((i + 1) + ". " + d.get("isbn") + "\t" + d.get("title"));
              }
              // reader can only be closed when there
              // is no need to access the documents any more.
          }
    }

    private static void addDoc(IndexWriter w, String title, String isbn) throws IOException {
        Document doc = new Document();
        doc.add(new TextField("title", title, Field.Store.YES));

        // use a string field for isbn because we don't want it tokenized
        doc.add(new StringField("isbn", isbn, Field.Store.YES));
        w.addDocument(doc);


    }
}