Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- package wbi1112;
- import java.io.*;
- import java.util.List;
- import net.htmlparser.jericho.*;
- import org.apache.lucene.analysis.standard.StandardAnalyzer;
- import org.apache.lucene.document.Document;
- import org.apache.lucene.document.Field;
- import org.apache.lucene.index.IndexWriter;
- import org.apache.lucene.index.IndexWriterConfig;
- import org.apache.lucene.store.FSDirectory;
- import org.apache.lucene.util.Version;
- public class IndexCreator {
- static IndexWriter contentw;
- static IndexWriter imagew;
- static boolean rmdir(File f){
- if (f.isDirectory()){
- String[] files = f.list();
- for(int i=0; i<files.length; i++){
- rmdir( new File(f, files[i]) );
- }
- }
- return f.delete();
- }
- /**
- * @param args
- */
- public static void main(String[] args) {
- // usage: IndexCreator ContentIndexDir ImageIndexDir PageDir
- // open specified path or working dir
- if (args.length != 1) {
- System.out.println("usage: java -jar IndexCreator.jar PageDir");
- System.out.println("indices are created in .imgidx / .cntidx");
- System.exit(1);
- }
- // get paths and check
- String sContent = ".cntidx";
- String sImage = ".imgidx";
- String sPath = args[0];
- final File fPath = new File(sPath);
- final File fContent = new File(sContent);
- final File fImage = new File(sImage);
- if (!fPath.exists() || !fPath.canRead()) {
- System.out.println("Page Directory couldn't be read.");
- System.exit(1);
- }
- rmdir(fContent);
- rmdir(fImage);
- fContent.mkdir();
- fImage.mkdir();
- if (! (fContent.exists() && fImage.exists() && fContent.canWrite() && fImage.canWrite()) ){
- System.out.println("One or both of the Index-Directories cannot be written to.");
- System.exit(1);
- }
- // build the indices
- try {
- // one for content, one for images
- contentw = new IndexWriter(FSDirectory.open(fContent),new IndexWriterConfig(Version.LUCENE_31, new StandardAnalyzer(Version.LUCENE_31)));
- imagew = new IndexWriter(FSDirectory.open(fImage), new IndexWriterConfig(Version.LUCENE_31, new StandardAnalyzer(Version.LUCENE_31)));
- // in one method
- index(fPath);
- contentw.close();
- imagew.close();
- } catch (Exception e) {
- e.printStackTrace();
- System.out.println(e.getMessage());
- }
- System.out.println("done");
- }
- static void index(File f){
- // recurse into subdirectories
- if( f.isDirectory() ){
- String[] files = f.list();
- for(int i=0; i<files.length; i++){
- index( new File(f, files[i]) );
- }
- }
- // but only look at .htm files
- if(! f.getPath().toLowerCase().endsWith(".htm") )
- return;
- FileInputStream fis;
- try {
- Document doc = new Document();
- // add path
- doc.add( new Field("path", f.getPath(), Field.Store.YES, Field.Index.NO));
- // read file
- fis = new FileInputStream(f);
- // use jericho html-parser to get contents of html-file
- Source src = new Source( new BufferedReader(new InputStreamReader(fis, "UTF-8")));
- // the plain text
- doc.add( new Field("content",
- src.getTextExtractor().setIncludeAttributes(true).toString(),
- Field.Store.NO, Field.Index.ANALYZED )
- );
- // the title
- doc.add( new Field("title",
- src.getFirstElement(HTMLElementName.TITLE ).getContent().toString(),
- Field.Store.YES, Field.Index.NOT_ANALYZED )
- );
- // add to the content index
- contentw.addDocument(doc);
- // are there (specially designated) images in our html-files?
- List<Element> elementList=src.getAllElementsByClass("d_image");
- for (Element element : elementList) {
- // loop through all of them and add them to the..
- Document d = new Document();
- d.add( new Field("path", element.getFirstElement(HTMLElementName.IMG).getAttributeValue("src"), Field.Store.YES, Field.Index.NO));
- d.add( new Field("caption", element.getTextExtractor().toString(), Field.Store.YES, Field.Index.NOT_ANALYZED ));
- // ..image index
- imagew.addDocument(d);
- }
- fis.close();
- } catch (Exception e) {
- e.printStackTrace();
- System.out.println(e.getMessage());
- }
- }
- }
Add Comment
Please, Sign In to add comment