Untitled

package wbi1112;

import java.io.*;
import java.util.List;

import net.htmlparser.jericho.*;

import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;


import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;


public class IndexCreator {

	static IndexWriter contentw;
	static IndexWriter imagew;

	static boolean rmdir(File f){

		if (f.isDirectory()){
			String[] files = f.list();
			for(int i=0; i<files.length; i++){

				rmdir( new File(f, files[i]) );

			}
		}

		return f.delete();

	}
	/**
	 * @param args
	 */
	public static void main(String[] args) {

		// usage: IndexCreator ContentIndexDir ImageIndexDir PageDir
		// open specified path or working dir

		if (args.length != 1) {
			System.out.println("usage: java -jar IndexCreator.jar PageDir");
			System.out.println("indices are created in .imgidx / .cntidx");
			System.exit(1);

		}

		// get paths and check
		String sContent = ".cntidx";
		String sImage = ".imgidx";
		String sPath = args[0];

		final File fPath = new File(sPath);
		final File fContent = new File(sContent);
		final File fImage = new File(sImage);

		if (!fPath.exists() || !fPath.canRead()) {
			System.out.println("Page Directory couldn't be read.");
			System.exit(1);
	    }

		rmdir(fContent);
		rmdir(fImage);

		fContent.mkdir();
		fImage.mkdir();

		if (! (fContent.exists() && fImage.exists() && fContent.canWrite() && fImage.canWrite()) ){
			System.out.println("One or both of the Index-Directories cannot be written to.");
			System.exit(1);
		}

		// build the indices
		try {

			// one for content, one for images
	    	contentw = new IndexWriter(FSDirectory.open(fContent),new IndexWriterConfig(Version.LUCENE_31, new StandardAnalyzer(Version.LUCENE_31)));
	    	imagew = new IndexWriter(FSDirectory.open(fImage), new IndexWriterConfig(Version.LUCENE_31, new StandardAnalyzer(Version.LUCENE_31)));

	    	// in one method
	    	index(fPath);

	    	contentw.close();
	    	imagew.close();


		} catch (Exception e) {
			e.printStackTrace();
			System.out.println(e.getMessage());
		}

		System.out.println("done");
	}

	static void index(File f){


		// recurse into subdirectories
		if( f.isDirectory() ){

			String[] files = f.list();
			for(int i=0; i<files.length; i++){

				index( new File(f, files[i]) );

			}

		}

		// but only look at .htm files
		if(! f.getPath().toLowerCase().endsWith(".htm") )
			return;


		FileInputStream fis;

		try {

				Document doc = new Document();

				// add path
				doc.add( new Field("path", f.getPath(), Field.Store.YES, Field.Index.NO));

				// read file
				fis = new FileInputStream(f);

				// use jericho html-parser to get contents of html-file
				Source src = new Source( new BufferedReader(new InputStreamReader(fis, "UTF-8")));


				// the plain text
				doc.add( new Field("content",
						src.getTextExtractor().setIncludeAttributes(true).toString(),
						Field.Store.NO, Field.Index.ANALYZED )
				);

				// the title
				doc.add( new Field("title",
						src.getFirstElement(HTMLElementName.TITLE ).getContent().toString(),
						Field.Store.YES, Field.Index.NOT_ANALYZED )
				);

				// add to the content index
				contentw.addDocument(doc);

				// are there (specially designated) images in our html-files?
				List<Element> elementList=src.getAllElementsByClass("d_image");

				for (Element element : elementList) {

					// loop through all of them and add them to the..
					Document d = new Document();
					d.add( new Field("path", element.getFirstElement(HTMLElementName.IMG).getAttributeValue("src"), Field.Store.YES, Field.Index.NO));
					d.add( new Field("caption", element.getTextExtractor().toString(), Field.Store.YES, Field.Index.NOT_ANALYZED ));
					// ..image index
					imagew.addDocument(d);

				}

				fis.close();

		} catch (Exception e) {
			e.printStackTrace();

			System.out.println(e.getMessage());
		}


	}
}