Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- /**
- * This program reads a file containing documents
- * (one per line), builds a keyword index and searches
- * the index for given search term.
- * Usage: java SearchIndex DocumentFile SearchTerm
- * @author: ingo.bax at fh-muenster.de
- * @version: SoSe 2011
- */
- import java.io.*;
- import java.util.*;
- public class SearchIndex {
- private static final String DELIMITERS = " \t\n\f!@#$%^&*()_+{}[]:\";'<>,.?/";
- public static void main(String[] args) {
- // check arguments
- if (args.length<2) {
- System.err.println("Usage: java SearchIndex DocumentFile SearchTerm");
- return;
- }
- String documentsFileName = args[0];
- String searchTerm = args[1];
- // for benchmarks
- long time;
- // read documents from file
- System.out.print("Reading documents -> ");
- time = System.currentTimeMillis();
- ArrayList<String> documents = readDocuments(documentsFileName);
- System.out.println("done. " + documents.size() + " documents read in " + (System.currentTimeMillis() - time) + " ms.");
- // build index from documents
- System.out.print("Building index -> ");
- time = System.currentTimeMillis();
- ArrayList<IndexEntry> index = buildIndex(documents);
- System.out.println("done. " + index.size() + " index entries created in " + (System.currentTimeMillis() - time) + " ms.");
- /*
- * // find search term using binary search
- */
- /*
- System.out.print("Searching index with binary search -> ");
- time = System.currentTimeMillis();
- int documentID = lookupBinary(index, searchTerm);
- System.out.println("done in " + (System.currentTimeMillis() - time) + " ms.");
- // print result
- if (documentID == -1) {
- System.out.println("No Document found.");
- }
- else {
- System.out.println("Match found in document " + documentID + ":");
- System.out.println(documents.get(documentID));
- }
- */
- System.out.print("Searching index with binary search -> ");
- time = System.currentTimeMillis();
- List<Integer> documentIDList = lookupBinaryWithList(index, searchTerm);
- if(documentIDList.size() > 0) {
- for(Integer documentID : documentIDList)
- {
- System.out.println("Match found in document " + documentID + ":");
- System.out.println(documents.get(documentID));
- }
- }
- else {
- System.out.println("No Document found.");
- }
- System.out.println("done in " + (System.currentTimeMillis() - time) + " ms.");
- }
- /**
- * Reads all documents from a specified file
- * @param filename String containing the documents one per line
- * @return ArrayList containing all documents as Strings
- */
- private static ArrayList<String> readDocuments(String filename) {
- ArrayList<String> documents = new ArrayList<String>();
- try {
- BufferedReader br = new BufferedReader(new FileReader(filename));
- while (true) {
- String line = br.readLine();
- if (line == null) break;
- documents.add(line);
- }
- br.close();
- }
- catch (IOException e) {
- throw new RuntimeException("Could not read documents from " + filename + ": " + e);
- }
- return documents;
- }
- /**
- * Reads all entries from the dictionary file
- * @param filename String containing the name of the word list file
- * @return ArrayList containing all dictionary entries as Strings
- */
- private static ArrayList<IndexEntry> buildIndex(ArrayList<String> documents) {
- // initialize index
- ArrayList<IndexEntry> index = new ArrayList<IndexEntry>();
- // create index elements from documents
- for (int documentID=0; documentID<documents.size(); documentID++) {
- StringTokenizer t = new StringTokenizer(documents.get(documentID), DELIMITERS);
- while (t.hasMoreTokens()) {
- index.add(new IndexEntry(t.nextToken(),documentID));
- }
- }
- // sort index
- Collections.sort(index);
- return index;
- }
- /**
- * Seaches a specified index for a specified key
- * @param index ArrayList of IndexEntries
- * @param searchTerm String search term that should be found
- * @return documentID of the first document found that contains the specified search term.
- * -1 if no document was found.
- */
- @SuppressWarnings("unused")
- private static int lookupBinary(ArrayList<IndexEntry> index, String searchTerm) {
- int i = Collections.binarySearch(index,new IndexEntry(searchTerm,0));
- if (i==-1) {
- return -1;
- }
- else {
- return index.get(i).documentID;
- }
- }
- /**
- * Searches a specified index for a specified key
- * @param index ArrayList of IndexEntries
- * @param searchTerm String search term that should be found
- * @return documentID Array of the documents found that contains the specified search term.
- * empty if no document was found.
- */
- @SuppressWarnings("unchecked")
- private static ArrayList<Integer> lookupBinaryWithList(ArrayList<IndexEntry> index, String searchTerm) {
- ArrayList<IndexEntry> indexSearched = new ArrayList<IndexEntry>();
- ArrayList<Integer> indexMatchList = new ArrayList<Integer>();
- // Copy index
- indexSearched = (ArrayList<IndexEntry>) index.clone();
- int i = 1;
- while(i > 0)
- {
- i = Collections.binarySearch(indexSearched,new IndexEntry(searchTerm,0));
- if (i > 0) {
- int documentID = index.get(i).documentID;
- // Check whether the documentId already exist
- if(!indexMatchList.contains(documentID)) {
- indexMatchList.add(index.get(i).documentID);
- }
- // Remove indexEntry from temporary index
- indexSearched.remove(i);
- }
- }
- return indexMatchList;
- }
- /**
- * Representation of an entry in the search index containing an
- * uppercase key and a documentID.
- * (This class implements the interface Comparable<IndexEntry>, so that
- * an ArrayList<IndexEntry> can be easily sorted using Collections.sort
- * and subsequently searched using Collections.binarySearch)
- */
- private static class IndexEntry implements Comparable<IndexEntry> {
- public String key;
- public int documentID;
- /**
- * Creates an IndexEntry object
- * @param key String containing the key
- * @param documentID the id of the document the key was found in
- */
- public IndexEntry(String key, int documentID) {
- this.key = key.toUpperCase();
- this.documentID = documentID;
- }
- /**
- * Compares the key of this IndexEntry to the key of a specified IndexEntry
- * @param e an IndexEntry
- * @return 0 in case the keys are equal,
- * -1 if the key of this IndexEntry appears before the key of specifid IndexEntry
- * 1 if the key of this IndexEntry appears after the key of specifid IndexEntry
- */
- public int compareTo(IndexEntry e) {
- return this.key.compareTo(e.key);
- }
- }
- }
Add Comment
Please, Sign In to add comment