LSE.java

package lse;

import java.io.*;
import java.util.*;

/**
 * This class builds an index of keywords. Each keyword maps to a set of pages in
 * which it occurs, with frequency of occurrence in each page.
 *
 */
public class LittleSearchEngine {

    /**
     * This is a hash table of all keywords. The key is the actual keyword, and the associated value is
     * an array list of all occurrences of the keyword in documents. The array list is maintained in
     * DESCENDING order of frequencies.
     */
    HashMap<String,ArrayList<Occurrence>> keywordsIndex;

    /**
     * The hash set of all noise words.
     */
    HashSet<String> noiseWords;

    /**
     * Creates the keyWordsIndex and noiseWords hash tables.
     */
    public LittleSearchEngine() {
        keywordsIndex = new HashMap<String,ArrayList<Occurrence>>(1000,2.0f);
        noiseWords = new HashSet<String>(100,2.0f);
    }

    /**
     * Scans a document, and loads all keywords found into a hash table of keyword occurrences
     * in the document. Uses the getKeyWord method to separate keywords from other words.
     *
     * @param docFile Name of the document file to be scanned and loaded
     * @return Hash table of keywords in the given document, each associated with an Occurrence object
     * @throws FileNotFoundException If the document file is not found on disk
     */
    public HashMap<String,Occurrence> loadKeywordsFromDocument(String docFile)
    throws FileNotFoundException {
        /** COMPLETE THIS METHOD **/
        HashMap<String, Occurrence> words = new HashMap<String, Occurrence>();
        try {
            File f = new File(docFile);
            Scanner sc = new Scanner(f);
            String word;
            while(sc.hasNext()) {
                word = getKeyword(sc.next());
                if(word == null)continue;
                if(!words.containsKey(word))words.put(word, new Occurrence(docFile, 1));
                else words.get(word).frequency++;
            }
            sc.close();
        }
        catch (FileNotFoundException ex){
            System.out.println("File could not be found");
            return null;
        }
        return words;
    }

    /**
     * Merges the keywords for a single document into the master keywordsIndex
     * hash table. For each keyword, its Occurrence in the current document
     * must be inserted in the correct place (according to descending order of
     * frequency) in the same keyword's Occurrence list in the master hash table.
     * This is done by calling the insertLastOccurrence method.
     *
     * @param kws Keywords hash table for a document
     */
    public void mergeKeywords(HashMap<String,Occurrence> kws) {
        for(String key: kws.keySet()) {
            ArrayList<Occurrence>list;
            if(!keywordsIndex.containsKey(key)) {
                list = new ArrayList<Occurrence>();
                keywordsIndex.put(key, list);
            }
            else list = keywordsIndex.get(key);
            list.add(kws.get(key));
            insertLastOccurrence(list);
        }
    }

    /**
     * Given a word, returns it as a keyword if it passes the keyword test,
     * otherwise returns null. A keyword is any word that, after being stripped of any
     * trailing punctuation, consists only of alphabetic letters, and is not
     * a noise word. All words are treated in a case-INsensitive manner.
     *
     * Punctuation characters are the following: '.', ',', '?', ':', ';' and '!'
     *
     * @param word Candidate word
     * @return Keyword (word without trailing punctuation, LOWER CASE)
     */
    public String getKeyword(String word) {
        /** COMPLETE THIS METHOD **/
        if(word.length() == 0) return null;
        word = word.toLowerCase();
        boolean hasFoundPunctuation = false;
        int endIndex = word.length();
        char c;
        for(int i = 0; i < word.length(); i++) {
            c = word.charAt(i);
            if(c >= 'a' && c <= 'z') {
                if(hasFoundPunctuation) return null;
            }
            else if(c == '.' || c == ',' || c == '?' || c == ':' || c == ';' || c == '!') {
                if(!hasFoundPunctuation)endIndex = i;
                hasFoundPunctuation = true;
            }
            else return null;
        }
        word = word.substring(0,endIndex);
        if(noiseWords.contains(word))return null;
        return word;
    }

    /**
     * Inserts the last occurrence in the parameter list in the correct position in the
     * list, based on ordering occurrences on descending frequencies. The elements
     * 0..n-2 in the list are already in the correct order. Insertion is done by
     * first finding the correct spot using binary search, then inserting at that spot.
     *
     * @param occs List of Occurrences
     * @return Sequence of mid point indexes in the input list checked by the binary search process,
     *         null if the size of the input list is 1. This returned array list is only used to test
     *         your code - it is not used elsewhere in the program.
     */
    public ArrayList<Integer> insertLastOccurrence(ArrayList<Occurrence> occs) {
        /** COMPLETE THIS METHOD **/
        if(occs.size() == 1) return null;
        Occurrence oc = occs.remove(occs.size() - 1);
        ArrayList<Integer> list = binarySearch(occs, oc.frequency, 0, occs.size()-1, new ArrayList<Integer>());
        int index = list.get(list.size() - 1);
        occs.add(index, oc);
        return list;
    }
    private ArrayList<Integer> binarySearch(ArrayList<Occurrence>occs, int num, int min, int max, ArrayList<Integer>ret) {
        int mid = (min + max) / 2;
        if(max <= min) {
            if(num > occs.get(mid).frequency) ret.add(mid);
            else ret.add(mid+1);
            return ret;
        }
        ret.add(mid);
        if(num < occs.get(mid).frequency) return binarySearch(occs, num, mid + 1, max, ret);
        else if(num > occs.get(mid).frequency) return binarySearch(occs, num, min, mid - 1, ret);
        return ret;
    }

    /**
     * This method indexes all keywords found in all the input documents. When this
     * method is done, the keywordsIndex hash table will be filled with all keywords,
     * each of which is associated with an array list of Occurrence objects, arranged
     * in decreasing frequencies of occurrence.
     *
     * @param docsFile Name of file that has a list of all the document file names, one name per line
     * @param noiseWordsFile Name of file that has a list of noise words, one noise word per line
     * @throws FileNotFoundException If there is a problem locating any of the input files on disk
     */
    public void makeIndex(String docsFile, String noiseWordsFile)
    throws FileNotFoundException {
        // load noise words to hash table
        Scanner sc = new Scanner(new File(noiseWordsFile));
        while (sc.hasNext()) {
            String word = sc.next();
            noiseWords.add(word);
        }

        // index all keywords
        sc = new Scanner(new File(docsFile));
        while (sc.hasNext()) {
            String docFile = sc.next();
            HashMap<String,Occurrence> kws = loadKeywordsFromDocument(docFile);
            mergeKeywords(kws);
        }
        sc.close();
    }

    /**
     * Search result for "kw1 or kw2". A document is in the result set if kw1 or kw2 occurs in that
     * document. Result set is arranged in descending order of document frequencies. (Note that a
     * matching document will only appear once in the result.) Ties in frequency values are broken
     * in favor of the first keyword. (That is, if kw1 is in doc1 with frequency f1, and kw2 is in doc2
     * also with the same frequency f1, then doc1 will take precedence over doc2 in the result.
     * The result set is limited to 5 entries. If there are no matches at all, result is null.
     *
     * @param kw1 First keyword
     * @param kw1 Second keyword
     * @return List of documents in which either kw1 or kw2 occurs, arranged in descending order of
     *         frequencies. The result size is limited to 5 documents. If there are no matches, returns null.
     */
    public ArrayList<String> top5search(String kw1, String kw2) {
        /** COMPLETE THIS METHOD **/
        ArrayList<String>ret = new ArrayList<String>();
        ArrayList<Occurrence>oList1 = keywordsIndex.get(kw1), oList2 = keywordsIndex.get(kw2);
        if(oList1 == null && oList2 == null) return ret;
        else if(oList1 == null) oList1 = oList2;
        else if(oList2 != null && oList1 != oList2){
            for(Occurrence o: oList2) {
                insert(oList1, o);
            }
        }
        for(Occurrence t: oList1) {
            if(!ret.contains(t.document))ret.add(t.document);
            if(ret.size() == 5)break;
        }
        return ret;
    }
    private void insert(ArrayList<Occurrence>oList1, Occurrence o) {
        for(int i = 0; i < oList1.size(); i++) {
            if(o.frequency > oList1.get(i).frequency) {
                oList1.add(i, o);
                return;
            }
        }
        oList1.add(o);
    }
}