Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- package lse;
- import java.io.*;
- import java.util.*;
- /**
- * This class builds an index of keywords. Each keyword maps to a set of pages in
- * which it occurs, with frequency of occurrence in each page.
- *
- */
- public class LittleSearchEngine {
- /**
- * This is a hash table of all keywords. The key is the actual keyword, and the associated value is
- * an array list of all occurrences of the keyword in documents. The array list is maintained in
- * DESCENDING order of frequencies.
- */
- HashMap<String,ArrayList<Occurrence>> keywordsIndex;
- /**
- * The hash set of all noise words.
- */
- HashSet<String> noiseWords;
- /**
- * Creates the keyWordsIndex and noiseWords hash tables.
- */
- public LittleSearchEngine() {
- keywordsIndex = new HashMap<String,ArrayList<Occurrence>>(1000,2.0f);
- noiseWords = new HashSet<String>(100,2.0f);
- }
- /**
- * Scans a document, and loads all keywords found into a hash table of keyword occurrences
- * in the document. Uses the getKeyWord method to separate keywords from other words.
- *
- * @param docFile Name of the document file to be scanned and loaded
- * @return Hash table of keywords in the given document, each associated with an Occurrence object
- * @throws FileNotFoundException If the document file is not found on disk
- */
- public HashMap<String,Occurrence> loadKeywordsFromDocument(String docFile)
- throws FileNotFoundException {
- /** COMPLETE THIS METHOD **/
- HashMap<String, Occurrence> words = new HashMap<String, Occurrence>();
- try {
- File f = new File(docFile);
- Scanner sc = new Scanner(f);
- String word;
- while(sc.hasNext()) {
- word = getKeyword(sc.next());
- if(word == null)continue;
- if(!words.containsKey(word))words.put(word, new Occurrence(docFile, 1));
- else words.get(word).frequency++;
- }
- sc.close();
- }
- catch (FileNotFoundException ex){
- System.out.println("File could not be found");
- return null;
- }
- return words;
- }
- /**
- * Merges the keywords for a single document into the master keywordsIndex
- * hash table. For each keyword, its Occurrence in the current document
- * must be inserted in the correct place (according to descending order of
- * frequency) in the same keyword's Occurrence list in the master hash table.
- * This is done by calling the insertLastOccurrence method.
- *
- * @param kws Keywords hash table for a document
- */
- public void mergeKeywords(HashMap<String,Occurrence> kws) {
- for(String key: kws.keySet()) {
- ArrayList<Occurrence>list;
- if(!keywordsIndex.containsKey(key)) {
- list = new ArrayList<Occurrence>();
- keywordsIndex.put(key, list);
- }
- else list = keywordsIndex.get(key);
- list.add(kws.get(key));
- insertLastOccurrence(list);
- }
- }
- /**
- * Given a word, returns it as a keyword if it passes the keyword test,
- * otherwise returns null. A keyword is any word that, after being stripped of any
- * trailing punctuation, consists only of alphabetic letters, and is not
- * a noise word. All words are treated in a case-INsensitive manner.
- *
- * Punctuation characters are the following: '.', ',', '?', ':', ';' and '!'
- *
- * @param word Candidate word
- * @return Keyword (word without trailing punctuation, LOWER CASE)
- */
- public String getKeyword(String word) {
- /** COMPLETE THIS METHOD **/
- if(word.length() == 0) return null;
- word = word.toLowerCase();
- boolean hasFoundPunctuation = false;
- int endIndex = word.length();
- char c;
- for(int i = 0; i < word.length(); i++) {
- c = word.charAt(i);
- if(c >= 'a' && c <= 'z') {
- if(hasFoundPunctuation) return null;
- }
- else if(c == '.' || c == ',' || c == '?' || c == ':' || c == ';' || c == '!') {
- if(!hasFoundPunctuation)endIndex = i;
- hasFoundPunctuation = true;
- }
- else return null;
- }
- word = word.substring(0,endIndex);
- if(noiseWords.contains(word))return null;
- return word;
- }
- /**
- * Inserts the last occurrence in the parameter list in the correct position in the
- * list, based on ordering occurrences on descending frequencies. The elements
- * 0..n-2 in the list are already in the correct order. Insertion is done by
- * first finding the correct spot using binary search, then inserting at that spot.
- *
- * @param occs List of Occurrences
- * @return Sequence of mid point indexes in the input list checked by the binary search process,
- * null if the size of the input list is 1. This returned array list is only used to test
- * your code - it is not used elsewhere in the program.
- */
- public ArrayList<Integer> insertLastOccurrence(ArrayList<Occurrence> occs) {
- /** COMPLETE THIS METHOD **/
- if(occs.size() == 1) return null;
- Occurrence oc = occs.remove(occs.size() - 1);
- ArrayList<Integer> list = binarySearch(occs, oc.frequency, 0, occs.size()-1, new ArrayList<Integer>());
- int index = list.get(list.size() - 1);
- occs.add(index, oc);
- return list;
- }
- private ArrayList<Integer> binarySearch(ArrayList<Occurrence>occs, int num, int min, int max, ArrayList<Integer>ret) {
- int mid = (min + max) / 2;
- if(max <= min) {
- if(num > occs.get(mid).frequency) ret.add(mid);
- else ret.add(mid+1);
- return ret;
- }
- ret.add(mid);
- if(num < occs.get(mid).frequency) return binarySearch(occs, num, mid + 1, max, ret);
- else if(num > occs.get(mid).frequency) return binarySearch(occs, num, min, mid - 1, ret);
- return ret;
- }
- /**
- * This method indexes all keywords found in all the input documents. When this
- * method is done, the keywordsIndex hash table will be filled with all keywords,
- * each of which is associated with an array list of Occurrence objects, arranged
- * in decreasing frequencies of occurrence.
- *
- * @param docsFile Name of file that has a list of all the document file names, one name per line
- * @param noiseWordsFile Name of file that has a list of noise words, one noise word per line
- * @throws FileNotFoundException If there is a problem locating any of the input files on disk
- */
- public void makeIndex(String docsFile, String noiseWordsFile)
- throws FileNotFoundException {
- // load noise words to hash table
- Scanner sc = new Scanner(new File(noiseWordsFile));
- while (sc.hasNext()) {
- String word = sc.next();
- noiseWords.add(word);
- }
- // index all keywords
- sc = new Scanner(new File(docsFile));
- while (sc.hasNext()) {
- String docFile = sc.next();
- HashMap<String,Occurrence> kws = loadKeywordsFromDocument(docFile);
- mergeKeywords(kws);
- }
- sc.close();
- }
- /**
- * Search result for "kw1 or kw2". A document is in the result set if kw1 or kw2 occurs in that
- * document. Result set is arranged in descending order of document frequencies. (Note that a
- * matching document will only appear once in the result.) Ties in frequency values are broken
- * in favor of the first keyword. (That is, if kw1 is in doc1 with frequency f1, and kw2 is in doc2
- * also with the same frequency f1, then doc1 will take precedence over doc2 in the result.
- * The result set is limited to 5 entries. If there are no matches at all, result is null.
- *
- * @param kw1 First keyword
- * @param kw1 Second keyword
- * @return List of documents in which either kw1 or kw2 occurs, arranged in descending order of
- * frequencies. The result size is limited to 5 documents. If there are no matches, returns null.
- */
- public ArrayList<String> top5search(String kw1, String kw2) {
- /** COMPLETE THIS METHOD **/
- ArrayList<String>ret = new ArrayList<String>();
- ArrayList<Occurrence>oList1 = keywordsIndex.get(kw1), oList2 = keywordsIndex.get(kw2);
- if(oList1 == null && oList2 == null) return ret;
- else if(oList1 == null) oList1 = oList2;
- else if(oList2 != null && oList1 != oList2){
- for(Occurrence o: oList2) {
- insert(oList1, o);
- }
- }
- for(Occurrence t: oList1) {
- if(!ret.contains(t.document))ret.add(t.document);
- if(ret.size() == 5)break;
- }
- return ret;
- }
- private void insert(ArrayList<Occurrence>oList1, Occurrence o) {
- for(int i = 0; i < oList1.size(); i++) {
- if(o.frequency > oList1.get(i).frequency) {
- oList1.add(i, o);
- return;
- }
- }
- oList1.add(o);
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment