Guest User

LSE.java

a guest
Sep 25th, 2019
202
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 9.22 KB | None | 0 0
  1. package lse;
  2.  
  3. import java.io.*;
  4. import java.util.*;
  5.  
  6. /**
  7. * This class builds an index of keywords. Each keyword maps to a set of pages in
  8. * which it occurs, with frequency of occurrence in each page.
  9. *
  10. */
  11. public class LittleSearchEngine {
  12.  
  13. /**
  14. * This is a hash table of all keywords. The key is the actual keyword, and the associated value is
  15. * an array list of all occurrences of the keyword in documents. The array list is maintained in
  16. * DESCENDING order of frequencies.
  17. */
  18. HashMap<String,ArrayList<Occurrence>> keywordsIndex;
  19.  
  20. /**
  21. * The hash set of all noise words.
  22. */
  23. HashSet<String> noiseWords;
  24.  
  25. /**
  26. * Creates the keyWordsIndex and noiseWords hash tables.
  27. */
  28. public LittleSearchEngine() {
  29. keywordsIndex = new HashMap<String,ArrayList<Occurrence>>(1000,2.0f);
  30. noiseWords = new HashSet<String>(100,2.0f);
  31. }
  32.  
  33. /**
  34. * Scans a document, and loads all keywords found into a hash table of keyword occurrences
  35. * in the document. Uses the getKeyWord method to separate keywords from other words.
  36. *
  37. * @param docFile Name of the document file to be scanned and loaded
  38. * @return Hash table of keywords in the given document, each associated with an Occurrence object
  39. * @throws FileNotFoundException If the document file is not found on disk
  40. */
  41. public HashMap<String,Occurrence> loadKeywordsFromDocument(String docFile)
  42. throws FileNotFoundException {
  43. /** COMPLETE THIS METHOD **/
  44. HashMap<String, Occurrence> words = new HashMap<String, Occurrence>();
  45. try {
  46. File f = new File(docFile);
  47. Scanner sc = new Scanner(f);
  48. String word;
  49. while(sc.hasNext()) {
  50. word = getKeyword(sc.next());
  51. if(word == null)continue;
  52. if(!words.containsKey(word))words.put(word, new Occurrence(docFile, 1));
  53. else words.get(word).frequency++;
  54. }
  55. sc.close();
  56. }
  57. catch (FileNotFoundException ex){
  58. System.out.println("File could not be found");
  59. return null;
  60. }
  61. return words;
  62. }
  63.  
  64. /**
  65. * Merges the keywords for a single document into the master keywordsIndex
  66. * hash table. For each keyword, its Occurrence in the current document
  67. * must be inserted in the correct place (according to descending order of
  68. * frequency) in the same keyword's Occurrence list in the master hash table.
  69. * This is done by calling the insertLastOccurrence method.
  70. *
  71. * @param kws Keywords hash table for a document
  72. */
  73. public void mergeKeywords(HashMap<String,Occurrence> kws) {
  74. for(String key: kws.keySet()) {
  75. ArrayList<Occurrence>list;
  76. if(!keywordsIndex.containsKey(key)) {
  77. list = new ArrayList<Occurrence>();
  78. keywordsIndex.put(key, list);
  79. }
  80. else list = keywordsIndex.get(key);
  81. list.add(kws.get(key));
  82. insertLastOccurrence(list);
  83. }
  84. }
  85.  
  86. /**
  87. * Given a word, returns it as a keyword if it passes the keyword test,
  88. * otherwise returns null. A keyword is any word that, after being stripped of any
  89. * trailing punctuation, consists only of alphabetic letters, and is not
  90. * a noise word. All words are treated in a case-INsensitive manner.
  91. *
  92. * Punctuation characters are the following: '.', ',', '?', ':', ';' and '!'
  93. *
  94. * @param word Candidate word
  95. * @return Keyword (word without trailing punctuation, LOWER CASE)
  96. */
  97. public String getKeyword(String word) {
  98. /** COMPLETE THIS METHOD **/
  99. if(word.length() == 0) return null;
  100. word = word.toLowerCase();
  101. boolean hasFoundPunctuation = false;
  102. int endIndex = word.length();
  103. char c;
  104. for(int i = 0; i < word.length(); i++) {
  105. c = word.charAt(i);
  106. if(c >= 'a' && c <= 'z') {
  107. if(hasFoundPunctuation) return null;
  108. }
  109. else if(c == '.' || c == ',' || c == '?' || c == ':' || c == ';' || c == '!') {
  110. if(!hasFoundPunctuation)endIndex = i;
  111. hasFoundPunctuation = true;
  112. }
  113. else return null;
  114. }
  115. word = word.substring(0,endIndex);
  116. if(noiseWords.contains(word))return null;
  117. return word;
  118. }
  119.  
  120. /**
  121. * Inserts the last occurrence in the parameter list in the correct position in the
  122. * list, based on ordering occurrences on descending frequencies. The elements
  123. * 0..n-2 in the list are already in the correct order. Insertion is done by
  124. * first finding the correct spot using binary search, then inserting at that spot.
  125. *
  126. * @param occs List of Occurrences
  127. * @return Sequence of mid point indexes in the input list checked by the binary search process,
  128. * null if the size of the input list is 1. This returned array list is only used to test
  129. * your code - it is not used elsewhere in the program.
  130. */
  131. public ArrayList<Integer> insertLastOccurrence(ArrayList<Occurrence> occs) {
  132. /** COMPLETE THIS METHOD **/
  133. if(occs.size() == 1) return null;
  134. Occurrence oc = occs.remove(occs.size() - 1);
  135. ArrayList<Integer> list = binarySearch(occs, oc.frequency, 0, occs.size()-1, new ArrayList<Integer>());
  136. int index = list.get(list.size() - 1);
  137. occs.add(index, oc);
  138. return list;
  139. }
  140. private ArrayList<Integer> binarySearch(ArrayList<Occurrence>occs, int num, int min, int max, ArrayList<Integer>ret) {
  141. int mid = (min + max) / 2;
  142. if(max <= min) {
  143. if(num > occs.get(mid).frequency) ret.add(mid);
  144. else ret.add(mid+1);
  145. return ret;
  146. }
  147. ret.add(mid);
  148. if(num < occs.get(mid).frequency) return binarySearch(occs, num, mid + 1, max, ret);
  149. else if(num > occs.get(mid).frequency) return binarySearch(occs, num, min, mid - 1, ret);
  150. return ret;
  151. }
  152.  
  153. /**
  154. * This method indexes all keywords found in all the input documents. When this
  155. * method is done, the keywordsIndex hash table will be filled with all keywords,
  156. * each of which is associated with an array list of Occurrence objects, arranged
  157. * in decreasing frequencies of occurrence.
  158. *
  159. * @param docsFile Name of file that has a list of all the document file names, one name per line
  160. * @param noiseWordsFile Name of file that has a list of noise words, one noise word per line
  161. * @throws FileNotFoundException If there is a problem locating any of the input files on disk
  162. */
  163. public void makeIndex(String docsFile, String noiseWordsFile)
  164. throws FileNotFoundException {
  165. // load noise words to hash table
  166. Scanner sc = new Scanner(new File(noiseWordsFile));
  167. while (sc.hasNext()) {
  168. String word = sc.next();
  169. noiseWords.add(word);
  170. }
  171.  
  172. // index all keywords
  173. sc = new Scanner(new File(docsFile));
  174. while (sc.hasNext()) {
  175. String docFile = sc.next();
  176. HashMap<String,Occurrence> kws = loadKeywordsFromDocument(docFile);
  177. mergeKeywords(kws);
  178. }
  179. sc.close();
  180. }
  181.  
  182. /**
  183. * Search result for "kw1 or kw2". A document is in the result set if kw1 or kw2 occurs in that
  184. * document. Result set is arranged in descending order of document frequencies. (Note that a
  185. * matching document will only appear once in the result.) Ties in frequency values are broken
  186. * in favor of the first keyword. (That is, if kw1 is in doc1 with frequency f1, and kw2 is in doc2
  187. * also with the same frequency f1, then doc1 will take precedence over doc2 in the result.
  188. * The result set is limited to 5 entries. If there are no matches at all, result is null.
  189. *
  190. * @param kw1 First keyword
  191. * @param kw1 Second keyword
  192. * @return List of documents in which either kw1 or kw2 occurs, arranged in descending order of
  193. * frequencies. The result size is limited to 5 documents. If there are no matches, returns null.
  194. */
  195. public ArrayList<String> top5search(String kw1, String kw2) {
  196. /** COMPLETE THIS METHOD **/
  197. ArrayList<String>ret = new ArrayList<String>();
  198. ArrayList<Occurrence>oList1 = keywordsIndex.get(kw1), oList2 = keywordsIndex.get(kw2);
  199. if(oList1 == null && oList2 == null) return ret;
  200. else if(oList1 == null) oList1 = oList2;
  201. else if(oList2 != null && oList1 != oList2){
  202. for(Occurrence o: oList2) {
  203. insert(oList1, o);
  204. }
  205. }
  206. for(Occurrence t: oList1) {
  207. if(!ret.contains(t.document))ret.add(t.document);
  208. if(ret.size() == 5)break;
  209. }
  210. return ret;
  211. }
  212. private void insert(ArrayList<Occurrence>oList1, Occurrence o) {
  213. for(int i = 0; i < oList1.size(); i++) {
  214. if(o.frequency > oList1.get(i).frequency) {
  215. oList1.add(i, o);
  216. return;
  217. }
  218. }
  219. oList1.add(o);
  220. }
  221. }
Advertisement
Add Comment
Please, Sign In to add comment