Advertisement
Guest User

Untitled

a guest
May 22nd, 2019
78
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 7.16 KB | None | 0 0
  1. package search.analyzers;
  2.  
  3. import datastructures.interfaces.IDictionary;
  4. import datastructures.interfaces.IList;
  5. import datastructures.interfaces.ISet;
  6. import datastructures.concrete.dictionaries.ChainedHashDictionary;
  7. import datastructures.concrete.KVPair;
  8. import datastructures.concrete.ChainedHashSet;
  9. import search.models.Webpage;
  10.  
  11. import java.net.URI;
  12.  
  13. /**
  14. * This class is responsible for computing how "relevant" any given document is
  15. * to a given search query.
  16. *
  17. * See the spec for more details.
  18. */
  19. public class TfIdfAnalyzer {
  20. // This field must contain the IDF score for every single word in all
  21. // the documents.
  22. private IDictionary<String, Double> idfScores;
  23.  
  24. // This field must contain the TF-IDF vector for each webpage you were given
  25. // in the constructor.
  26. //
  27. // We will use each webpage's page URI as a unique key.
  28. private IDictionary<URI, IDictionary<String, Double>> documentTfIdfVectors;
  29. private IDictionary<URI, Double> docNorms;
  30.  
  31. // Feel free to add extra fields and helper methods.
  32.  
  33. public TfIdfAnalyzer(ISet<Webpage> webpages) {
  34. // Implementation note: We have commented these method calls out so your
  35. // search engine doesn't immediately crash when you try running it for the
  36. // first time.
  37. //
  38. // You should uncomment these lines when you're ready to begin working
  39. // on this class.
  40.  
  41. this.idfScores = this.computeIdfScores(webpages);
  42. this.documentTfIdfVectors = this.computeAllDocumentTfIdfVectors(webpages);
  43. this.docNorms = new ChainedHashDictionary<URI, Double>();
  44. for (KVPair<URI, IDictionary<String, Double>> pair: this.documentTfIdfVectors) {
  45. docNorms.put(pair.getKey(), norm(pair.getValue()));
  46. }
  47. }
  48.  
  49. // Note: this method, strictly speaking, doesn't need to exist. However,
  50. // we've included it so we can add some unit tests to help verify that your
  51. // constructor correctly initializes your fields.
  52. public IDictionary<URI, IDictionary<String, Double>> getDocumentTfIdfVectors() {
  53. return this.documentTfIdfVectors;
  54. }
  55.  
  56. // Note: these private methods are suggestions or hints on how to structure your
  57. // code. However, since they're private, you're not obligated to implement exactly
  58. // these methods: feel free to change or modify these methods however you want. The
  59. // important thing is that your 'computeRelevance' method ultimately returns the
  60. // correct answer in an efficient manner.
  61.  
  62. /**
  63. * Return a dictionary mapping every single unique word found
  64. * in every single document to their IDF score.
  65. */
  66. private IDictionary<String, Double> computeIdfScores(ISet<Webpage> pages) {
  67. IDictionary<String, Double> idfs = new ChainedHashDictionary<>();
  68. for (Webpage page : pages) {
  69. IList<String> words = page.getWords();
  70. ISet<String> uniqueWords = new ChainedHashSet<>();
  71. for (String word : words) {
  72. uniqueWords.add(word);
  73. }
  74. for (String word : uniqueWords) {
  75. if (!idfs.containsKey(word)) {
  76. idfs.put(word, 1.0);
  77. }
  78. else {
  79. idfs.put(word, idfs.get(word) + 1.0);
  80. }
  81. }
  82. }
  83. IDictionary<String, Double> cleanIdfVector = new ChainedHashDictionary<>();
  84. for (KVPair<String, Double> pair : idfs) {
  85. cleanIdfVector.put(pair.getKey(), Math.log(pages.size()/ pair.getValue()));
  86. }
  87. return cleanIdfVector;
  88. }
  89.  
  90. /**
  91. * Returns a dictionary mapping every unique word found in the given list
  92. * to their term frequency (TF) score.
  93. *
  94. * The input list represents the words contained within a single document.
  95. */
  96. private IDictionary<String, Double> computeTfScores(IList<String> words) {
  97. IDictionary<String, Double> tfVector = new ChainedHashDictionary<>();
  98. for (String word : words) {
  99. if (!tfVector.containsKey(word)){
  100. tfVector.put(word, 1.0);
  101. }
  102. else {
  103. tfVector.put(word, tfVector.get(word) + 1.0);
  104. }
  105. }
  106. IDictionary<String, Double> cleanTfVector = new ChainedHashDictionary<>();
  107. for (KVPair<String, Double> pair : tfVector){
  108. cleanTfVector.put(pair.getKey(), pair.getValue()/ words.size());
  109. }
  110. return cleanTfVector;
  111. }
  112.  
  113. /**
  114. * See spec for more details on what this method should do.
  115. */
  116. private IDictionary<URI, IDictionary<String, Double>> computeAllDocumentTfIdfVectors(ISet<Webpage> pages) {
  117. // Hint: this method should use the idfScores field and
  118. // call the computeTfScores(...) method.
  119. IDictionary<URI, IDictionary<String, Double>> pagesTfScores = new ChainedHashDictionary<>();
  120. for (Webpage page : pages) {
  121. pagesTfScores.put(page.getUri(), computeOneDocumentTfIdfVector(page.getWords()));
  122. }
  123. return pagesTfScores;
  124. }
  125.  
  126. private IDictionary<String, Double> computeOneDocumentTfIdfVector(IList<String> words) {
  127. IDictionary<String, Double> tfIdfScores = new ChainedHashDictionary<>();
  128. IDictionary<String, Double> tfScores = computeTfScores(words);
  129. for (KVPair<String, Double> pair : tfScores) {
  130. tfIdfScores.put(pair.getKey(), pair.getValue() * idfScores.getOrDefault(pair.getKey(), 0.0));
  131. }
  132. return tfIdfScores;
  133. }
  134.  
  135. private double norm(IDictionary<String, Double> vector) {
  136. double output = 0.0;
  137. for (KVPair<String, Double> pair : vector) {
  138. double score = pair.getValue();
  139. output += score * score;
  140. }
  141. return Math.sqrt(output);
  142. }
  143.  
  144. /**
  145. * Returns the cosine similarity between the TF-IDF vector for the given query and the
  146. * URI's document.
  147. *
  148. * Precondition: the given uri must have been one of the uris within the list of
  149. * webpages given to the constructor.
  150. */
  151. public Double computeRelevance(IList<String> query, URI pageUri) {
  152. // Note: The pseudocode we gave you is not very efficient. When implementing,
  153. // this method, you should:
  154. //
  155. // 1. Figure out what information can be precomputed in your constructor.
  156. // Add a third field containing that information.
  157. //
  158. // 2. See if you can combine or merge one or more loops.
  159.  
  160. IDictionary<String, Double> documentVector = documentTfIdfVectors.get(pageUri);
  161. IDictionary<String, Double> queryVector = computeOneDocumentTfIdfVector(query);
  162.  
  163. double numerator = 0.0;
  164. for (KVPair<String, Double> pair: queryVector) {
  165. double docWordScore = documentVector.getOrDefault(pair.getKey(), 0.0);
  166. double queryWordScore = pair.getValue();
  167. numerator += docWordScore * queryWordScore;
  168. }
  169. double deno = docNorms.get(pageUri) * norm(queryVector);
  170. if (deno != 0) {
  171. return numerator/deno;
  172. }
  173. else {
  174. return 0.0;
  175. }
  176. }
  177. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement