Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- package search.analyzers;
- import datastructures.interfaces.IDictionary;
- import datastructures.interfaces.IList;
- import datastructures.interfaces.ISet;
- import datastructures.concrete.dictionaries.ChainedHashDictionary;
- import datastructures.concrete.KVPair;
- import datastructures.concrete.ChainedHashSet;
- import search.models.Webpage;
- import java.net.URI;
- /**
- * This class is responsible for computing how "relevant" any given document is
- * to a given search query.
- *
- * See the spec for more details.
- */
- public class TfIdfAnalyzer {
- // This field must contain the IDF score for every single word in all
- // the documents.
- private IDictionary<String, Double> idfScores;
- // This field must contain the TF-IDF vector for each webpage you were given
- // in the constructor.
- //
- // We will use each webpage's page URI as a unique key.
- private IDictionary<URI, IDictionary<String, Double>> documentTfIdfVectors;
- private IDictionary<URI, Double> docNorms;
- // Feel free to add extra fields and helper methods.
- public TfIdfAnalyzer(ISet<Webpage> webpages) {
- // Implementation note: We have commented these method calls out so your
- // search engine doesn't immediately crash when you try running it for the
- // first time.
- //
- // You should uncomment these lines when you're ready to begin working
- // on this class.
- this.idfScores = this.computeIdfScores(webpages);
- this.documentTfIdfVectors = this.computeAllDocumentTfIdfVectors(webpages);
- this.docNorms = new ChainedHashDictionary<URI, Double>();
- for (KVPair<URI, IDictionary<String, Double>> pair: this.documentTfIdfVectors) {
- docNorms.put(pair.getKey(), norm(pair.getValue()));
- }
- }
- // Note: this method, strictly speaking, doesn't need to exist. However,
- // we've included it so we can add some unit tests to help verify that your
- // constructor correctly initializes your fields.
- public IDictionary<URI, IDictionary<String, Double>> getDocumentTfIdfVectors() {
- return this.documentTfIdfVectors;
- }
- // Note: these private methods are suggestions or hints on how to structure your
- // code. However, since they're private, you're not obligated to implement exactly
- // these methods: feel free to change or modify these methods however you want. The
- // important thing is that your 'computeRelevance' method ultimately returns the
- // correct answer in an efficient manner.
- /**
- * Return a dictionary mapping every single unique word found
- * in every single document to their IDF score.
- */
- private IDictionary<String, Double> computeIdfScores(ISet<Webpage> pages) {
- IDictionary<String, Double> idfs = new ChainedHashDictionary<>();
- for (Webpage page : pages) {
- IList<String> words = page.getWords();
- ISet<String> uniqueWords = new ChainedHashSet<>();
- for (String word : words) {
- uniqueWords.add(word);
- }
- for (String word : uniqueWords) {
- if (!idfs.containsKey(word)) {
- idfs.put(word, 1.0);
- }
- else {
- idfs.put(word, idfs.get(word) + 1.0);
- }
- }
- }
- IDictionary<String, Double> cleanIdfVector = new ChainedHashDictionary<>();
- for (KVPair<String, Double> pair : idfs) {
- cleanIdfVector.put(pair.getKey(), Math.log(pages.size()/ pair.getValue()));
- }
- return cleanIdfVector;
- }
- /**
- * Returns a dictionary mapping every unique word found in the given list
- * to their term frequency (TF) score.
- *
- * The input list represents the words contained within a single document.
- */
- private IDictionary<String, Double> computeTfScores(IList<String> words) {
- IDictionary<String, Double> tfVector = new ChainedHashDictionary<>();
- for (String word : words) {
- if (!tfVector.containsKey(word)){
- tfVector.put(word, 1.0);
- }
- else {
- tfVector.put(word, tfVector.get(word) + 1.0);
- }
- }
- IDictionary<String, Double> cleanTfVector = new ChainedHashDictionary<>();
- for (KVPair<String, Double> pair : tfVector){
- cleanTfVector.put(pair.getKey(), pair.getValue()/ words.size());
- }
- return cleanTfVector;
- }
- /**
- * See spec for more details on what this method should do.
- */
- private IDictionary<URI, IDictionary<String, Double>> computeAllDocumentTfIdfVectors(ISet<Webpage> pages) {
- // Hint: this method should use the idfScores field and
- // call the computeTfScores(...) method.
- IDictionary<URI, IDictionary<String, Double>> pagesTfScores = new ChainedHashDictionary<>();
- for (Webpage page : pages) {
- pagesTfScores.put(page.getUri(), computeOneDocumentTfIdfVector(page.getWords()));
- }
- return pagesTfScores;
- }
- private IDictionary<String, Double> computeOneDocumentTfIdfVector(IList<String> words) {
- IDictionary<String, Double> tfIdfScores = new ChainedHashDictionary<>();
- IDictionary<String, Double> tfScores = computeTfScores(words);
- for (KVPair<String, Double> pair : tfScores) {
- tfIdfScores.put(pair.getKey(), pair.getValue() * idfScores.getOrDefault(pair.getKey(), 0.0));
- }
- return tfIdfScores;
- }
- private double norm(IDictionary<String, Double> vector) {
- double output = 0.0;
- for (KVPair<String, Double> pair : vector) {
- double score = pair.getValue();
- output += score * score;
- }
- return Math.sqrt(output);
- }
- /**
- * Returns the cosine similarity between the TF-IDF vector for the given query and the
- * URI's document.
- *
- * Precondition: the given uri must have been one of the uris within the list of
- * webpages given to the constructor.
- */
- public Double computeRelevance(IList<String> query, URI pageUri) {
- // Note: The pseudocode we gave you is not very efficient. When implementing,
- // this method, you should:
- //
- // 1. Figure out what information can be precomputed in your constructor.
- // Add a third field containing that information.
- //
- // 2. See if you can combine or merge one or more loops.
- IDictionary<String, Double> documentVector = documentTfIdfVectors.get(pageUri);
- IDictionary<String, Double> queryVector = computeOneDocumentTfIdfVector(query);
- double numerator = 0.0;
- for (KVPair<String, Double> pair: queryVector) {
- double docWordScore = documentVector.getOrDefault(pair.getKey(), 0.0);
- double queryWordScore = pair.getValue();
- numerator += docWordScore * queryWordScore;
- }
- double deno = docNorms.get(pageUri) * norm(queryVector);
- if (deno != 0) {
- return numerator/deno;
- }
- else {
- return 0.0;
- }
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement