Untitled

package search.analyzers;

import datastructures.interfaces.IDictionary;
import datastructures.interfaces.IList;
import datastructures.interfaces.ISet;
import datastructures.concrete.dictionaries.ChainedHashDictionary;
import datastructures.concrete.KVPair;
import datastructures.concrete.ChainedHashSet;
import search.models.Webpage;

import java.net.URI;

/**
 * This class is responsible for computing how "relevant" any given document is
 * to a given search query.
 *
 * See the spec for more details.
 */
public class TfIdfAnalyzer {
    // This field must contain the IDF score for every single word in all
    // the documents.
    private IDictionary<String, Double> idfScores;

    // This field must contain the TF-IDF vector for each webpage you were given
    // in the constructor.
    //
    // We will use each webpage's page URI as a unique key.
    private IDictionary<URI, IDictionary<String, Double>> documentTfIdfVectors;
    private IDictionary<URI, Double> docNorms;

    // Feel free to add extra fields and helper methods.

    public TfIdfAnalyzer(ISet<Webpage> webpages) {
        // Implementation note: We have commented these method calls out so your
        // search engine doesn't immediately crash when you try running it for the
        // first time.
        //
        // You should uncomment these lines when you're ready to begin working
        // on this class.

        this.idfScores = this.computeIdfScores(webpages);
        this.documentTfIdfVectors = this.computeAllDocumentTfIdfVectors(webpages);
        this.docNorms = new ChainedHashDictionary<URI, Double>();
        for (KVPair<URI, IDictionary<String, Double>> pair: this.documentTfIdfVectors) {
            docNorms.put(pair.getKey(), norm(pair.getValue()));
        }
    }

    // Note: this method, strictly speaking, doesn't need to exist. However,
    // we've included it so we can add some unit tests to help verify that your
    // constructor correctly initializes your fields.
    public IDictionary<URI, IDictionary<String, Double>> getDocumentTfIdfVectors() {
        return this.documentTfIdfVectors;
    }

    // Note: these private methods are suggestions or hints on how to structure your
    // code. However, since they're private, you're not obligated to implement exactly
    // these methods: feel free to change or modify these methods however you want. The
    // important thing is that your 'computeRelevance' method ultimately returns the
    // correct answer in an efficient manner.

    /**
     * Return a dictionary mapping every single unique word found
     * in every single document to their IDF score.
     */
    private IDictionary<String, Double> computeIdfScores(ISet<Webpage> pages) {
        IDictionary<String, Double> idfs = new ChainedHashDictionary<>();
        for (Webpage page : pages) {
            IList<String> words = page.getWords();
            ISet<String> uniqueWords = new ChainedHashSet<>();
            for (String word : words) {
                uniqueWords.add(word);
            }
            for (String word : uniqueWords) {
                if (!idfs.containsKey(word)) {
                    idfs.put(word, 1.0);
                }
                else {
                    idfs.put(word, idfs.get(word) + 1.0);
                }
            }
        }
        IDictionary<String, Double> cleanIdfVector = new ChainedHashDictionary<>();
        for (KVPair<String, Double> pair : idfs) {
            cleanIdfVector.put(pair.getKey(), Math.log(pages.size()/ pair.getValue()));
        }
        return cleanIdfVector;
    }

    /**
     * Returns a dictionary mapping every unique word found in the given list
     * to their term frequency (TF) score.
     *
     * The input list represents the words contained within a single document.
     */
    private IDictionary<String, Double> computeTfScores(IList<String> words) {
        IDictionary<String, Double> tfVector = new ChainedHashDictionary<>();
        for (String word : words) {
            if (!tfVector.containsKey(word)){
                tfVector.put(word, 1.0);
            }
            else {
                tfVector.put(word, tfVector.get(word) + 1.0);
            }
        }
        IDictionary<String, Double> cleanTfVector = new ChainedHashDictionary<>();
        for (KVPair<String, Double> pair : tfVector){
            cleanTfVector.put(pair.getKey(), pair.getValue()/ words.size());
        }
        return cleanTfVector;
    }

    /**
     * See spec for more details on what this method should do.
     */
    private IDictionary<URI, IDictionary<String, Double>> computeAllDocumentTfIdfVectors(ISet<Webpage> pages) {
        // Hint: this method should use the idfScores field and
        // call the computeTfScores(...) method.
        IDictionary<URI, IDictionary<String, Double>> pagesTfScores = new ChainedHashDictionary<>();
        for (Webpage page : pages) {
            pagesTfScores.put(page.getUri(), computeOneDocumentTfIdfVector(page.getWords()));
        }
        return pagesTfScores;
    }

    private IDictionary<String, Double> computeOneDocumentTfIdfVector(IList<String> words) {
        IDictionary<String, Double> tfIdfScores = new ChainedHashDictionary<>();
        IDictionary<String, Double> tfScores = computeTfScores(words);
        for (KVPair<String, Double> pair : tfScores) {
            tfIdfScores.put(pair.getKey(), pair.getValue() * idfScores.getOrDefault(pair.getKey(), 0.0));
        }
        return tfIdfScores;
    }

    private double norm(IDictionary<String, Double> vector) {
        double output = 0.0;
        for (KVPair<String, Double> pair : vector) {
            double score = pair.getValue();
            output += score * score;
        }
        return Math.sqrt(output);
    }

    /**
     * Returns the cosine similarity between the TF-IDF vector for the given query and the
     * URI's document.
     *
     * Precondition: the given uri must have been one of the uris within the list of
     *               webpages given to the constructor.
     */
    public Double computeRelevance(IList<String> query, URI pageUri) {
        // Note: The pseudocode we gave you is not very efficient. When implementing,
        // this method, you should:
        //
        // 1. Figure out what information can be precomputed in your constructor.
        //    Add a third field containing that information.
        //
        // 2. See if you can combine or merge one or more loops.

        IDictionary<String, Double> documentVector = documentTfIdfVectors.get(pageUri);
        IDictionary<String, Double> queryVector = computeOneDocumentTfIdfVector(query);

        double numerator = 0.0;
        for (KVPair<String, Double> pair: queryVector) {
            double docWordScore = documentVector.getOrDefault(pair.getKey(), 0.0);
            double queryWordScore = pair.getValue();
            numerator += docWordScore * queryWordScore;
        }
        double deno = docNorms.get(pageUri) * norm(queryVector);
        if (deno != 0) {
            return numerator/deno;
        }
        else {
            return 0.0;
        }
    }
}