Untitled

class Tfidf(object):
    """
    Convert a collection of raw documents to a matrix of TF-IDF features.

    Inputs a list of string.
    and the list of feature name.
    Wrapper of https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html

    Parameters
    ----------
    input=’content’, encoding=’utf-8’, decode_error=’strict’, strip_accents=None,
    lowercase=True, preprocessor=None, tokenizer=None, analyzer=’word’,
    stop_words=None, token_pattern=’(?u)\b\w\w+\b’, ngram_range=(1, 1),
    max_df=1.0, min_df=1, max_features=None, vocabulary=None, binary=False,
    dtype=<class ‘numpy.float64’>, norm=’l2’, use_idf=True, smooth_idf=True, sublinear_tf=False

    Returns
    -------
    list
        Outputs a tuple with the wordcount vector matrix
    """

    def __init__(self, **kwargs):
        self.tfidf_vectorizer = _TfidfVectorizer(**kwargs)


    def _get_features_name(self):
        '''
        Array mapping from feature integer indices to feature name
        '''
        self.feature_names = self.tfidf_vectorizer.get_feature_names()
        return self.feature_names


    def compute_tfidf(self, raw_documents):
        '''
        Learn vocabulary and idf, return term-document matrix.

        Input a list of documents (string)
        Output the wordcount vector matrix.

        params
        ------
        raw_documents : iterable
            an iterable which yields either str, unicode or file objects

        returns
        -------
        X : sparse matrix, [n_samples, n_features]
            Tf-idf-weighted document-term matrix.
        '''
        self.word_count_vector = self.tfidf_vectorizer.fit_transform(raw_documents)
        self._get_features_name()
        return self.word_count_vector


    def apply_tfidf_to_documents(self, raw_document:list):
        '''
        Apply the tf-idf weights to documents.

        Transform documents to document-term matrix.
        Uses the vocabulary and document frequencies (df) learned by fit
        (or fit_transform).

        parameters
        ---------
        raw_documents : iterable
            an iterable which yields either str, unicode or file objects

        Returns
        -------
        X : sparse matrix, [n_samples, n_features]
            Tf-idf-weighted document-term matrix.
        '''
        if type(raw_document) == str:
            raw_document = [raw_document]
        return self.tfidf_vectorizer.transform(raw_document)


    def _sort_coo(self, coo_matrix):
        '''sort the tf-idf vectors by descending order of scores'''
        tuples = zip(coo_matrix.col, coo_matrix.data)
        return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)


    def _extract_topn_from_vector(self, feature_names, sorted_items, topn=10):
        """get the feature names and tf-idf score of top n items"""

        #use only topn items from vector
        sorted_items = sorted_items[:topn]

        score_vals = []
        feature_vals = []

        # word index and corresponding tf-idf score
        for idx, score in sorted_items:

            #keep track of feature name and its corresponding score
            score_vals.append(round(score, 3))
            feature_vals.append(feature_names[idx])

        #create a tuples of feature,score
        #results = zip(feature_vals,score_vals)
        results= {}
        for idx in range(len(feature_vals)):
            results[feature_vals[idx]]=score_vals[idx]

        return results


    def get_top_tfidf_per_doc(self, text, n=10):
        '''compute TF-IDF for a given doc, and returns a list of the top N weighted words'''
        tf_idf_vector= self.apply_tfidf_to_documents([text])
        sorted_items=self._sort_coo(tf_idf_vector.tocoo())
        return list(self._extract_topn_from_vector(self.feature_names, sorted_items, n).keys())


    def get_top_tfidf(self, tfidf_matrix=None, n=10):
        '''
        returns a dict of the top N weighted words, with their weight
        '''
        if tfidf_matrix is None:
            tfidf_matrix = self.word_count_vector
        return self._extract_topn_from_vector(self.feature_names, self._sort_coo(tfidf_matrix.tocoo()), topn=n)