Advertisement
Guest User

Untitled

a guest
May 23rd, 2019
72
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 4.30 KB | None | 0 0
  1. class Tfidf(object):
  2.     """
  3.    Convert a collection of raw documents to a matrix of TF-IDF features.
  4.  
  5.    Inputs a list of string.
  6.    and the list of feature name.
  7.    Wrapper of https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
  8.  
  9.    Parameters
  10.    ----------
  11.    input=’content’, encoding=’utf-8’, decode_error=’strict’, strip_accents=None,
  12.    lowercase=True, preprocessor=None, tokenizer=None, analyzer=’word’,
  13.    stop_words=None, token_pattern=’(?u)\b\w\w+\b’, ngram_range=(1, 1),
  14.    max_df=1.0, min_df=1, max_features=None, vocabulary=None, binary=False,
  15.    dtype=<class ‘numpy.float64’>, norm=’l2’, use_idf=True, smooth_idf=True, sublinear_tf=False
  16.  
  17.    Returns
  18.    -------
  19.    list
  20.        Outputs a tuple with the wordcount vector matrix
  21.    """
  22.    
  23.     def __init__(self, **kwargs):
  24.         self.tfidf_vectorizer = _TfidfVectorizer(**kwargs)
  25.  
  26.  
  27.     def _get_features_name(self):
  28.         '''
  29.        Array mapping from feature integer indices to feature name
  30.        '''
  31.         self.feature_names = self.tfidf_vectorizer.get_feature_names()
  32.         return self.feature_names
  33.  
  34.  
  35.     def compute_tfidf(self, raw_documents):
  36.         '''
  37.        Learn vocabulary and idf, return term-document matrix.
  38.  
  39.        Input a list of documents (string)
  40.        Output the wordcount vector matrix.
  41.  
  42.        params
  43.        ------
  44.        raw_documents : iterable
  45.            an iterable which yields either str, unicode or file objects
  46.  
  47.        returns
  48.        -------
  49.        X : sparse matrix, [n_samples, n_features]
  50.            Tf-idf-weighted document-term matrix.
  51.        '''
  52.         self.word_count_vector = self.tfidf_vectorizer.fit_transform(raw_documents)
  53.         self._get_features_name()
  54.         return self.word_count_vector    
  55.  
  56.  
  57.     def apply_tfidf_to_documents(self, raw_document:list):
  58.         '''
  59.        Apply the tf-idf weights to documents.
  60.        
  61.        Transform documents to document-term matrix.
  62.        Uses the vocabulary and document frequencies (df) learned by fit
  63.        (or fit_transform).
  64.  
  65.        parameters
  66.        ---------
  67.        raw_documents : iterable
  68.            an iterable which yields either str, unicode or file objects
  69.  
  70.        Returns
  71.        -------
  72.        X : sparse matrix, [n_samples, n_features]
  73.            Tf-idf-weighted document-term matrix.
  74.        '''
  75.         if type(raw_document) == str:
  76.             raw_document = [raw_document]
  77.         return self.tfidf_vectorizer.transform(raw_document)
  78.    
  79.    
  80.     def _sort_coo(self, coo_matrix):
  81.         '''sort the tf-idf vectors by descending order of scores'''
  82.         tuples = zip(coo_matrix.col, coo_matrix.data)
  83.         return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)
  84.    
  85.    
  86.     def _extract_topn_from_vector(self, feature_names, sorted_items, topn=10):
  87.         """get the feature names and tf-idf score of top n items"""
  88.  
  89.         #use only topn items from vector
  90.         sorted_items = sorted_items[:topn]
  91.  
  92.         score_vals = []
  93.         feature_vals = []
  94.  
  95.         # word index and corresponding tf-idf score
  96.         for idx, score in sorted_items:
  97.  
  98.             #keep track of feature name and its corresponding score
  99.             score_vals.append(round(score, 3))
  100.             feature_vals.append(feature_names[idx])
  101.  
  102.         #create a tuples of feature,score
  103.         #results = zip(feature_vals,score_vals)
  104.         results= {}
  105.         for idx in range(len(feature_vals)):
  106.             results[feature_vals[idx]]=score_vals[idx]
  107.  
  108.         return results
  109.  
  110.  
  111.     def get_top_tfidf_per_doc(self, text, n=10):
  112.         '''compute TF-IDF for a given doc, and returns a list of the top N weighted words'''
  113.         tf_idf_vector= self.apply_tfidf_to_documents([text])
  114.         sorted_items=self._sort_coo(tf_idf_vector.tocoo())
  115.         return list(self._extract_topn_from_vector(self.feature_names, sorted_items, n).keys())
  116.    
  117.  
  118.     def get_top_tfidf(self, tfidf_matrix=None, n=10):
  119.         '''
  120.        returns a dict of the top N weighted words, with their weight
  121.        '''
  122.         if tfidf_matrix is None:
  123.             tfidf_matrix = self.word_count_vector
  124.         return self._extract_topn_from_vector(self.feature_names, self._sort_coo(tfidf_matrix.tocoo()), topn=n)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement