Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- class Tfidf(object):
- """
- Convert a collection of raw documents to a matrix of TF-IDF features.
- Inputs a list of string.
- and the list of feature name.
- Wrapper of https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
- Parameters
- ----------
- input=’content’, encoding=’utf-8’, decode_error=’strict’, strip_accents=None,
- lowercase=True, preprocessor=None, tokenizer=None, analyzer=’word’,
- stop_words=None, token_pattern=’(?u)\b\w\w+\b’, ngram_range=(1, 1),
- max_df=1.0, min_df=1, max_features=None, vocabulary=None, binary=False,
- dtype=<class ‘numpy.float64’>, norm=’l2’, use_idf=True, smooth_idf=True, sublinear_tf=False
- Returns
- -------
- list
- Outputs a tuple with the wordcount vector matrix
- """
- def __init__(self, **kwargs):
- self.tfidf_vectorizer = _TfidfVectorizer(**kwargs)
- def _get_features_name(self):
- '''
- Array mapping from feature integer indices to feature name
- '''
- self.feature_names = self.tfidf_vectorizer.get_feature_names()
- return self.feature_names
- def compute_tfidf(self, raw_documents):
- '''
- Learn vocabulary and idf, return term-document matrix.
- Input a list of documents (string)
- Output the wordcount vector matrix.
- params
- ------
- raw_documents : iterable
- an iterable which yields either str, unicode or file objects
- returns
- -------
- X : sparse matrix, [n_samples, n_features]
- Tf-idf-weighted document-term matrix.
- '''
- self.word_count_vector = self.tfidf_vectorizer.fit_transform(raw_documents)
- self._get_features_name()
- return self.word_count_vector
- def apply_tfidf_to_documents(self, raw_document:list):
- '''
- Apply the tf-idf weights to documents.
- Transform documents to document-term matrix.
- Uses the vocabulary and document frequencies (df) learned by fit
- (or fit_transform).
- parameters
- ---------
- raw_documents : iterable
- an iterable which yields either str, unicode or file objects
- Returns
- -------
- X : sparse matrix, [n_samples, n_features]
- Tf-idf-weighted document-term matrix.
- '''
- if type(raw_document) == str:
- raw_document = [raw_document]
- return self.tfidf_vectorizer.transform(raw_document)
- def _sort_coo(self, coo_matrix):
- '''sort the tf-idf vectors by descending order of scores'''
- tuples = zip(coo_matrix.col, coo_matrix.data)
- return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)
- def _extract_topn_from_vector(self, feature_names, sorted_items, topn=10):
- """get the feature names and tf-idf score of top n items"""
- #use only topn items from vector
- sorted_items = sorted_items[:topn]
- score_vals = []
- feature_vals = []
- # word index and corresponding tf-idf score
- for idx, score in sorted_items:
- #keep track of feature name and its corresponding score
- score_vals.append(round(score, 3))
- feature_vals.append(feature_names[idx])
- #create a tuples of feature,score
- #results = zip(feature_vals,score_vals)
- results= {}
- for idx in range(len(feature_vals)):
- results[feature_vals[idx]]=score_vals[idx]
- return results
- def get_top_tfidf_per_doc(self, text, n=10):
- '''compute TF-IDF for a given doc, and returns a list of the top N weighted words'''
- tf_idf_vector= self.apply_tfidf_to_documents([text])
- sorted_items=self._sort_coo(tf_idf_vector.tocoo())
- return list(self._extract_topn_from_vector(self.feature_names, sorted_items, n).keys())
- def get_top_tfidf(self, tfidf_matrix=None, n=10):
- '''
- returns a dict of the top N weighted words, with their weight
- '''
- if tfidf_matrix is None:
- tfidf_matrix = self.word_count_vector
- return self._extract_topn_from_vector(self.feature_names, self._sort_coo(tfidf_matrix.tocoo()), topn=n)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement