Advertisement
Guest User

Untitled

a guest
Jan 17th, 2019
113
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 0.52 KB | None | 0 0
  1. from nltk import word_tokenize          
  2. from nltk.stem import WordNetLemmatizer
  3.  
  4. class LemmaTokenizer(object):
  5.   def __init__(self):
  6.     self.wnl = WordNetLemmatizer()
  7.     def __call__(self, doc):
  8.       return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]
  9.  
  10. from sklearn.feature_extraction.text import TfidfVectorizer
  11.  
  12. vectorizer = TfidfVectorizer( tokenizer = LemmaTokenizer(), analyzer = 'word', max_df = 0.7, min_df = 50, stop_words = 'english' )
  13. vectorizer.fit(corpus)
  14. corpus_tf_idf = vectorizer.transform(corpus)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement