Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from nltk import word_tokenize
- from nltk.stem import WordNetLemmatizer
- class LemmaTokenizer(object):
- def __init__(self):
- self.wnl = WordNetLemmatizer()
- def __call__(self, doc):
- return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]
- from sklearn.feature_extraction.text import TfidfVectorizer
- vectorizer = TfidfVectorizer( tokenizer = LemmaTokenizer(), analyzer = 'word', max_df = 0.7, min_df = 50, stop_words = 'english' )
- vectorizer.fit(corpus)
- corpus_tf_idf = vectorizer.transform(corpus)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement