Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import nltk, string
- from sklearn.feature_extraction.text import TfidfVectorizer
- stemmer = nltk.stem.porter.PorterStemmer()
- remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)
- def stem_tokens(tokens):
- return [stemmer.stem(item) for item in tokens]
- '''removendo pontuação'''
- def normalize(text):
- return stem_tokens(nltk.word_tokenize(text.lower().translate(remove_punctuation_map)))
- vectorizer = TfidfVectorizer(tokenizer=normalize)
- def cosine_sim(text1, text2):
- tfidf = vectorizer.fit_transform([text1, text2])
- return ((tfidf * tfidf.T).A)[0,1]
- print (cosine_sim('eu tenho um celular da y', 'eu tenho um celular da y'))
- print (cosine_sim('um celular', 'um celular da marca x é meu'))
- print (cosine_sim('um celular', 'uma maquina de lavar'))
Add Comment
Please, Sign In to add comment