Guest User

Untitled

a guest
Feb 17th, 2019
77
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 0.77 KB | None | 0 0
  1. import nltk, string
  2. from sklearn.feature_extraction.text import TfidfVectorizer
  3.  
  4. stemmer = nltk.stem.porter.PorterStemmer()
  5. remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)
  6.  
  7. def stem_tokens(tokens):
  8. return [stemmer.stem(item) for item in tokens]
  9.  
  10. '''removendo pontuação'''
  11. def normalize(text):
  12. return stem_tokens(nltk.word_tokenize(text.lower().translate(remove_punctuation_map)))
  13.  
  14. vectorizer = TfidfVectorizer(tokenizer=normalize)
  15.  
  16. def cosine_sim(text1, text2):
  17. tfidf = vectorizer.fit_transform([text1, text2])
  18. return ((tfidf * tfidf.T).A)[0,1]
  19.  
  20.  
  21. print (cosine_sim('eu tenho um celular da y', 'eu tenho um celular da y'))
  22. print (cosine_sim('um celular', 'um celular da marca x é meu'))
  23. print (cosine_sim('um celular', 'uma maquina de lavar'))
Add Comment
Please, Sign In to add comment