Advertisement
Guest User

Untitled

a guest
Jun 19th, 2018
78
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 0.83 KB | None | 0 0
  1. stopwords = corpus.stopwords.words('english')
  2. stopwords.extend(string.punctuation)
  3. stemmer = snowball.SnowballStemmer('english')
  4.  
  5. def tokenize(text):
  6. return nltk.word_tokenize(text)
  7.  
  8. def probability_score(shopee_item_name, lazada_item_name):
  9. tokens_a = [token.lower().strip(string.punctuation) for token in tokenize(shopee_item_name) \
  10. if token.lower().strip(string.punctuation) not in stopwords]
  11. tokens_b = [token.lower().strip(string.punctuation) for token in tokenize(lazada_item_name) \
  12. if token.lower().strip(string.punctuation) not in stopwords]
  13. stems_a = [stemmer.stem(token) for token in tokens_a]
  14. stems_b = [stemmer.stem(token) for token in tokens_b]
  15. ratio = len(set(stems_a).intersection(stems_b)) / float(len(set(stems_a).union(stems_b)))
  16. return ratio
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement