Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- stopwords = corpus.stopwords.words('english')
- stopwords.extend(string.punctuation)
- stemmer = snowball.SnowballStemmer('english')
- def tokenize(text):
- return nltk.word_tokenize(text)
- def probability_score(shopee_item_name, lazada_item_name):
- tokens_a = [token.lower().strip(string.punctuation) for token in tokenize(shopee_item_name) \
- if token.lower().strip(string.punctuation) not in stopwords]
- tokens_b = [token.lower().strip(string.punctuation) for token in tokenize(lazada_item_name) \
- if token.lower().strip(string.punctuation) not in stopwords]
- stems_a = [stemmer.stem(token) for token in tokens_a]
- stems_b = [stemmer.stem(token) for token in tokens_b]
- ratio = len(set(stems_a).intersection(stems_b)) / float(len(set(stems_a).union(stems_b)))
- return ratio
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement