Untitled

stopwords = corpus.stopwords.words('english')
stopwords.extend(string.punctuation)
stemmer = snowball.SnowballStemmer('english')

def tokenize(text):
    return nltk.word_tokenize(text)

def probability_score(shopee_item_name, lazada_item_name):
    tokens_a = [token.lower().strip(string.punctuation) for token in tokenize(shopee_item_name) \
                    if token.lower().strip(string.punctuation) not in stopwords]
    tokens_b = [token.lower().strip(string.punctuation) for token in tokenize(lazada_item_name) \
                    if token.lower().strip(string.punctuation) not in stopwords]
    stems_a = [stemmer.stem(token) for token in tokens_a]
    stems_b = [stemmer.stem(token) for token in tokens_b]
    ratio = len(set(stems_a).intersection(stems_b)) / float(len(set(stems_a).union(stems_b)))
    return ratio