Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #####words Data Set####
- words_data_set = pandas.DataFrame({'keywords':['wlmart womens book set','microsoft fish sauce','books from walmat store','mens login for facebook fools','mens login for facbook fools','login for twetter boy','apples from cook']})
- #####Company Name List#####
- company_name_list = ['walmart','microsoft','facebook','twitter','amazon','apple']
- ######Check for fuzzy match######
- import pandas
- from fuzzywuzzy import fuzz
- from fuzzywuzzy import process
- import time
- print(len(words_data_set),'....rows')
- start_time = time.time()
- fuzzed_data_final = pandas.DataFrame()
- for s in words_data_set.keywords.tolist():
- #Flatten each word within the string
- step1 = words_data_set[words_data_set.keywords == s]
- step1['keywords2'] = step1.keywords.str.split()
- step2 = step1.keywords2.values.tolist()
- step3 = [item for sublist in step2 for item in sublist]
- step3 = pandas.DataFrame(step3)
- step3.columns = ['search_words']
- step3['keywords'] = s
- #For each word within the string, do a fuzzy match, where if the max score for the fuzzy match is >= 95 then replace the word with blank, else retain the word
- fuzzed_data = pandas.DataFrame()
- for w in step3.search_words.tolist():
- step4 = step3[step3.search_words == w]
- step5 = pandas.DataFrame(process.extract(w,keyword_list))
- step5.columns = ['w','score']
- if step5.score.max() >= 90:
- w = ''
- else:
- w
- step4['search_words'] = w
- fuzzed_data = fuzzed_data.append(step4)
- fuzzed_data_final = fuzzed_data_final.append(fuzzed_data)
- print("--- %s seconds ---" % (time.time() - start_time))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement