Advertisement
Guest User

Untitled

a guest
Jun 17th, 2019
159
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.56 KB | None | 0 0
  1. #####words Data Set####
  2. words_data_set = pandas.DataFrame({'keywords':['wlmart womens book set','microsoft fish sauce','books from walmat store','mens login for facebook fools','mens login for facbook fools','login for twetter boy','apples from cook']})
  3.  
  4. #####Company Name List#####
  5. company_name_list = ['walmart','microsoft','facebook','twitter','amazon','apple']
  6.  
  7. ######Check for fuzzy match######
  8. import pandas
  9. from fuzzywuzzy import fuzz
  10. from fuzzywuzzy import process
  11. import time
  12. print(len(words_data_set),'....rows')
  13. start_time = time.time()
  14.  
  15.  
  16. fuzzed_data_final = pandas.DataFrame()
  17. for s in words_data_set.keywords.tolist():
  18.  
  19. #Flatten each word within the string
  20. step1 = words_data_set[words_data_set.keywords == s]
  21. step1['keywords2'] = step1.keywords.str.split()
  22. step2 = step1.keywords2.values.tolist()
  23. step3 = [item for sublist in step2 for item in sublist]
  24. step3 = pandas.DataFrame(step3)
  25. step3.columns = ['search_words']
  26. step3['keywords'] = s
  27.  
  28. #For each word within the string, do a fuzzy match, where if the max score for the fuzzy match is >= 95 then replace the word with blank, else retain the word
  29.  
  30. fuzzed_data = pandas.DataFrame()
  31. for w in step3.search_words.tolist():
  32. step4 = step3[step3.search_words == w]
  33. step5 = pandas.DataFrame(process.extract(w,keyword_list))
  34. step5.columns = ['w','score']
  35. if step5.score.max() >= 90:
  36. w = ''
  37. else:
  38. w
  39.  
  40. step4['search_words'] = w
  41. fuzzed_data = fuzzed_data.append(step4)
  42. fuzzed_data_final = fuzzed_data_final.append(fuzzed_data)
  43.  
  44. print("--- %s seconds ---" % (time.time() - start_time))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement