Advertisement
Guest User

Untitled

a guest
Jun 16th, 2019
123
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 0.96 KB | None | 0 0
  1. dat_A="""file_name
  2. 3M CO
  3. ABBOTT LABORATORIES
  4. ABC INC
  5. ALTRIA GROUP INC
  6. AMERICAN ELECTRIC POWER CO"""
  7.  
  8. dat_B="""crsp_name
  9. A & E PLASTIK PAK INC
  10. A & M FOOD SERVICES INC
  11. A A I CORP
  12. A A IMPORTING INC
  13. A A R CORP
  14. ABBOTT
  15. ABBOTT LABS
  16. ALTRIA
  17. ALTRIA GROUP"""
  18.  
  19. df_A = pd.read_csv(pd.compat.StringIO(dat_A))
  20. df_B = pd.read_csv(pd.compat.StringIO(dat_B))
  21.  
  22. crsp = df_B.crsp_name
  23.  
  24. ratio_threshold = 50
  25.  
  26. def fn(t1):
  27. ratios = crsp.apply(lambda t2: fuzz.token_set_ratio(t1, t2))
  28. iMax = ratios.idxmax()
  29. rMax = ratios[iMax]
  30. return crsp.loc[iMax] if rMax > ratio_threshold else f'{iMax}_{rMax}'
  31.  
  32. df_A['crsp_name'] = df_A.file_name.apply(fn)
  33.  
  34. file_name crsp_name
  35. 0 3M CO 2_46
  36. 1 ABBOTT LABORATORIES ABBOTT
  37. 2 ABC INC A & E PLASTIK PAK INC
  38. 3 ALTRIA GROUP INC ALTRIA
  39. 4 AMERICAN ELECTRIC POWER CO 1_43
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement