Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- dat_A="""file_name
- 3M CO
- ABBOTT LABORATORIES
- ABC INC
- ALTRIA GROUP INC
- AMERICAN ELECTRIC POWER CO"""
- dat_B="""crsp_name
- A & E PLASTIK PAK INC
- A & M FOOD SERVICES INC
- A A I CORP
- A A IMPORTING INC
- A A R CORP
- ABBOTT
- ABBOTT LABS
- ALTRIA
- ALTRIA GROUP"""
- df_A = pd.read_csv(pd.compat.StringIO(dat_A))
- df_B = pd.read_csv(pd.compat.StringIO(dat_B))
- crsp = df_B.crsp_name
- ratio_threshold = 50
- def fn(t1):
- ratios = crsp.apply(lambda t2: fuzz.token_set_ratio(t1, t2))
- iMax = ratios.idxmax()
- rMax = ratios[iMax]
- return crsp.loc[iMax] if rMax > ratio_threshold else f'{iMax}_{rMax}'
- df_A['crsp_name'] = df_A.file_name.apply(fn)
- file_name crsp_name
- 0 3M CO 2_46
- 1 ABBOTT LABORATORIES ABBOTT
- 2 ABC INC A & E PLASTIK PAK INC
- 3 ALTRIA GROUP INC ALTRIA
- 4 AMERICAN ELECTRIC POWER CO 1_43
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement