Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import pandas as pd
- from operator import itemgetter
- from fuzzywuzzy import fuzz
- from fuzzywuzzy import process
- beall_df = pd.read_csv('Beall_list.txt', header=None, sep='\t', names=['names'])
- j1_df = pd.read_csv('J1.csv', header=None, names=['names'])
- j2_df = pd.read_csv('J2.csv', header=None, names=['names'])
- j3_df = pd.read_csv('J3.csv', header=None, names=['names'])
- j4_df = pd.read_csv('J4.csv', header=None, names=['names'])
- j5_df = pd.read_csv('J5.csv', header=None, names=['names'])
- beall_df_names = beall_df.names.str.replace(' ','').str.lower()
- j1_df_names = j1_df.names.str.replace(' ','').str.lower().tolist()
- j2_df_names = j2_df.names.str.replace(' ','').str.lower().tolist()
- j3_df_names = j3_df.names.str.replace(' ','').str.lower().tolist()
- j4_df_names = j4_df.names.str.replace(' ','').str.lower().tolist()
- j5_df_names = j5_df.names.str.replace(' ','').str.lower().tolist()
- j1_matches = [('xxxxx',0)]
- j2_matches = [('xxxxx',0)]
- j3_matches = [('xxxxx',0)]
- j4_matches = [('xxxxx',0)]
- j5_matches = [('xxxxx',0)]
- for name in beall_df_names:
- match = process.extractOne(name, j1_df_names)
- j1_matches.append(match)
- match = process.extractOne(name, j2_df_names)
- j2_matches.append(match)
- match = process.extractOne(name, j3_df_names)
- j3_matches.append(match)
- match = process.extractOne(name, j4_df_names)
- j4_matches.append(match)
- match = process.extractOne(name, j5_df_names)
- j5_matches.append(match)
- with open('j1_matches.csv', 'w') as fh:
- for item in j1_matches:
- if item:
- fh.write("{},{}\n".format(item[0], item[1]))
- with open('j2_matches.csv', 'w') as fh:
- for item in j2_matches:
- if item:
- fh.write("{},{}\n".format(item[0], item[1]))
- with open('j3_matches.csv', 'w') as fh:
- for item in j3_matches:
- if item:
- fh.write("{},{}\n".format(item[0], item[1]))
- with open('j4_matches.csv', 'w') as fh:
- for item in j4_matches:
- if item:
- fh.write("{},{}\n".format(item[0], item[1]))
- with open('j5_matches.csv', 'w') as fh:
- for item in j5_matches:
- if item:
- fh.write("{},{}\n".format(item[0], item[1]))
- """
- Run:
- $ sort --field-separator=',' -k2 -n j1_matches.csv
- $ sort --field-separator=',' -k2 -n j2_matches.csv
- $ sort --field-separator=',' -k2 -n j3_matches.csv
- $ sort --field-separator=',' -k2 -n j4_matches.csv
- $ sort --field-separator=',' -k2 -n j5_matches.csv
- """
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement