Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from string import Template
- import pandas as pd
- from fuzzywuzzy import fuzz
- from pandas import DataFrame, ExcelWriter
- # pip install openpyxl
- # pip install xlsxwriter
- # pip install Pandas
- # define global variables
- folder = ''
- safetyfiles = folder + "safetyfile.xlsx"
- edcfile = folder + "edc.xlsx"
- safety_df = DataFrame
- edc_df = DataFrame
- sheetlen = 0
- query_temp = "Sponsor DM: per EDC $column is $val However, per Safety database $column is $value"
- query_temp = Template(query_temp)
- # configurations ***you can tweak these settings to adjust accuracy***
- similarity_threshold = 40
- # field Mapping Object
- # TODO: complete the mappings
- table = {
- "Patient_ID": {
- 'edc': 2,
- 'safety': 1,
- },
- "Protocol #": {
- 'edc': 1,
- 'safety': 3,
- },
- "SAE Term": {
- 'edc': 6,
- 'safety': 4,
- },
- # "Preferred Term": {
- # 'edc': 0,
- # 'safety': 0,
- #
- # },
- # "Date site became aware of event": {
- # 'edc': 0,
- # 'safety': 0,
- #
- # },
- # "Stop Date": {
- # 'edc': 0,
- # 'safety': 0,
- #
- # },
- # "Toxicity Grade": {
- # 'edc': 0,
- # 'safety': 0,
- #
- # },
- # "Event Outcome": {
- # 'edc': 0,
- # 'safety': 0,
- #
- # },
- # "Relationship to study treatment": {
- # 'edc': 0,
- # 'safety': 0,
- #
- # },
- # "Action taken with study": {
- # 'edc': 0,
- # 'safety': 0,
- #
- # },
- # "SAE criteria": {
- # 'edc': 0,
- # 'safety': 0,
- #
- # },
- # "Date of death": {
- # 'edc': 0,
- # 'safety': 0,
- #
- # },
- # "Date of Hospitalization": {
- # 'edc': 0,
- # 'safety': 0,
- #
- # },
- }
- # funtion definations
- def main():
- global sheetlen
- data_array = []
- for k in table:
- edc_col = table[k]['edc']
- safety_col = table[k]['safety']
- query_template = table[k]['query']
- # ------------------------------------------------------
- edc_df = pd.read_excel(edcfile, index_col=[edc_col])
- sheetlen = len(edc_df.index)
- safety_df = pd.read_excel(safetyfiles, index_col=[safety_col])
- print("**************Creating .xlsx for****************")
- print(k)
- print("*****************Done!*************************")
- for row in range(sheetlen):
- res = fuzzySmiles(
- edc_df.index[row], safety_df.index[row], similarity_threshold, k)
- data_array.append(res)
- # write new sheet to output file
- create_or_append_xlsx("output.xlsx", k, data_array)
- data_array.clear()
- # using fuzzywuzzy for similarity check
- def fuzzySmiles(str1, str2, threshold, col):
- query = ''
- val = fuzz.token_sort_ratio(str1, str2)
- str1_len = len(str(str1))
- str2_len = len(str(str2))
- r_val = False
- if val >= threshold:
- r_val = True
- else:
- r_val = False
- query = query_temp.substitute(
- {'column': col, "val": str1, "value": str2})
- if val == 0:
- l = str1_len > str2_len
- if l:
- query = query_temp.substitute(
- {'column': col, "val": str1, "value": "Missing"})
- return [str1, str2, r_val, val, query]
- def create_or_append_xlsx(filepath, sheetname, data):
- df = pd.DataFrame(
- data, columns=["Edc_field", "Safety_field", "similarity", "score", "query"])
- with ExcelWriter(sheetname + "_gen_.xlsx", engine="xlsxwriter", mode='w') as writer:
- df.to_excel(writer, sheet_name=sheetname)
- writer.close()
- # python run as main script
- if __name__ == "__main__":
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement