emma4lil

Re-upload pyEdc

Sep 23rd, 2020
1,119
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. from string import Template
  2.  
  3. import pandas as pd
  4. from fuzzywuzzy import fuzz
  5. from pandas import DataFrame, ExcelWriter
  6.  
  7. # pip install openpyxl
  8. # pip install xlsxwriter
  9. # pip install Pandas
  10.  
  11. # define global variables
  12. folder = ''
  13. safetyfiles = folder + "safetyfile1.xlsx"
  14. edcfile = folder + "edc5.xlsx"
  15. safety_df = DataFrame
  16. edc_df = DataFrame
  17. sheetlen = 0
  18. query_temp = "Sponsor DM: per EDC $column is $val However, per Safety database $column is $value please clarify"
  19. query_temp = Template(query_temp)
  20.  
  21.  
  22. # configurations ***you can tweak these settings to adjust accuracy***
  23. similarity_threshold = 40
  24.  
  25. # field Mapping Object
  26. # TODO: complete the mappings
  27. table = {
  28.     "Patient_ID": {
  29.         'edc': 2,
  30.         'safety': 1,
  31.  
  32.     },
  33.     "Protocol #": {
  34.         'edc': 1,
  35.         'safety': 3,
  36.  
  37.     },
  38.     "SAE Term": {
  39.         'edc': 6,
  40.         'safety': 4,
  41.  
  42.     },
  43.     # "Preferred Term": {
  44.     #     'edc': 0,
  45.     #     'safety': 0,
  46.     #
  47.     # },
  48.     # "Date site became aware of event": {
  49.     #     'edc': 0,
  50.     #     'safety': 0,
  51.     #
  52.     # },
  53.     # "Stop Date": {
  54.     #     'edc': 0,
  55.     #     'safety': 0,
  56.     #
  57.     # },
  58.     # "Toxicity Grade": {
  59.     #     'edc': 0,
  60.     #     'safety': 0,
  61.     #
  62.     # },
  63.     # "Event Outcome": {
  64.     #     'edc': 0,
  65.     #     'safety': 0,
  66.     #
  67.     # },
  68.     # "Relationship to study treatment": {
  69.     #     'edc': 0,
  70.     #     'safety': 0,
  71.     #
  72.     # },
  73.     # "Action taken with study": {
  74.     #     'edc': 0,
  75.     #     'safety': 0,
  76.     #
  77.     # },
  78.     # "SAE criteria": {
  79.     #     'edc': 0,
  80.     #     'safety': 0,
  81.     #
  82.     # },
  83.     # "Date of death": {
  84.     #     'edc': 0,
  85.     #     'safety': 0,
  86.     #
  87.     # },
  88.     # "Date of Hospitalization": {
  89.     #     'edc': 0,
  90.     #     'safety': 0,
  91.     #
  92.     # },
  93.  
  94. }
  95.  
  96.  
  97. # funtion definations
  98. def main():
  99.     global sheetlen
  100.     data_array = []
  101.     patient_id = edc_df = pd.read_excel(edcfile, index_col=[2])
  102.     for k in table:
  103.         edc_col = table[k]['edc']
  104.         safety_col = table[k]['safety']
  105.         # query_template = table[k]['query']
  106.         # ------------------------------------------------------
  107.         edc_df = pd.read_excel(edcfile, index_col=[edc_col])
  108.         sheetlen = len(edc_df.index)
  109.         safety_df = pd.read_excel(safetyfiles, index_col=[safety_col])
  110.         print("**************generating .xlsx for ************")
  111.         print(k)
  112.         print("*****************Done!*************************")
  113.         for row in range(sheetlen):
  114.             res = fuzzySmiles(patient_id.index[row],
  115.                               edc_df.index[row], safety_df.index[row], similarity_threshold, k)
  116.             data_array.append(res)
  117.         # write new sheet to output file
  118.         create_or_append_xlsx("output.xlsx", k, data_array)
  119.         data_array.clear()
  120.  
  121.  
  122. # using fuzzywuzzy for similarity check
  123. def fuzzySmiles(patient_id, str1, str2, threshold, col):
  124.     query = ''
  125.     val = fuzz.token_sort_ratio(str1, str2)
  126.     str1_len = len(str(str1))
  127.     str2_len = len(str(str2))
  128.  
  129.     r_val = False
  130.     if val >= threshold:
  131.         r_val = True
  132.     else:
  133.         r_val = False
  134.         query = query_temp.substitute(
  135.             {'column': col, "val": str1, "value": str2})
  136.         if val == 0:
  137.             l = str1_len > str2_len
  138.             if l:
  139.                 query = query_temp.substitute(
  140.                     {'column': col, "val": str1, "value": "Missing"})
  141.     return [patient_id, str1, str2, r_val, val, query, ]
  142.  
  143.  
  144. def create_or_append_xlsx(filepath, sheetname, data):
  145.  
  146.     df = pd.DataFrame(
  147.         data, columns=["Patient_ID", "Edc_field", "Safety_field", "Similarity", "Score", "Query"])
  148.     with ExcelWriter(sheetname + "_gen_.xlsx", engine="xlsxwriter", mode='w') as writer:
  149.         df.to_excel(writer, sheet_name=sheetname)
  150.         writer.close()
  151.  
  152.  
  153. # python run as main script
  154. if __name__ == "__main__":
  155.     main()
  156.  
RAW Paste Data