Advertisement
emma4lil

Re-upload pyEdc

Sep 23rd, 2020
1,582
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.89 KB | None | 0 0
  1. from string import Template
  2.  
  3. import pandas as pd
  4. from fuzzywuzzy import fuzz
  5. from pandas import DataFrame, ExcelWriter
  6.  
  7. # pip install openpyxl
  8. # pip install xlsxwriter
  9. # pip install Pandas
  10.  
  11. # define global variables
  12. folder = ''
  13. safetyfiles = folder + "safetyfile1.xlsx"
  14. edcfile = folder + "edc5.xlsx"
  15. safety_df = DataFrame
  16. edc_df = DataFrame
  17. sheetlen = 0
  18. query_temp = "Sponsor DM: per EDC $column is $val However, per Safety database $column is $value please clarify"
  19. query_temp = Template(query_temp)
  20.  
  21.  
  22. # configurations ***you can tweak these settings to adjust accuracy***
  23. similarity_threshold = 40
  24.  
  25. # field Mapping Object
  26. # TODO: complete the mappings
  27. table = {
  28.     "Patient_ID": {
  29.         'edc': 2,
  30.         'safety': 1,
  31.  
  32.     },
  33.     "Protocol #": {
  34.         'edc': 1,
  35.         'safety': 3,
  36.  
  37.     },
  38.     "SAE Term": {
  39.         'edc': 6,
  40.         'safety': 4,
  41.  
  42.     },
  43.     # "Preferred Term": {
  44.     #     'edc': 0,
  45.     #     'safety': 0,
  46.     #
  47.     # },
  48.     # "Date site became aware of event": {
  49.     #     'edc': 0,
  50.     #     'safety': 0,
  51.     #
  52.     # },
  53.     # "Stop Date": {
  54.     #     'edc': 0,
  55.     #     'safety': 0,
  56.     #
  57.     # },
  58.     # "Toxicity Grade": {
  59.     #     'edc': 0,
  60.     #     'safety': 0,
  61.     #
  62.     # },
  63.     # "Event Outcome": {
  64.     #     'edc': 0,
  65.     #     'safety': 0,
  66.     #
  67.     # },
  68.     # "Relationship to study treatment": {
  69.     #     'edc': 0,
  70.     #     'safety': 0,
  71.     #
  72.     # },
  73.     # "Action taken with study": {
  74.     #     'edc': 0,
  75.     #     'safety': 0,
  76.     #
  77.     # },
  78.     # "SAE criteria": {
  79.     #     'edc': 0,
  80.     #     'safety': 0,
  81.     #
  82.     # },
  83.     # "Date of death": {
  84.     #     'edc': 0,
  85.     #     'safety': 0,
  86.     #
  87.     # },
  88.     # "Date of Hospitalization": {
  89.     #     'edc': 0,
  90.     #     'safety': 0,
  91.     #
  92.     # },
  93.  
  94. }
  95.  
  96.  
  97. # funtion definations
  98. def main():
  99.     global sheetlen
  100.     data_array = []
  101.     patient_id = edc_df = pd.read_excel(edcfile, index_col=[2])
  102.     for k in table:
  103.         edc_col = table[k]['edc']
  104.         safety_col = table[k]['safety']
  105.         # query_template = table[k]['query']
  106.         # ------------------------------------------------------
  107.         edc_df = pd.read_excel(edcfile, index_col=[edc_col])
  108.         sheetlen = len(edc_df.index)
  109.         safety_df = pd.read_excel(safetyfiles, index_col=[safety_col])
  110.         print("**************generating .xlsx for ************")
  111.         print(k)
  112.         print("*****************Done!*************************")
  113.         for row in range(sheetlen):
  114.             res = fuzzySmiles(patient_id.index[row],
  115.                               edc_df.index[row], safety_df.index[row], similarity_threshold, k)
  116.             data_array.append(res)
  117.         # write new sheet to output file
  118.         create_or_append_xlsx("output.xlsx", k, data_array)
  119.         data_array.clear()
  120.  
  121.  
  122. # using fuzzywuzzy for similarity check
  123. def fuzzySmiles(patient_id, str1, str2, threshold, col):
  124.     query = ''
  125.     val = fuzz.token_sort_ratio(str1, str2)
  126.     str1_len = len(str(str1))
  127.     str2_len = len(str(str2))
  128.  
  129.     r_val = False
  130.     if val >= threshold:
  131.         r_val = True
  132.     else:
  133.         r_val = False
  134.         query = query_temp.substitute(
  135.             {'column': col, "val": str1, "value": str2})
  136.         if val == 0:
  137.             l = str1_len > str2_len
  138.             if l:
  139.                 query = query_temp.substitute(
  140.                     {'column': col, "val": str1, "value": "Missing"})
  141.     return [patient_id, str1, str2, r_val, val, query, ]
  142.  
  143.  
  144. def create_or_append_xlsx(filepath, sheetname, data):
  145.  
  146.     df = pd.DataFrame(
  147.         data, columns=["Patient_ID", "Edc_field", "Safety_field", "Similarity", "Score", "Query"])
  148.     with ExcelWriter(sheetname + "_gen_.xlsx", engine="xlsxwriter", mode='w') as writer:
  149.         df.to_excel(writer, sheet_name=sheetname)
  150.         writer.close()
  151.  
  152.  
  153. # python run as main script
  154. if __name__ == "__main__":
  155.     main()
  156.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement