Advertisement
emma4lil

Untitled

Sep 1st, 2020
987
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.73 KB | None | 0 0
  1. from string import Template
  2.  
  3. import pandas as pd
  4. from fuzzywuzzy import fuzz
  5. from pandas import DataFrame, ExcelWriter
  6.  
  7. # pip install openpyxl
  8. # pip install xlsxwriter
  9. # pip install Pandas
  10.  
  11. # define global variables
  12. folder = ''
  13. safetyfiles = folder + "safetyfile.xlsx"
  14. edcfile = folder + "edc.xlsx"
  15. safety_df = DataFrame
  16. edc_df = DataFrame
  17. sheetlen = 0
  18. query_temp = "Sponsor DM: per EDC $column is $val However, per Safety database $column is $value"
  19. query_temp = Template(query_temp)
  20.  
  21.  
  22. # configurations ***you can tweak these settings to adjust accuracy***
  23. similarity_threshold = 40
  24.  
  25. # field Mapping Object
  26. # TODO: complete the mappings
  27. table = {
  28.     "Patient_ID": {
  29.         'edc': 2,
  30.         'safety': 1,
  31.  
  32.     },
  33.     "Protocol #": {
  34.         'edc': 1,
  35.         'safety': 3,
  36.  
  37.     },
  38.     "SAE Term": {
  39.         'edc': 6,
  40.         'safety': 4,
  41.  
  42.     },
  43.     # "Preferred Term": {
  44.     #     'edc': 0,
  45.     #     'safety': 0,
  46.     #
  47.     # },
  48.     # "Date site became aware of event": {
  49.     #     'edc': 0,
  50.     #     'safety': 0,
  51.     #
  52.     # },
  53.     # "Stop Date": {
  54.     #     'edc': 0,
  55.     #     'safety': 0,
  56.     #
  57.     # },
  58.     # "Toxicity Grade": {
  59.     #     'edc': 0,
  60.     #     'safety': 0,
  61.     #
  62.     # },
  63.     # "Event Outcome": {
  64.     #     'edc': 0,
  65.     #     'safety': 0,
  66.     #
  67.     # },
  68.     # "Relationship to study treatment": {
  69.     #     'edc': 0,
  70.     #     'safety': 0,
  71.     #
  72.     # },
  73.     # "Action taken with study": {
  74.     #     'edc': 0,
  75.     #     'safety': 0,
  76.     #
  77.     # },
  78.     # "SAE criteria": {
  79.     #     'edc': 0,
  80.     #     'safety': 0,
  81.     #
  82.     # },
  83.     # "Date of death": {
  84.     #     'edc': 0,
  85.     #     'safety': 0,
  86.     #
  87.     # },
  88.     # "Date of Hospitalization": {
  89.     #     'edc': 0,
  90.     #     'safety': 0,
  91.     #
  92.     # },
  93.  
  94. }
  95.  
  96.  
  97. # funtion definations
  98. def main():
  99.     global sheetlen
  100.     data_array = []
  101.     for k in table:
  102.         edc_col = table[k]['edc']
  103.         safety_col = table[k]['safety']
  104.         query_template = table[k]['query']
  105.         # ------------------------------------------------------
  106.         edc_df = pd.read_excel(edcfile, index_col=[edc_col])
  107.         sheetlen = len(edc_df.index)
  108.         safety_df = pd.read_excel(safetyfiles, index_col=[safety_col])
  109.         print("**************Creating .xlsx for****************")
  110.         print(k)
  111.         print("*****************Done!*************************")
  112.         for row in range(sheetlen):
  113.             res = fuzzySmiles(
  114.                 edc_df.index[row], safety_df.index[row], similarity_threshold, k)
  115.             data_array.append(res)
  116.         # write new sheet to output file
  117.         create_or_append_xlsx("output.xlsx", k, data_array)
  118.         data_array.clear()
  119.  
  120.  
  121. # using fuzzywuzzy for similarity check
  122. def fuzzySmiles(str1, str2, threshold, col):
  123.     query = ''
  124.     val = fuzz.token_sort_ratio(str1, str2)
  125.     str1_len = len(str(str1))
  126.     str2_len = len(str(str2))
  127.  
  128.     r_val = False
  129.     if val >= threshold:
  130.         r_val = True
  131.     else:
  132.         r_val = False
  133.         query = query_temp.substitute(
  134.             {'column': col, "val": str1, "value": str2})
  135.         if val == 0:
  136.             l = str1_len > str2_len
  137.             if l:
  138.                 query = query_temp.substitute(
  139.                     {'column': col, "val": str1, "value": "Missing"})
  140.     return [str1, str2, r_val, val, query]
  141.  
  142.  
  143. def create_or_append_xlsx(filepath, sheetname, data):
  144.  
  145.     df = pd.DataFrame(
  146.         data, columns=["Edc_field", "Safety_field", "similarity", "score", "query"])
  147.     with ExcelWriter(sheetname + "_gen_.xlsx", engine="xlsxwriter", mode='w') as writer:
  148.         df.to_excel(writer, sheet_name=sheetname)
  149.         writer.close()
  150.  
  151.  
  152. # python run as main script
  153. if __name__ == "__main__":
  154.     main()
  155.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement