Advertisement
Guest User

After Merge, one hot encode data

a guest
Dec 7th, 2019
106
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.45 KB | None | 0 0
  1. # -*- coding: utf-8 -*-
  2. """
  3. Created on Sat Dec  7 03:53:43 2019
  4.  
  5. @author: Tejas
  6.  
  7. Unone hot trialPrimaryDrugs based merged datasets are now imputed using missingpy technique.
  8. """
  9.  
  10. import pandas as pd
  11. import numpy as np
  12. import ast
  13.  
  14. ### Read and filter unwanted data
  15. df_p3app=pd.read_csv('C:/Users/Tejas/Downloads/David JnJ/DataBank/MergedonPrimaryDrugs.csv')
  16.  
  17. df_p3app=df_p3app.reset_index(drop=True).drop([
  18.    'Unnamed: 0_x','Unnamed: 0_y','trialId','drugId','trialSponsors',
  19.     'trialTherapeuticAreas', 'trialPhase', 'trialPrimaryDrugsTested', 'trialIds'],1)
  20.  
  21. #df_p3app['globalStatus']=((df_p3app['globalStatus']=='Launched')|(df_p3app['globalStatus']=='Registered'))*1
  22.  
  23.  
  24.  
  25.  
  26. ### Now lets drop GlobalStatus, Approved and Active columns because they are highly correlated with response variable
  27. df_p3app = df_p3app.drop(['Active', 'globalStatus', 'Approved'], 1)
  28.  
  29.  
  30.  
  31. df_p3app.isnull().sum().reset_index()
  32.  
  33.  
  34.  
  35. ######## One hot encode the whole set
  36. from sklearn.preprocessing import MultiLabelBinarizer
  37.  
  38. categorical_list = [46, 48, 49, 50, 1, 2, 5, 8, 10]
  39. categorical_nonlist = [47, 3, 4]
  40.  
  41. df_p3app_new=pd.DataFrame()
  42. for i in range(df_p3app.shape[1]):
  43.    
  44.     if i in categorical_list:
  45.         temp=df_p3app[df_p3app.columns[i]].apply(lambda x:ast.literal_eval(x) if x==x and x!='[]' else [df_p3app.columns[i]+'_others']
  46.                                                  if x=='[]' else ["Missing"])
  47.         mlb=MultiLabelBinarizer()
  48.         temp=pd.DataFrame(mlb.fit_transform(temp),columns=mlb.classes_)
  49.         if "Missing" in temp.columns:
  50.             temp.loc[temp['Missing']==1,:]=np.nan
  51.             temp=temp.drop('Missing',1)
  52.         df_p3app_new=pd.concat([df_p3app_new,temp],1)
  53.    
  54.    
  55.     if i in categorical_nonlist:
  56.        
  57.         temp = pd.get_dummies(df_p3app[df_p3app.columns[i]])
  58.         temp = temp.rename(columns = {'others': 'others'+ str(i)})
  59.         for i1 in range(0, temp.shape[0]):
  60.             if(sum(temp.iloc[i1,:]) == 0):
  61.                 temp.iloc[i1,:] = np.nan
  62.         df_p3app_new=pd.concat([df_p3app_new,temp],1)
  63.    
  64.     else:
  65.         df_p3app_new[df_p3app.columns[i]]=df_p3app[df_p3app.columns[i]]
  66.  
  67. categorical_listnames =  ['deliveryRoutes', 'deliveryMediums', 'targets1', 'pharmaTarget_families', 'trialStudyKeywords',
  68.                           'sponsorType', 'trialOutcomes', 'trialCountries', 'trialTags']
  69.  
  70. ### remove categorical list name columns from new df.
  71. df_p3app_new.drop(categorical_listnames, axis = 1, inplace = True)
  72. df_p3app_new.reset_index(drop = True, inplace = True)
  73.  
  74. df_p3app_new['drugApprovalStatus'] = ((df_p3app_new['drugApprovalStatus']=='Approved'))*1
  75.  
  76.  
  77. df_p3app_new.to_csv('MergedPrimaryOnehot.csv', index = False)
  78. ##################### We could remove others if thats not so imp.
  79.  
  80. ################## Missingpy Imputation (takes 1 hour)
  81. df_p3app_new = pd.read_csv('MergedPrimaryOnehot.csv')
  82.  
  83. col = list(df_p3app_new.columns)
  84.  
  85. #### so missingpy has that error, but fancy impute works.
  86. from missingpy import KNNImputer
  87.  
  88. imputer1 = KNNImputer(n_neighbors = 5, metric='gower', n_jobs=1)
  89. imp = imputer1.fit(df_p3app_new)
  90.  
  91.  
  92.  
  93.  
  94. from time import time
  95. t = time()
  96. df_p3app_imp = imp.transform(df_p3app_new)
  97. print(time()-t)
  98.  
  99.  
  100.  
  101.  
  102.  
  103. from fancyimpute import KNN
  104.  
  105. imputer = KNN(k = 5)
  106. df_p3app_imp = imputer.fit_transform(df_p3app_new)
  107.  
  108. ### Resultant Array is a matrix, so make it back into a df.
  109. df_p3app_imp = pd.DataFrame(df_p3app_imp)
  110. df_p3app_imp.columns = col
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement