Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # -*- coding: utf-8 -*-
- """
- Created on Sat Dec 7 03:53:43 2019
- @author: Tejas
- Unone hot trialPrimaryDrugs based merged datasets are now imputed using missingpy technique.
- """
- import pandas as pd
- import numpy as np
- import ast
- ### Read and filter unwanted data
- df_p3app=pd.read_csv('C:/Users/Tejas/Downloads/David JnJ/DataBank/MergedonPrimaryDrugs.csv')
- df_p3app=df_p3app.reset_index(drop=True).drop([
- 'Unnamed: 0_x','Unnamed: 0_y','trialId','drugId','trialSponsors',
- 'trialTherapeuticAreas', 'trialPhase', 'trialPrimaryDrugsTested', 'trialIds'],1)
- #df_p3app['globalStatus']=((df_p3app['globalStatus']=='Launched')|(df_p3app['globalStatus']=='Registered'))*1
- ### Now lets drop GlobalStatus, Approved and Active columns because they are highly correlated with response variable
- df_p3app = df_p3app.drop(['Active', 'globalStatus', 'Approved'], 1)
- df_p3app.isnull().sum().reset_index()
- ######## One hot encode the whole set
- from sklearn.preprocessing import MultiLabelBinarizer
- categorical_list = [46, 48, 49, 50, 1, 2, 5, 8, 10]
- categorical_nonlist = [47, 3, 4]
- df_p3app_new=pd.DataFrame()
- for i in range(df_p3app.shape[1]):
- if i in categorical_list:
- temp=df_p3app[df_p3app.columns[i]].apply(lambda x:ast.literal_eval(x) if x==x and x!='[]' else [df_p3app.columns[i]+'_others']
- if x=='[]' else ["Missing"])
- mlb=MultiLabelBinarizer()
- temp=pd.DataFrame(mlb.fit_transform(temp),columns=mlb.classes_)
- if "Missing" in temp.columns:
- temp.loc[temp['Missing']==1,:]=np.nan
- temp=temp.drop('Missing',1)
- df_p3app_new=pd.concat([df_p3app_new,temp],1)
- if i in categorical_nonlist:
- temp = pd.get_dummies(df_p3app[df_p3app.columns[i]])
- temp = temp.rename(columns = {'others': 'others'+ str(i)})
- for i1 in range(0, temp.shape[0]):
- if(sum(temp.iloc[i1,:]) == 0):
- temp.iloc[i1,:] = np.nan
- df_p3app_new=pd.concat([df_p3app_new,temp],1)
- else:
- df_p3app_new[df_p3app.columns[i]]=df_p3app[df_p3app.columns[i]]
- categorical_listnames = ['deliveryRoutes', 'deliveryMediums', 'targets1', 'pharmaTarget_families', 'trialStudyKeywords',
- 'sponsorType', 'trialOutcomes', 'trialCountries', 'trialTags']
- ### remove categorical list name columns from new df.
- df_p3app_new.drop(categorical_listnames, axis = 1, inplace = True)
- df_p3app_new.reset_index(drop = True, inplace = True)
- df_p3app_new['drugApprovalStatus'] = ((df_p3app_new['drugApprovalStatus']=='Approved'))*1
- df_p3app_new.to_csv('MergedPrimaryOnehot.csv', index = False)
- ##################### We could remove others if thats not so imp.
- ################## Missingpy Imputation (takes 1 hour)
- df_p3app_new = pd.read_csv('MergedPrimaryOnehot.csv')
- col = list(df_p3app_new.columns)
- #### so missingpy has that error, but fancy impute works.
- from missingpy import KNNImputer
- imputer1 = KNNImputer(n_neighbors = 5, metric='gower', n_jobs=1)
- imp = imputer1.fit(df_p3app_new)
- from time import time
- t = time()
- df_p3app_imp = imp.transform(df_p3app_new)
- print(time()-t)
- from fancyimpute import KNN
- imputer = KNN(k = 5)
- df_p3app_imp = imputer.fit_transform(df_p3app_new)
- ### Resultant Array is a matrix, so make it back into a df.
- df_p3app_imp = pd.DataFrame(df_p3app_imp)
- df_p3app_imp.columns = col
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement