After Merge, one hot encode data

# -*- coding: utf-8 -*-
"""
Created on Sat Dec  7 03:53:43 2019

@author: Tejas

Unone hot trialPrimaryDrugs based merged datasets are now imputed using missingpy technique.
"""

import pandas as pd
import numpy as np
import ast

### Read and filter unwanted data
df_p3app=pd.read_csv('C:/Users/Tejas/Downloads/David JnJ/DataBank/MergedonPrimaryDrugs.csv')

df_p3app=df_p3app.reset_index(drop=True).drop([
   'Unnamed: 0_x','Unnamed: 0_y','trialId','drugId','trialSponsors',
    'trialTherapeuticAreas', 'trialPhase', 'trialPrimaryDrugsTested', 'trialIds'],1)

#df_p3app['globalStatus']=((df_p3app['globalStatus']=='Launched')|(df_p3app['globalStatus']=='Registered'))*1


### Now lets drop GlobalStatus, Approved and Active columns because they are highly correlated with response variable
df_p3app = df_p3app.drop(['Active', 'globalStatus', 'Approved'], 1)


df_p3app.isnull().sum().reset_index()


######## One hot encode the whole set
from sklearn.preprocessing import MultiLabelBinarizer

categorical_list = [46, 48, 49, 50, 1, 2, 5, 8, 10]
categorical_nonlist = [47, 3, 4]

df_p3app_new=pd.DataFrame()
for i in range(df_p3app.shape[1]):

    if i in categorical_list:
        temp=df_p3app[df_p3app.columns[i]].apply(lambda x:ast.literal_eval(x) if x==x and x!='[]' else [df_p3app.columns[i]+'_others']
                                                 if x=='[]' else ["Missing"])
        mlb=MultiLabelBinarizer()
        temp=pd.DataFrame(mlb.fit_transform(temp),columns=mlb.classes_)
        if "Missing" in temp.columns:
            temp.loc[temp['Missing']==1,:]=np.nan
            temp=temp.drop('Missing',1)
        df_p3app_new=pd.concat([df_p3app_new,temp],1)


    if i in categorical_nonlist:

        temp = pd.get_dummies(df_p3app[df_p3app.columns[i]])
        temp = temp.rename(columns = {'others': 'others'+ str(i)})
        for i1 in range(0, temp.shape[0]):
            if(sum(temp.iloc[i1,:]) == 0):
                temp.iloc[i1,:] = np.nan
        df_p3app_new=pd.concat([df_p3app_new,temp],1)

    else:
        df_p3app_new[df_p3app.columns[i]]=df_p3app[df_p3app.columns[i]]

categorical_listnames =  ['deliveryRoutes', 'deliveryMediums', 'targets1', 'pharmaTarget_families', 'trialStudyKeywords',
                          'sponsorType', 'trialOutcomes', 'trialCountries', 'trialTags']

### remove categorical list name columns from new df.
df_p3app_new.drop(categorical_listnames, axis = 1, inplace = True)
df_p3app_new.reset_index(drop = True, inplace = True)

df_p3app_new['drugApprovalStatus'] = ((df_p3app_new['drugApprovalStatus']=='Approved'))*1


df_p3app_new.to_csv('MergedPrimaryOnehot.csv', index = False)
##################### We could remove others if thats not so imp.

################## Missingpy Imputation (takes 1 hour)
df_p3app_new = pd.read_csv('MergedPrimaryOnehot.csv')

col = list(df_p3app_new.columns)

#### so missingpy has that error, but fancy impute works.
from missingpy import KNNImputer

imputer1 = KNNImputer(n_neighbors = 5, metric='gower', n_jobs=1)
imp = imputer1.fit(df_p3app_new)


from time import time
t = time()
df_p3app_imp = imp.transform(df_p3app_new)
print(time()-t)


from fancyimpute import KNN

imputer = KNN(k = 5)
df_p3app_imp = imputer.fit_transform(df_p3app_new)

### Resultant Array is a matrix, so make it back into a df.
df_p3app_imp = pd.DataFrame(df_p3app_imp)
df_p3app_imp.columns = col