Not a member of Pastebin yet?
                        Sign Up,
                        it unlocks many cool features!                    
                - import pandas as pd
 - import numpy as np
 - from sklearn.preprocessing import MultiLabelBinarizer
 - url_train='https://raw.githubusercontent.com/sak1b0/proteiNN/master/train_formatted.csv'
 - url_test='https://raw.githubusercontent.com/sak1b0/proteiNN/master/test_formatted.csv'
 - df_train=pd.read_csv(url_train,header=None)
 - df_test=pd.read_csv(url_test,header=None)
 - one_hot = MultiLabelBinarizer() #finally the ultimate solution
 - df_train=np.asarray(df_train)
 - df_test=np.asarray(df_test)
 - X_train = df_train[:, 0]
 - y_train = df_train[:, 1]
 - X_test = df_test[:, 0]
 - y_test = df_test[:, 1]
 - def debug_me():
 - print('train dataframe: ',df_train.shape)
 - print('X train: ',X_train.shape)
 - print('y train: ',y_train.shape)
 - print('test dataframe: ',df_test.shape)
 - print('X test: ',X_test.shape)
 - print('y test: ',y_test.shape)
 - debug_me()
 - max_length_train=0
 - for item in range (len(X_train)):
 - max_length_train=max(max_length_train,len(X_train[item])) #finding maximum length of string in training
 - print('Maximum length of train: ',max_length_train)
 - max_length_test=0
 - for item in range (len(X_test)):
 - max_length_test=max(max_length_test,len(X_test[item])) #finding maximum length of string in testing set
 - print('Maximum length of test: ',max_length_test)
 - #max_length_train=max(max_length_train,max_length_test)
 - for item in range (len(X_train)):
 - X_train[item] = X_train[item]+'Z'*(max_length_train-len(X_train[item])) # padding with 'Z' to make all of them the same length
 - X_train[item] = one_hot.fit_transform(X_train[item])
 - for item in range (len(y_train)):
 - y_train[item] = y_train[item]+'Z'*(max_length_train-len(y_train[item])) # padding with 'Z' to make all of them the same length
 - y_train[item] = one_hot.fit_transform(y_train[item])
 - for item in range (len(X_test)):
 - X_test[item] = X_test[item]+'Z'*(max_length_train-len(X_test[item])) # padding with 'Z' to make all of them the same length
 - X_test[item] =one_hot.fit_transform(X_test[item])
 - for item in range (len(y_test)):
 - y_test[item] = y_test[item]+'Z'*(max_length_train-len(y_test[item])) # padding with 'Z' to make all of them the same length
 - y_test[item] = one_hot.fit_transform(y_test[item])
 - #print(X_train[0])
 - #print(y_train[0])
 - print('\nafter encoding: \n')
 - debug_me()
 - print(X_train[0].shape)
 - print(y_train[0].shape)
 
Advertisement
 
                    Add Comment                
                
                        Please, Sign In to add comment