Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import pandas as pd
- import numpy as np
- from sklearn.preprocessing import MultiLabelBinarizer
- url_train='https://raw.githubusercontent.com/sak1b0/proteiNN/master/train_formatted.csv'
- url_test='https://raw.githubusercontent.com/sak1b0/proteiNN/master/test_formatted.csv'
- df_train=pd.read_csv(url_train,header=None)
- df_test=pd.read_csv(url_test,header=None)
- one_hot = MultiLabelBinarizer() #finally the ultimate solution
- df_train=np.asarray(df_train)
- df_test=np.asarray(df_test)
- X_train = df_train[:, 0]
- y_train = df_train[:, 1]
- X_test = df_test[:, 0]
- y_test = df_test[:, 1]
- def debug_me():
- print('train dataframe: ',df_train.shape)
- print('X train: ',X_train.shape)
- print('y train: ',y_train.shape)
- print('test dataframe: ',df_test.shape)
- print('X test: ',X_test.shape)
- print('y test: ',y_test.shape)
- debug_me()
- max_length_train=0
- for item in range (len(X_train)):
- max_length_train=max(max_length_train,len(X_train[item])) #finding maximum length of string in training
- print('Maximum length of train: ',max_length_train)
- max_length_test=0
- for item in range (len(X_test)):
- max_length_test=max(max_length_test,len(X_test[item])) #finding maximum length of string in testing set
- print('Maximum length of test: ',max_length_test)
- #max_length_train=max(max_length_train,max_length_test)
- for item in range (len(X_train)):
- X_train[item] = X_train[item]+'Z'*(max_length_train-len(X_train[item])) # padding with 'Z' to make all of them the same length
- X_train[item] = one_hot.fit_transform(X_train[item])
- for item in range (len(y_train)):
- y_train[item] = y_train[item]+'Z'*(max_length_train-len(y_train[item])) # padding with 'Z' to make all of them the same length
- y_train[item] = one_hot.fit_transform(y_train[item])
- for item in range (len(X_test)):
- X_test[item] = X_test[item]+'Z'*(max_length_train-len(X_test[item])) # padding with 'Z' to make all of them the same length
- X_test[item] =one_hot.fit_transform(X_test[item])
- for item in range (len(y_test)):
- y_test[item] = y_test[item]+'Z'*(max_length_train-len(y_test[item])) # padding with 'Z' to make all of them the same length
- y_test[item] = one_hot.fit_transform(y_test[item])
- #print(X_train[0])
- #print(y_train[0])
- print('\nafter encoding: \n')
- debug_me()
- print(X_train[0].shape)
- print(y_train[0].shape)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement