pro

import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer

url_train='https://raw.githubusercontent.com/sak1b0/proteiNN/master/train_formatted.csv'
url_test='https://raw.githubusercontent.com/sak1b0/proteiNN/master/test_formatted.csv'

df_train=pd.read_csv(url_train,header=None)
df_test=pd.read_csv(url_test,header=None)


one_hot = MultiLabelBinarizer() #finally the ultimate solution

df_train=np.asarray(df_train)
df_test=np.asarray(df_test)

X_train = df_train[:, 0]
y_train = df_train[:, 1]

X_test = df_test[:, 0]
y_test = df_test[:, 1]

def debug_me():
  print('train dataframe: ',df_train.shape)
  print('X train: ',X_train.shape)
  print('y train: ',y_train.shape)

  print('test dataframe: ',df_test.shape)
  print('X test: ',X_test.shape)
  print('y test: ',y_test.shape)


debug_me()

max_length_train=0
for item in range (len(X_train)):
  max_length_train=max(max_length_train,len(X_train[item])) #finding maximum length of string in training


print('Maximum length of train: ',max_length_train)

max_length_test=0
for item in range (len(X_test)):
  max_length_test=max(max_length_test,len(X_test[item])) #finding maximum length of string in testing set

print('Maximum length of test: ',max_length_test)

#max_length_train=max(max_length_train,max_length_test)


for item in range (len(X_train)):
  X_train[item] = X_train[item]+'Z'*(max_length_train-len(X_train[item])) # padding with 'Z' to make all of them the same length
  X_train[item] = one_hot.fit_transform(X_train[item])


for item in range (len(y_train)):
  y_train[item] = y_train[item]+'Z'*(max_length_train-len(y_train[item])) # padding with 'Z' to make all of them the same length
  y_train[item] = one_hot.fit_transform(y_train[item])


for item in range (len(X_test)):
  X_test[item] = X_test[item]+'Z'*(max_length_train-len(X_test[item])) # padding with 'Z' to make all of them the same length
  X_test[item] =one_hot.fit_transform(X_test[item])


for item in range (len(y_test)):
  y_test[item] = y_test[item]+'Z'*(max_length_train-len(y_test[item])) # padding with 'Z' to make all of them the same length
  y_test[item] = one_hot.fit_transform(y_test[item])


#print(X_train[0])
#print(y_train[0])


print('\nafter encoding: \n')

debug_me()
print(X_train[0].shape)
print(y_train[0].shape)