Advertisement
sak1b

pro

Apr 12th, 2019
190
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.36 KB | None | 0 0
  1. import pandas as pd
  2. import numpy as np
  3. from sklearn.preprocessing import MultiLabelBinarizer
  4.  
  5. url_train='https://raw.githubusercontent.com/sak1b0/proteiNN/master/train_formatted.csv'
  6. url_test='https://raw.githubusercontent.com/sak1b0/proteiNN/master/test_formatted.csv'
  7.  
  8. df_train=pd.read_csv(url_train,header=None)
  9. df_test=pd.read_csv(url_test,header=None)
  10.  
  11.  
  12. one_hot = MultiLabelBinarizer() #finally the ultimate solution
  13.  
  14. df_train=np.asarray(df_train)
  15. df_test=np.asarray(df_test)
  16.  
  17. X_train = df_train[:, 0]
  18. y_train = df_train[:, 1]
  19.  
  20. X_test = df_test[:, 0]
  21. y_test = df_test[:, 1]
  22.  
  23. def debug_me():
  24. print('train dataframe: ',df_train.shape)
  25. print('X train: ',X_train.shape)
  26. print('y train: ',y_train.shape)
  27.  
  28. print('test dataframe: ',df_test.shape)
  29. print('X test: ',X_test.shape)
  30. print('y test: ',y_test.shape)
  31.  
  32.  
  33. debug_me()
  34.  
  35. max_length_train=0
  36. for item in range (len(X_train)):
  37. max_length_train=max(max_length_train,len(X_train[item])) #finding maximum length of string in training
  38.  
  39.  
  40. print('Maximum length of train: ',max_length_train)
  41.  
  42. max_length_test=0
  43. for item in range (len(X_test)):
  44. max_length_test=max(max_length_test,len(X_test[item])) #finding maximum length of string in testing set
  45.  
  46. print('Maximum length of test: ',max_length_test)
  47.  
  48. #max_length_train=max(max_length_train,max_length_test)
  49.  
  50.  
  51.  
  52. for item in range (len(X_train)):
  53. X_train[item] = X_train[item]+'Z'*(max_length_train-len(X_train[item])) # padding with 'Z' to make all of them the same length
  54. X_train[item] = one_hot.fit_transform(X_train[item])
  55.  
  56.  
  57. for item in range (len(y_train)):
  58. y_train[item] = y_train[item]+'Z'*(max_length_train-len(y_train[item])) # padding with 'Z' to make all of them the same length
  59. y_train[item] = one_hot.fit_transform(y_train[item])
  60.  
  61.  
  62. for item in range (len(X_test)):
  63. X_test[item] = X_test[item]+'Z'*(max_length_train-len(X_test[item])) # padding with 'Z' to make all of them the same length
  64. X_test[item] =one_hot.fit_transform(X_test[item])
  65.  
  66.  
  67. for item in range (len(y_test)):
  68. y_test[item] = y_test[item]+'Z'*(max_length_train-len(y_test[item])) # padding with 'Z' to make all of them the same length
  69. y_test[item] = one_hot.fit_transform(y_test[item])
  70.  
  71.  
  72. #print(X_train[0])
  73. #print(y_train[0])
  74.  
  75.  
  76. print('\nafter encoding: \n')
  77.  
  78. debug_me()
  79. print(X_train[0].shape)
  80. print(y_train[0].shape)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement