Advertisement
sak1b

damn_son

May 5th, 2019
182
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 6.07 KB | None | 0 0
  1. from keras.models import Sequential
  2. from keras.layers import Dense
  3. from keras.layers import LSTM
  4. from keras import losses
  5. import pandas as pd
  6. import numpy as np
  7. from numpy import argmax
  8. import matplotlib.pyplot as plt
  9. import time
  10.  
  11. prop = {1:[1.8,-0.17,0.11,0,0.38,-0.21,-1.6,0.42,-0.27,1.12,0.61],
  12. 18:[-4.5,-0.81,2.58,3.71,-2.57,2.11,12.3,-1.56,1.87,-2.55,0.6],
  13. 14:[-3.5,-0.42,2.05,3.47,-1.62,0.96,4.8,-1.03,0.81,-0.83,0.06],
  14. 4:[-3.5,-1.23,3.49,2.95,-3.27,1.36,9.2,-0.51,0.81,-0.83,0.46],
  15. 3:[2.5,0.24,-0.13,0.49,-0.3,-6.04,-2,0.84,-1.05,0.59,1.07],
  16. 17:[-3.5,-0.58,2.36,3.01,-1.84,1.52,4.1,-0.96,1.1,-0.78,0],
  17. 5:[-3.5,-2.02,2.68,1.64,-2.9,2.3,8.2,-0.37,1.17,-0.92,0.47],
  18. 7:[-0.4,-0.01,0.74,1.72,-0.19,0,-1,0,-0.16,1.2,0.07],
  19. 8:[-3.2,-0.96,2.06,4.76,-1.44,-1.23,3,-2.28,0.28,-0.93,0.61],
  20. 9:[4.5,0.31,-0.6,-1.56,1.97,-4.81,-3.1,1.81,-0.77,1.16,2.22],
  21. 12:[3.8,0.56,-0.55,-1.81,1.82,-4.68,-2.8,1.8,-1.1,1.18,1.53],
  22. 11:[-3.9,-0.99,2.71,5.39,-3.46,3.88,8.8,-2.03,1.7,-0.8,1.15],
  23. 13:[1.9,0.23,-0.1,-0.76,1.4,-3.66,-3.4,1.18,-0.73,0.55,1.18],
  24. 6:[2.8,1.13,-0.32,-2.2,1.98,-4.65,-3.7,1.74,-1.43,0.67,2.02],
  25. 16:[-1.6,-0.45,2.23,-1.52,-1.44,0.75,0.2,0.86,-0.75,0.54,1.95],
  26. 19:[-0.8,-0.13,0.84,1.83,-0.53,1.74,-0.6,-0.64,0.42,-0.05,0.05],
  27. 20:[-0.7,-0.14,0.52,1.78,-0.32,0.78,-1.2,-0.26,0.63,-0.02,0.05],
  28. 23:[-0.9,1.85,0.3,-0.38,1.53,-3.32,-1.9,1.46,-1.57,-0.19,2.65],
  29. 25:[-1.3,0.94,0.68,-1.09,0.49,-1.01,0.7,0.51,-0.56,-0.23,1.88],
  30. 22:[4.2,-0.07,-0.31,-0.78,1.46,-3.5,-2.6,1.34,-0.4,1.13,1.32],
  31. 26:[0,0,0,0,0,0,0,0,0,0,0]}
  32.  
  33. df_train=np.asarray(pd.read_csv('https://raw.githubusercontent.com/sak1b0/proteiNN/master/train_formatted.csv',header=None))
  34. df_test=np.asarray(pd.read_csv('https://raw.githubusercontent.com/sak1b0/proteiNN/master/test_formatted.csv',header=None))
  35.  
  36. x_train = df_train[:,0]
  37. y_train = df_train[:,1]
  38.  
  39. x_test = df_test[:,0]
  40. y_test = df_test[:,1]
  41.  
  42. def debug_me():
  43. #print('train dataframe: ',df_train.shape)
  44. print('x_train shape: ',x_train.shape)
  45. print('y_train shape: ',y_train.shape)
  46.  
  47. #print('test dataframe: ',df_test.shape)
  48. print('x_test shape: ',x_test.shape)
  49. print('y_test shape: ',y_test.shape)
  50.  
  51. max_len=400
  52.  
  53. #================== x_train ===============
  54. n = x_train
  55. j=-1
  56.  
  57. for i in x_train:
  58. j=j+1
  59. if(len(i)>max_len):
  60. n = np.delete(n, j)
  61. j=j-1
  62.  
  63. for item in range (len(n)):
  64. n[item] = n[item]+'Z'*(max_len-len(n[item]))
  65.  
  66. x_train = n
  67.  
  68. #================= y_train =================
  69. n = y_train
  70. j=-1
  71.  
  72. for i in y_train:
  73. j=j+1
  74. if(len(i)>max_len):
  75. n = np.delete(n, j)
  76. j=j-1
  77.  
  78. for item in range (len(n)):
  79. n[item] = n[item]+'Z'*(max_len-len(n[item]))
  80.  
  81. y_train = n
  82. #================= x_test ==================
  83. n = x_test
  84. j=-1
  85.  
  86. for i in x_test:
  87. j=j+1
  88. if(len(i)>max_len):
  89. n = np.delete(n, j)
  90. j=j-1
  91.  
  92. for item in range (len(n)):
  93. n[item] = n[item]+'Z'*(max_len-len(n[item]))
  94.  
  95. x_test = n
  96. #================= y_test ==================
  97. n = y_test
  98. j=-1
  99.  
  100. for i in y_test:
  101. j=j+1
  102. if(len(i)>max_len):
  103. n = np.delete(n, j)
  104. j=j-1
  105.  
  106. for item in range (len(n)):
  107. n[item] = n[item]+'Z'*(max_len-len(n[item]))
  108.  
  109. y_test = n
  110.  
  111. #============= selected data withing range===========
  112.  
  113.  
  114. max_len = max([len(i) for i in x_train])
  115. #print(max_len)
  116.  
  117. max_len = max([len(i) for i in y_test])
  118. #print(max_len)
  119.  
  120. print('starting the preprocessing\n')
  121. start_time = time.time()
  122.  
  123. #============== Properties Encoded start ============================
  124.  
  125. # ==========x_train conversion start====
  126. s = list(x_train)
  127.  
  128. k = []
  129.  
  130. for i in range(len(s)):
  131. t=[]
  132. for item in range(len(s[i])):
  133. t.append(prop[ord(s[i][item])-64])
  134. k.append(t)
  135.  
  136.  
  137. x_train = np.array(k)
  138.  
  139. #=========== x_train conversion end ====
  140.  
  141. #=========== x_test conversion start====
  142. s = list(x_test)
  143.  
  144. k = []
  145.  
  146. for i in range(len(s)):
  147. t=[]
  148. for item in range(len(s[i])):
  149. t.append(prop[ord(s[i][item])-64])
  150. k.append(t)
  151.  
  152.  
  153. x_test = np.array(k)
  154.  
  155. #============= x_test conversion end====
  156.  
  157.  
  158. #============== Properties Encoded end ============================
  159.  
  160.  
  161.  
  162. #============== ONE_HOT ===================================================
  163.  
  164. #======= y_train start========
  165. #y_train = y_train[0:3]
  166.  
  167. alphabet = 'CEHXZ'
  168.  
  169. char_to_int = dict((c, i) for i, c in enumerate(alphabet))
  170. int_to_char = dict((i, c) for i, c in enumerate(alphabet))
  171.  
  172. k = []
  173.  
  174. for i in range(len(y_train)):
  175. integer_encoded = [char_to_int[char] for char in y_train[i]]
  176.  
  177. onehot_encoded=list()
  178. for value in integer_encoded:
  179. letter = [0 for _ in range(len(alphabet))]
  180. letter[value] = 1
  181. onehot_encoded.append(letter)
  182.  
  183. k.append(onehot_encoded)
  184.  
  185. y_train = np.array(k)
  186. #display(y_train)
  187.  
  188. #======= y_train end========
  189.  
  190. #======= y_test start========
  191. #y_train = y_train[0:3]
  192.  
  193. alphabet = 'CEHXZ'
  194.  
  195. char_to_int = dict((c, i) for i, c in enumerate(alphabet))
  196. int_to_char = dict((i, c) for i, c in enumerate(alphabet))
  197.  
  198. k = []
  199.  
  200. for i in range(len(y_test)):
  201. integer_encoded = [char_to_int[char] for char in y_test[i]]
  202.  
  203. onehot_encoded=list()
  204. for value in integer_encoded:
  205. letter = [0 for _ in range(len(alphabet))]
  206. letter[value] = 1
  207. onehot_encoded.append(letter)
  208.  
  209. k.append(onehot_encoded)
  210.  
  211. y_test = np.array(k)
  212. #display(y_train)
  213.  
  214. #======= y_test end========
  215.  
  216. #============== ONE_HOT finish ============================
  217.  
  218. print('ending the preprocessing\n')
  219. finish_time=time.time()
  220. print ('Time taken to pre-process: ',round(finish_time - start_time,2),' seconds')
  221.  
  222. #============== ONE_HOT_INVERSION =========================================
  223.  
  224. #for i in range(len(y_train[0])):
  225. # inverted = int_to_char[argmax(y_train[0][i])]
  226. # print(inverted)
  227.  
  228. #================ it's time to learn============================
  229. debug_me()
  230.  
  231.  
  232. model=Sequential()
  233.  
  234. model.add(LSTM((5),batch_input_shape=(None,400,11),return_sequences=True,activation='softmax'))
  235.  
  236. model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
  237.  
  238. model.summary()
  239.  
  240. print(model.input_shape)
  241. print(model.output_shape)
  242.  
  243. history=model.fit(x_train,y_train,epochs=5,validation_data=(x_test,y_test))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement