Advertisement
Guest User

Untitled

a guest
Feb 16th, 2019
93
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 6.67 KB | None | 0 0
  1. import numpy as np
  2. from numpy import array
  3. import pandas as pd
  4.  
  5. import os
  6. import glob
  7.  
  8. import tensorflow as tf
  9. from keras.models import Sequential
  10. from keras.layers import Dense, Dropout, Conv2D, Activation, MaxPooling2D, Flatten
  11.  
  12.  
  13. from sklearn.model_selection import train_test_split
  14. from sklearn.preprocessing import LabelEncoder
  15. from sklearn.preprocessing import OneHotEncoder
  16.  
  17.  
  18. # enumerate adds a number like [0, [22, 35, 42], ... ] to each sample
  19. # creates data and then looks for row with max length, then adds 0 triplet until each row is the same length as max
  20. # length
  21. def create_all_data():
  22. data = []
  23. temp = []
  24. max_len = 0
  25. label_names = [f for f in os.listdir("HMP_Dataset")]
  26. for label in label_names:
  27. file_list = glob.glob(os.path.join(os.getcwd(), "HMP_Dataset/" + label, "*.txt"))
  28. for file in file_list:
  29. with open(file) as f:
  30. for line in f:
  31. line = line.split()
  32. line = [int(i) for i in line]
  33. temp.append(line)
  34. data.append(temp)
  35. temp = []
  36. for row in data:
  37. if len(row) > max_len:
  38. max_len = len(row)
  39. for index, row in enumerate(data):
  40. while len(row) != max_len:
  41. data[index].append([0, 0, 0])
  42. return data
  43.  
  44.  
  45. def create_labels():
  46. labels = []
  47. label_names = [f for f in os.listdir("HMP_Dataset")]
  48. for label in label_names:
  49. file_list = glob.glob(os.path.join(os.getcwd(), "HMP_Dataset/" + label, "*.txt"))
  50. for num in range(len(file_list)):
  51. labels.append(label)
  52. return labels
  53.  
  54.  
  55. # data is a list of labels, turns data into array called values
  56. # LabelEncoder turns the 'string' labels into labels between 0 and n where n is number of labels
  57. # fit_transform actually takes in array of strings and turns them into numbers
  58. # after this, it reshapes the array so that there is now a row for each label
  59. # OneHotEncoder and fit_transform then turns the number that represents the label in each row into a one hot encoding
  60. def create_onehot_labels(labels):
  61. data = labels
  62. values = array(data)
  63. le = LabelEncoder()
  64. num_labels = le.fit_transform(values)
  65. num_labels = num_labels.reshape(len(num_labels), 1)
  66. enc = OneHotEncoder(sparse=False, categories='auto')
  67. onehot_labels = enc.fit_transform(num_labels)
  68. return onehot_labels
  69.  
  70.  
  71. def create_np_labels():
  72. np_labels = create_onehot_labels(create_labels())
  73. return np_labels
  74.  
  75.  
  76. def create_np_data_forreal():
  77. np_data = array(create_all_data())
  78. return np_data
  79.  
  80. def create_np_data(one_d, two_d, three_d):
  81. pd_data = pd.DataFrame(create_all_data()).values
  82. np_data = np.zeros((one_d, two_d, three_d))
  83. for i in range(one_d):
  84. for j in range(two_d):
  85. for k in range(three_d):
  86. np_data[i, j, k] = pd_data[i, j][k]
  87. np_data = np.reshape(np_data, (one_d, (two_d*three_d)))
  88. return np_data
  89.  
  90.  
  91. def create_np_csv(two_d):
  92. #two_d max is 9318
  93. np_data = create_np_data(850, two_d, 3)
  94. np_labels = create_np_labels()
  95. x_train, x_test, y_train, y_test = train_test_split(np_data, np_labels, test_size=0.2, shuffle=True, stratify=np_labels)
  96. x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, shuffle=True, stratify=y_train)
  97. np.savetxt('x_train', x_train, delimiter=',', fmt='%0.0f')
  98. np.savetxt('x_test', x_test, delimiter=',', fmt='%0.0f')
  99. np.savetxt('y_train', y_train, delimiter=',', fmt='%0.0f')
  100. np.savetxt('y_test', y_test, delimiter=',', fmt='%0.0f')
  101. np.savetxt('x_val', x_val, delimiter=',', fmt='%0.0f')
  102. np.savetxt('y_val', y_val, delimiter=',', fmt='%0.0f')
  103.  
  104.  
  105. # need window and more knowledge on how cnn works before implementing this
  106. def create_cnn(size, num_cnn_layers):
  107. num_filters = 32
  108. kernel = (3,3)
  109. max_neurons = 64
  110. model = Sequential()
  111. for i in range(1, num_cnn_layers+1):
  112. if i == 1:
  113. model.add(Conv2D(num_filters*i, kernel, input_shape=size, activation='relu', padding='same'))
  114. else:
  115. model.add(Conv2D(num_filters * i, kernel, activation='relu', padding='same'))
  116. model.add(MaxPooling2D(pool_size=(2,2)))
  117. model.add(Flatten())
  118. model.add(Dense(int(max_neurons), activation='relu'))
  119. model.add(Dropout(0.25))
  120. model.add(Dense(int(max_neurons/2), activation='relu'))
  121. model.add(Dense(14, activation='softmax'))
  122. model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
  123. return model
  124.  
  125.  
  126. def create_fnn(x_train, y_train, input_dim, epochs):
  127. #input_dim is 27954
  128. model = Sequential()
  129. #model.add(Dense(units=14, activation='relu', input_dim=input_dim))
  130. #model.add(Dense(units=64, activation='relu'))
  131. #model.add(Dropout(0.5))
  132. model.add(Dense(units=14, activation='softmax'))
  133. model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
  134. model.fit(x_train, y_train, epochs=epochs)
  135. return model
  136.  
  137.  
  138. def cross_val_fnn(x_train, y_train):
  139. model_list = []
  140. accuracy = []
  141. highest_accuracy = 0
  142. average_accuracy = 0
  143. best_model = None
  144. n_folds = 10
  145. print("======================================================================================")
  146. for i in range(n_folds):
  147. print("Training on Fold: ", i + 1)
  148. x_t, x_val, y_t, y_val = train_test_split(x_train, y_train, test_size=0.1, random_state=np.random.randint(1, 1000, 1)[0])
  149. model = create_fnn(x_t, y_t, 27954, 200)
  150. score, accuracy_t = model.evaluate(x_val, y_val)
  151. print("Validation Score: " +str(score))
  152. print("Validation Accuracy: " +str(accuracy_t))
  153. print()
  154. model_list.append(model)
  155. accuracy.append(accuracy_t)
  156. for accuracies in accuracy:
  157. if accuracies > highest_accuracy:
  158. highest_accuracy = accuracies
  159. for accuracies in accuracy:
  160. average_accuracy += accuracies
  161. average_accuracy = average_accuracy / (len(accuracy))
  162. best_model = model_list[accuracy.index(highest_accuracy)]
  163. return best_model, highest_accuracy, average_accuracy
  164.  
  165. def stratify(x_train, y_train):
  166. x_t, x_test, y_t, y_test = train_test_split(x_train, y_train, test_size=0.2, Shuffle=True, random_state=np.random.randint(1,1000, 1)[0])
  167. return x_t, x_test, y_t, y_test
  168.  
  169.  
  170. x_train = create_np_data_forreal().reshape(850, 27954)
  171. y_train = create_np_labels()
  172.  
  173. sess = tf.Session()
  174.  
  175. best_mod, high_acc, avg_acc = cross_val_fnn(x_train, y_train)
  176. print(best_mod)
  177. print("Highest Validation Accuracy: "+str(high_acc))
  178. print("Average Validation Accuracy: "+str(avg_acc))
  179. sess.close()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement