Advertisement
Guest User

lstm

a guest
Nov 13th, 2019
189
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 8.88 KB | None | 0 0
  1. import numpy as np
  2. import os
  3. import sys
  4. import time
  5. import pandas as pd
  6. from tqdm._tqdm_notebook import tqdm_notebook
  7. import pickle
  8. from keras.models import Sequential, load_model
  9. from keras.layers import Dense, Dropout
  10. from keras.layers import LSTM
  11. from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau, CSVLogger
  12. from keras import optimizers
  13. # from keras.wrappers.scikit_learn import KerasClassifier
  14. from sklearn.preprocessing import MinMaxScaler
  15. from sklearn.model_selection import train_test_split
  16. from sklearn.metrics import mean_squared_error
  17. import logging
  18. # import talos as ta
  19.  
  20. os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
  21. logging.getLogger("tensorflow").setLevel(logging.ERROR)
  22. os.environ['TZ'] = 'Asia/Kolkata' # to set timezone; needed when running on cloud
  23. time.tzset()
  24.  
  25. params = {
  26. "batch_size": 20, # 20<16<10, 25 was a bust
  27. "epochs": 300,
  28. "lr": 0.00010000,
  29. "time_steps": 60
  30. }
  31.  
  32. iter_changes = "complete"
  33.  
  34. INPUT_PATH = "/content/drive/My Drive/VIP"
  35. OUTPUT_PATH = "/content/drive/My Drive/output"+iter_changes
  36. TIME_STEPS = params["time_steps"]
  37. BATCH_SIZE = params["batch_size"]
  38. stime = time.time()
  39.  
  40. # check if directory already exists
  41. if not os.path.exists(OUTPUT_PATH):
  42. os.makedirs(OUTPUT_PATH)
  43. print("Directory created", OUTPUT_PATH)
  44. else:
  45. raise Exception("Directory already exists. Don't override.")
  46.  
  47.  
  48. def print_time(text, stime):
  49. seconds = (time.time()-stime)
  50. print(text, seconds//60,"minutes : ",np.round(seconds%60),"seconds")
  51.  
  52.  
  53. def trim_dataset(mat,batch_size):
  54. """
  55. trims dataset to a size that's divisible by BATCH_SIZE
  56. """
  57. no_of_rows_drop = mat.shape[0]%batch_size
  58. if no_of_rows_drop > 0:
  59. return mat[:-no_of_rows_drop]
  60. else:
  61. return mat
  62.  
  63.  
  64. def build_timeseries(mat, y_col_index):
  65. """
  66. Converts ndarray into timeseries format and supervised data format. Takes first TIME_STEPS
  67. number of rows as input and sets the TIME_STEPS+1th data as corresponding output and so on.
  68. :param mat: ndarray which holds the dataset
  69. :param y_col_index: index of column which acts as output
  70. :return: returns two ndarrays-- input and output in format suitable to feed
  71. to LSTM.
  72. """
  73. # total number of time-series samples would be len(mat) - TIME_STEPS
  74. dim_0 = mat.shape[0] - TIME_STEPS
  75. dim_1 = mat.shape[1]
  76. x = np.zeros((dim_0, TIME_STEPS, dim_1))
  77. y = np.zeros((dim_0,))
  78. print("dim_0",dim_0)
  79. for i in tqdm_notebook(range(dim_0)):
  80. x[i] = mat[i:TIME_STEPS+i]
  81. y[i] = mat[TIME_STEPS+i, y_col_index]
  82. # if i < 10:
  83. # print(i,"-->", x[i,-1,:], y[i])
  84. print("length of time-series i/o",x.shape,y.shape)
  85. return x, y
  86.  
  87.  
  88. stime = time.time()
  89. print(os.listdir(INPUT_PATH))
  90. df_ge = pd.read_csv(os.path.join(INPUT_PATH, "GE.csv"), engine='python')
  91. print(df_ge.shape)
  92. print(df_ge.columns)
  93. display(df_ge.head(5))
  94. tqdm_notebook.pandas('Processing...')
  95. # df_ge = process_dataframe(df_ge)
  96. print(df_ge.dtypes)
  97. train_cols = ["Open","High","Low","Close","Volume"]
  98. df_train, df_test = train_test_split(df_ge, train_size=0.8, test_size=0.2, shuffle=False)
  99. print("Train--Test size", len(df_train), len(df_test))
  100.  
  101. # scale the feature MinMax, build array
  102. x = df_train.loc[:,train_cols].values
  103. min_max_scaler = MinMaxScaler()
  104. x_train = min_max_scaler.fit_transform(x)
  105. x_test = min_max_scaler.transform(df_test.loc[:,train_cols])
  106.  
  107. print("Deleting unused dataframes of total size(KB)",(sys.getsizeof(df_ge)+sys.getsizeof(df_train)+sys.getsizeof(df_test))//1024)
  108.  
  109. del df_ge
  110. del df_test
  111. del df_train
  112. del x
  113.  
  114. print("Are any NaNs present in train/test matrices?",np.isnan(x_train).any(), np.isnan(x_train).any())
  115. x_t, y_t = build_timeseries(x_train, 3)
  116. x_t = trim_dataset(x_t, BATCH_SIZE)
  117. y_t = trim_dataset(y_t, BATCH_SIZE)
  118. print("Batch trimmed size",x_t.shape, y_t.shape)
  119.  
  120.  
  121. def create_model():
  122. lstm_model = Sequential()
  123. # (batch_size, timesteps, data_dim)
  124. lstm_model.add(LSTM(100, batch_input_shape=(BATCH_SIZE, TIME_STEPS, x_t.shape[2]),
  125. dropout=0.0, recurrent_dropout=0.0, stateful=True, return_sequences=True,
  126. kernel_initializer='random_uniform'))
  127. lstm_model.add(Dropout(0.4))
  128. lstm_model.add(LSTM(60, dropout=0.0))
  129. lstm_model.add(Dropout(0.4))
  130. lstm_model.add(Dense(20,activation='relu'))
  131. lstm_model.add(Dense(1,activation='sigmoid'))
  132. optimizer = optimizers.RMSprop(lr=params["lr"])
  133. # optimizer = optimizers.SGD(lr=0.000001, decay=1e-6, momentum=0.9, nesterov=True)
  134. lstm_model.compile(loss='mean_squared_error', optimizer=optimizer)
  135. return lstm_model
  136.  
  137.  
  138. model = None
  139. try:
  140. model = pickle.load(open("lstm_model", 'rb'))
  141. print("Loaded saved model...")
  142. except FileNotFoundError:
  143. print("Model not found")
  144.  
  145.  
  146. x_temp, y_temp = build_timeseries(x_test, 3)
  147. x_val, x_test_t = np.split(trim_dataset(x_temp, BATCH_SIZE),2)
  148. y_val, y_test_t = np.split(trim_dataset(y_temp, BATCH_SIZE),2)
  149.  
  150. print("Test size", x_test_t.shape, y_test_t.shape, x_val.shape, y_val.shape)
  151.  
  152. is_update_model = True
  153. if model is None or is_update_model:
  154. from keras import backend as K
  155. print("Building model...")
  156. print("checking if GPU available", K.tensorflow_backend._get_available_gpus())
  157. model = create_model()
  158.  
  159. es = EarlyStopping(monitor='val_loss', mode='min', verbose=1,
  160. patience=40, min_delta=0.0001)
  161.  
  162. mcp = ModelCheckpoint(os.path.join(OUTPUT_PATH,
  163. "best_model.h5"), monitor='val_loss', verbose=1,
  164. save_best_only=True, save_weights_only=False, mode='min', period=1)
  165.  
  166. # Not used here. But leaving it here as a reminder for future
  167. r_lr_plat = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=30,
  168. verbose=0, mode='auto', min_delta=0.0001, cooldown=0, min_lr=0)
  169.  
  170. csv_logger = CSVLogger(os.path.join(OUTPUT_PATH, 'training_log_' + time.ctime().replace(" ","_") + '.log'), append=True)
  171.  
  172. history = model.fit(x_t, y_t, epochs=params["epochs"], verbose=2, batch_size=BATCH_SIZE,
  173. shuffle=False, validation_data=(trim_dataset(x_val, BATCH_SIZE),
  174. trim_dataset(y_val, BATCH_SIZE)), callbacks=[es, mcp, csv_logger])
  175.  
  176. print("saving model...")
  177. pickle.dump(model, open("lstm_model", "wb"))
  178.  
  179. # model.evaluate(x_test_t, y_test_t, batch_size=BATCH_SIZE
  180. y_pred = model.predict(trim_dataset(x_test_t, BATCH_SIZE), batch_size=BATCH_SIZE)
  181. y_pred = y_pred.flatten()
  182. y_test_t = trim_dataset(y_test_t, BATCH_SIZE)
  183. error = mean_squared_error(y_test_t, y_pred)
  184. print("Error is", error, y_pred.shape, y_test_t.shape)
  185. print(y_pred[0:15])
  186. print(y_test_t[0:15])
  187.  
  188. # convert the predicted value to range of real data
  189. y_pred_org = (y_pred * min_max_scaler.data_range_[3]) + min_max_scaler.data_min_[3]
  190. # min_max_scaler.inverse_transform(y_pred)
  191. y_test_t_org = (y_test_t * min_max_scaler.data_range_[3]) + min_max_scaler.data_min_[3]
  192. # min_max_scaler.inverse_transform(y_test_t)
  193. print(y_pred_org[0:15])
  194. print(y_test_t_org[0:15])
  195.  
  196. # Visualize the training data
  197. from matplotlib import pyplot as plt
  198. plt.figure()
  199. plt.plot(history.history['loss'])
  200. plt.plot(history.history['val_loss'])
  201. plt.title('Model loss')
  202. plt.ylabel('Loss')
  203. plt.xlabel('Epoch')
  204. plt.legend(['Train', 'Test'], loc='upper left')
  205. #plt.show()
  206. plt.savefig(os.path.join(OUTPUT_PATH, 'train_vis_BS_'+str(BATCH_SIZE)+"_"+time.ctime()+'.png'))
  207.  
  208. # load the saved best model from above
  209. saved_model = load_model(os.path.join(OUTPUT_PATH, 'best_model.h5')) # , "lstm_best_7-3-19_12AM",
  210. print(saved_model)
  211.  
  212. y_pred = saved_model.predict(trim_dataset(x_test_t, BATCH_SIZE), batch_size=BATCH_SIZE)
  213. y_pred = y_pred.flatten()
  214. y_test_t = trim_dataset(y_test_t, BATCH_SIZE)
  215. error = mean_squared_error(y_test_t, y_pred)
  216. print("Error is", error, y_pred.shape, y_test_t.shape)
  217. print(y_pred[0:15])
  218. print(y_test_t[0:15])
  219. y_pred_org = (y_pred * min_max_scaler.data_range_[3]) + min_max_scaler.data_min_[3] # min_max_scaler.inverse_transform(y_pred)
  220. y_test_t_org = (y_test_t * min_max_scaler.data_range_[3]) + min_max_scaler.data_min_[3] # min_max_scaler.inverse_transform(y_test_t)
  221. print(y_pred_org[0:15])
  222. print(y_test_t_org[0:15])
  223.  
  224. # Visualize the prediction
  225. from matplotlib import pyplot as plt
  226. plt.figure()
  227. plt.plot(y_pred_org)
  228. plt.plot(y_test_t_org)
  229. plt.title('Prediction vs Real Stock Price')
  230. plt.ylabel('Price')
  231. plt.xlabel('Days')
  232. plt.legend(['Prediction', 'Real'], loc='upper left')
  233. #plt.show()
  234. plt.savefig(os.path.join(OUTPUT_PATH, 'pred_vs_real_BS'+str(BATCH_SIZE)+"_"+time.ctime()+'.png'))
  235. print_time("program completed ", stime)
  236.  
  237. def mean_absolute_percentage_error(y_test_t_org, y_pred_org):
  238. y_test_t_org, y_pred_org = np.array(y_test_t_org), np.array(y_pred_org)
  239. return np.mean(np.abs((y_test_t_org - y_pred_org) / y_test_t_org)) * 100
  240. print (np_mean)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement