Guest User

Untitled

a guest
Apr 16th, 2020
118
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 8.32 KB | None | 0 0
  1. import cv2
  2. import random
  3. import warnings
  4. warnings.filterwarnings("ignore")
  5. import os
  6. os.environ['TF_CPP_MIN_LOG_LEVEL']='3'
  7. import numpy as np
  8. import gym
  9. import tensorflow as tf
  10. from tqdm import tqdm as tqdm
  11. from tensorflow.keras.models import Model
  12. from tensorflow.keras.layers import Conv2D, MaxPool2D, MaxPooling2D, Dropout, Flatten, Input, Dense
  13. from tensorflow.keras.optimizers import Adam
  14. from tensorflow.keras.callbacks import TensorBoard
  15. from collections import deque
  16. env = gym.make('Pong-v0')
  17. EPISODES = 300
  18. MODEL_NAME = 'my_first_ddqn'
  19.  
  20. class TensorBoard_for_DQN(TensorBoard):
  21. # Overriding init to set initial step and writer (we want one log file for all .fit() calls)
  22. def __init__(self, **kwargs):
  23. super().__init__(**kwargs)
  24. self.step = 1 # Is this a attribute which comes in the TensorBoard class
  25. self.writer = tf.summary.create_file_writer(self.log_dir)
  26. self._log_write_dir = os.path.join(self.log_dir, MODEL_NAME)
  27.  
  28. # Overriding this method to stop creating default log writer
  29. def set_model(self, model):
  30. pass
  31.  
  32. # Overrided, saves logs with our step number
  33. # (otherwise every .fit() will start writing from 0th step)
  34. def on_epoch_end(self, epoch, logs=None):
  35. self.update_stats(**logs)
  36.  
  37. # Overrided
  38. # We train for one batch only, no need to save anything at epoch end
  39. def on_batch_end(self, batch, logs=None):
  40. pass
  41.  
  42. # Overrided, so won't close writer
  43. def on_train_end(self, _):
  44. pass
  45.  
  46. def on_train_batch_end(self, batch, logs=None):
  47. pass
  48.  
  49. # Custom method for saving own metrics
  50. # Creates writer, writes custom metrics and closes writer
  51. def update_stats(self, **stats):
  52. self._write_logs(stats, self.step)
  53.  
  54. def _write_logs(self, logs, index):
  55. with self.writer.as_default():
  56. for name, value in logs.items():
  57. tf.summary.scalar(name, value, step=index)
  58. self.step += 1
  59. self.writer.flush()
  60.  
  61. class model(Model):
  62. def __init__(self):
  63. super(model, self).__init__()
  64. self.lr = 0.01
  65. self.conv1 = Conv2D(filters=32, input_shape=(210, 160, 1), kernel_size=(3, 3), strides=1, padding='same', activation='elu')#(self.inp)
  66.  
  67. self.conv2 = Conv2D(filters=32, kernel_size=(3, 3), strides=1, padding='same', activation='elu')#(self.conv1)
  68. self.mp2 = MaxPool2D(pool_size=(3, 3), strides=1, padding='same')#(self.conv2)
  69.  
  70. self.conv3 = Conv2D(filters=64, kernel_size=(3, 3), strides=1, padding='same', activation='elu')#(self.mp2)
  71. self.mp3 = MaxPool2D(pool_size=(3, 3), strides=1, padding='same')#(self.conv3)
  72.  
  73. self.conv4 = Conv2D(filters=64, kernel_size=(3, 3), strides=1, padding='same', activation='elu')#(self.mp3)
  74. self.mp4 = MaxPool2D(pool_size=(3, 3), strides=1, padding='same')#(self.conv4)
  75.  
  76. self.flat = Flatten() #(self.mp6)
  77.  
  78. self.value = Dense(1, activation=None)#(self.flat) # how good is a particular state
  79. self.advantage = Dense(env.action_space.n, activation=None)#(self.flat) # which is best action
  80. self.compile(optimizer=Adam(lr=self.lr), loss='mse', metrics=['accuracy'])
  81.  
  82. def predict_q(self, state):
  83. state = tf.cast(cv2.cvtColor(state, cv2.COLOR_RGB2GRAY), tf.float32)
  84.  
  85. #x = self.inp(state)
  86. x = self.conv1(x)
  87.  
  88. x=self.conv2(x)
  89. x=self.mp2(x)
  90.  
  91. x=self.conv3(x)
  92. x=self.mp3(x)
  93.  
  94. x=self.conv4(x)
  95. x=self.mp4(x)
  96.  
  97. x = self.flat(x)
  98. value = self.value(x)
  99. advantage = self.advantage(x)
  100. q = (value + (advantage - tf.math.reduce_mean(advantage, axis=1, keep_dims=True)))
  101. return q
  102.  
  103. def predict_advantage(self, state):
  104. state = tf.cast(cv2.cvtColor(state, cv2.COLOR_RGB2GRAY), tf.float32)
  105.  
  106. #x = self.inp(state)
  107. x = self.conv1(x)
  108.  
  109. x=self.conv2(x)
  110. x=self.mp2(x)
  111.  
  112. x=self.conv3(x)
  113. x=self.mp3(x)
  114.  
  115. x=self.conv4(x)
  116. x=self.mp4(x)
  117.  
  118. x = self.flat(x)
  119. # value = self.value(x)
  120. x = self.advantage(x)
  121. return x
  122.  
  123. class agent():
  124. def __init__(self):
  125. # our parameters for learning
  126. self.discount = 0.9
  127. self.epsilon = 0.99
  128. self.epsilon_decay = 0.99 # 0.99
  129. self.min_epsilon = 0.1
  130. self.MINIBATCH_SIZE = 128
  131. self.lr = 0.01
  132.  
  133. # Our memory
  134. self.REPLAY_MEMORY_SIZE = 5000
  135. self.MIN_REPLAY_MEMORY_SIZE = 1000
  136. self.replay_memory = deque(maxlen=self.REPLAY_MEMORY_SIZE)
  137. self.model_name = 'my_first_ddqn.h5'
  138.  
  139. #our models -
  140. self.model = model()
  141. self.target_model = model()
  142.  
  143. #self.model.compile(optimizer=Adam(lr=self.lr), loss='mse', metrics=['accuracy'])
  144. #self.target_model.compile(optimizer=Adam(lr=self.lr), loss='mse', metrics=['accuracy'])
  145.  
  146. # when to update our target model
  147. self.UPDATE_TARGET_EVERY = 1000
  148. self.UPDATE_TARGET_COUNTER = 0
  149.  
  150. # Visualisations -
  151. self.tensorboard = TensorBoard_for_DQN()
  152.  
  153. def update_replay_memory(self, transition):
  154. self.replay_memory.append(transition)
  155.  
  156. def choose_action(self, state):
  157. if np.random.random() < self.epsilon:
  158. action = np.random.choice(env.action_space.n)
  159. else: # we exploit
  160. actions = self.model.advantage(state)
  161. action = np.argmax(acions, axis=1)
  162. return action
  163.  
  164. def decay_epsilon(self):
  165. # Decay epsilon
  166. if self.epsilon > self.min_epsilon:
  167. self.epsilon *= self.epsilon_decay
  168. self.epsilon = max(MIN_EPSILON, epsilon)
  169.  
  170. def train(self):
  171. if len(self.replay_memory) < 1000:
  172. return
  173.  
  174. # Let's make a minibatch -
  175. minibatch = random.sample(self.replay_memory, k=self.MINIBATCH_SIZE)
  176. # Get current states from minibatch, then query NN model for Q values
  177. current_states = np.array([transition[0] for transition in minibatch])/255
  178. current_qs_list = self.model.predict_q(current_states)
  179.  
  180. new_current_states = np.array([transition[3] for transition in minibatch])/255
  181. future_qs_list = self.target_model.predict_q(new_current_states)
  182.  
  183. x = []
  184. y = []
  185.  
  186. for idx, (current_state, action, reward, new_current_state, done) in enumerate(minibatch):
  187. if not done:
  188. max_future_q = np.max(future_qs_list[idx])
  189. new_q = reward + self.discount * max_future_q
  190. else:
  191. new_q =reward
  192.  
  193. # Update Q value for given state
  194. current_qs = current_qs_list[index]
  195. current_qs[action] = new_q
  196.  
  197. x.append(current_state)
  198. y.append(current_qs)
  199.  
  200. model.fit()
  201.  
  202. self.model.fit(np.array(X)/255, np.array(y), batch_size=MINIBATCH_SIZE, verbose=0, shuffle=False, callbacks=[self.tensorboard] if terminal_state else None)
  203.  
  204. if terminal_state:
  205. self.UPDATE_TARGET_COUNTER += 1
  206.  
  207. if self.UPDATE_TARGET_COUNTER == self.UPDATE_TARGET_EVERY:
  208. self.target_model.set_weights(self.model.weights)
  209. self.UPDATE_TARGET_COUNTER = 0
  210.  
  211. episodes = 3000
  212. agent = agent()
  213. ep_rewards = []
  214. SHOW_PREVIEW = False
  215. AGGREGATE_STATS_EVERY = 1000
  216. MIN_REWARD = 10
  217.  
  218. for episode in tqdm(range(1, EPISODES + 1), ascii=True, unit='episodes'):
  219. agent.tensorboard.step = episode# Update tensorboard step every episode
  220. episode_reward = 0 # We reset episode reward after each episode
  221. current_state = env.reset() # Reset environment and get initial state
  222. done = False # To exit loop when we are done.
  223. while not done:
  224. # predict what our best action is -
  225. action = agent.choose_action(current_state)
  226. new_state, reward, done, info = env.step(action)
  227.  
  228. # Transform new continous state to new discrete state and count reward
  229. episode_reward += reward
  230.  
  231. if SHOW_PREVIEW and not episode % AGGREGATE_STATS_EVERY:
  232. env.render()
  233.  
  234. # Every step we update replay memory and train main network
  235. agent.update_replay_memory((current_state, action, reward, new_state, done))
  236. agent.train()
  237.  
  238. current_state = new_state
  239.  
  240. # Append episode reward to a list and log stats (every given number of episodes)
  241. ep_rewards.append(episode_reward)
  242. if not episode % AGGREGATE_STATS_EVERY or episode == 1:
  243. average_reward = sum(ep_rewards[-AGGREGATE_STATS_EVERY:])/len(ep_rewards[-AGGREGATE_STATS_EVERY:])
  244. min_reward = min(ep_rewards[-AGGREGATE_STATS_EVERY:])
  245. max_reward = max(ep_rewards[-AGGREGATE_STATS_EVERY:])
  246. agent.tensorboard.update_stats(reward_avg=average_reward, reward_min=min_reward, reward_max=max_reward, epsilon=epsilon)
  247.  
  248. # Save model, but only when min reward is greater or equal a set value
  249. if min_reward >= MIN_REWARD:
  250. agent.model.save(f'models/{MODEL_NAME}__{max_reward:_>7.2f}max_{average_reward:_>7.2f}avg_{min_reward:_>7.2f}min__{int(time.time())}.model')
  251.  
  252. agent.decay_epsilon()
Add Comment
Please, Sign In to add comment