Advertisement
Guest User

Untitled

a guest
Aug 23rd, 2019
107
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 9.47 KB | None | 0 0
  1. import numpy as np
  2. import random
  3. import copy
  4. from collections import deque, namedtuple
  5. import matplotlib.pyplot as plt
  6. import tensorflow as tf
  7. import gym
  8.  
  9. Experience = namedtuple('Experience', 'state0, action, reward, state1, terminal')
  10.  
  11.  
  12. def sample_batch_indexes(low, high, size):
  13. r = range(low, high)
  14. batch_idxs = random.sample(r, size)
  15.  
  16. return batch_idxs
  17.  
  18.  
  19. class EpsGreedyQPolicy:
  20. def __init__(self, eps=.1, eps_decay_rate=.99, min_eps=0.1):
  21. self.eps = eps
  22. self.eps_decay_rate = eps_decay_rate
  23. self.min_eps = min_eps
  24.  
  25. def select_action(self, q_values, is_training=True):
  26. nb_actions = q_values.shape[0]
  27.  
  28. if is_training:
  29. if np.random.uniform() < self.eps:
  30. action = np.random.randint(0, nb_actions)
  31. else:
  32. action = np.argmax(q_values)
  33. else:
  34. action = np.argmax(q_values)
  35.  
  36. return action
  37.  
  38. def decay_eps_rate(self):
  39. self.eps = self.eps*self.eps_decay_rate
  40. if self.eps < self.min_eps:
  41. self.eps = self.min_eps
  42.  
  43.  
  44. def build_model(input_shape, nb_output):
  45. model = tf.keras.models.Sequential()
  46. inputs = tf.placeholder(dtype=tf.float32, shape=[None,]+input_shape, name="input")
  47. model.add(tf.keras.layers.Dense(16, activation="relu", input_shape=[None,]+input_shape))
  48. model.add(tf.keras.layers.Dense(16, activation="relu"))
  49. model.add(tf.keras.layers.Dense(16, activation="relu"))
  50. model.add(tf.keras.layers.Dense(nb_output))
  51. outputs = model(inputs)
  52.  
  53. return inputs, outputs, model
  54.  
  55.  
  56. class Memory:
  57. def __init__(self, limit, maxlen):
  58. self.actions = deque(maxlen=limit)
  59. self.rewards = deque(maxlen=limit)
  60. self.terminals = deque(maxlen=limit)
  61. self.observations = deque(maxlen=limit)
  62. self.maxlen = maxlen
  63. self.recent_observations = deque(maxlen=maxlen)
  64.  
  65. def sample(self, batch_size):
  66. batch_idxs = sample_batch_indexes(0, len(self.observations) - 1, size=batch_size)
  67. for (i, idx) in enumerate(batch_idxs):
  68. terminal = self.terminals[idx-1]
  69. while terminal:
  70. idx = sample_batch_indexes(0, len(self.observations)-1, size=1)[0]
  71. terminal = self.terminals[idx-1]
  72. batch_idxs[i] = idx
  73.  
  74. experiences = []
  75. for idx in batch_idxs:
  76. state0 = self.observations[idx]
  77. action = self.actions[idx]
  78. reward = self.rewards[idx]
  79. terminal = self.terminals[idx]
  80. state1 = self.observations[idx+1]
  81. experiences.append(Experience(state0=state0, action=action, reward=reward, state1=state1, terminal=terminal))
  82.  
  83. return experiences
  84.  
  85. def append(self, observation, action, reward, terminal=False):
  86. self.observations.append(observation)
  87. self.actions.append(action)
  88. self.rewards.append(reward)
  89. self.terminals.append(terminal)
  90. self.recent_observations.append(observation)
  91.  
  92.  
  93. class DQNAgent:
  94. def __init__(self, training=None, policy=None, gamma=.99, actions=None, memory=None, memory_interval=1,train_interval=1,
  95. batch_size=32, nb_steps_warmup=300, observation=None, input_shape=None):
  96. self.training = training
  97. self.policy = policy
  98. self.actions = actions
  99. self.gamma = gamma
  100. self.recent_observation = observation
  101. self.previous_observation = observation
  102. self.memory = memory
  103. self.memory_interval = memory_interval
  104. self.batch_size = batch_size
  105. self.recent_action_id = 0
  106. self.nb_steps_warmup = nb_steps_warmup
  107.  
  108. self.sess = tf.InteractiveSession()
  109. self.model_inputs, self.model_outputs, self.model = build_model(input_shape, len(self.actions))
  110. self.target_model_inputs, self.target_model_outputs, self.target_model = build_model(input_shape, len(self.actions))
  111. target_model_weights = self.target_model.trainable_weights
  112. model_weights = self.model.trainable_weights
  113. self.update_target_model = [target_model_weights[i].assign(.999*target_model_weights[i]+.001*model_weights[i]) for i in range(len(target_model_weights))]
  114. self.train_interval = train_interval
  115. self.step = 0
  116.  
  117. def compile(self):
  118. self.targets = tf.placeholder(dtype=tf.float32, shape=[None, 2], name="target_q")
  119. self.inputs= tf.placeholder(dtype=tf.int32, shape=[None], name="action")
  120. actions_one_hot = tf.one_hot(indices=self.inputs, depth=len(self.actions), on_value=1.0, off_value=0.0, name="action_one_hot")
  121.  
  122. pred_q = tf.multiply(self.model_outputs, actions_one_hot)
  123.  
  124. error = self.targets - pred_q
  125. square_error = .5 * tf.square(error)
  126. loss = tf.reduce_mean(square_error, axis=0, name="loss")
  127.  
  128. optimizer = tf.train.AdamOptimizer(learning_rate=1e-3)
  129. self.train = optimizer.minimize(loss)
  130. self.sess.run(tf.initialize_all_variables())
  131.  
  132. def act(self):
  133. action_id = self.forward()
  134. action = self.actions[action_id]
  135. return action
  136.  
  137. def forward(self):
  138. q_values = self.compute_q_values(self.recent_observation)
  139. action_id = self.policy.select_action(q_values=q_values, is_training=self.training)
  140. self.recent_action_id = action_id
  141.  
  142. return action_id
  143.  
  144. def observe(self, observation, reward=None, is_terminal=None):
  145. self.previous_observation = copy.deepcopy(self.recent_observation)
  146. self.recent_observation = observation
  147.  
  148. if self.training and reward is not None:
  149. if self.step % self.memory_interval == 0:
  150. self.memory.append(self.previous_observation, self.recent_action_id, reward, terminal=is_terminal)
  151. self.experience_replay()
  152. self.policy.decay_eps_rate()
  153.  
  154. self.step += 1
  155.  
  156. def experience_replay(self):
  157. if (self.step > self.nb_steps_warmup) and (self.step % self.train_interval == 0):
  158. experiences = self.memory.sample(self.batch_size)
  159.  
  160. state0_batch = []
  161. reward_batch = []
  162. action_batch = []
  163. state1_batch = []
  164. terminal_batch = []
  165.  
  166. for e in experiences:
  167. state0_batch.append(e.state0)
  168. state1_batch.append(e.state1)
  169. reward_batch.append(e.reward)
  170. action_batch.append(e.action)
  171. terminal_batch.append(0. if e.terminal else 1.)
  172.  
  173. target_batch = np.zeros((self.batch_size, len(self.actions)))
  174. reward_batch = np.array(reward_batch)
  175. target_q_values = np.array(self.compute_target_q_value(state1_batch)) # compute maxQ'(s')
  176. discounted_reward_batch = (self.gamma * target_q_values)
  177. discounted_reward_batch *= terminal_batch
  178. targets = reward_batch + discounted_reward_batch # target = r + γ maxQ'(s')
  179.  
  180. for idx, (action, target) in enumerate(zip(action_batch, targets)):
  181. target_batch[idx][action] = target
  182.  
  183. self.train_on_batch(state0_batch, action_batch, target_batch)
  184.  
  185. self.sess.run(self.update_target_model)
  186.  
  187. def train_on_batch(self, state_batch, action_batch, targets):
  188. self.sess.run(self.train, feed_dict={self.model_inputs:state_batch, self.inputs:action_batch, self.targets:targets})
  189.  
  190. def compute_target_q_value(self, state1_batch):
  191. q_values = self.sess.run(self.target_model_outputs, feed_dict={self.target_model_inputs: state1_batch})
  192. q_values = np.max(q_values, axis=1)
  193.  
  194. return q_values
  195.  
  196. def compute_q_values(self, state):
  197. q_values = self.sess.run(self.target_model_outputs, feed_dict={self.target_model_inputs: [state]})
  198.  
  199. return q_values[0]
  200.  
  201. def reset(self):
  202. self.recent_observation = None
  203. self.previous_observation = None
  204. self.recent_action_id = None
  205.  
  206.  
  207. if __name__ == '__main__':
  208.  
  209. env = gym.make('CartPole-v0')
  210. np.random.seed(123)
  211. env.seed(123)
  212.  
  213. nb_actions = env.action_space.n
  214. actions = np.arange(nb_actions)
  215. obs = env.reset()
  216.  
  217. policy = EpsGreedyQPolicy(1.0, 0.999)
  218. memory = Memory(limit=50000, maxlen=1)
  219. agent = DQNAgent(actions=actions, memory=memory, observation=obs, input_shape=[len(obs)], policy=policy)
  220. agent.compile()
  221.  
  222. nb_episode = 3000
  223. evaluate_interval = 10
  224.  
  225. result = []
  226. evaluate_episode = []
  227. timestep_limit = env.spec.timestep_limit
  228. for episode in range(nb_episode):
  229. agent.reset()
  230. observation = env.reset()
  231. observation = copy.deepcopy(observation)
  232. agent.observe(observation)
  233.  
  234. for t in range(timestep_limit):
  235. # env.render()
  236. action = agent.act()
  237. observation, reward, done, info = env.step(action)
  238. observation = copy.deepcopy(observation)
  239.  
  240. agent.observe(observation, reward, done)
  241. if done:
  242. break
  243.  
  244. # evaluate
  245. if episode % evaluate_interval == 0:
  246. evaluate_episode.append(episode)
  247. agent.training = False
  248. agent.reset()
  249. observation = env.reset()
  250. agent.observe(observation)
  251. for t in range(timestep_limit):
  252. # env.render()
  253. action = agent.act()
  254. observation, reward, done, info = env.step(action)
  255. agent.observe(observation)
  256. if done:
  257. result.append(t)
  258. break
  259. agent.training = True
  260.  
  261. plt.ylabel("timesteps")
  262. plt.xlabel("episode")
  263. plt.ylim((0, 200))
  264. plt.plot(evaluate_episode, result)
  265. plt.savefig("result.png")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement