Advertisement
Guest User

Untitled

a guest
Jul 27th, 2017
70
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 5.38 KB | None | 0 0
  1. import numpy as np
  2. import cupy as cp
  3. import copy
  4. import gym
  5. import os
  6. import time
  7. from collections import deque
  8. from chainer import Chain, Variable, optimizers, initializers, serializers, cuda
  9. import chainer.functions as F
  10. import chainer.links as L
  11. from abc import ABCMeta, abstractclassmethod
  12.  
  13. os.environ['PATH'] += ':/usr/local/cuda-8.0/bin:/usr/local/cuda-8.0/bin'
  14.  
  15.  
  16. class ConvNet(Chain):
  17. def __init__(self, n_out):
  18. initializer = initializers.HeNormal()
  19. super(ConvNet, self).__init__(
  20. c1=L.Convolution2D(3, 10, 10, stride=(4, 3), initialW=initializer),
  21. c2=L.Convolution2D(10, 10, 3, pad=1, initialW=initializer),
  22. l3=L.Linear(None, 160, initialW=initializer),
  23. l4=L.Linear(None, n_out, initialW=initializer)
  24. )
  25.  
  26. def __call__(self, x):
  27. h = F.leaky_relu(self.c1(x))
  28. h = F.max_pooling_2d(h, 6, stride=3)
  29. h = F.leaky_relu(self.c2(h))
  30. h = F.max_pooling_2d(h, 7, stride=3)
  31. h = F.leaky_relu(self.l3(h))
  32. h = F.leaky_relu(self.l4(h))
  33.  
  34. return h
  35.  
  36.  
  37. class Strategy(metaclass=ABCMeta):
  38. @abstractclassmethod
  39. def get_action(self, v_act, is_train=True):
  40. pass
  41.  
  42. @abstractclassmethod
  43. def forward_step(self, step):
  44. pass
  45.  
  46.  
  47. class EpsilonGreedy(Strategy):
  48. def __init__(self):
  49. self.epsilon = 1
  50. self.decay = 0.001
  51. self.min = 0
  52. self.n_search = 1000
  53.  
  54. def get_action(self, v_act, is_train=True):
  55. if cp.random.rand() < self.epsilon and is_train:
  56. return cp.random.randint(len(v_act))
  57. else:
  58. return cp.argmax(v_act)
  59.  
  60. def forward_step(self, step):
  61. if self.epsilon > self.min and self.n_search < step:
  62. self.epsilon -= self.decay
  63.  
  64.  
  65. class Agent(metaclass=ABCMeta):
  66. @abstractclassmethod
  67. def get_action(self, st, is_train=True):
  68. pass
  69.  
  70.  
  71. class DQN(Agent):
  72. def __init__(self, strategy, n_act, n_mem=1000, seed=0):
  73. cp.random.seed(seed)
  74.  
  75. self.n_act = n_act
  76. self.n_mem = n_mem
  77. self.n_batch = 100
  78. self.f_train = 50
  79. self.f_target_update = 100
  80. self.q_func = ConvNet(n_act)
  81. self.target_q_func = copy.deepcopy(self.q_func)
  82. self.q_func.to_gpu(0) # for using GPU
  83. self.target_q_func.to_gpu(0) # for using GPU
  84. self.optimizer = optimizers.Adam()
  85. self.optimizer.setup(self.q_func)
  86. self.strategy = strategy
  87. self.memory = deque(maxlen=self.n_mem)
  88. self.gamma = 0.99
  89. self.step = 0
  90.  
  91. def stock_experience(self, st, act, r, st_next, is_term):
  92. self.memory.append((st, act, r, st_next, is_term))
  93.  
  94. def experience_replay(self):
  95. memory = list(map(np.array, zip(*self.memory)))
  96. index = np.random.permutation(len(self.memory))
  97. for mask in np.split(index, self.n_mem / self.n_batch):
  98. batch = list(map(lambda a: cuda.to_gpu(a[mask]), memory))
  99. self.q_func.cleargrads()
  100. loss = self.forward(*batch)
  101. loss.backward()
  102. self.optimizer.update()
  103.  
  104. def forward(self, st, act, r, st_next, is_term):
  105. s, s_next = Variable(st), Variable(st_next)
  106. q_out = self.q_func.__call__(s)
  107. target = copy.deepcopy(q_out.data)
  108. target = cp.asanyarray(target, dtype=cp.float32)
  109. q_max = self.target_q_func.__call__(s_next)
  110. q_max = list(map(cp.max, q_max.data))
  111. q_max = cp.asanyarray(q_max, dtype=cp.float32)
  112. index = cp.arange(self.n_batch)
  113. target[index, act] = r + self.gamma * cp.where(is_term, 0.0, q_max)
  114. loss = F.mean_squared_error(q_out, Variable(target))
  115. return loss
  116.  
  117. def get_action(self, st, is_train=True):
  118. s = Variable(cuda.to_gpu(np.array([st])))
  119. v_act = self.q_func(s)
  120. v_act = v_act.data[0]
  121. act = self.strategy.get_action(v_act, is_train)
  122. return cuda.to_cpu(act)
  123.  
  124. def train(self):
  125. if len(self.memory) >= self.n_mem:
  126. if self.step % self.f_train == 0:
  127. self.experience_replay()
  128. self.strategy.forward_step(self.step)
  129. if self.step % self.f_target_update == 0:
  130. self.target_q_func = copy.deepcopy(self.q_func)
  131. self.target_q_func.to_gpu(0) # for using GPU
  132. self.step += 1
  133.  
  134. env = gym.make('MsPacman-v0')
  135. n_act = env.action_space.n
  136.  
  137. agent = DQN(EpsilonGreedy(), n_act, 500)
  138.  
  139. print("starting train...")
  140. for episode in range(100):
  141. observation = env.reset()
  142. score = 0
  143. while True:
  144. state = observation.reshape([3, 210, 160]).astype(cp.float32)
  145. action = agent.get_action(state)
  146. observation, reward, is_term, _ = env.step(action)
  147. state_next = observation.reshape([3, 210, 160]).astype(cp.float32)
  148. agent.stock_experience(state, action, reward, state_next, is_term)
  149. agent.train()
  150.  
  151. score += reward
  152.  
  153. if is_term:
  154. break
  155. print("episode", episode, " is finished. score is ", score, "point.")
  156.  
  157. name = input('Please input model filename... >>')
  158.  
  159. print("saving model...")
  160. model = copy.deepcopy(agent.q_func)
  161. model.to_cpu()
  162. serializers.save_npz(name + ".npz", model)
  163.  
  164. print("starting test...")
  165. observation = env.reset()
  166. for t in range(1000):
  167. env.render()
  168. time.sleep(1.0 / 25)
  169.  
  170. state = observation.reshape([3, 210, 160]).astype(cp.float32)
  171. action = agent.get_action(state, False)
  172. observation, _, is_term, _ = env.step(action)
  173.  
  174. if is_term:
  175. break
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement