Guest User

Untitled

a guest
Jun 21st, 2018
90
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.27 KB | None | 0 0
  1. import random, math
  2. import numpy as np
  3. import torch, torch.nn as nn
  4. from torch.autograd import Variable
  5. from collections import deque
  6.  
  7.  
  8. class ReplayBuffer(object):
  9. def __init__(self, capacity):
  10. self.buffer = deque(maxlen=capacity)
  11.  
  12. def push(self, state, action, reward, next_state, done):
  13. self.buffer.append((state, action, reward, next_state, done))
  14.  
  15. def sample(self, batch_size):
  16. return zip(*random.sample(self.buffer, batch_size))
  17.  
  18. def __len__(self):
  19. return len(self.buffer)
  20.  
  21.  
  22. class DQN(nn.Module):
  23. def __init__(self, num_inputs, num_actions, hidden_size):
  24. super(DQN, self).__init__()
  25.  
  26. self.layers = nn.Sequential(
  27. nn.Linear(num_inputs, hidden_size),
  28. nn.ReLU(),
  29. nn.Linear(hidden_size, hidden_size),
  30. nn.ReLU(),
  31. nn.Linear(hidden_size, num_actions),
  32. )
  33.  
  34. def forward(self, x):
  35. return self.layers(x)
  36.  
  37. def act(self, state, epsilon):
  38. if random.random() > epsilon:
  39. q_value_list = self.forward(state)
  40. _, action = q_value_list.max(0)
  41. else:
  42. action = random.randrange(6)
  43. return action
  44.  
  45.  
  46. def compute_td_loss(batch, q_list, nx_q_list, gamma=0.99):
  47. state, action, reward, nx_state, done = batch
  48.  
  49. # to Variable
  50. state = Variable(torch.Tensor(state))
  51. action = Variable(torch.LongTensor(action))
  52. reward = Variable(torch.Tensor(reward))
  53. nx_state = Variable(torch.Tensor(nx_state))
  54. done = Variable(torch.Tensor(done))
  55.  
  56. # compute loss
  57. q = q_list.gather(0, action)
  58.  
  59. max_nx_q, _ = nx_q_list.max(0)
  60. y = reward + gamma * max_nx_q * (1 - done)
  61.  
  62. loss = (y - q).pow(2).mean()
  63. return loss
  64.  
  65.  
  66. def prepro(I):
  67. # prepro 210x160x3 into 6400, from Karpathy's code
  68. I = I[35:195]
  69. I = I[::2, ::2, 0]
  70. I[I == 144] = 0
  71. I[I == 109] = 0
  72. I[I != 0] = 1
  73. I = I.astype(np.float).ravel()
  74. return Variable(torch.Tensor(I))
  75.  
  76.  
  77. def main():
  78. # inits
  79. import gym
  80. env = gym.make("Pong-v0")
  81. model = DQN(6400, env.action_space.n, hidden_size=128)
  82. optimizer = torch.optim.Adam(model.parameters())
  83. replay_buffer = ReplayBuffer(capacity=1000)
  84.  
  85. # hyperparameters
  86. num_frames = 10000
  87. batch_size = 64
  88. num_iters = 1000
  89.  
  90. # epsilion greedy decay
  91. epsilon_start = 1.0
  92. epsilon_final = 0.01
  93. epsilon_decay = 500
  94. epsilon_by_i = lambda i: epsilon_final + (epsilon_start - epsilon_final) * math.exp(-1. * i / epsilon_decay)
  95.  
  96. while True:
  97. state = env.reset()
  98. for i in range(num_iters):
  99. # env.render()
  100. action = model.act(prepro(state), epsilon_by_i(i))
  101.  
  102. nx_state, reward, done, _ = env.step(action)
  103. replay_buffer.push(state, action, reward, nx_state, done)
  104.  
  105. if len(replay_buffer) > batch_size:
  106. # learning stage
  107. q_list = model(prepro(state))
  108. nx_q_list = model(prepro(nx_state))
  109. loss = compute_td_loss(replay_buffer.sample(batch_size),
  110. q_list,
  111. nx_q_list)
  112. optimizer.zero_grad()
  113. loss.backward()
  114.  
  115. # update
  116. state = nx_state
  117.  
  118. if done: break
  119.  
  120.  
  121. if __name__ == "__main__":
  122. main()
Add Comment
Please, Sign In to add comment