Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import argparse
- import gym
- import numpy as np
- from itertools import count
- import torch
- import torch.nn as nn
- import torch.nn.functional as F
- import torch.optim as optim
- import torch.autograd as autograd
- from torch.autograd import Variable
- import torchvision.transforms as T
- parser = argparse.ArgumentParser(description='PyTorch REINFORCE example')
- parser.add_argument('--gamma', type=float, default=0.9, metavar='G',
- help='discount factor (default: 0.999)')
- parser.add_argument('--seed', type=int, default=1, metavar='N',
- help='random seed (default: 1)')
- parser.add_argument('--log-interval', type=int, default=50, metavar='N',
- help='interval between training status logs')
- args = parser.parse_args()
- # torch.manual_seed(args.seed)
- class Policy(nn.Module):
- def __init__(self):
- super(Policy, self).__init__()
- self.affine1 = nn.Linear(4, 128)
- self.affine3 = nn.Linear(128, 2)
- self.sampled_probs = []
- self.sampled_actions = []
- self.rewards = []
- def forward(self, x):
- x = F.relu(self.affine1(x))
- x = F.relu(self.affine3(x))
- return F.softmax(x)
- env = gym.make('CartPole-v0')
- model = Policy()
- optimizer = optim.RMSprop(model.parameters(), lr=1e-2, alpha=1, eps=1e-10)
- def select_action(state):
- state = torch.from_numpy(state).float().unsqueeze(0)
- probs = model(Variable(state))
- action = probs.multinomial(1, True).data.squeeze()[0]
- model.sampled_probs.append(probs)
- model.sampled_actions.append(action)
- return action
- def finish_episode():
- R = 0
- sampled_actions = model.sampled_actions
- sampled_probs = model.sampled_probs
- rewards = []
- for action, r in zip(sampled_actions[::-1], model.rewards[::-1]):
- R = r + args.gamma * R
- rewards.insert(0, R)
- rewards = torch.Tensor(rewards)
- rewards = (rewards - rewards.mean()) / rewards.std()
- ys = [torch.zeros(1, 2) for _ in sampled_probs]
- for i, action in enumerate(sampled_actions):
- ys[i][0][action] = 1
- loss = [((Variable(y) - p)**2).sum() / 2 for p, y in zip(sampled_probs, ys)]
- grads = {}
- for i, l in enumerate(loss):
- optimizer.zero_grad()
- l.backward()
- for j, group in enumerate(optimizer.param_groups):
- saved_group = grads.get(j, {})
- '''
- x = group['params'][-1]
- print(x)
- print(rewards[i] * x)
- '''
- for k, param in enumerate(group['params']):
- cumsum = saved_group.get(k, torch.zeros(param.grad.size()))
- cumsum += rewards[i] * param.grad.data
- saved_group[k] = cumsum
- '''
- print(cumsum)
- import pdb; pdb.set_trace()
- '''
- grads[j] = saved_group
- for j, group in enumerate(optimizer.param_groups):
- for k, param in enumerate(group['params']):
- param.grad.data = grads[j][k] / len(loss)
- '''
- for j, group in enumerate(optimizer.param_groups):
- for k, param in enumerate(group['params']):
- print("grad: ", param.grad.data.abs().max())
- print("param: ", param.data.abs().max())
- '''
- optimizer.step()
- del model.rewards[:]
- del model.sampled_actions[:]
- del model.sampled_probs[:]
- running_reward = 10
- for i_episode in count(1):
- reward_sum = 0
- state = env.reset()
- for t in count(1):
- action = select_action(state)
- state, reward, done, _ = env.step(action)
- model.rewards.append(reward)
- reward_sum += reward
- if done:
- break
- running_reward = running_reward * 0.99 + reward_sum * 0.01
- finish_episode()
- if i_episode % args.log_interval == 0:
- print('Episode {}\tLast length: {:5f}\tAverage length: {:.2f}'.format(
- i_episode, reward_sum, running_reward))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement