Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- class A2C:
- def __init__(self, policy, optimizer, value_loss_coef=0.25,
- entropy_coef=0.01, max_grad_norm=0.5):
- self.policy = policy
- self.optimizer = optimizer
- self.value_loss_coef = value_loss_coef
- self.entropy_coef = entropy_coef
- self.max_grad_norm = max_grad_norm
- self.reward_history = []
- self.log_counter = 0
- self.logger = {
- 'entropy': [],
- 'value_loss': [],
- 'average_reward': [],
- 'policy_loss': [],
- 'value_targets': [],
- 'value_predictions': [],
- 'gradient_norm': [],
- 'advantages': [],
- 'A2C_loss': []
- }
- def _log(self, rewards=False, **kwargs):
- for key, value in kwargs.items():
- self.logger[key].append(value)
- if rewards:
- self.log_counter += 1
- if self.log_counter == 100:
- self.logger['average_reward'].append(np.mean(self.reward_history))
- self.reward_history = []
- self.log_counter = 0
- def policy_loss(self, trajectory):
- advantages = trajectory['value_targets'] - trajectory['values'].squeeze()
- actions = torch.tensor(trajectory['actions'], device=device).unsqueeze(-1)
- log_probs = torch.gather(trajectory['log_probs'], dim=1, index=actions)
- loss = torch.mean(advantages.detach() * log_probs.squeeze())
- self._log(**{
- 'advantages': advantages.cpu().data.numpy().mean(),
- 'policy_loss': loss.cpu().item()
- })
- return loss
- def value_loss(self, trajectory):
- value_targets = trajectory['value_targets']
- values = trajectory['values'].squeeze()
- loss = torch.nn.MSELoss()(values, value_targets.detach())
- self._log(**{
- 'value_targets': value_targets.cpu().data.numpy().mean(),
- 'value_predictions': values.cpu().data.numpy().mean(),
- 'value_loss': loss.cpu().item()
- })
- return loss
- def loss(self, trajectory):
- entropy = -torch.sum(trajectory['log_probs'] * trajectory['probs'], dim=-1)
- entropy = torch.mean(entropy)
- a2c_loss = -self.policy_loss(trajectory) +\
- self.value_loss_coef * self.value_loss(trajectory) -\
- self.entropy_coef * entropy
- self._log(**{
- 'entropy': entropy.cpu().item(),
- 'A2C_loss': a2c_loss.cpu().item()
- })
- return a2c_loss
- def step(self, trajectory):
- loss = self.loss(trajectory)
- grad_norm = nn.utils.clip_grad_norm_(model.parameters(), self.max_grad_norm)
- loss.backward()
- opt.step()
- opt.zero_grad()
- self.reward_history.extend(trajectory['rewards'])
- self._log(rewards=True, **{'gradient_norm': grad_norm})
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement