Advertisement
ronAmit

Untitled

Mar 22nd, 2021
665
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 13.38 KB | None | 0 0
  1. # based on
  2. # https://github.com/pytorch/examples/blob/master/reinforcement_learning/actor_critic.py
  3. # http://rail.eecs.berkeley.edu/deeprlcourse-fa17/f17docs/lecture_5_actor_critic_pdf.pdf
  4.  
  5. import argparse
  6. import gym
  7. import numpy as np
  8. from itertools import count
  9. from collections import namedtuple
  10.  
  11. import torch
  12. import torch.nn as nn
  13. import torch.nn.functional as F
  14. import torch.optim as optim
  15. from torch.distributions import Categorical
  16. import os
  17.  
  18. os.environ["WANDB_BASE_URL"] = "http://api.wandb.ai"
  19.  
  20. import wandb
  21.  
  22. # ------------------------------------------------------------------------------------------------------------~
  23.  
  24.  
  25. # Cart Pole
  26.  
  27. parser = argparse.ArgumentParser(description='PyTorch actor-critic example')
  28. # parser.add_argument('--gamma', type=float, default=0.99, metavar='G',
  29. #                     help='discount factor (default: 0.99)')
  30. parser.add_argument('--seed', type=int, default=543, metavar='N',
  31.                     help='random seed (default: 543)')
  32. parser.add_argument('--render', action='store_true',
  33.                     help='render the environment')
  34. parser.add_argument('--log-interval', type=int, default=10, metavar='N',
  35.                     help='interval between training status logs (default: 10)')
  36. args = parser.parse_args()
  37.  
  38.  
  39.  
  40. hyperparameter_defaults = dict(
  41.     policy_lr=1e-2,
  42.     n_episodes=2000,
  43.     gamma_lr=1e-1,   # used for the 'Adapt' option
  44.     alpha_lr=1e0,   # used for the 'SigmoidAdapt' option
  45.     gamma_type='Adapt',  # 'Adapt', 'SigmoidAdapt'
  46.     use_paired=True,
  47.     env_name='CartPole-v1',  # 'Acrobot-v1', 'CartPole-v1'
  48.     normalize_r2go=True  # False | True
  49. )
  50.  
  51. wandb.init(config=hyperparameter_defaults, project="GammaAdapt")
  52. config = wandb.config
  53. # ------------------------------------------------------------------------------------------------------------~
  54.  
  55. env = gym.make(config.env_name)
  56. n_actions = env.action_space.n
  57. n_inputs = env.observation_space.shape[0]
  58.  
  59. env.seed(args.seed)
  60. torch.manual_seed(args.seed)
  61.  
  62. eps = np.finfo(np.float32).eps.item()
  63.  
  64.  
  65. # ------------------------------------------------------------------------------------------------------------~
  66.  
  67.  
  68. class DiscountFactor:
  69.     def __init__(self):
  70.         self.init_gamma = 0.001 if config.use_paired else 0.99
  71.         self.max_gamma = 0.99
  72.         self.gamma_type = config.gamma_type
  73.  
  74.     def get_tensor(self):
  75.         # Return the tensor of gamma (possible to backprop)
  76.         return torch.tensor(0)  # dummy value
  77.  
  78.     def get_item(self):
  79.         # Return the current value of gamma as s simple number (not for backprop)
  80.         gamma = self.get_tensor()
  81.         return gamma.item()  # detach from the computation graph
  82. # ------------------------------------------------------------------------------------------------------------~
  83.  
  84.  
  85. class DiscountFactorAdaptSimple(DiscountFactor):
  86.     def __init__(self):
  87.         super().__init__()
  88.         self.gamma = torch.tensor([self.init_gamma], requires_grad=True)
  89.         self.gamma_optimizer = optim.Adam([self.gamma], lr=config.gamma_lr)
  90.  
  91.     def gd_step(self, loss):
  92.         # Gradient Descent Step
  93.         self.gamma_optimizer.zero_grad()
  94.         loss.backward()
  95.         self.gamma_optimizer.step()
  96.         self.gamma.data = torch.clamp(self.gamma.data, 0, self.max_gamma)
  97.  
  98.     def get_tensor(self):
  99.         # Return the tensor of gamma (possible to backprop)
  100.         return self.gamma
  101. # ------------------------------------------------------------------------------------------------------------~
  102.  
  103.  
  104. class DiscountFactorAdaptSigmoid(DiscountFactor):
  105.     # represent gamma = 1/(1+exp(alpha)) in which case you'll never need to clip,
  106.     # perhaps making the optimization more stable.
  107.     def __init__(self):
  108.         super().__init__()
  109.         init_alpha = np.log(self.init_gamma / (1 - self.init_gamma))
  110.         self.alpha = torch.tensor([init_alpha], requires_grad=True)
  111.         self.alpha_optimizer = optim.Adam([self.alpha], lr=config.alpha_lr)
  112.  
  113.     def gd_step(self, loss):
  114.         # Gradient Descent Step
  115.         self.alpha_optimizer.zero_grad()
  116.         loss.backward()
  117.         self.alpha_optimizer.step()
  118.  
  119.     def get_tensor(self):
  120.         # Return the tensor of gamma (possible to backprop)
  121.         gamma = 1 / (1 + torch.exp(-self.alpha))
  122.         return gamma
  123. # ------------------------------------------------------------------------------------------------------------~
  124.  
  125.  
  126. class Policy(nn.Module):
  127.     """
  128.    implements both actor and critic in one model
  129.    """
  130.  
  131.     def __init__(self):
  132.         super(Policy, self).__init__()
  133.         self.affine1 = nn.Linear(n_inputs, 128)
  134.  
  135.         # actor's layer
  136.         self.action_head = nn.Linear(128, n_actions)
  137.  
  138.         # critic's layer
  139.         self.value_head = nn.Linear(128, 1)
  140.  
  141.         # buffers  (save info from last trajectory)
  142.         self.ep_states = []  # list of states s_t  in the last run trajectory
  143.         self.ep_actions = []  # list of actions a_t  in the last run trajectory
  144.         self.ep_rewards = []  # list of rewards r_t  in the last run trajectory
  145.         self.ep_log_probs = []  # list of log(Pi(a_t|s_t)) in the last run trajectory
  146.         self.ep_value_est = []  # list of estimated V(s_t) in the last run trajectory
  147.         self.ep_len = None  # length last run trajectory
  148.  
  149.     def forward(self, x):
  150.         """
  151.        forward of both actor and critic
  152.        """
  153.         x = F.relu(self.affine1(x))
  154.  
  155.         # actor: chooses action to take from state s_t
  156.         # by returning probability of each action
  157.         action_prob = F.softmax(self.action_head(x), dim=-1)
  158.  
  159.         # critic: evaluates being in the state s_t
  160.         state_values = self.value_head(x)
  161.  
  162.         # return values for both actor and critic as a tuple of 2 values:
  163.         # 1. a list with the probability of each action over the action space
  164.         # 2. the value from state s_t
  165.         return action_prob, state_values
  166.  
  167.     def run_model(self, state):
  168.         state = torch.from_numpy(state).float()
  169.         probs, state_value = self.forward(state)
  170.         return probs, state_value
  171.  
  172.     def set_requires_grad(self, flag):
  173.         for param in self.parameters():
  174.             param.requires_grad = flag
  175. # ------------------------------------------------------------------------------------------------------------~
  176.  
  177.  
  178. def select_action(probs):
  179.     # create a categorical distribution over the list of probabilities of actions
  180.     m = Categorical(probs)
  181.  
  182.     # and sample an action using the distribution
  183.     action = m.sample()
  184.  
  185.     log_prob = m.log_prob(action)
  186.  
  187.     return action.item(), log_prob
  188.  
  189.  
  190. # ------------------------------------------------------------------------------------------------------------~
  191.  
  192.  
  193. def run_trajectory(pol: Policy):
  194.     episode_length_limit = 10000  # Don't infinite loop while learning
  195.  
  196.     # clear the memory of the previous episode
  197.     del pol.ep_states[:]
  198.     del pol.ep_actions[:]
  199.     del pol.ep_rewards[:]
  200.     del pol.ep_log_probs[:]
  201.     del pol.ep_value_est[:]
  202.  
  203.     ep_tot_reward = 0
  204.     t = 0
  205.  
  206.     # Init environment
  207.     state = env.reset()
  208.  
  209.     for t in range(1, episode_length_limit):
  210.         probs, value_est = pol.run_model(state)
  211.         action, log_prob = select_action(probs)
  212.         state, reward, done, _ = env.step(action)
  213.         if args.render:
  214.             env.render()
  215.         pol.ep_states.append(state)
  216.         pol.ep_actions.append(action)
  217.         pol.ep_rewards.append(reward)
  218.         pol.ep_log_probs.append(log_prob)
  219.         pol.ep_value_est.append(value_est)
  220.         ep_tot_reward += reward
  221.         if done:
  222.             break
  223.  
  224.     ep_len = t
  225.     pol.ep_len = ep_len
  226.     return ep_tot_reward, ep_len
  227.  
  228.  
  229. # ------------------------------------------------------------------------------------------------------------~
  230.  
  231.  
  232. def get_episode_objective(pol, gamma):
  233.     """
  234.    Training code. Calculates actor and critic loss
  235.    """
  236.     # calculate the "Reward-to-go"
  237.     # (the cumulative discounted return starting at each time-step until the end of the episode)
  238.  
  239.     r2go = 0.0
  240.     r2go_ep = torch.zeros(pol.ep_len)  # the reward-to-go per time step in the episode
  241.     for t in reversed(range(pol.ep_len)):  # go from end of episode to start
  242.         r = pol.ep_rewards[t]
  243.         r2go = r + gamma * r2go
  244.         r2go_ep[t] = r2go
  245.  
  246.     if config.normalize_r2go:
  247.         # Normalize the Returns (for variance reduction)
  248.         # Note: this is unjustified heuristic
  249.         r2go_ep = (r2go_ep - r2go_ep.mean()) / (r2go_ep.std() + eps)
  250.  
  251.     # calculate the loss (objective)
  252.     policy_losses = []  # list to save actor (policy) per-step losses
  253.     value_losses = []  # list to save critic (value) per-step losses
  254.  
  255.     for t in range(pol.ep_len):
  256.         r2go = r2go_ep[t]
  257.         value_est = pol.ep_value_est[t]
  258.         log_prob = pol.ep_log_probs[t]
  259.  
  260.         advantage = r2go - value_est.item()
  261.         # note: we don't backprop trough the advantage
  262.  
  263.         # calculate actor (policy) loss
  264.         policy_losses.append(-log_prob * advantage)
  265.  
  266.         # calculate critic (value) loss using L1 smooth loss
  267.         value_losses.append(F.smooth_l1_loss(value_est.squeeze(), r2go))
  268.  
  269.     # sum up all the values of policy_losses and value_losses
  270.     actor_loss = torch.stack(policy_losses).sum()
  271.     critic_loss = torch.stack(value_losses).sum()
  272.  
  273.     loss = actor_loss + critic_loss
  274.     return loss
  275.  
  276.  
  277. # ------------------------------------------------------------------------------------------------------------~
  278.  
  279.  
  280. def main():
  281.     all_rewards = []
  282.     gammas = []
  283.     running_reward = 0
  284.  
  285.     # Initialize Gamma
  286.     if config.gamma_type == 'Adapt':
  287.         discount = DiscountFactorAdaptSimple()
  288.     elif config.gamma_type == 'SigmoidAdapt':
  289.         discount = DiscountFactorAdaptSigmoid()
  290.     else:
  291.         raise ValueError
  292.  
  293.     # Initialize policies:
  294.     polP = Policy()  # Protagonist policy
  295.     polA = Policy()  # Antagonist policy
  296.  
  297.     polP_optimizer = optim.Adam(polP.parameters(), lr=config.policy_lr)
  298.     polA_optimizer = optim.Adam(polA.parameters(), lr=config.policy_lr)
  299.  
  300.     # run infinitely many episodes
  301.     for i_episode in range(config.n_episodes):
  302.  
  303.         polA.set_requires_grad(True)
  304.         polP.set_requires_grad(True)
  305.  
  306.         # get Protagonist loss
  307.         ep_tot_rewardP, ep_lenP = run_trajectory(polP)
  308.         lossP = get_episode_objective(polP, discount.get_item())
  309.  
  310.         polP_optimizer.zero_grad()
  311.         lossP.backward()  # perform backprop
  312.         polP_optimizer.step()
  313.  
  314.         if config.use_paired:
  315.             # get Antagonist loss
  316.             ep_tot_rewardA, ep_lenA = run_trajectory(polA)
  317.             lossA = get_episode_objective(polA, discount.get_item())
  318.  
  319.             polA_optimizer.zero_grad()
  320.             lossA.backward()  # perform backprop
  321.             polA_optimizer.step()
  322.  
  323.             polA.set_requires_grad(False)
  324.             polP.set_requires_grad(False)
  325.  
  326.             # Regret
  327.             # get Protagonist loss
  328.             ep_tot_rewardP, ep_lenP = run_trajectory(polP)
  329.             lossP = get_episode_objective(polP, discount.get_tensor())
  330.             # get Protagonist loss
  331.             ep_tot_rewardA, ep_lenA = run_trajectory(polA)
  332.             lossA = get_episode_objective(polA, discount.get_tensor())
  333.             neg_regret = lossA - lossP  # notice the signs flipped because here we talk about minimization objective
  334.  
  335.             # Take a gradient step with gamma
  336.             discount.gd_step(neg_regret)
  337.  
  338.         # -------------------------------------------
  339.  
  340.         # neg_regret.backward()  # perform backprop
  341.  
  342.         #
  343.         # # take a step with polP to maximize -Regret (i.e, minimize +Regret)
  344.         # polP.grad_step(lossP)
  345.         #
  346.         # # take a step with polA to maximize +Regret (i.e, minimize -Regret)
  347.         # polA.grad_step(lossA)
  348.         #
  349.         #
  350.         # ######################
  351.  
  352.         # # get Protagonist loss
  353.         # ep_tot_rewardP, ep_lenP = run_trajectory(polP)
  354.         # lossP = get_episode_AC_objective(polP, gamma)
  355.         #
  356.         # # get Protagonist loss
  357.         # ep_tot_rewardA, ep_lenA = run_trajectory(polA)
  358.         # lossA = get_episode_AC_objective(polA, gamma)
  359.  
  360.         ######################
  361.  
  362.         # # take an optimizer step with gamma to maximize +Regret (i.e, minimize -Regret)
  363.         # gamma_optimizer.zero_grad()
  364.         #
  365.         # gamma_optimizer.step()
  366.         # gamma = torch.clip(gamma, 0, 0.99)
  367.  
  368.         # update cumulative reward
  369.  
  370.         all_rewards.append(ep_tot_rewardP)
  371.         gammas.append(discount.get_item())
  372.         running_reward = 0.05 * ep_tot_rewardP + (1 - 0.05) * running_reward
  373.  
  374.         # log results
  375.         if i_episode % args.log_interval == 0:
  376.             print('Protagonist: Episode {}\tLast reward: {:.2f}\tAverage reward: {:.2f}'.format(
  377.                 i_episode, ep_tot_rewardP, running_reward))
  378.             print(f'gamma = {discount.get_item():.4f} ')
  379.             wandb.log({'Protagonist reward': np.mean(all_rewards)}, step=i_episode)
  380.             wandb.log({'Gamma': np.mean(gammas)}, step=i_episode)
  381.             gammas = []
  382.             all_rewards = []
  383.  
  384.         # check if we have "solved" the cart pole problem
  385.         # if running_reward > 0: #env.spec.reward_threshold:
  386.         #     print("Solved! Running reward is now {} and "
  387.         #          "the last episode runs to {} time steps!".format(running_reward, ep_lenP))
  388.         #    break
  389.  
  390.  
  391. if __name__ == '__main__':
  392.     main()
  393.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement