Guest User

Untitled

a guest
Feb 17th, 2019
76
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.02 KB | None | 0 0
  1. # Actor
  2. with tf.variable_scope('pi'):
  3. # mlp is a function that simply generates a dense forward network
  4. logits = mlp(obs_ph, hidden_sizes=hidden_sizes+[act_dim], activation=tf.nn.relu, output_activation=None)
  5. pi = tf.squeeze(tf.multinomial(logits, num_samples=1), axis=1)
  6. logp = tf.reduce_sum(tf.one_hot(act_ph, depth=act_dim) * tf.nn.log_softmax(logits), axis=1)
  7. logp_pi = tf.reduce_sum(tf.one_hot(pi, depth=act_dim) * tf.nn.log_softmax(logits), axis=1)
  8.  
  9. # PPO objectives
  10. ratio = tf.exp(logp - logp_old_ph)
  11. clipped_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph, (1 - clip_ratio) * adv_ph)
  12. pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, clipped_adv))
  13. train_pi = tf.train.AdamOptimizer(learning_rate=pi_lr).minimize(pi_loss)
  14.  
  15. # Critic
  16. with tf.variable_scope('v'):
  17. v = tf.squeeze(mlp(obs_ph, hidden_sizes=hidden_sizes+[1], activation=tf.tanh, output_activation=None), axis=1)
  18. v_loss = tf.reduce_mean((ret_ph - v) ** 2)
  19. train_v = tf.train.AdamOptimizer(learning_rate=v_lr).minimize(v_loss)
  20.  
  21. def update(feed_ph, sess, max_kl=0.01, train_pi_iters=80, train_v_iters=80):
  22. inputs = {k:v for k,v in zip(feed_ph, buf.get())}
  23.  
  24. # Policy gradient step
  25. for i in range(train_pi_iters):
  26. _ = sess.run(train_pi, feed_dict=inputs)
  27.  
  28. # Value function learning
  29. for _ in range(train_v_iters):
  30. _ = sess.run(train_v, feed_dict=inputs)
  31.  
  32. actor = nn.Sequential(
  33. nn.Linear(self.obs_dim, h_dim), # input
  34. nn.ReLU(),
  35. nn.Linear(h_dim, h_dim), # hidden 1
  36. nn.ReLU(),
  37. nn.Linear(h_dim, h_dim), # hidden 2
  38. nn.ReLU(),
  39. nn.Linear(h_dim, self.act_dim) # output
  40. )
  41.  
  42. critic = nn.Sequential(
  43. nn.Linear(self.obs_dim, h_dim), # input
  44. nn.Tanh(),
  45. nn.Linear(h_dim, h_dim), # hidden 1
  46. nn.Tanh(),
  47. nn.Linear(h_dim, h_dim), # hidden 2
  48. nn.Tanh(),
  49. nn.Linear(h_dim, 1) # output
  50. )
  51.  
  52. a_op = t.optim.Adam(self.actor.parameters(), lr = pi_lr)
  53. c_op = t.optim.Adam(self.critic.parameters(), lr = v_lr)
  54.  
  55. def update(self):
  56.  
  57. buf = {k:t.from_numpy(v).float() for k,v in zip(['obs', 'acts', 'advs', 'rets', 'logps'], self.buf.get())}
  58.  
  59. # Policy gradient step
  60. for _ in range(train_pi_iters):
  61.  
  62. logits = actor(buf['obs'])
  63. logps = t.sum(one_hot(buf['acts'].long(), act_dim) * log_softmax(logits), 1)
  64.  
  65. ratio = t.exp(logps - buf['logps'])
  66. clipped_adv = t.where(buf['advs'] > 0, (1 + clip_ratio) * buf['advs'], (1 - clip_ratio) * buf['advs'])
  67. pi_loss = t.mean(t.min(ratio * buf['advs'], clipped_adv)) * -1
  68.  
  69. a_op.zero_grad()
  70. pi_loss.backward()
  71. a_op.step()
  72.  
  73. # Value function learning
  74. for i in range(self.train_v_iters):
  75. v = t.squeeze(critic(buf['obs']), dim=1)
  76. v_loss = t.mean((buf['rets'] - v)** 2)
  77.  
  78. c_op.zero_grad()
  79. v_loss.backward()
  80. c_op.step()
Add Comment
Please, Sign In to add comment