Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # Actor
- with tf.variable_scope('pi'):
- # mlp is a function that simply generates a dense forward network
- logits = mlp(obs_ph, hidden_sizes=hidden_sizes+[act_dim], activation=tf.nn.relu, output_activation=None)
- pi = tf.squeeze(tf.multinomial(logits, num_samples=1), axis=1)
- logp = tf.reduce_sum(tf.one_hot(act_ph, depth=act_dim) * tf.nn.log_softmax(logits), axis=1)
- logp_pi = tf.reduce_sum(tf.one_hot(pi, depth=act_dim) * tf.nn.log_softmax(logits), axis=1)
- # PPO objectives
- ratio = tf.exp(logp - logp_old_ph)
- clipped_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph, (1 - clip_ratio) * adv_ph)
- pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, clipped_adv))
- train_pi = tf.train.AdamOptimizer(learning_rate=pi_lr).minimize(pi_loss)
- # Critic
- with tf.variable_scope('v'):
- v = tf.squeeze(mlp(obs_ph, hidden_sizes=hidden_sizes+[1], activation=tf.tanh, output_activation=None), axis=1)
- v_loss = tf.reduce_mean((ret_ph - v) ** 2)
- train_v = tf.train.AdamOptimizer(learning_rate=v_lr).minimize(v_loss)
- def update(feed_ph, sess, max_kl=0.01, train_pi_iters=80, train_v_iters=80):
- inputs = {k:v for k,v in zip(feed_ph, buf.get())}
- # Policy gradient step
- for i in range(train_pi_iters):
- _ = sess.run(train_pi, feed_dict=inputs)
- # Value function learning
- for _ in range(train_v_iters):
- _ = sess.run(train_v, feed_dict=inputs)
- actor = nn.Sequential(
- nn.Linear(self.obs_dim, h_dim), # input
- nn.ReLU(),
- nn.Linear(h_dim, h_dim), # hidden 1
- nn.ReLU(),
- nn.Linear(h_dim, h_dim), # hidden 2
- nn.ReLU(),
- nn.Linear(h_dim, self.act_dim) # output
- )
- critic = nn.Sequential(
- nn.Linear(self.obs_dim, h_dim), # input
- nn.Tanh(),
- nn.Linear(h_dim, h_dim), # hidden 1
- nn.Tanh(),
- nn.Linear(h_dim, h_dim), # hidden 2
- nn.Tanh(),
- nn.Linear(h_dim, 1) # output
- )
- a_op = t.optim.Adam(self.actor.parameters(), lr = pi_lr)
- c_op = t.optim.Adam(self.critic.parameters(), lr = v_lr)
- def update(self):
- buf = {k:t.from_numpy(v).float() for k,v in zip(['obs', 'acts', 'advs', 'rets', 'logps'], self.buf.get())}
- # Policy gradient step
- for _ in range(train_pi_iters):
- logits = actor(buf['obs'])
- logps = t.sum(one_hot(buf['acts'].long(), act_dim) * log_softmax(logits), 1)
- ratio = t.exp(logps - buf['logps'])
- clipped_adv = t.where(buf['advs'] > 0, (1 + clip_ratio) * buf['advs'], (1 - clip_ratio) * buf['advs'])
- pi_loss = t.mean(t.min(ratio * buf['advs'], clipped_adv)) * -1
- a_op.zero_grad()
- pi_loss.backward()
- a_op.step()
- # Value function learning
- for i in range(self.train_v_iters):
- v = t.squeeze(critic(buf['obs']), dim=1)
- v_loss = t.mean((buf['rets'] - v)** 2)
- c_op.zero_grad()
- v_loss.backward()
- c_op.step()
Add Comment
Please, Sign In to add comment