Advertisement
Guest User

Untitled

a guest
Apr 26th, 2017
69
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.03 KB | None | 0 0
  1. import gym
  2. import numpy as np
  3.  
  4. def gen_random_policy():
  5. return (np.random.uniform(-1,1, size=4), np.random.uniform(-1,1))
  6.  
  7. def policy_to_action(env, policy, obs):
  8. if np.dot(policy[0], obs) + policy[1] > 0:
  9. return 1
  10. else:
  11. return 0
  12.  
  13. def run_episode(env, policy, t_max=1000, render=False):
  14. obs = env.reset()
  15. total_reward = 0
  16. for i in range(t_max):
  17. if render:
  18. env.render()
  19. selected_action = policy_to_action(env, policy, obs)
  20. obs, reward, done, _ = env.step(selected_action)
  21. total_reward += reward
  22. if done:
  23. break
  24. return total_reward
  25.  
  26. env = gym.make('CartPole-v0')
  27.  
  28. ## Generate a pool or random policies
  29. policy_list = [gen_random_policy() for _ in range(200)]
  30.  
  31. # Evaluate the score of each policy.
  32. scores_list = [run_episode(env, p) for p in policy_list]
  33.  
  34. # Select the best plicy.
  35. print('Best policy score = %f' %max(scores_list))
  36.  
  37. best_policy= policy_list[np.argmax(scores_list)]
  38. print('Running with best policy:\n')
  39. run_episode(env, best_policy, render=True)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement