Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import gym
- import numpy as np
- def gen_random_policy():
- return (np.random.uniform(-1,1, size=4), np.random.uniform(-1,1))
- def policy_to_action(env, policy, obs):
- if np.dot(policy[0], obs) + policy[1] > 0:
- return 1
- else:
- return 0
- def run_episode(env, policy, t_max=1000, render=False):
- obs = env.reset()
- total_reward = 0
- for i in range(t_max):
- if render:
- env.render()
- selected_action = policy_to_action(env, policy, obs)
- obs, reward, done, _ = env.step(selected_action)
- total_reward += reward
- if done:
- break
- return total_reward
- env = gym.make('CartPole-v0')
- ## Generate a pool or random policies
- policy_list = [gen_random_policy() for _ in range(200)]
- # Evaluate the score of each policy.
- scores_list = [run_episode(env, p) for p in policy_list]
- # Select the best plicy.
- print('Best policy score = %f' %max(scores_list))
- best_policy= policy_list[np.argmax(scores_list)]
- print('Running with best policy:\n')
- run_episode(env, best_policy, render=True)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement