Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python3
- # -*- coding: utf-8 -*-
- """
- Created on Sun May 28 15:21:00 2017
- @author: diegslva
- Here we try use Q-learning to solve CartPole-v0 with quantize states
- That means we going bin each states so that this set of states will be
- discrete and finite
- We update our reward to -300 to make our model to not go to far
- and restrict our area using bins
- """
- from __future__ import print_function, division
- from builtins import range
- import gym
- import os
- import sys
- import numpy as np
- import matplotlib.pyplot as plt
- from gym import wrappers
- from datetime import datetime
- # turn list of integers into an int
- # ex.
- # build_state([1,2,3,4,5]) -> 12345
- def build_state(features):
- """get our features and put all together converting into an integer"""
- return int("".join(map(lambda feature: str(int(feature)), features)))
- #end build_state
- def to_bin(value, bins):
- """"""
- return np.digitize(x=[value], bins=bins)[0]
- #end to_bin
- class FeatureTransformer:
- def __init__(self):
- # Note: to make this better you could look at how often each bin was
- # actually used while running this script
- # It's not clear from the high low values nor sample() what
- # values we really expect here
- self.cart_position_bins = np.linspace(-2.4, 2.4, 9)
- self.cart_velocity_bins = np.linspace(-2, 2, 9) # (-inf, inf)
- self.pole_angle_bins = np.linspace(-0.4, 0.4, 9)
- self.pole_velocity_bins = np.linspace(-3.5, 3.5, 9) # (-inf, inf)
- def transform(self, observation):
- # return an int
- cart_pos, cart_vel, pole_angle, pole_vel = observation
- return build_state([
- to_bin(cart_pos, self.cart_position_bins),
- to_bin(cart_vel, self.cart_velocity_bins),
- to_bin(pole_angle, self.pole_angle_bins),
- to_bin(pole_vel, self.pole_velocity_bins)
- ])
- #end FeatureTransformer
- class Model:
- def __init__(self, env, feature_transformer):
- self.env = env
- self.feature_transformer = feature_transformer
- num_states = 10**env.observation_space.shape[0]
- num_actions = env.action_space.n
- self.Q = np.random.uniform(low=-1, high=1, size=(num_states, num_actions))
- #end __init__
- def predict(self, s):
- x = self.feature_transformer.transform(s)
- return self.Q[x]
- #end predict
- def update(self, s, a, G):
- """ update our model using gradient descent """
- x = self.feature_transformer.transform(s)
- self.Q[x, a] += 10e-3*(G - self.Q[x, a])
- #end update
- def sample_action(self, s, eps):
- if np.random.random() < eps:
- return self.env.action_space.sample()
- else:
- p = self.predict(s)
- return np.argmax(p)
- #end sample_action
- #end Model
- def play_one(model, eps, gamma):
- """Play one episode
- @return: totalreward
- """
- observation = env.reset()
- done = False
- totalreward = 0
- iters = 0
- while not done and iters < 10000:
- action = model.sample_action(observation, eps)
- prev_observation = observation
- observation, reward, done, info = env.step(action)
- # accumulate our rewards
- totalreward += reward
- # if the pole fall down or step hit the limit 200
- # we decrease our reward to -300
- if done and iters < 199:
- reward = -300
- #update the model
- G = reward + gamma*np.max(model.predict(observation))
- model.update(prev_observation, action, G)
- iters += 1
- return totalreward
- #end play_one
- def plot_running_avg(totalrewards):
- """Plot running average for better view"""
- N = len(totalrewards)
- running_avg = np.empty(N)
- for t in range(N):
- running_avg[t] = totalrewards[max(0, t-100):(t+1)].mean()
- plt.plot(running_avg)
- plt.title("Running Average")
- plt.show()
- #end plot_running_avg
- if __name__ == '__main__':
- # initialize variables
- recording = True
- env = gym.make('CartPole-v0')
- ft = FeatureTransformer()
- model = Model(env, ft)
- # discount rate
- gamma = 0.9
- if recording is True:
- filename = os.path.basename(__file__).split('.')[0]
- monitor_dir = './' + filename + '_' + str(datetime.now())
- env = wrappers.Monitor(env, monitor_dir)
- N = 10000
- totalrewards = np.empty(N)
- for n in range(N):
- eps = 1.0/np.sqrt(n+1)
- totalreward = play_one(model, eps, gamma)
- totalrewards[n] = totalreward
- if n % 100 == 0:
- print("episode:",n,"total reward:", totalreward, "eps:", eps)
- print("avg reward for last 100 episodes:", totalrewards[-100:].mean())
- print("total steps:", totalrewards.sum())
- plt.plot(totalrewards)
- plt.title("Rewards")
- plot_running_avg(totalrewards)
- # Submit to OpenAI Gym
- env.close()
- print("Uploading to gym...")
- gym.scoreboard.api_key = "" # Put your key here
- print("Results: " + str( gym.upload(monitor_dir)) )
- #==============================================================================
- #[2017-05-28 18:47:01,029] Making new env: CartPole-v0
- # episode: 0 total reward: 29.0 eps: 1.0
- # episode: 100 total reward: 30.0 eps: 0.099503719021
- # ...
- # episode: 9800 total reward: 200.0 eps: 0.010101010101
- # episode: 9900 total reward: 200.0 eps: 0.0100498705962
- # avg reward for last 100 episodes: 197.23
- # total steps: 1834026.0
- #==============================================================================
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement