Advertisement
Guest User

Untitled

a guest
May 28th, 2017
63
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 5.58 KB | None | 0 0
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. Created on Sun May 28 15:21:00 2017
  5.  
  6. @author: diegslva
  7.  
  8. Here we try use Q-learning to solve CartPole-v0 with quantize states
  9. That means we going bin each states so that this set of states will be
  10. discrete and finite
  11.  
  12. We update our reward to -300 to make our model to not go to far
  13. and restrict our area using bins
  14.  
  15. """
  16.  
  17. from __future__ import print_function, division
  18. from builtins import range
  19.  
  20. import gym
  21. import os
  22. import sys
  23. import numpy as np
  24. import matplotlib.pyplot as plt
  25. from gym import wrappers
  26. from datetime import datetime
  27.  
  28.  
  29. # turn list of integers into an int
  30. # ex.
  31. # build_state([1,2,3,4,5]) -> 12345
  32. def build_state(features):
  33. """get our features and put all together converting into an integer"""
  34. return int("".join(map(lambda feature: str(int(feature)), features)))
  35. #end build_state
  36.  
  37.  
  38. def to_bin(value, bins):
  39. """"""
  40. return np.digitize(x=[value], bins=bins)[0]
  41. #end to_bin
  42.  
  43.  
  44. class FeatureTransformer:
  45. def __init__(self):
  46. # Note: to make this better you could look at how often each bin was
  47. # actually used while running this script
  48. # It's not clear from the high low values nor sample() what
  49. # values we really expect here
  50. self.cart_position_bins = np.linspace(-2.4, 2.4, 9)
  51. self.cart_velocity_bins = np.linspace(-2, 2, 9) # (-inf, inf)
  52. self.pole_angle_bins = np.linspace(-0.4, 0.4, 9)
  53. self.pole_velocity_bins = np.linspace(-3.5, 3.5, 9) # (-inf, inf)
  54.  
  55. def transform(self, observation):
  56. # return an int
  57. cart_pos, cart_vel, pole_angle, pole_vel = observation
  58. return build_state([
  59. to_bin(cart_pos, self.cart_position_bins),
  60. to_bin(cart_vel, self.cart_velocity_bins),
  61. to_bin(pole_angle, self.pole_angle_bins),
  62. to_bin(pole_vel, self.pole_velocity_bins)
  63. ])
  64. #end FeatureTransformer
  65.  
  66. class Model:
  67. def __init__(self, env, feature_transformer):
  68. self.env = env
  69. self.feature_transformer = feature_transformer
  70. num_states = 10**env.observation_space.shape[0]
  71. num_actions = env.action_space.n
  72. self.Q = np.random.uniform(low=-1, high=1, size=(num_states, num_actions))
  73. #end __init__
  74.  
  75. def predict(self, s):
  76. x = self.feature_transformer.transform(s)
  77. return self.Q[x]
  78. #end predict
  79.  
  80. def update(self, s, a, G):
  81. """ update our model using gradient descent """
  82. x = self.feature_transformer.transform(s)
  83. self.Q[x, a] += 10e-3*(G - self.Q[x, a])
  84. #end update
  85.  
  86. def sample_action(self, s, eps):
  87. if np.random.random() < eps:
  88. return self.env.action_space.sample()
  89. else:
  90. p = self.predict(s)
  91. return np.argmax(p)
  92. #end sample_action
  93. #end Model
  94.  
  95. def play_one(model, eps, gamma):
  96. """Play one episode
  97.  
  98. @return: totalreward
  99. """
  100. observation = env.reset()
  101. done = False
  102. totalreward = 0
  103. iters = 0
  104. while not done and iters < 10000:
  105. action = model.sample_action(observation, eps)
  106. prev_observation = observation
  107. observation, reward, done, info = env.step(action)
  108.  
  109. # accumulate our rewards
  110. totalreward += reward
  111.  
  112. # if the pole fall down or step hit the limit 200
  113. # we decrease our reward to -300
  114. if done and iters < 199:
  115. reward = -300
  116.  
  117. #update the model
  118. G = reward + gamma*np.max(model.predict(observation))
  119. model.update(prev_observation, action, G)
  120.  
  121. iters += 1
  122.  
  123. return totalreward
  124. #end play_one
  125.  
  126.  
  127. def plot_running_avg(totalrewards):
  128. """Plot running average for better view"""
  129. N = len(totalrewards)
  130. running_avg = np.empty(N)
  131. for t in range(N):
  132. running_avg[t] = totalrewards[max(0, t-100):(t+1)].mean()
  133. plt.plot(running_avg)
  134. plt.title("Running Average")
  135. plt.show()
  136. #end plot_running_avg
  137.  
  138.  
  139. if __name__ == '__main__':
  140. # initialize variables
  141. recording = True
  142.  
  143. env = gym.make('CartPole-v0')
  144. ft = FeatureTransformer()
  145. model = Model(env, ft)
  146.  
  147. # discount rate
  148. gamma = 0.9
  149.  
  150. if recording is True:
  151. filename = os.path.basename(__file__).split('.')[0]
  152. monitor_dir = './' + filename + '_' + str(datetime.now())
  153. env = wrappers.Monitor(env, monitor_dir)
  154.  
  155. N = 10000
  156. totalrewards = np.empty(N)
  157. for n in range(N):
  158. eps = 1.0/np.sqrt(n+1)
  159. totalreward = play_one(model, eps, gamma)
  160. totalrewards[n] = totalreward
  161. if n % 100 == 0:
  162. print("episode:",n,"total reward:", totalreward, "eps:", eps)
  163. print("avg reward for last 100 episodes:", totalrewards[-100:].mean())
  164. print("total steps:", totalrewards.sum())
  165.  
  166. plt.plot(totalrewards)
  167. plt.title("Rewards")
  168.  
  169. plot_running_avg(totalrewards)
  170.  
  171.  
  172. # Submit to OpenAI Gym
  173. env.close()
  174. print("Uploading to gym...")
  175. gym.scoreboard.api_key = "" # Put your key here
  176. print("Results: " + str( gym.upload(monitor_dir)) )
  177.  
  178.  
  179. #==============================================================================
  180. #[2017-05-28 18:47:01,029] Making new env: CartPole-v0
  181. # episode: 0 total reward: 29.0 eps: 1.0
  182. # episode: 100 total reward: 30.0 eps: 0.099503719021
  183. # ...
  184. # episode: 9800 total reward: 200.0 eps: 0.010101010101
  185. # episode: 9900 total reward: 200.0 eps: 0.0100498705962
  186. # avg reward for last 100 episodes: 197.23
  187. # total steps: 1834026.0
  188. #==============================================================================
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement