Advertisement
Guest User

Patyczek

a guest
Mar 24th, 2018
143
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.51 KB | None | 0 0
  1. import math
  2. import numpy as np
  3.  
  4. import gym
  5.  
  6.  
  7. class QLearner:
  8.     def __init__(self):
  9.         self.environment = gym.make('CartPole-v1')
  10.         self.attempt_no = 1
  11.         self.upper_bounds = [
  12.             self.environment.observation_space.high[0],
  13.             0.5,
  14.             self.environment.observation_space.high[2],
  15.             math.radians(50)
  16.         ]
  17.         self.lower_bounds = [
  18.             self.environment.observation_space.low[0],
  19.             -0.5,
  20.             self.environment.observation_space.low[2],
  21.             -math.radians(50)
  22.         ]
  23.         self.buckets = (4, 4, 4, 4)
  24.         self.Q = np.zeros(self.buckets + (self.environment.action_space.n,))
  25.         self.epsilon = 0.2
  26.         self.alpha = 0.2
  27.         self.gamma = 0.9
  28.  
  29.  
  30.     def learn(self, max_attempts):
  31.         return [self.attempt() for _ in range(max_attempts)]
  32.  
  33.     def attempt(self):
  34.         observation = self.discretise(self.environment.reset())
  35.         done = False
  36.         reward_sum = 0.0
  37.         while not done:
  38.             #self.environment.render()
  39.             action = self.pick_action(observation)
  40.             new_observation, reward, done, info = self.environment.step(action)
  41.             new_observation = self.discretise(new_observation)
  42.             self.update_knowledge(action, observation, new_observation, reward)
  43.             observation = new_observation
  44.             reward_sum += reward
  45.         print(reward_sum)
  46.         self.attempt_no += 1
  47.         return reward_sum
  48.  
  49.     def discretise(self, observation):
  50.         ratios = [(observation[i] + abs(self.lower_bounds[i])) / (self.upper_bounds[i] - self.lower_bounds[i]) for i in range(len(observation))]
  51.         new_obs = [int(round((self.buckets[i] - 1) * ratios[i])) for i in range(len(observation))]
  52.         new_obs = [min(self.buckets[i] - 1, max(0, new_obs[i])) for i in range(len(observation))]
  53.         discretised = tuple(new_obs)
  54.  
  55.         return discretised
  56.  
  57.     def pick_action(self, observation):
  58.         if (np.random.random() <= self.epsilon):
  59.             action = self.environment.action_space.sample()
  60.         else:
  61.             action = np.argmax(self.Q[observation])
  62.         return action
  63.  
  64.     def update_knowledge(self, action, observation, new_observation, reward):
  65.         self.Q[observation][action] += (1 - self.alpha) * self.Q[observation][action] + self.alpha * (reward + self.gamma * np.max(self.Q[new_observation]))
  66.  
  67.  
  68. def main():
  69.     learner = QLearner()
  70.     learner.learn(1000)
  71.  
  72.  
  73. if __name__ == '__main__':
  74.     main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement