Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import math
- import numpy as np
- import gym
- class QLearner:
- def __init__(self):
- self.environment = gym.make('CartPole-v1')
- self.attempt_no = 1
- self.upper_bounds = [
- self.environment.observation_space.high[0],
- 0.5,
- self.environment.observation_space.high[2],
- math.radians(50)
- ]
- self.lower_bounds = [
- self.environment.observation_space.low[0],
- -0.5,
- self.environment.observation_space.low[2],
- -math.radians(50)
- ]
- self.buckets = (4, 4, 4, 4)
- self.Q = np.zeros(self.buckets + (self.environment.action_space.n,))
- self.epsilon = 0.2
- self.alpha = 0.2
- self.gamma = 0.9
- def learn(self, max_attempts):
- return [self.attempt() for _ in range(max_attempts)]
- def attempt(self):
- observation = self.discretise(self.environment.reset())
- done = False
- reward_sum = 0.0
- while not done:
- #self.environment.render()
- action = self.pick_action(observation)
- new_observation, reward, done, info = self.environment.step(action)
- new_observation = self.discretise(new_observation)
- self.update_knowledge(action, observation, new_observation, reward)
- observation = new_observation
- reward_sum += reward
- print(reward_sum)
- self.attempt_no += 1
- return reward_sum
- def discretise(self, observation):
- ratios = [(observation[i] + abs(self.lower_bounds[i])) / (self.upper_bounds[i] - self.lower_bounds[i]) for i in range(len(observation))]
- new_obs = [int(round((self.buckets[i] - 1) * ratios[i])) for i in range(len(observation))]
- new_obs = [min(self.buckets[i] - 1, max(0, new_obs[i])) for i in range(len(observation))]
- discretised = tuple(new_obs)
- return discretised
- def pick_action(self, observation):
- if (np.random.random() <= self.epsilon):
- action = self.environment.action_space.sample()
- else:
- action = np.argmax(self.Q[observation])
- return action
- def update_knowledge(self, action, observation, new_observation, reward):
- self.Q[observation][action] += (1 - self.alpha) * self.Q[observation][action] + self.alpha * (reward + self.gamma * np.max(self.Q[new_observation]))
- def main():
- learner = QLearner()
- learner.learn(1000)
- if __name__ == '__main__':
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement