Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import math
- from statistics import mean
- from numpy import random
- from time import sleep
- import numpy
- from operator import add
- import gym
- import matplotlib.pyplot as plt
- from visualizer import plot_learning_result
- class QLearner:
- def __init__(self, epsilon, alpha, gamma, epsilon_decay, alpha_k):
- self.environment = gym.make('CartPole-v1')
- self.attempt_no = 1
- self.upper_bounds = [
- self.environment.observation_space.high[0],
- -3.0,
- self.environment.observation_space.high[2],
- -3.5
- ]
- self.lower_bounds = [
- self.environment.observation_space.low[0],
- 3.0,
- self.environment.observation_space.low[2],
- 3.5
- ]
- self.bins_cartpos = [0.0] # [-0.05, 0.05]
- self.bins_cartvel = [0.0] # [-0.42, 0.42]
- self.bins_pole_angle = [-0.2+i*0.08 for i in range(7)]
- self.bins_pole_vel = [-.8+i*0.27 for i in range(7)]
- self.epsilon = epsilon
- self.alpha = alpha
- self.gamma = gamma
- self.epsilon_decay = epsilon_decay
- self.alpha_k = alpha_k
- self.minimal_epsilon = 0.02
- # self.stats = [{} for _ in range(4)]
- self.all_observations = []
- self.knowledge = {(a, b, c, d, i): random.random() for a in range(3) for b in range(3) for c in range(8) for d in range(8) for i in range(2)}
- def learn(self, max_attempts):
- return [self.attempt() for _ in range(max_attempts)]
- def feed_stats(self, obs):
- for i, o in enumerate(obs):
- if str(o) in self.stats[i]:
- self.stats[i][str(o)] += 1
- else:
- self.stats[i][str(o)] = 0
- def attempt(self):
- observation = self.discretise(self.environment.reset())
- done = False
- reward_sum = 0.0
- while not done:
- # self.environment.render()
- action = self.pick_action(observation)
- new_observation, reward, done, info = self.environment.step(action)
- self.all_observations.append(new_observation)
- new_observation = self.discretise(new_observation)
- # self.feed_stats(new_observation)
- # print(new_observation)
- self.update_knowledge(action, observation, new_observation, reward)
- observation = new_observation
- reward_sum += reward
- # print("{} {}".format(reward_sum, self.epsilon))
- self.epsilon *= self.epsilon_decay
- # if self.epsilon < self.minimal_epsilon:
- # self.epsilon = self.minimal_epsilon
- # self.epsilon_decay = 1
- self.alpha *= self.alpha_k
- self.attempt_no += 1
- return reward_sum
- def discretise(self, observation):
- obs_a = numpy.digitize(observation[0], self.bins_cartpos)
- obs_b = numpy.digitize(observation[1], self.bins_cartvel)
- obs_c = numpy.digitize(observation[2], self.bins_pole_angle)
- obs_d = numpy.digitize(observation[3], self.bins_pole_vel)
- return [obs_a[()], obs_b[()], obs_c[()], obs_d[()]]
- def pick_action(self, observation):
- action_left = self.knowledge[(*observation, 0)]
- action_right = self.knowledge[(*observation, 1)]
- if random.random() > self.epsilon:
- return 0 if action_left > action_right else 1
- else:
- return random.randint(0, 2)
- def update_knowledge(self, action, observation, new_observation, reward):
- qval = self.knowledge[(*observation, action)]
- maxqval = max(self.knowledge[(*new_observation, 0)], self.knowledge[(*new_observation, 1)])
- new_qval = qval + self.alpha*(reward+self.gamma*maxqval-qval)
- self.knowledge[(*observation, action)] = new_qval
- def plot_histogram(self):
- for i in range(4):
- plt.hist([o[i] for o in self.all_observations], bins=10)
- plt.show()
- def main():
- exp_count = 5
- total_score = []
- # starting parameters
- alpha = 0.9
- gamma = 1.0
- epsilon = 1.0
- alpha_k = 0.996
- epsilon_decay = 0.996
- # experiment parameters
- # alpha_l = [0.1 + 0.1*i for i in range(9)]
- # gamma_l = [0.0+0.2*i for i in range(6)]
- # epsilon_l = [0.2 + 0.2*i for i in range(5)]
- # alpha_k_l = [0.999, 0.996]
- # epsilon_decay_l = [0.999, 0.996]
- #
- # best_params = [0.0, 0.0, 0.0, 0.0, 0.0]
- # best_score = 0.0
- # cnt = 0
- # for a in alpha_l:
- # for b in gamma_l:
- # for c in epsilon_l:
- # for d in alpha_k_l:
- # for e in epsilon_decay_l:
- # sum = 0.0
- # cnt += 1
- # print(cnt)
- #
- # for i in range(exp_count):
- # learner = QLearner(a, b, c, e, d)
- # score = learner.learn(600)
- # if len(total_score) == 0:
- # total_score = [s for s in score]
- # else:
- # total_score = list(map(add, score, total_score))
- # sum += mean(score[-50:])
- # if sum > best_score:
- # best_score = sum
- # best_params = [a,b,c,d,e]
- # print(best_params)
- # print(best_params)
- hill = False
- hill_params = [alpha, gamma, epsilon]
- res = 0
- res_float = 0.0
- if hill:
- while res < 3:
- hill_params[res] += 0.1
- print(res)
- print(hill_params[res])
- total_score = []
- for i in range(5):
- print(i)
- learner = QLearner(alpha, gamma, epsilon, epsilon_decay, alpha_k)
- score = learner.learn(1000)
- if len(total_score) == 0:
- total_score = [s for s in score]
- else:
- total_score = list(map(add, score, total_score))
- res_temp = mean(total_score[-30:])
- if res_temp < res_float or hill_params[res] > 1.0:
- res += 1
- res_float = 0.0
- else:
- res_float = res_temp
- print(res_temp)
- print(hill_params)
- else:
- for i in range(exp_count):
- learner = QLearner(epsilon, alpha, gamma, epsilon_decay, alpha_k)
- score = learner.learn(2000)
- if len(total_score) == 0:
- total_score = [s for s in score]
- else:
- total_score = list(map(add, score, total_score))
- print(mean(score[-20:]))
- total_score = [t/5 for t in total_score]
- # learner.plot_histogram()
- plot_learning_result(total_score, 100,
- {'alpha': learner.alpha, 'gamma': learner.gamma, 'epsilon': learner.epsilon,
- 'epsilon decay': learner.epsilon_decay, 'alpha decay': learner.alpha_k})
- if __name__ == '__main__':
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement