Advertisement
Guest User

Lunar lander

a guest
Apr 6th, 2020
297
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 6.18 KB | None | 0 0
  1. import matplotlib.pyplot as pyplot
  2. import math
  3. import numpy as np
  4. import gym
  5. import random
  6. import time
  7.  
  8.  
  9. class QLearner:
  10.     def __init__(self):
  11.         self.knowledge = {}
  12.         self.environment = gym.make('LunarLander-v2')
  13.         self.attempt_no = 1
  14.         self.reward_sum_history = []
  15.         self.upper_bounds = [
  16.             1.0,
  17.             1.0,
  18.             2.0,
  19.             2.0,
  20.             2.0,
  21.             math.radians(180)
  22.         ]
  23.         self.lower_bounds = [
  24.             -1.0,
  25.             -1.0,
  26.             -2.0,
  27.             -2.0,
  28.             -2.0,
  29.             -math.radians(180)
  30.         ]
  31.  
  32.     def learn(self, max_attempts, config):
  33.         self.config = config
  34.  
  35.         self.max_attempts = max_attempts
  36.         self.alpha = self.config['initial_alpha']
  37.         self.eps = self.config['initial_eps']
  38.  
  39.         return [self.attempt() for _ in range(max_attempts)]
  40.  
  41.     def attempt(self):
  42.         observation = self.discretise(self.environment.reset())
  43.         done = False
  44.         reward_sum = 0.0
  45.         while not done:
  46.             # self.environment.render()
  47.             action = self.pick_action(observation)
  48.             new_observation, reward, done, info = self.environment.step(action)
  49.             new_observation = self.discretise(new_observation)
  50.             self.update_knowledge(action, observation, new_observation, reward)
  51.             observation = new_observation
  52.             reward_sum += reward
  53.         # print(reward_sum)
  54.         self.reward_sum_history.append(reward_sum)
  55.         self.exp_update_learning_parameters()
  56.         self.attempt_no += 1
  57.         return reward_sum
  58.  
  59.     def discretise(self, observation):
  60.         bins = np.linspace(self.lower_bounds[0], self.upper_bounds[0], self.config['x_bucket_size'])
  61.         pos_x = np.digitize([observation[0]], bins)
  62.  
  63.         bins = np.linspace(self.lower_bounds[1], self.upper_bounds[1], self.config['y_bucket_size'])
  64.         pos_y = np.digitize([observation[1]], bins)
  65.  
  66.         bins = np.linspace(self.lower_bounds[2], self.upper_bounds[2], self.config['v_x_bucket_size'])
  67.         vel_x = np.digitize([observation[2]], bins)
  68.  
  69.         bins = np.linspace(self.lower_bounds[3], self.upper_bounds[3], self.config['v_y_bucket_size'])
  70.         vel_y = np.digitize([observation[3]], bins)
  71.  
  72.         bins = np.linspace(self.lower_bounds[4], self.upper_bounds[4], self.config['angle_bucket_size'])
  73.         angle = np.digitize([observation[4]], bins)
  74.  
  75.         bins = np.linspace(self.lower_bounds[5], self.upper_bounds[5], self.config['v_angle_bucket_size'])
  76.         ang_v = np.digitize([observation[5]], bins)
  77.  
  78.         return pos_x[0], pos_y[0], vel_x[0], vel_y[0], angle[0], ang_v[0], observation[6], observation[7]
  79.  
  80.     def pick_action(self, observation):
  81.         if random.random() < self.eps:
  82.             return random.randint(0, 3)
  83.  
  84.         action_dict = {key: self.knowledge.get((observation, key), 0.0) for key in range(0, 4)}
  85.         action = max(action_dict.keys(), key=(lambda key: action_dict[key]))
  86.  
  87.         return action
  88.  
  89.     def update_knowledge(self, action, observation, new_observation, reward):
  90.         self.knowledge[(observation, action)] = \
  91.             (1.0 - self.alpha) * self.knowledge.get((observation, action), 1) + self.alpha * (
  92.                     reward + self.config['gamma'] * max(
  93.                 self.knowledge.get((new_observation, 0), 1),
  94.                 self.knowledge.get((new_observation, 1), 1),
  95.                 self.knowledge.get((new_observation, 2), 1),
  96.                 self.knowledge.get((new_observation, 3), 1)
  97.  
  98.             ))
  99.  
  100.     def exp_update_learning_parameters(self):
  101.         self.alpha = self.config['initial_alpha'] * math.exp(-self.config['k_alpha'] * self.attempt_no)
  102.         self.eps = self.config['initial_eps'] * math.exp(-self.config['k_eps'] * self.attempt_no)
  103.         # print(self.alpha)
  104.         # print(self.eps)
  105.  
  106.     def lin_update_learning_parameters(self):
  107.         eps_start = self.config['eps_start']
  108.         eps_end = self.config['eps_end']
  109.         eps_perc = self.config['eps_perc']
  110.  
  111.         delta_eps = - (eps_start - eps_end) / (eps_perc * self.max_attempts)
  112.  
  113.         self.eps = max(eps_end, self.eps + delta_eps)
  114.  
  115.     def moving_average(self, y, N):
  116.         cumsum = np.cumsum(np.insert(y, 0, 0))
  117.         return (cumsum[N:] - cumsum[:-N]) / float(N)
  118.  
  119.     def reword_sum_history_stats(self):
  120.         y = self.reward_sum_history
  121.  
  122.         std_dev = np.std(y)
  123.         mean = np.mean(y)
  124.  
  125.         return mean, std_dev
  126.  
  127.     def format_config(self, dict):
  128.         i = 0
  129.         result = ''
  130.  
  131.         for key, value in dict.items():
  132.             result += '{}:{} '.format(key, value)
  133.             i += 1
  134.             if (i % 2 == 0):
  135.                 result += '\n'
  136.  
  137.         return result
  138.  
  139.     def plot_reword_sum_history(self):
  140.         y = self.reward_sum_history
  141.         len_y = len(y)
  142.  
  143.         x = range(len_y)
  144.         y_mean = [np.mean(y)] * len_y
  145.  
  146.         moving_avg = self.moving_average(y, 6)
  147.         x_moving_avg = range(len(moving_avg))
  148.  
  149.         fig, ax = pyplot.subplots()
  150.         # title = ax.set_title()
  151.  
  152.         fig.tight_layout()
  153.         fig.subplots_adjust(top=0.70)
  154.  
  155.         data_line = ax.plot(x, y, label='Quality')
  156.         moving_average_line = ax.plot(x_moving_avg, moving_avg, label='Quality Moving Avg')
  157.         # mean_line = ax.plot(x, y_mean, 'r', label='Mean', linestyle='--')
  158.  
  159.         legend = ax.legend(loc='upper left')
  160.  
  161.         filename = self.format_config(self.config) + time.strftime("%Y%m%d-%H%M%S") + '.png'
  162.  
  163.         pyplot.savefig(filename)
  164.         pyplot.close()
  165.  
  166.  
  167. def evaluate_learning_process(config):
  168.     learner = QLearner()
  169.     learner.learn(2000, config)
  170.     learner.plot_reword_sum_history()
  171.  
  172.  
  173. def main():
  174.     config = {}
  175.  
  176.     config['initial_alpha'] = 0.9
  177.     config['initial_eps'] = 0.3
  178.     config['gamma'] = 0.7
  179.     config['k_alpha'] = 0.01
  180.     config['k_eps'] = 0.009
  181.  
  182.     config['x_bucket_size'] = 2
  183.     config['y_bucket_size'] = 2
  184.     config['v_x_bucket_size'] = 6
  185.     config['v_y_bucket_size'] = 10
  186.     config['angle_bucket_size'] = 10
  187.     config['v_angle_bucket_size'] = 8
  188.  
  189.     evaluate_learning_process(config)
  190.  
  191.  
  192. if __name__ == '__main__':
  193.     main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement