Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # Tudor Berariu, 2016
- # Razvan Chitu, 2018
- # Standard library imports
- from collections import defaultdict
- from argparse import ArgumentParser
- from random import choice, randint
- from time import sleep
- # External library imports
- from matplotlib import pyplot as plt
- import numpy as np
- # Local imports
- from mini_pacman_new import Game
- def epsilon_greedy(Q, state, legal_actions, epsilon):
- # TODO (2) : Epsilon greedy
- p = randint(1, 100)
- if p < epsilon * 100:
- return choice(legal_actions)
- else:
- return best_action(Q, state, legal_actions)
- def best_action(Q, state, legal_actions):
- # TODO (3) : Best action
- # # search for unexplored actions
- unexplored_actions = []
- # for action in legal_actions:
- # if (state, action) not in Q.keys():
- # unexplored_actions.append(action)
- #
- # if unexplored_actions:
- # return choice(unexplored_actions)
- # all actions have been explored, search for the one with highest q
- max_q = -999999999
- max_action = None
- # all_q_values = []
- for action in legal_actions:
- if (state, action) not in Q.keys():
- unexplored_actions.append(action)
- elif Q[(state, action)] > max_q:
- max_q = Q[(state, action)]
- max_action = action
- # # if all actions have the same max_q, pick a random one
- # if all_q_values and all(elem == max_q for elem in all_q_values):
- # return choice(legal_actions)
- if unexplored_actions:
- return choice(unexplored_actions)
- return max_action
- def q_learning(map_file, learning_rate, discount, epsilon, train_episodes,
- eval_every, eval_episodes, verbose, plot_scores, sleep_interval,
- final_show):
- # Q will use (state, action) tuples as key.
- # Use Q.get(..., 0) for default values.
- Q = {}
- train_scores = []
- eval_scores = []
- # for each episode ...
- for train_ep in range(1, train_episodes + 1):
- game = Game(map_file)
- # display current state and sleep
- if verbose:
- print(game.state)
- sleep(sleep_interval)
- # while current state is not terminal
- while not game.is_over():
- # choose one of the legal actions
- state, actions = game.state, game.legal_actions
- action = epsilon_greedy(Q, state, actions, epsilon)
- # apply action and get the next state and the reward
- reward, msg = game.apply_action(action)
- next_state, next_actions = game.state, game.legal_actions
- # TODO (1) : Q-Learning
- max_a_prime = -999999999999
- for a_prime in next_actions:
- if Q.get((next_state, a_prime), 0) > max_a_prime:
- max_a_prime = Q.get((next_state, a_prime), 0)
- Q[(state, action)] = Q.get((state, action), 0) + learning_rate * (
- reward + discount * max_a_prime - Q.get((state, action), 0))
- # display current state and sleep
- if verbose:
- print(msg);
- print(game.state);
- sleep(sleep_interval)
- print("Episode %6d / %6d" % (train_ep, train_episodes))
- train_scores.append(game.score)
- # evaluate the greedy policy
- if train_ep % eval_every == 0:
- avg_score = .0
- # TODO (4) : Evaluate
- for _ in range(eval_episodes):
- game = Game(map_file)
- while not game.is_over():
- state, actions = game.state, game.legal_actions
- action = best_action(Q, state, actions)
- reward, msg = game.apply_action(action)
- avg_score += game.score
- avg_score /= eval_episodes
- eval_scores.append(avg_score)
- # --------------------------------------------------------------------------
- if final_show:
- game = Game(map_file)
- while not game.is_over():
- state, actions = game.state, game.legal_actions
- action = best_action(Q, state, actions)
- reward, msg = game.apply_action(action)
- print(msg)
- print(game.state)
- sleep(sleep_interval)
- if plot_scores:
- plt.xlabel("Episode")
- plt.ylabel("Average score")
- plt.plot(
- np.linspace(1, train_episodes, train_episodes),
- np.convolve(train_scores, [0.2, 0.2, 0.2, 0.2, 0.2], "same"),
- linewidth=1.0, color="blue"
- )
- plt.plot(
- np.linspace(eval_every, train_episodes, len(eval_scores)),
- eval_scores, linewidth=2.0, color="red"
- )
- plt.show()
- def main():
- parser = ArgumentParser()
- # Input file
- parser.add_argument("--map_file", type=str, default="mini_map",
- help="File to read map from.")
- # Meta-parameters
- parser.add_argument("--learning_rate", type=float, default=0.1,
- help="Learning rate")
- parser.add_argument("--discount", type=float, default=0.99,
- help="Value for the discount factor")
- parser.add_argument("--epsilon", type=float, default=0.05,
- help="Probability to choose a random action.")
- # Training and evaluation episodes
- parser.add_argument("--train_episodes", type=int, default=1000,
- help="Number of episodes")
- parser.add_argument("--eval_every", type=int, default=10,
- help="Evaluate policy every ... games.")
- parser.add_argument("--eval_episodes", type=int, default=10,
- help="Number of games to play for evaluation.")
- # Display
- parser.add_argument("--verbose", dest="verbose",
- action="store_true", help="Print each state")
- parser.add_argument("--plot", dest="plot_scores", action="store_true",
- help="Plot scores in the end", )
- parser.add_argument("--sleep", type=float, default=0.1,
- help="Seconds to 'sleep' between moves.")
- parser.add_argument("--final_show", dest="final_show",
- action="store_true",
- help="Demonstrate final strategy.")
- args = parser.parse_args()
- q_learning(
- args.map_file, args.learning_rate, args.discount, args.epsilon,
- args.train_episodes, args.eval_every, args.eval_episodes, args.verbose,
- args.plot_scores, args.sleep, args.final_show
- )
- if __name__ == "__main__":
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement