Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- def get_legal_actions(str_state):
- #TODO (1) : Get the actions Greuceanu can do
- local_actions = deepcopy(ACTIONS)
- return_actions = []
- for a in local_actions:
- state, r, m = apply_action(str_state, a)
- posx, posy = __get_position(state, "G")
- if __is_valid_cell(state, posx, posy):
- return_actions += [a]
- return return_actions
- def epsilon_greedy(Q, state, legal_actions, epsilon):
- # TODO (2) : Epsilon greedy
- not_explored = []
- for a in legal_actions:
- if (state, a) not in Q:
- not_explored += [a]
- if not_explored != []:
- return choice(not_explored)
- max_action = -9999
- max_a = ""
- for a in legal_actions:
- if Q[(state, a)] > max_action:
- max_action = Q[(state, a)]
- max_a = a
- if random() <= epsilon:
- return choice(legal_actions)
- return max_a
- def best_action(Q, state, legal_actions):
- # TODO (3) : Best action
- max_action = -9999
- max_a = ""
- for a in legal_actions:
- if (state, a) not in Q:
- Q[(state, a)] = 0
- if Q[(state, a)] > max_action:
- max_action = Q[(state, a)]
- max_a = a
- return max_a
- def q_learning():
- Q = {}
- train_scores = []
- eval_scores = []
- initial_state = get_initial_state(MAP_NAME)
- for train_ep in range(1, TRAIN_EPISODES+1):
- clear_output(wait=True)
- score = 0
- state = deepcopy(initial_state)
- if VERBOSE:
- display_state(state); sleep(SLEEP_TIME)
- clear_output(wait=True)
- while not is_final_state(state, score):
- actions = get_legal_actions(state)
- action = epsilon_greedy(Q, state, actions, EPSILON)
- new_state, reward, msg = apply_action(state, action)
- score += reward
- max_action = -9999
- new_actions = get_legal_actions(new_state)
- for a in new_actions:
- if (new_state, a) in Q:
- max_action = max(max_action, Q[(new_state, a)])
- else:
- max_action = max(max_action, 0)
- if (new_state, action) in Q:
- if (new_state, max_action) not in Q:
- Q[(new_state, max_action)] = 0
- if (state, action) not in Q:
- Q[(state, action)] = 0
- Q[(state, action)] += LEARNING_RATE * (reward + DISCOUNT_FACTOR * max_action - Q[(state, action)])
- state = new_state
- # TODO (1) : Q-Learning
- if VERBOSE:
- print(msg); display_state(state); sleep(SLEEP_TIME)
- clear_output(wait=True)
- print(f"Episode {train_ep} / {TRAIN_EPISODES}")
- train_scores.append(score)
- # evaluate the greedy policy
- if train_ep % EVAL_EVERY == 0:
- avg_score = .0
- # TODO (4) : Evaluate
- # eval_scores.append(avg_score)
- for i in range(1, EVAL_EPISODES + 1):
- state = deepcopy(initial_state)
- n_score = 0
- while not is_final_state(state, score):
- action = best_action(Q, state, get_legal_actions(state))
- new_state, reward, msg = apply_action(state, action)
- n_score += reward
- if (state, action) not in Q:
- Q[(state, action)] = 0
- new_actions = get_legal_actions(new_state)
- max_action = -9999
- for new_a in new_actions:
- if (new_state, new_a) not in Q:
- Q[(new_state, new_a)] = 0
- if Q[(new_state, new_a)] > max_action:
- max_action = Q[(new_state, new_a)]
- Q[(state, action)] += LEARNING_RATE * (reward + DISCOUNT_FACTOR * max_action - Q[(state, action)])
- avg_score += n_score
- eval_scores += [avg_score / EVAL_EPISODES]
- # --------------------------------------------------------------------------
- if FINAL_SHOW:
- state = deepcopy(initial_state)
- while not is_final_state(state, score):
- action = best_action(Q, state, get_legal_actions(state))
- state, _, msg = apply_action(state, action)
- print(msg); display_state(state); sleep(SLEEP_TIME)
- clear_output(wait=True)
- if PLOT_SCORE:
- from matplotlib import pyplot as plt
- import numpy as np
- plt.xlabel("Episode")
- plt.ylabel("Average score")
- plt.plot(
- np.linspace(1, TRAIN_EPISODES, TRAIN_EPISODES),
- np.convolve(train_scores, [0.2,0.2,0.2,0.2,0.2], "same"),
- linewidth = 1.0, color = "blue"
- )
- plt.plot(
- np.linspace(EVAL_EVERY, TRAIN_EPISODES, len(eval_scores)),
- eval_scores, linewidth = 2.0, color = "red"
- )
- plt.show()
- q_learning()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement