Untitled

def get_legal_actions(str_state):
    #TODO (1) : Get the actions Greuceanu can do
    local_actions = deepcopy(ACTIONS)
    return_actions = []

    for a in local_actions:
        state, r, m = apply_action(str_state, a)
        posx, posy = __get_position(state, "G")
        if __is_valid_cell(state, posx, posy):
            return_actions += [a]

    return return_actions

def epsilon_greedy(Q, state, legal_actions, epsilon):
    # TODO (2) : Epsilon greedy
    not_explored = []

    for a in legal_actions:
        if (state, a) not in Q:
            not_explored += [a]
    if not_explored != []:
        return choice(not_explored)

    max_action = -9999
    max_a = ""

    for a in legal_actions:
        if Q[(state, a)] > max_action:
            max_action = Q[(state, a)]
            max_a = a

    if random() <= epsilon:
        return choice(legal_actions)
    return max_a

def best_action(Q, state, legal_actions):
    # TODO (3) : Best action
    max_action = -9999
    max_a = ""

    for a in legal_actions:
        if (state, a) not in Q:
            Q[(state, a)] = 0
        if Q[(state, a)] > max_action:
            max_action = Q[(state, a)]
            max_a = a
    return max_a

def q_learning():
    Q = {}
    train_scores = []
    eval_scores = []
    initial_state = get_initial_state(MAP_NAME)

    for train_ep in range(1, TRAIN_EPISODES+1):
        clear_output(wait=True)
        score = 0
        state = deepcopy(initial_state)

        if VERBOSE:
            display_state(state); sleep(SLEEP_TIME)
            clear_output(wait=True)

        while not is_final_state(state, score):

            actions = get_legal_actions(state)
            action = epsilon_greedy(Q, state, actions, EPSILON)

            new_state, reward, msg = apply_action(state, action)
            score += reward

            max_action = -9999
            new_actions = get_legal_actions(new_state)

            for a in new_actions:
                if (new_state, a) in Q:
                    max_action = max(max_action, Q[(new_state, a)])
                else:
                    max_action = max(max_action, 0)
            if (new_state, action) in Q:
                if (new_state, max_action) not in Q:
                    Q[(new_state, max_action)] = 0

                if (state, action) not in Q:
                    Q[(state, action)] = 0
                Q[(state, action)] += LEARNING_RATE * (reward + DISCOUNT_FACTOR * max_action - Q[(state, action)])

            state = new_state
            # TODO (1) : Q-Learning
            if VERBOSE:
                print(msg); display_state(state); sleep(SLEEP_TIME)
                clear_output(wait=True)


        print(f"Episode {train_ep} / {TRAIN_EPISODES}")
        train_scores.append(score)

        # evaluate the greedy policy
        if train_ep % EVAL_EVERY == 0:
            avg_score = .0

#             TODO (4) : Evaluate
#             eval_scores.append(avg_score)

            for i in range(1, EVAL_EPISODES + 1):
                state = deepcopy(initial_state)
                n_score = 0
                while not is_final_state(state, score):
                    action = best_action(Q, state, get_legal_actions(state))
                    new_state, reward, msg = apply_action(state, action)
                    n_score += reward

                    if (state, action) not in Q:
                        Q[(state, action)] = 0
                    new_actions = get_legal_actions(new_state)
                    max_action = -9999

                    for new_a in new_actions:
                        if (new_state, new_a) not in Q:
                            Q[(new_state, new_a)] = 0
                        if Q[(new_state, new_a)] > max_action:
                            max_action = Q[(new_state, new_a)]
                    Q[(state, action)] += LEARNING_RATE * (reward + DISCOUNT_FACTOR * max_action - Q[(state, action)])
                avg_score += n_score
            eval_scores += [avg_score / EVAL_EPISODES]
    # --------------------------------------------------------------------------
    if FINAL_SHOW:
        state = deepcopy(initial_state)
        while not is_final_state(state, score):
            action = best_action(Q, state, get_legal_actions(state))
            state, _, msg = apply_action(state, action)
            print(msg); display_state(state); sleep(SLEEP_TIME)
            clear_output(wait=True)

    if PLOT_SCORE:
        from matplotlib import pyplot as plt
        import numpy as np
        plt.xlabel("Episode")
        plt.ylabel("Average score")
        plt.plot(
            np.linspace(1, TRAIN_EPISODES, TRAIN_EPISODES),
            np.convolve(train_scores, [0.2,0.2,0.2,0.2,0.2], "same"),
            linewidth = 1.0, color = "blue"
        )
        plt.plot(
            np.linspace(EVAL_EVERY, TRAIN_EPISODES, len(eval_scores)),
                        eval_scores, linewidth = 2.0, color = "red"
        )
        plt.show()

q_learning()