Untitled

import random
import numpy as np

q_table = np.zeros([frozen.observation_space.n, frozen.action_space.n])

#hyperparameters
alpha = .1
gamma = .6
epsilon = .1

#plotting metrics
all_epochs = []
all_penalties = []

done = False

for i in range(1, 100001):
    state = frozen.reset()
    previous = {
        hash(state): 1
    }

    epochs, penalties, reward = 0,0,0
    done = False


    while not done:
        if epsilon > random.uniform(0,1):
            action = frozen.action_space.sample()
        else:
            action = np.argmax(q_table[state])

        next_state, reward, done, info = frozen.step(action)

        #if hole, penalize
        if reward == 0 and done == True:
            reward = -1

        #penalize for backtracking
        count = previous[hash(state)]
        if count:
            reward += -.1*count

        old_value = q_table[state, action]
        next_max = np.max(q_table[next_state])

        new_value = (1 - alpha) * old_value + alpha*(reward + gamma * next_max)
        q_table[state, action] = new_value

        state = next_state
        epochs += 1

    if i % 100 == 0:
        clear_output(wait=True)
        print(f"Episode: {i}")

print("Training finished.\n")