Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import random
- import numpy as np
- q_table = np.zeros([frozen.observation_space.n, frozen.action_space.n])
- #hyperparameters
- alpha = .1
- gamma = .6
- epsilon = .1
- #plotting metrics
- all_epochs = []
- all_penalties = []
- done = False
- for i in range(1, 100001):
- state = frozen.reset()
- previous = {
- hash(state): 1
- }
- epochs, penalties, reward = 0,0,0
- done = False
- while not done:
- if epsilon > random.uniform(0,1):
- action = frozen.action_space.sample()
- else:
- action = np.argmax(q_table[state])
- next_state, reward, done, info = frozen.step(action)
- #if hole, penalize
- if reward == 0 and done == True:
- reward = -1
- #penalize for backtracking
- count = previous[hash(state)]
- if count:
- reward += -.1*count
- old_value = q_table[state, action]
- next_max = np.max(q_table[next_state])
- new_value = (1 - alpha) * old_value + alpha*(reward + gamma * next_max)
- q_table[state, action] = new_value
- state = next_state
- epochs += 1
- if i % 100 == 0:
- clear_output(wait=True)
- print(f"Episode: {i}")
- print("Training finished.\n")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement