Advertisement
Guest User

Untitled

a guest
Jul 15th, 2019
79
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.29 KB | None | 0 0
  1. import random
  2. import numpy as np
  3.  
  4. q_table = np.zeros([frozen.observation_space.n, frozen.action_space.n])
  5.  
  6. #hyperparameters
  7. alpha = .1
  8. gamma = .6
  9. epsilon = .1
  10.  
  11. #plotting metrics
  12. all_epochs = []
  13. all_penalties = []
  14.  
  15. done = False
  16.  
  17. for i in range(1, 100001):
  18. state = frozen.reset()
  19. previous = {
  20. hash(state): 1
  21. }
  22.  
  23. epochs, penalties, reward = 0,0,0
  24. done = False
  25.  
  26.  
  27. while not done:
  28. if epsilon > random.uniform(0,1):
  29. action = frozen.action_space.sample()
  30. else:
  31. action = np.argmax(q_table[state])
  32.  
  33. next_state, reward, done, info = frozen.step(action)
  34.  
  35. #if hole, penalize
  36. if reward == 0 and done == True:
  37. reward = -1
  38.  
  39. #penalize for backtracking
  40. count = previous[hash(state)]
  41. if count:
  42. reward += -.1*count
  43.  
  44. old_value = q_table[state, action]
  45. next_max = np.max(q_table[next_state])
  46.  
  47. new_value = (1 - alpha) * old_value + alpha*(reward + gamma * next_max)
  48. q_table[state, action] = new_value
  49.  
  50. state = next_state
  51. epochs += 1
  52.  
  53. if i % 100 == 0:
  54. clear_output(wait=True)
  55. print(f"Episode: {i}")
  56.  
  57. print("Training finished.\n")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement