Advertisement
Guest User

Untitled

a guest
Nov 12th, 2019
90
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.14 KB | None | 0 0
  1. while True:
  2. prev_value_function = np.copy(value_function)
  3.  
  4. # update each element/state of V
  5. for iS, actions_dict in P.items():
  6. # take max of actions to update V(s)
  7. amax_val = float('-inf')
  8. abest_action = float('-inf')
  9. for iA, next_states_arr in actions_dict.items():
  10. # evaluate value iteration main part of eqn
  11. aexpected_val = 0
  12. for next_state_tuple in next_states_arr:
  13. (probability, nextstate, reward, terminal) = next_state_tuple
  14. if terminal:
  15. aexpected_val += probability * reward
  16. else:
  17. aexpected_val += probability * (reward + gamma * prev_value_function[nextstate])
  18. if aexpected_val > amax_val:
  19. amax_val = aexpected_val
  20. abest_action = iA
  21.  
  22. value_function[iS] = amax_val
  23. policy[iS] = abest_action
  24.  
  25. # check if exit condition true
  26. if np.amax(np.abs(value_function - prev_value_function)) < tol:
  27. break
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement