Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- while True:
- prev_value_function = np.copy(value_function)
- # update each element/state of V
- for iS, actions_dict in P.items():
- # take max of actions to update V(s)
- amax_val = float('-inf')
- abest_action = float('-inf')
- for iA, next_states_arr in actions_dict.items():
- # evaluate value iteration main part of eqn
- aexpected_val = 0
- for next_state_tuple in next_states_arr:
- (probability, nextstate, reward, terminal) = next_state_tuple
- if terminal:
- aexpected_val += probability * reward
- else:
- aexpected_val += probability * (reward + gamma * prev_value_function[nextstate])
- if aexpected_val > amax_val:
- amax_val = aexpected_val
- abest_action = iA
- value_function[iS] = amax_val
- policy[iS] = abest_action
- # check if exit condition true
- if np.amax(np.abs(value_function - prev_value_function)) < tol:
- break
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement