Untitled

   while True:
        prev_value_function = np.copy(value_function)

        # update each element/state of V
        for iS, actions_dict in P.items():
            # take max of actions to update V(s)
            amax_val = float('-inf')
            abest_action = float('-inf')
            for iA, next_states_arr in actions_dict.items():
                # evaluate value iteration main part of eqn
                aexpected_val = 0
                for next_state_tuple in next_states_arr:
                    (probability, nextstate, reward, terminal) = next_state_tuple
                    if terminal:
                        aexpected_val += probability * reward
                    else:
                        aexpected_val += probability * (reward + gamma * prev_value_function[nextstate])
                if aexpected_val > amax_val:
                    amax_val = aexpected_val
                    abest_action = iA

            value_function[iS] = amax_val
            policy[iS] = abest_action

        # check if exit condition true
        if np.amax(np.abs(value_function - prev_value_function)) < tol:
            break