Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- def evaluate_policy_return(T, behavioral_policy, target_policy):
- returns = []
- for trajectory in T:
- importance_weight = 1
- trajectory_return = 0
- for transition in trajectory:
- state, action, reward = transition[0 : 3]
- action_prob_b = behavioral_policy(state, action)
- action_prob_t = target_policy(state, action)
- importance_weight *= (action_prob_t / action_prob_b)
- trajectory_return += reward
- returns.append(trajectory_return * importance_weight)
- return np.mean(returns)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement