Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import random
- X = [0,1,2,3,4,5]
- Ppi = pi[:,0,None]*U + pi[:,1,None]*D + pi[:,2,None] * L + pi[:,3,None]* R
- print(Ppi)
- initial_1 = 0
- initial_5 = 4
- cost_1 = 0
- cost_5 = 0
- for i in range(100):
- for k in range(10000):
- # Generate 100 trajectories of 10,000 steps each, following the optimal policy for the MDP.
- initial_1 = np.random.choice(X, p=Ppi[initial_1])
- initial_5 = np.random.choice(X, p=Ppi[initial_5])
- # For each trajectory, compute the accumulated (discounted) cost.
- cost_1 += C[initial_1,0]
- cost_5 += C[initial_5,0]
- print(initial_1)
- print(initial_5)
- print(cost_1)
- print(cost_5)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement