Advertisement
Guest User

Untitled

a guest
Mar 21st, 2018
63
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 0.67 KB | None | 0 0
  1. import random
  2.  
  3. X = [0,1,2,3,4,5]
  4.  
  5. Ppi = pi[:,0,None]*U + pi[:,1,None]*D + pi[:,2,None] * L + pi[:,3,None]* R
  6.  
  7. print(Ppi)
  8.  
  9. initial_1 = 0
  10. initial_5 = 4
  11.  
  12. cost_1 = 0
  13. cost_5 = 0
  14.  
  15. for i in range(100):
  16. for k in range(10000):
  17. # Generate 100 trajectories of 10,000 steps each, following the optimal policy for the MDP.
  18. initial_1 = np.random.choice(X, p=Ppi[initial_1])
  19. initial_5 = np.random.choice(X, p=Ppi[initial_5])
  20.  
  21. # For each trajectory, compute the accumulated (discounted) cost.
  22. cost_1 += C[initial_1,0]
  23. cost_5 += C[initial_5,0]
  24.  
  25. print(initial_1)
  26. print(initial_5)
  27.  
  28. print(cost_1)
  29. print(cost_5)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement