SHARE
TWEET

Untitled

a guest Apr 23rd, 2019 70 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. #!/usr/bin/env python
  2. # coding=utf-8
  3. import numpy as np
  4.  
  5. """
  6. 1: Procedure Policy_Iteration(S,A,P,R)
  7. 2:           Inputs
  8. 3:                     S is the set of all states
  9. 4:                     A is the set of all actions
  10. 5:                     P is state transition function specifying P(s'|s,a)
  11. 6:                     R is a reward function R(s,a,s')
  12. 7:           Output
  13. 8:                     optimal policy π
  14. 9:           Local
  15. 10:                     action array π[S]
  16. 11:                     Boolean variable noChange
  17. 12:                     real array V[S]
  18. 13:           set π arbitrarily
  19. 14:           repeat
  20. 15:                     noChange ←true
  21. 16:                     Solve V[s] = ∑s'∈S P(s'|s,π[s])(R(s,a,s')+γV[s'])
  22. 17:                     for each s∈S do
  23. 18:                               Let QBest=V[s]
  24. 19:                               for each a ∈A do
  25. 20:                                         Let Qsa=∑s'∈S P(s'|s,a)(R(s,a,s')+γV[s'])
  26. 21:                                         if (Qsa > QBest) then
  27. 22:                                                   π[s]←a
  28. 23:                                                   QBest ←Qsa
  29. 24:                                                   noChange ←false
  30. 25:           until noChange
  31. 26:           return π
  32. """
  33.  
  34. states = [0,1,2,3,4]
  35. actions = [0,1]
  36. N_STATES = len(states)
  37. N_ACTIONS = len(actions)
  38. P = np.zeros((N_STATES, N_ACTIONS, N_STATES))  # transition probability
  39. R = np.zeros((N_STATES, N_ACTIONS, N_STATES))  # rewards
  40.  
  41. P[0,0,1] = 1.0
  42. P[1,1,2] = 1.0
  43. P[2,0,3] = 1.0
  44. P[3,1,4] = 1.0
  45. P[4,0,4] = 1.0
  46.  
  47.  
  48. R[0,0,1] = 1
  49. R[1,1,2] = 10
  50. R[2,0,3] = 100
  51. R[3,1,4] = 1000
  52. R[4,0,4] = 1.0
  53.  
  54.  
  55. gamma = 0.75
  56.  
  57. # initialize policy and value arbitrarily
  58. policy = [0 for s in range(N_STATES)]
  59. V = np.zeros(N_STATES)
  60.  
  61. print "Initial policy", policy
  62. # print V
  63. # print P
  64. # print R
  65.  
  66. is_value_changed = True
  67. iterations = 0
  68. while is_value_changed:
  69.     is_value_changed = False
  70.     iterations += 1
  71.     # run value iteration for each state
  72.     for s in range(N_STATES):
  73.         V[s] = sum([P[s,policy[s],s1] * (R[s,policy[s],s1] + gamma*V[s1]) for s1 in range(N_STATES)])
  74.         # print "Run for state", s
  75.  
  76.     for s in range(N_STATES):
  77.         q_best = V[s]
  78.         # print "State", s, "q_best", q_best
  79.         for a in range(N_ACTIONS):
  80.             q_sa = sum([P[s, a, s1] * (R[s, a, s1] + gamma * V[s1]) for s1 in range(N_STATES)])
  81.             if q_sa > q_best:
  82.                 print "State", s, ": q_sa", q_sa, "q_best", q_best
  83.                 policy[s] = a
  84.                 q_best = q_sa
  85.                 is_value_changed = True
  86.  
  87.     print "Iterations:", iterations
  88.     # print "Policy now", policy
  89.  
  90. print "Final policy"
  91. print policy
  92. print V
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
Not a member of Pastebin yet?
Sign Up, it unlocks many cool features!
 
Top