Advertisement
Guest User

Untitled

a guest
Apr 23rd, 2019
123
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.64 KB | None | 0 0
  1. #!/usr/bin/env python
  2. # coding=utf-8
  3. import numpy as np
  4.  
  5. """
  6. 1: Procedure Policy_Iteration(S,A,P,R)
  7. 2: Inputs
  8. 3: S is the set of all states
  9. 4: A is the set of all actions
  10. 5: P is state transition function specifying P(s'|s,a)
  11. 6: R is a reward function R(s,a,s')
  12. 7: Output
  13. 8: optimal policy π
  14. 9: Local
  15. 10: action array π[S]
  16. 11: Boolean variable noChange
  17. 12: real array V[S]
  18. 13: set π arbitrarily
  19. 14: repeat
  20. 15: noChange ←true
  21. 16: Solve V[s] = ∑s'∈S P(s'|s,π[s])(R(s,a,s')+γV[s'])
  22. 17: for each s∈S do
  23. 18: Let QBest=V[s]
  24. 19: for each a ∈A do
  25. 20: Let Qsa=∑s'∈S P(s'|s,a)(R(s,a,s')+γV[s'])
  26. 21: if (Qsa > QBest) then
  27. 22: π[s]←a
  28. 23: QBest ←Qsa
  29. 24: noChange ←false
  30. 25: until noChange
  31. 26: return π
  32. """
  33.  
  34. states = [0,1,2,3,4]
  35. actions = [0,1]
  36. N_STATES = len(states)
  37. N_ACTIONS = len(actions)
  38. P = np.zeros((N_STATES, N_ACTIONS, N_STATES)) # transition probability
  39. R = np.zeros((N_STATES, N_ACTIONS, N_STATES)) # rewards
  40.  
  41. P[0,0,1] = 1.0
  42. P[1,1,2] = 1.0
  43. P[2,0,3] = 1.0
  44. P[3,1,4] = 1.0
  45. P[4,0,4] = 1.0
  46.  
  47.  
  48. R[0,0,1] = 1
  49. R[1,1,2] = 10
  50. R[2,0,3] = 100
  51. R[3,1,4] = 1000
  52. R[4,0,4] = 1.0
  53.  
  54.  
  55. gamma = 0.75
  56.  
  57. # initialize policy and value arbitrarily
  58. policy = [0 for s in range(N_STATES)]
  59. V = np.zeros(N_STATES)
  60.  
  61. print "Initial policy", policy
  62. # print V
  63. # print P
  64. # print R
  65.  
  66. is_value_changed = True
  67. iterations = 0
  68. while is_value_changed:
  69. is_value_changed = False
  70. iterations += 1
  71. # run value iteration for each state
  72. for s in range(N_STATES):
  73. V[s] = sum([P[s,policy[s],s1] * (R[s,policy[s],s1] + gamma*V[s1]) for s1 in range(N_STATES)])
  74. # print "Run for state", s
  75.  
  76. for s in range(N_STATES):
  77. q_best = V[s]
  78. # print "State", s, "q_best", q_best
  79. for a in range(N_ACTIONS):
  80. q_sa = sum([P[s, a, s1] * (R[s, a, s1] + gamma * V[s1]) for s1 in range(N_STATES)])
  81. if q_sa > q_best:
  82. print "State", s, ": q_sa", q_sa, "q_best", q_best
  83. policy[s] = a
  84. q_best = q_sa
  85. is_value_changed = True
  86.  
  87. print "Iterations:", iterations
  88. # print "Policy now", policy
  89.  
  90. print "Final policy"
  91. print policy
  92. print V
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement