Advertisement
Guest User

Untitled

a guest
Jun 19th, 2019
198
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 4.13 KB | None | 0 0
  1. python
  2. def value_iteration(env, discount_factor, epsilon):
  3.  
  4.  
  5. # This will be a return value
  6. policy = dict()
  7.  
  8. reward = dict()
  9.  
  10. for state in env.get_all_states():
  11. reward[state.x, state.y] = env.get_state_reward(state)
  12. while True:
  13. max_difference = 0
  14. for state in env.get_all_states():
  15. if env.is_goal_state(state):
  16. policy[state.x, state.y] = None
  17. continue
  18.  
  19. # initialize this to minimum value to get in first if condition.
  20. max_value = - float("inf")
  21.  
  22. previous_reward = reward[state.x, state.y]
  23.  
  24. for action in env.get_actions(state):
  25. state1 = env.get_next_states_and_probs(state, action)
  26. value = 0
  27. for i in range(len(state1)):
  28. value += (state1[i][1] * reward[state1[i][0]])
  29.  
  30. if value > max_value:
  31. policy[state.x, state.y] = action
  32. max_value = value
  33.  
  34. reward[state.x, state.y] = (env.get_state_reward(state) + (discount_factor * max_value))
  35.  
  36. difference = abs(reward[state.x, state.y] - previous_reward)
  37. if difference > max_difference:
  38. max_difference = difference
  39. if max_difference < (epsilon * (1 - discount_factor)) / discount_factor:
  40. break
  41.  
  42. return policy
  43.  
  44. python
  45. def policy_evaluation(env, Q, iter_num, reward=-1, dis=1):
  46. x_dims = env.observation_space.spaces[0].n
  47. y_dims = env.observation_space.spaces[1].n
  48. maze_size = tuple((x_dims, y_dims))
  49. num_actions = env.action_space.n
  50. post_value_table = np.zeros([maze_size[0], maze_size[1]], dtype=float)
  51.  
  52.  
  53. next_value_table = np.zeros([maze_size[0], maze_size[1]], dtype=float)
  54.  
  55. for iteration in range(iter_num):
  56.  
  57. for i in range(maze_size[0]):
  58. for j in range(maze_size[1]):
  59. if i == j and ((i == 0) or (i == 3)):
  60. value_t = 0
  61. else:
  62. value_t = 0
  63. for state in env.get_all_states():
  64. if env.is_goal_state(state):
  65. # policy[state.x, state.y] = None
  66. continue
  67. for action in env.get_actions(state):
  68. state1 = env.get_next_states_and_probs(state, action)
  69.  
  70. for k in range(len(state1)):
  71. # print(state1[k][0].x,state1[k][0].y)
  72. # value += (state1[i][1] * reward[state1[i][0]])
  73. # print(post_value_table, state[0], state1[k][0].x)
  74. # i_, j_ = get_state(state1[k][0],k)
  75. # print(post_value_table[state1[k][0].x][state1[k][0].y])
  76. value = Q[i][j][k] * (reward + dis *
  77. post_value_table[state1[k][0].x][state1[k][0].y])
  78. value_t += value
  79. # print(post_value_table)
  80. # print(reward, dis, post_value_table,value_t)
  81. next_value_table[i][j] = round(value_t, 3)
  82.  
  83. iteration += 1
  84.  
  85. print(next_value_table)
  86. post_value_table = next_value_table
  87.  
  88. return next_value_table
  89.  
  90. [[ 0.00000000e+00 -5.63167618e+10 -5.63167618e+10 -5.63167618e+10]
  91. [-5.63167618e+10 -5.63167618e+10 -5.63167618e+10 -5.63167618e+10]
  92. [-5.63167618e+10 -5.63167618e+10 -5.63167618e+10 -1.64197709e+11]
  93. [-2.13710094e+11 -2.13710094e+11 -1.19479275e+12 0.00000000e+00]]
  94. [[ 0.00000000e+00 -1.35899046e+12 -1.35899046e+12 -1.35899046e+12]
  95. [-1.35899046e+12 -1.35899046e+12 -1.35899046e+12 -1.35899046e+12]
  96. [-1.35899046e+12 -1.35899046e+12 -1.35899046e+12 -3.96228604e+12]
  97. [-5.15707879e+12 -5.15707879e+12 -2.88317703e+13 0.00000000e+00]]
  98. [[ 0.00000000e+00 -1.35899046e+12 -1.35899046e+12 -1.35899046e+12]
  99. [-1.35899046e+12 -1.35899046e+12 -1.35899046e+12 -1.35899046e+12]
  100. [-1.35899046e+12 -1.35899046e+12 -1.35899046e+12 -3.96228604e+12]
  101. [-5.15707879e+12 -5.15707879e+12 -2.88317703e+13 0.00000000e+00]]
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement