Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- python
- def value_iteration(env, discount_factor, epsilon):
- # This will be a return value
- policy = dict()
- reward = dict()
- for state in env.get_all_states():
- reward[state.x, state.y] = env.get_state_reward(state)
- while True:
- max_difference = 0
- for state in env.get_all_states():
- if env.is_goal_state(state):
- policy[state.x, state.y] = None
- continue
- # initialize this to minimum value to get in first if condition.
- max_value = - float("inf")
- previous_reward = reward[state.x, state.y]
- for action in env.get_actions(state):
- state1 = env.get_next_states_and_probs(state, action)
- value = 0
- for i in range(len(state1)):
- value += (state1[i][1] * reward[state1[i][0]])
- if value > max_value:
- policy[state.x, state.y] = action
- max_value = value
- reward[state.x, state.y] = (env.get_state_reward(state) + (discount_factor * max_value))
- difference = abs(reward[state.x, state.y] - previous_reward)
- if difference > max_difference:
- max_difference = difference
- if max_difference < (epsilon * (1 - discount_factor)) / discount_factor:
- break
- return policy
- python
- def policy_evaluation(env, Q, iter_num, reward=-1, dis=1):
- x_dims = env.observation_space.spaces[0].n
- y_dims = env.observation_space.spaces[1].n
- maze_size = tuple((x_dims, y_dims))
- num_actions = env.action_space.n
- post_value_table = np.zeros([maze_size[0], maze_size[1]], dtype=float)
- next_value_table = np.zeros([maze_size[0], maze_size[1]], dtype=float)
- for iteration in range(iter_num):
- for i in range(maze_size[0]):
- for j in range(maze_size[1]):
- if i == j and ((i == 0) or (i == 3)):
- value_t = 0
- else:
- value_t = 0
- for state in env.get_all_states():
- if env.is_goal_state(state):
- # policy[state.x, state.y] = None
- continue
- for action in env.get_actions(state):
- state1 = env.get_next_states_and_probs(state, action)
- for k in range(len(state1)):
- # print(state1[k][0].x,state1[k][0].y)
- # value += (state1[i][1] * reward[state1[i][0]])
- # print(post_value_table, state[0], state1[k][0].x)
- # i_, j_ = get_state(state1[k][0],k)
- # print(post_value_table[state1[k][0].x][state1[k][0].y])
- value = Q[i][j][k] * (reward + dis *
- post_value_table[state1[k][0].x][state1[k][0].y])
- value_t += value
- # print(post_value_table)
- # print(reward, dis, post_value_table,value_t)
- next_value_table[i][j] = round(value_t, 3)
- iteration += 1
- print(next_value_table)
- post_value_table = next_value_table
- return next_value_table
- [[ 0.00000000e+00 -5.63167618e+10 -5.63167618e+10 -5.63167618e+10]
- [-5.63167618e+10 -5.63167618e+10 -5.63167618e+10 -5.63167618e+10]
- [-5.63167618e+10 -5.63167618e+10 -5.63167618e+10 -1.64197709e+11]
- [-2.13710094e+11 -2.13710094e+11 -1.19479275e+12 0.00000000e+00]]
- [[ 0.00000000e+00 -1.35899046e+12 -1.35899046e+12 -1.35899046e+12]
- [-1.35899046e+12 -1.35899046e+12 -1.35899046e+12 -1.35899046e+12]
- [-1.35899046e+12 -1.35899046e+12 -1.35899046e+12 -3.96228604e+12]
- [-5.15707879e+12 -5.15707879e+12 -2.88317703e+13 0.00000000e+00]]
- [[ 0.00000000e+00 -1.35899046e+12 -1.35899046e+12 -1.35899046e+12]
- [-1.35899046e+12 -1.35899046e+12 -1.35899046e+12 -1.35899046e+12]
- [-1.35899046e+12 -1.35899046e+12 -1.35899046e+12 -3.96228604e+12]
- [-5.15707879e+12 -5.15707879e+12 -2.88317703e+13 0.00000000e+00]]
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement