Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import numpy as np
- prob_w = [0.1, 0.2, 0.7]
- states = [0, 1, 2]
- actions = [0, 1, 2]
- demand = [0, 1, 2]
- def fill_level(x, u, w):
- fill_lvl = x + u - w
- return fill_lvl
- def next_state(x, u, w, clip=False):
- x_next = max(0, fill_level(x, u, w))
- if clip:
- x_next = min(x_next, 2)
- return x_next
- def reward(x, u, w):
- rew = -(fill_level(x, u, w) ** 2)
- return rew
- def one_step(v_old):
- vals, acts = [], []
- for x in states:
- valsx = []
- for u in actions:
- ret = []
- for w in demand:
- xnext = next_state(x, u, w, clip=True)
- tmp = reward(x, u, w)
- tmp += v_old[xnext]
- tmp *= prob_w[w]
- ret.append(tmp)
- value = np.sum(ret)
- valsx.append(value)
- v_curr = np.max(valsx)
- vals.append(v_curr)
- u_optx = np.argmax(valsx)
- acts.append(u_optx)
- return vals, acts
- def run_DP(T=20, V_init=[-0, -1, -2]):
- values, policy = np.zeros((T+1, len(V_init))), np.zeros((T+1, len(actions)))
- v_old = V_init
- values[T, ] = V_init
- for t in range(T, 0, -1):
- vals, acts = one_step(v_old)
- values[t-1, ], policy[t-1, ] = vals, acts
- v_old = vals
- return values, policy
- def main():
- V_T = [-1, -2, -3]
- T = 20
- values, policy = run_DP(T=T, V_init=V_T)
- for t in range(T, 0, -1):
- print('Timestep ', t, ' Value for states 0, 1, 2: ', values[t, :],
- ' Optimal actions for states 0, 1, 2: ', policy[t, :])
- if __name__ == '__main__':
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement