d2t2.py

import numpy as np

prob_w = [0.1, 0.2, 0.7]
states = [0, 1, 2]
actions = [0, 1, 2]
demand = [0, 1, 2]


def fill_level(x, u, w):
    fill_lvl = x + u - w
    return fill_lvl


def next_state(x, u, w, clip=False):
    x_next = max(0, fill_level(x, u, w))
    if clip:
        x_next = min(x_next, 2)
    return x_next


def reward(x, u, w):
    rew = -(fill_level(x, u, w) ** 2)
    return rew


def one_step(v_old):
    vals, acts = [], []
    for x in states:
        valsx = []
        for u in actions:
            ret = []
            for w in demand:
                xnext = next_state(x, u, w, clip=True)
                tmp = reward(x, u, w)
                tmp += v_old[xnext]
                tmp *= prob_w[w]
                ret.append(tmp)
            value = np.sum(ret)
            valsx.append(value)
        v_curr = np.max(valsx)
        vals.append(v_curr)
        u_optx = np.argmax(valsx)
        acts.append(u_optx)
    return vals, acts


def run_DP(T=20, V_init=[-0, -1, -2]):
    values, policy = np.zeros((T+1, len(V_init))), np.zeros((T+1, len(actions)))

    v_old = V_init
    values[T, ] = V_init
    for t in range(T, 0, -1):
        vals, acts = one_step(v_old)
        values[t-1, ], policy[t-1, ] = vals, acts
        v_old = vals
    return values, policy


def main():
    V_T = [-1, -2, -3]
    T = 20
    values, policy = run_DP(T=T, V_init=V_T)
    for t in range(T, 0, -1):
        print('Timestep ', t, ' Value for states 0, 1, 2: ', values[t, :],
              ' Optimal actions for states 0, 1, 2: ', policy[t, :])


if __name__ == '__main__':
    main()