Q-Learning for Inventory Management

import random
import numpy as np
import matplotlib.pyplot as plt
from math import sqrt

#Parameters
T = 360  #time span (days)
RT = 1  # review time (days) - time between 2 placed orders
LT = 1  # lead time (days) - time that goes from when an order is placed to the moment it arrives
h = 1   # on-hand inventory cost (unit cost)
b = 5   # backorder cost (unit cost)
I0 = 300  # initial stock (units)
dm = 100 # average demand
dmdp = 10 # standard deviation for demand
d = []  #demand following normal distribution (units per day)
random.seed(100)
for j in range(T):
    #d.append(round(np.random.normal(loc=0.0,scale=1.0)*dmdp+dm,0))
    d.append(random.randint(90, 110))
k = 1.96 # safety factor
ss = sqrt(RT+LT)*dmdp*k  # safety stock (units) -> k=1.96, 1.64, 1.28
s = []  # target stock (units)
for j in range(T):
    s.append((RT+LT)*dm + ss)

#Variable Initialization
Qt = [0] * T    # quantity to order each day (units)
It = [0] * T    # inventory at the end of each day (units)
NIt = [0] * T   # net inventory at the start of each day (units) = (on-hand + in transit) inventory
Ip = [0] * T    # on-hand inventory at the end of each day (units)
Im = [0] * T    # backorder at the end of each day (units)
dsat = [0] * T  #satisfied demand (units)

#Initial Values
It[0] = I0 - d[0]
NIt[0] = I0

if (It[0] >= 0):
    Ip[0] = It[0]
else:
    Im[0] = -It[0]

if (NIt[0] < s[0]):
    Qt[0] = s[0] - NIt[0]
else:
    Qt[0] = 0

if d[0] >= I0:
    dsat[0] = I0
else:
    dsat[0] = d[0]

def test(T, LT, d, s):
    i=1
    while i < T:
        j=1
        NIt[i] = Ip[i-1]
        while j <= LT:
            if (i-j >= 0):
                NIt[i] = NIt[i] + Qt[i-j]
            j += 1
        NIt[i] = round(NIt[i], 0)
        if (NIt[i] < s[i]):
            Qt[i] = s[i] - NIt[i]
        else:
            Qt[i] = 0
        Qt[i] = round(Qt[i], 0)
        if (i - LT >= 0):
            It[i] = Ip[i-1] - d[i] + Qt[i-LT]
        else:
            It[i] = Ip[i-1] - d[i]
        It[i] = round(It[i], 0)
        if It[i] >= 0:
            Ip[i] = It[i]
        else:
            Im[i] = -It[i]
        if (d[i] >= Ip[i-1] + Qt[i-LT]):
            dsat[i] = Ip[i-1] + Qt[i-LT]
        else:
            dsat[i] = d[i]
        i += 1
    return Qt, NIt, It, Ip, Im, dsat

def objective(T, Ip, Im, h, b):
    sumIp = Ip[0]
    sumIm = Im[0]
    ym = 0
    if (Im[0] > 0):
        ym = 1
    i = 1
    while i < T:
        sumIp = sumIp + Ip[i]
        sumIm = sumIm + Im[i]
        if (Im[i] > 0):
            ym +=1
        i += 1
    return h*sumIp+b*sumIm, sumIm, ym

[Qt, NIt, It, Ip, Im, dsat] = test(T, LT, d, s)
[obj, sumIm, ym] = objective(T, Ip, Im, h, b)

sumd = 0
for j in range(T):
    sumd = sumd + d[j]

alpha = ym/T
beta = sumIm/sumd

# Training

# Ip - on-hand inventory
# Im - Lost orders
# T - no. of days (360)
# Qt - order quantity

def reward(t):
    return h*Ip[t]+b*Im[t]

Q = np.matrix(np.zeros([9,9]))

iteration = 0
t = 0
MAX_ITERATION = 500
alp = 0.2 # learning rate (between 0 and 1)
exploitation_p = 0.15 # exploitation probability (incresed after each iteration until it reaches 1)

while iteration <= MAX_ITERATION:
    while t < T-1:
        if Ip[t] <= 8:
            state = 0
        if Ip[t] > 8 and Ip[t] <= 14:
            state = 1
        if Ip[t] > 14 and Ip[t] <= 20:
            state = 2
        if Ip[t] > 20 and Ip[t] <= 26:
            state = 3
        if Ip[t] > 26 and Ip[t] <= 32:
            state = 4
        if Ip[t] > 32 and Ip[t] <= 38:
            state = 5
        if Ip[t] > 38 and Ip[t] <= 44:
            state = 6
        if Ip[t] > 44 and Ip[t] <= 50:
            state = 7
        if Ip[t] > 50:
            state = 8

        rd = random.random()
        if rd < exploitation_p:
            action = np.where(Q[state,] == np.max(Q[state,]))[1]
            if np.size(action) > 1:
                action = np.random.choice(action,1)
        elif rd >= exploitation_p:
            av_act = np.where(Q[state,] < 999999)[1]
            action = np.random.choice(av_act,1)
        action = int(action)
        rew = reward(t+1)

        if Ip[t+1] <= 8:
            next_state = 0
        if Ip[t+1] > 8 and Ip[t+1] <= 14:
            next_state = 1
        if Ip[t+1] > 14 and Ip[t+1] <= 20:
            next_state = 2
        if Ip[t+1] > 20 and Ip[t+1] <= 26:
            next_state = 3
        if Ip[t+1] > 26 and Ip[t+1] <= 32:
            next_state = 4
        if Ip[t+1] > 32 and Ip[t+1] <= 38:
            next_state = 5
        if Ip[t+1] > 38 and Ip[t+1] <= 44:
            next_state = 6
        if Ip[t+1] > 44 and Ip[t+1] <= 50:
            next_state = 7
        if Ip[t+1] > 50:
            next_state = 8

        next_action = np.where(Q[next_state,] == np.max(Q[next_state,]))[1]
        if np.size(next_action) > 1:
            next_action = np.random.choice(next_action,1)
        next_action = int(next_action)

        Q[state, action] = Q[state, action] + alp*(-rew+Q[next_state, next_action]-Q[state, action])

        t += 1
    if (exploitation_p < 1):
        exploitation_p = exploitation_p + 0.05
    t = 0
    iteration += 1

# Testing

Ip = [0] * T
Im = [0] * T

It[0] = I0 - d[0]

if (It[0] >= 0):
    Ip[0] = It[0]
else:
    Im[0] = -It[0]

Qt[0] = 0
Qbase = 100

sumIp = Ip[0]
sumIm = Im[0]

i = 1
while i < T:
    if (i - LT >= 0):
        It[i] = Ip[i-1] - d[i] + Qt[i-LT]
    else:
        It[i] = Ip[i-1] - d[i]
    It[i] = round(It[i], 0)
    if It[i] >= 0:
        Ip[i] = It[i]
    else:
        Im[i] = -It[i]

    if Ip[i] <= 8:
        state = 0
    if Ip[i] > 8 and Ip[i] <= 14:
        state = 1
    if Ip[i] > 14 and Ip[i] <= 20:
        state = 2
    if Ip[i] > 20 and Ip[i] <= 26:
        state = 3
    if Ip[i] > 26 and Ip[i] <= 32:
        state = 4
    if Ip[i] > 32 and Ip[i] <= 38:
        state = 5
    if Ip[i] > 38 and Ip[i] <= 44:
        state = 6
    if Ip[i] > 44 and Ip[i] <= 50:
        state = 7
    if Ip[i] > 50:
        state = 8

    action = np.where(Q[state,] == np.max(Q[state,]))[1]
    if np.size(action) > 1:
        action = np.random.choice(action,1)
    action = int(action)

    if action == 0:
        Qt[i] = Qbase
    if action == 1:
        Qt[i] = Qbase * 0.95
    if action == 2:
        Qt[i] = Qbase * 0.9
    if action == 3:
        Qt[i] = Qbase * 0.85
    if action == 4:
        Qt[i] = Qbase * 0.8
    if action == 5:
        Qt[i] = Qbase * 0.75
    if action == 6:
        Qt[i] = Qbase * 0.7
    if action == 7:
        Qt[i] = Qbase * 0.65
    if action == 8:
        Qt[i] = Qbase * 0.6

    sumIp = sumIp + Ip[i]
    sumIm = sumIm + Im[i]

    i += 1

objfunc = h*sumIp+b*sumIm

print(objfunc)