Guest User

Q-Learning for Inventory Management

a guest
Jan 22nd, 2019
124
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 6.75 KB | None | 0 0
  1. import random
  2. import numpy as np
  3. import matplotlib.pyplot as plt
  4. from math import sqrt
  5.  
  6. #Parameters
  7. T = 360  #time span (days)
  8. RT = 1  # review time (days) - time between 2 placed orders
  9. LT = 1  # lead time (days) - time that goes from when an order is placed to the moment it arrives
  10. h = 1   # on-hand inventory cost (unit cost)
  11. b = 5   # backorder cost (unit cost)
  12. I0 = 300  # initial stock (units)
  13. dm = 100 # average demand
  14. dmdp = 10 # standard deviation for demand
  15. d = []  #demand following normal distribution (units per day)
  16. random.seed(100)
  17. for j in range(T):
  18.     #d.append(round(np.random.normal(loc=0.0,scale=1.0)*dmdp+dm,0))
  19.     d.append(random.randint(90, 110))
  20. k = 1.96 # safety factor
  21. ss = sqrt(RT+LT)*dmdp*k  # safety stock (units) -> k=1.96, 1.64, 1.28
  22. s = []  # target stock (units)
  23. for j in range(T):
  24.     s.append((RT+LT)*dm + ss)
  25.  
  26. #Variable Initialization
  27. Qt = [0] * T    # quantity to order each day (units)
  28. It = [0] * T    # inventory at the end of each day (units)
  29. NIt = [0] * T   # net inventory at the start of each day (units) = (on-hand + in transit) inventory
  30. Ip = [0] * T    # on-hand inventory at the end of each day (units)
  31. Im = [0] * T    # backorder at the end of each day (units)
  32. dsat = [0] * T  #satisfied demand (units)
  33.  
  34. #Initial Values
  35. It[0] = I0 - d[0]
  36. NIt[0] = I0
  37.  
  38. if (It[0] >= 0):
  39.     Ip[0] = It[0]
  40. else:
  41.     Im[0] = -It[0]
  42.  
  43. if (NIt[0] < s[0]):
  44.     Qt[0] = s[0] - NIt[0]
  45. else:
  46.     Qt[0] = 0
  47.  
  48. if d[0] >= I0:
  49.     dsat[0] = I0
  50. else:
  51.     dsat[0] = d[0]
  52.  
  53. def test(T, LT, d, s):
  54.     i=1
  55.     while i < T:
  56.         j=1
  57.         NIt[i] = Ip[i-1]
  58.         while j <= LT:
  59.             if (i-j >= 0):
  60.                 NIt[i] = NIt[i] + Qt[i-j]
  61.             j += 1
  62.         NIt[i] = round(NIt[i], 0)
  63.         if (NIt[i] < s[i]):
  64.             Qt[i] = s[i] - NIt[i]
  65.         else:
  66.             Qt[i] = 0
  67.         Qt[i] = round(Qt[i], 0)
  68.         if (i - LT >= 0):
  69.             It[i] = Ip[i-1] - d[i] + Qt[i-LT]
  70.         else:
  71.             It[i] = Ip[i-1] - d[i]
  72.         It[i] = round(It[i], 0)
  73.         if It[i] >= 0:
  74.             Ip[i] = It[i]
  75.         else:
  76.             Im[i] = -It[i]
  77.         if (d[i] >= Ip[i-1] + Qt[i-LT]):
  78.             dsat[i] = Ip[i-1] + Qt[i-LT]
  79.         else:
  80.             dsat[i] = d[i]
  81.         i += 1
  82.     return Qt, NIt, It, Ip, Im, dsat
  83.  
  84. def objective(T, Ip, Im, h, b):
  85.     sumIp = Ip[0]
  86.     sumIm = Im[0]
  87.     ym = 0
  88.     if (Im[0] > 0):
  89.         ym = 1
  90.     i = 1
  91.     while i < T:
  92.         sumIp = sumIp + Ip[i]
  93.         sumIm = sumIm + Im[i]
  94.         if (Im[i] > 0):
  95.             ym +=1
  96.         i += 1
  97.     return h*sumIp+b*sumIm, sumIm, ym
  98.  
  99. [Qt, NIt, It, Ip, Im, dsat] = test(T, LT, d, s)
  100. [obj, sumIm, ym] = objective(T, Ip, Im, h, b)
  101.  
  102. sumd = 0
  103. for j in range(T):
  104.     sumd = sumd + d[j]
  105.  
  106. alpha = ym/T
  107. beta = sumIm/sumd
  108.  
  109. # Training
  110.  
  111. # Ip - on-hand inventory
  112. # Im - Lost orders
  113. # T - no. of days (360)
  114. # Qt - order quantity
  115.  
  116. def reward(t):
  117.     return h*Ip[t]+b*Im[t]
  118.  
  119. Q = np.matrix(np.zeros([9,9]))
  120.  
  121. iteration = 0
  122. t = 0
  123. MAX_ITERATION = 500
  124. alp = 0.2 # learning rate (between 0 and 1)
  125. exploitation_p = 0.15 # exploitation probability (incresed after each iteration until it reaches 1)
  126.  
  127. while iteration <= MAX_ITERATION:
  128.     while t < T-1:
  129.         if Ip[t] <= 8:
  130.             state = 0
  131.         if Ip[t] > 8 and Ip[t] <= 14:
  132.             state = 1
  133.         if Ip[t] > 14 and Ip[t] <= 20:
  134.             state = 2
  135.         if Ip[t] > 20 and Ip[t] <= 26:
  136.             state = 3
  137.         if Ip[t] > 26 and Ip[t] <= 32:
  138.             state = 4
  139.         if Ip[t] > 32 and Ip[t] <= 38:
  140.             state = 5
  141.         if Ip[t] > 38 and Ip[t] <= 44:
  142.             state = 6
  143.         if Ip[t] > 44 and Ip[t] <= 50:
  144.             state = 7
  145.         if Ip[t] > 50:
  146.             state = 8
  147.  
  148.         rd = random.random()
  149.         if rd < exploitation_p:
  150.             action = np.where(Q[state,] == np.max(Q[state,]))[1]
  151.             if np.size(action) > 1:
  152.                 action = np.random.choice(action,1)
  153.         elif rd >= exploitation_p:
  154.             av_act = np.where(Q[state,] < 999999)[1]
  155.             action = np.random.choice(av_act,1)
  156.         action = int(action)
  157.         rew = reward(t+1)
  158.  
  159.         if Ip[t+1] <= 8:
  160.             next_state = 0
  161.         if Ip[t+1] > 8 and Ip[t+1] <= 14:
  162.             next_state = 1
  163.         if Ip[t+1] > 14 and Ip[t+1] <= 20:
  164.             next_state = 2
  165.         if Ip[t+1] > 20 and Ip[t+1] <= 26:
  166.             next_state = 3
  167.         if Ip[t+1] > 26 and Ip[t+1] <= 32:
  168.             next_state = 4
  169.         if Ip[t+1] > 32 and Ip[t+1] <= 38:
  170.             next_state = 5
  171.         if Ip[t+1] > 38 and Ip[t+1] <= 44:
  172.             next_state = 6
  173.         if Ip[t+1] > 44 and Ip[t+1] <= 50:
  174.             next_state = 7
  175.         if Ip[t+1] > 50:
  176.             next_state = 8
  177.  
  178.         next_action = np.where(Q[next_state,] == np.max(Q[next_state,]))[1]
  179.         if np.size(next_action) > 1:
  180.             next_action = np.random.choice(next_action,1)
  181.         next_action = int(next_action)
  182.  
  183.         Q[state, action] = Q[state, action] + alp*(-rew+Q[next_state, next_action]-Q[state, action])
  184.  
  185.         t += 1
  186.     if (exploitation_p < 1):
  187.         exploitation_p = exploitation_p + 0.05
  188.     t = 0
  189.     iteration += 1
  190.  
  191. # Testing
  192.  
  193. Ip = [0] * T
  194. Im = [0] * T
  195.  
  196. It[0] = I0 - d[0]
  197.  
  198. if (It[0] >= 0):
  199.     Ip[0] = It[0]
  200. else:
  201.     Im[0] = -It[0]
  202.  
  203. Qt[0] = 0
  204. Qbase = 100
  205.  
  206. sumIp = Ip[0]
  207. sumIm = Im[0]
  208.  
  209. i = 1
  210. while i < T:
  211.     if (i - LT >= 0):
  212.         It[i] = Ip[i-1] - d[i] + Qt[i-LT]
  213.     else:
  214.         It[i] = Ip[i-1] - d[i]
  215.     It[i] = round(It[i], 0)
  216.     if It[i] >= 0:
  217.         Ip[i] = It[i]
  218.     else:
  219.         Im[i] = -It[i]
  220.  
  221.     if Ip[i] <= 8:
  222.         state = 0
  223.     if Ip[i] > 8 and Ip[i] <= 14:
  224.         state = 1
  225.     if Ip[i] > 14 and Ip[i] <= 20:
  226.         state = 2
  227.     if Ip[i] > 20 and Ip[i] <= 26:
  228.         state = 3
  229.     if Ip[i] > 26 and Ip[i] <= 32:
  230.         state = 4
  231.     if Ip[i] > 32 and Ip[i] <= 38:
  232.         state = 5
  233.     if Ip[i] > 38 and Ip[i] <= 44:
  234.         state = 6
  235.     if Ip[i] > 44 and Ip[i] <= 50:
  236.         state = 7
  237.     if Ip[i] > 50:
  238.         state = 8
  239.  
  240.     action = np.where(Q[state,] == np.max(Q[state,]))[1]
  241.     if np.size(action) > 1:
  242.         action = np.random.choice(action,1)
  243.     action = int(action)
  244.  
  245.     if action == 0:
  246.         Qt[i] = Qbase
  247.     if action == 1:
  248.         Qt[i] = Qbase * 0.95
  249.     if action == 2:
  250.         Qt[i] = Qbase * 0.9
  251.     if action == 3:
  252.         Qt[i] = Qbase * 0.85
  253.     if action == 4:
  254.         Qt[i] = Qbase * 0.8
  255.     if action == 5:
  256.         Qt[i] = Qbase * 0.75
  257.     if action == 6:
  258.         Qt[i] = Qbase * 0.7
  259.     if action == 7:
  260.         Qt[i] = Qbase * 0.65
  261.     if action == 8:
  262.         Qt[i] = Qbase * 0.6
  263.  
  264.     sumIp = sumIp + Ip[i]
  265.     sumIm = sumIm + Im[i]
  266.  
  267.     i += 1
  268.  
  269. objfunc = h*sumIp+b*sumIm
  270.  
  271. print(objfunc)
Add Comment
Please, Sign In to add comment