Untitled

import numpy as np
from math import *

MAX_NUM_CARS = 20
MAX_MOVE_CARS = 5
RENT_FIRST_LOC = 3
RENT_SECOND_LOC = 4
RETURN_FIRST_LOC = 3
RETURN_SECOND_LOC = 2
DISCOUNT = 0.9
MONEY_PER_CAR = 10
MOVE_CAR_COST = 2

matrixPoisson = dict()
def poisson(n, lambd):
    global matrixPoisson
    key = 10 * n + lambd
    if key not in matrixPoisson.keys():
        matrixPoisson[key] = exp(-lambd) * pow(lambd, n) / factorial(n)
    return matrixPoisson[key]

def cacheProfit(estimated_rented):
    profitRented = []
    for i in range(MAX_NUM_CARS+1):
        profitRented.append(0)

    for car_available in range(MAX_NUM_CARS + 1):
        for car_rented in range(MAX_NUM_CARS + 1):
            profitRented[car_available] += MONEY_PER_CAR * min(car_available, car_rented) * poisson(car_rented, estimated_rented)
    return profitRented

def cacheRentReturnsProbability(estimated_rented, estimated_return):
    probabilityComplexCache = []
    for i in range(MAX_NUM_CARS + 1):
        probabilityComplexCache.append([])
        for j in range(MAX_NUM_CARS + 1):
            probabilityComplexCache[i].append(0)

    for car in range(MAX_NUM_CARS + 1):
        for car_rented in range(MAX_NUM_CARS + 1):
            for car_return in range(MAX_NUM_CARS + 1):
                car_real_rented = min(car, car_rented)
                car_available = max(0, min(MAX_NUM_CARS, car + car_return - car_real_rented))
                probabilityComplexCache[car][car_available] += poisson(car_rented, estimated_rented) * poisson(car_return, estimated_return)
    return probabilityComplexCache


def evaluatePolicy(returnsLoc1, returnsLoc2, rentReturnsLoc1, rentReturnsLoc2, policy):
    valueMatrix = []
    for i in range(MAX_NUM_CARS + 1):
        valueMatrix.append([])
        for j in range(MAX_NUM_CARS + 1):
            valueMatrix[i].append(0)

    delta = 1
    while (delta > 1e-6):
        delta = 0
        for car_loc_1 in range(MAX_NUM_CARS + 1):
            for car_loc_2 in range(MAX_NUM_CARS + 1):
                profit_prev = valueMatrix[car_loc_1][car_loc_2]
                car_moved = int(policy[car_loc_1][car_loc_2])
                profit = -MOVE_CAR_COST * abs(car_moved)
                for car_real_loc_1 in range(MAX_NUM_CARS + 1):
                    for car_real_loc_2 in range(MAX_NUM_CARS + 1):
                        if (car_loc_1 - car_moved >= 0) & (car_loc_2 + car_moved >= 0) & (car_loc_1 - car_moved <= MAX_NUM_CARS) & (car_loc_2 + car_moved <= MAX_NUM_CARS):
                            profit += rentReturnsLoc1[car_loc_1 - car_moved][car_real_loc_1] * rentReturnsLoc2[car_loc_2 + car_moved][car_real_loc_2] * (returnsLoc1[car_loc_1 - car_moved] + returnsLoc2[car_loc_2 + car_moved] + DISCOUNT * valueMatrix[car_real_loc_1][car_real_loc_2])
                valueMatrix[car_loc_1][car_loc_2] = profit
                delta = max(delta, abs(profit_prev - profit))

    return valueMatrix

def improvePolicy(returnsLoc1, returnsLoc2, rentReturnsLoc1, rentReturnsLoc2, valueMatrix):
    policy = []
    for i in range(MAX_NUM_CARS + 1):
        policy.append([])
        for j in range(MAX_NUM_CARS + 1):
            policy[i].append(0)

    for car_loc_1 in range(MAX_NUM_CARS + 1):
        for car_loc_2 in range(MAX_NUM_CARS + 1):
            max_profit = 0
            for car_moved in range(-MAX_MOVE_CARS, MAX_MOVE_CARS+1):
                if (car_loc_1 - car_moved >= 0) & (car_loc_2 + car_moved >= 0) & (car_loc_1 - car_moved <= MAX_NUM_CARS) & (car_loc_2 + car_moved <= MAX_NUM_CARS):
                    profit = -MOVE_CAR_COST * abs(car_moved)
                    for car_real_loc_1 in range(MAX_NUM_CARS + 1):
                        for car_real_loc_2 in range(MAX_NUM_CARS + 1):
                            profit += rentReturnsLoc1[car_loc_1 - car_moved][car_real_loc_1] * rentReturnsLoc2[car_loc_2 + car_moved][car_real_loc_2] * (returnsLoc1[car_loc_1 - car_moved] + returnsLoc2[car_loc_2 + car_moved] + DISCOUNT * valueMatrix[car_real_loc_1][car_real_loc_2])
                    if profit > max_profit:
                        policy[car_loc_1][car_loc_2] = car_moved
                        max_profit = profit
    return policy

def initCache():
    cachedReturnsLoc1 = cacheProfit(RENT_FIRST_LOC)
    cachedReturnsLoc2 = cacheProfit(RENT_SECOND_LOC)

    cachedRentReturnsLoc1 = cacheRentReturnsProbability(RENT_FIRST_LOC, RETURN_FIRST_LOC)
    cachedRentReturnsLoc2 = cacheRentReturnsProbability(RENT_SECOND_LOC, RETURN_SECOND_LOC)

    valueMatrix = []
    for i in range(MAX_NUM_CARS + 1):
        valueMatrix.append([])
        for j in range(MAX_NUM_CARS + 1):
            valueMatrix[i].append(0)

    policy = []
    for i in range(MAX_NUM_CARS + 1):
        policy.append([])
        for j in range(MAX_NUM_CARS + 1):
            policy[i].append(0)

    return cachedReturnsLoc1, cachedReturnsLoc2, cachedRentReturnsLoc1, cachedRentReturnsLoc2, valueMatrix, policy

def main():
    cachedReturnsLoc1, cachedReturnsLoc2, cachedRentReturnsLoc1, cachedRentReturnsLoc2, valueMatrix, policy = initCache()

    while True:
        valueMatrix = evaluatePolicy(cachedReturnsLoc1, cachedReturnsLoc2, cachedRentReturnsLoc1, cachedRentReturnsLoc2, policy)
        newPolicy = improvePolicy(cachedReturnsLoc1, cachedReturnsLoc2, cachedRentReturnsLoc1, cachedRentReturnsLoc2, valueMatrix)
        policyStable = np.sum(newPolicy != policy)
        policy = newPolicy

        if policyStable == 0:
            break

    for i in range(len(policy)):
        for j in range(len(policy[0])):
            print(int(policy[i][j]), end="\t")
        print()

    pass

if __name__ == "__main__":
    main()