Untitled

def Q_0(sa):
    return [1]

Q = Q_0
iterations = 6
for i in range(iterations):                 # FQI iterations
    X = []
    Y = []
    for sample in T.reshape((n * (m + 1), m + 1)):
        x = np.append(sample[0], sample[1]) # feature vector consists of state-action pairs

        a_best, v_best = best_action(Q, sample[3], offers)
        y = sample[2] + v_best              # reward + value

        X.append(x)
        Y.append(y)

    regr = RandomForestRegressor(max_depth=4, random_state=0, n_estimators=10)
    regr.fit(X, np.ravel(Y))
    Q = regr.predict

# find the optimal action under a greedy policy and corresponding state value
def best_action(Q, state, actions):
    v_best = 0
    a_best = 0
    for a in actions:
        v = Q([np.append(state, a)])[0]
        if(v > v_best):
            v_best = v
            a_best = a

    return a_best, v_best