Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- def Q_0(sa):
- return [1]
- Q = Q_0
- iterations = 6
- for i in range(iterations): # FQI iterations
- X = []
- Y = []
- for sample in T.reshape((n * (m + 1), m + 1)):
- x = np.append(sample[0], sample[1]) # feature vector consists of state-action pairs
- a_best, v_best = best_action(Q, sample[3], offers)
- y = sample[2] + v_best # reward + value
- X.append(x)
- Y.append(y)
- regr = RandomForestRegressor(max_depth=4, random_state=0, n_estimators=10)
- regr.fit(X, np.ravel(Y))
- Q = regr.predict
- # find the optimal action under a greedy policy and corresponding state value
- def best_action(Q, state, actions):
- v_best = 0
- a_best = 0
- for a in actions:
- v = Q([np.append(state, a)])[0]
- if(v > v_best):
- v_best = v
- a_best = a
- return a_best, v_best
Advertisement
Add Comment
Please, Sign In to add comment