Advertisement
Guest User

Untitled

a guest
May 23rd, 2019
119
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 0.88 KB | None | 0 0
  1. def Q_0(sa):
  2. return [1]
  3.  
  4. Q = Q_0
  5. iterations = 6
  6. for i in range(iterations): # FQI iterations
  7. X = []
  8. Y = []
  9. for sample in T.reshape((n * (m + 1), m + 1)):
  10. x = np.append(sample[0], sample[1]) # feature vector consists of state-action pairs
  11.  
  12. a_best, v_best = best_action(Q, sample[3], offers)
  13. y = sample[2] + v_best # reward + value
  14.  
  15. X.append(x)
  16. Y.append(y)
  17.  
  18. regr = RandomForestRegressor(max_depth=4, random_state=0, n_estimators=10)
  19. regr.fit(X, np.ravel(Y))
  20. Q = regr.predict
  21.  
  22. # find the optimal action under a greedy policy and corresponding state value
  23. def best_action(Q, state, actions):
  24. v_best = 0
  25. a_best = 0
  26. for a in actions:
  27. v = Q([np.append(state, a)])[0]
  28. if(v > v_best):
  29. v_best = v
  30. a_best = a
  31.  
  32. return a_best, v_best
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement