Advertisement
Guest User

Untitled

a guest
Jan 16th, 2017
67
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 0.60 KB | None | 0 0
  1. Q−1 := Q0; // Initialization
  2. for k := 0, 1, 2, 3, . . . , T − 1 do // Main loop
  3. αk := 1
  4. k+1 ;
  5. for each (x, a) ∈ Z do
  6. Generate the next state sample yk ∼ P(·|x, a);
  7. TkQk−1(x, a) := r(x, a) + γMQk−1(yk);
  8. TkQk(x, a) := r(x, a) + γMQk(yk); // Empirical Bellman operator
  9. Qk+1(x, a) := Qk(x, a)+αk
  10. `
  11. TkQk−1(x, a)−Qk(x, a)
  12. ´
  13. +(1−αk)
  14. `
  15. TkQk(x, a)−TkQk−1(x, a)
  16. ´
  17. ;
  18. // SQL update rule
  19. end
  20. end
  21. return QT
  22.  
  23. previousActionEstimations[action] = ((previousActionEstimations[action]+(learningRate * ( reward + discountFactor * maxNextExpectedReward - previousActionEstimations[action]) )));
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement