Untitled

Q−1 := Q0; // Initialization
for k := 0, 1, 2, 3, . . . , T − 1 do // Main loop
αk := 1
k+1 ;
for each (x, a) ∈ Z do
Generate the next state sample yk ∼ P(·|x, a);
TkQk−1(x, a) := r(x, a) + γMQk−1(yk);
TkQk(x, a) := r(x, a) + γMQk(yk); // Empirical Bellman operator
Qk+1(x, a) := Qk(x, a)+αk
`
TkQk−1(x, a)−Qk(x, a)
´
+(1−αk)
`
TkQk(x, a)−TkQk−1(x, a)
´
;
// SQL update rule
end
end
return QT

previousActionEstimations[action] = ((previousActionEstimations[action]+(learningRate * ( reward + discountFactor * maxNextExpectedReward - previousActionEstimations[action]) )));