Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import numpy as np
- '''
- Verifying how Silver et al's Sarsa definition works in a very simple toy problem
- '''
- alpha = 0.1
- gamma = 1
- lamda = 0.9
- RIGHT, LEFT, UP, DOWN = (0, 1, 2, 3)
- e = np.zeros((4,3),float)
- w = np.array([
- [4, -3, 1], # weights of right
- [0, 0, 0], # weights of left
- [2, -2, 5], # weights of up
- [-1, -1, -1] # weights of down
- ])
- print('w before:\n', w)
- f = np.array([
- [1, 0, 0], # features of s0
- [0, 1, 0], # features of s1
- [0, 0, 1], # features of s2
- ])
- # the stage is set. agent starts in s0, performs action right, receives r=+1, ends in s1, choosing action up
- s = 0
- a = RIGHT
- next_s = 1
- r = 1
- next_a = UP
- delta = r + gamma * w[next_a].dot(f[next_s]) - w[a].dot(f[s])
- delta # should equal -5
- w = w + alpha * delta * e # nothing changed (but should have!)
- e = gamma * lamda * e
- e[s] = e[s] + f[s]
- print('w after:\n', w) # that is, the value of action RIGHT in s0 has not changed!
- print('e:\n', e)
- # agent is in s1, performs up, reaches s2, gains r = -2 choosing action right
- s = next_s
- a = next_a
- next_s = 2
- r = -2
- next_a = RIGHT
- print('w before:\n', w)
- delta = r + gamma * w[next_a].dot(f[next_s]) - w[a].dot(f[s])
- delta #should be 1
- w = w + alpha * delta * e # value of (RIGHT,s0) should rise a little bit; value of (UP,s1) should rise a little bit, but does not
- e = gamma * lamda * e
- e[s] = e[s] + f[s]
- print('w after:\n', w) # that is, the value of (UP,s1) has not changed!
- print('e:\n', e)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement