# Untitled

1. import numpy as np
2.
3. '''
4. Verifying how Silver et al's Sarsa definition works in a very simple toy problem
5. '''
6.
7. alpha = 0.1
8. gamma = 1
9. lamda = 0.9
10.
11. RIGHT, LEFT, UP, DOWN =  (0, 1, 2, 3)
12.
13. e = np.zeros((4,3),float)
14. w = np.array([
15.     [4, -3, 1],  # weights of right
16.     [0, 0, 0],   # weights of left
17.     [2, -2, 5],  # weights of up
18.     [-1, -1, -1] # weights of down
19. ])
20.
21. print('w before:\n', w)
22.
23. f = np.array([
24.     [1, 0, 0], # features of s0
25.     [0, 1, 0], # features of s1
26.     [0, 0, 1], # features of s2
27. ])
28.
29. # the stage is set. agent starts in s0, performs action right, receives r=+1, ends in s1, choosing action up
30. s = 0
31. a = RIGHT
32. next_s = 1
33. r = 1
34. next_a = UP
35.
36. delta = r + gamma * w[next_a].dot(f[next_s]) - w[a].dot(f[s])
37. delta # should equal -5
38.
39. w = w + alpha * delta * e # nothing changed (but should have!)
40.
41. e = gamma * lamda * e
42. e[s] = e[s] + f[s]
43.
44. print('w after:\n', w) # that is, the value of action RIGHT in s0 has not changed!
45. print('e:\n', e)
46.
47.
48.
49. # agent is in s1, performs up, reaches s2, gains r = -2 choosing action right
50. s = next_s
51. a = next_a
52.
53. next_s = 2
54. r = -2
55. next_a = RIGHT
56.
57. print('w before:\n', w)
58.
59. delta = r + gamma * w[next_a].dot(f[next_s]) - w[a].dot(f[s])
60. delta #should be 1
61.
62. w = w + alpha * delta * e # value of (RIGHT,s0) should rise a little bit; value of (UP,s1) should rise a little bit, but does not
63.
64. e = gamma * lamda * e
65. e[s] = e[s] + f[s]
66.
67. print('w after:\n', w) # that is, the value of (UP,s1) has not changed!
68. print('e:\n', e)
