Advertisement
Guest User

Untitled

a guest
Mar 28th, 2017
68
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.04 KB | None | 0 0
  1. from __future__ import print_function, division
  2.  
  3. import gym
  4.  
  5. import numpy as np
  6. import matplotlib as plt
  7. import logging
  8. import matplotlib.pyplot as plt
  9. logging.disable(logging.CRITICAL)
  10. np.seterr('raise')
  11. plt.ion()
  12. problem = 'CartPole-v1'
  13. env = gym.make(problem)
  14.  
  15. validation_env = gym.make(problem)
  16. validation_env = gym.wrappers.Monitor(validation_env, directory='/tmp/es1/4')
  17.  
  18. n_features = env.observation_space.shape[0]
  19. n_actions = env.action_space.n
  20.  
  21. P = (n_features *n_actions) + (n_actions) # number of parameters
  22. N = 75 # number of histories
  23.  
  24. T = np.zeros((P, N))
  25. S = np.zeros((P, N))
  26.  
  27. alpha = 0.00001
  28. HISTORY_SIZE = 50
  29.  
  30. def softmax(x):
  31. """Compute softmax values for each sets of scores in x."""
  32. e_x = np.exp(x - np.max(x))
  33. return e_x / e_x.sum()
  34.  
  35.  
  36. def unpack(model):
  37. shapes = [
  38. (n_features, n_actions),
  39. (1, n_actions),
  40. # (hidden_layer_size, output_layer_size),
  41. # (1, output_layer_size),
  42. ]
  43. result = []
  44. start = 0
  45. for i, offset in enumerate(np.prod(shape) for shape in shapes):
  46. result.append(model[start:start+offset].reshape(shapes[i]))
  47. start += offset
  48. return result
  49.  
  50.  
  51. def sigmoid(x):
  52. return 1 / (1 + np.exp(-x))
  53.  
  54. def choose_action(a1):
  55. return np.argmax(a1)
  56. probs = softmax(a1[0])
  57. return np.random.choice(np.arange(n_actions), p=probs)
  58.  
  59.  
  60. def model(theta, state):
  61. w,b = unpack(theta)
  62.  
  63. z = state.dot(w) + b
  64. a1 = np.tanh(z)
  65.  
  66. return choose_action(a1)
  67.  
  68.  
  69. def evaluate_policy(theta, env=None):
  70. if env is None:
  71. env = gym.make(problem)
  72. creward = 0
  73. state = env.reset()
  74.  
  75. while True:
  76. action = model(theta, state)
  77.  
  78. new_state, reward, done, _ = env.step(action)
  79. state = new_state
  80. creward += reward
  81.  
  82. if done:
  83. return creward
  84.  
  85.  
  86. u = 0.0
  87. sigma = 0.05
  88. b=0.0
  89.  
  90. u = np.repeat(u, P)
  91. assert u.shape == (P,)
  92. sigma = np.repeat(sigma, P)
  93. assert sigma.shape == (P, )
  94.  
  95. r_history = []
  96. val_history = []
  97. mean_history = []
  98. cross_history = []
  99.  
  100. for _ in range(350):
  101. theta = np.zeros((N, P))
  102. r = np.zeros(N)
  103. assert theta.shape == (N, P)
  104. for n in range(N):
  105. theta[n,:] = np.random.normal(u, np.square(sigma))
  106. r[n] = evaluate_policy(theta[n])
  107.  
  108. for i in range(P):
  109. for j in range(N):
  110. T[i,j] = theta[j,i] - u[i]
  111.  
  112. for i in range(P):
  113. for j in range(N):
  114. S[i,j] = np.divide(np.square(T[i,j]) - np.square(sigma[i]),
  115. sigma[i])
  116.  
  117.  
  118. mean_history.append(np.mean(r))
  119.  
  120. r = r - b
  121. r = r.T
  122.  
  123.  
  124. val_score = evaluate_policy(u, validation_env)
  125. r_history.append(val_score)
  126.  
  127. b = np.mean(r_history[-HISTORY_SIZE:])
  128.  
  129. u += alpha * np.matmul(T, r)
  130. sigma += alpha * np.matmul(S, r)
  131.  
  132.  
  133. val_history.append(val_score)
  134. cross_history.append(np.mean(val_history[-100:]))
  135. print(np.mean(val_score))
  136. plt.clf()
  137. plt.plot(val_history)
  138. plt.plot(mean_history)
  139. plt.plot(cross_history)
  140. plt.legend(['validation', 'mean', 'benchmark'])
  141. plt.pause(0.05)
  142.  
  143. # b = np.mean(r_history[-HISTORY_SIZE:])
  144. validation_env.close()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement