Advertisement
Guest User

Untitled

a guest
Jun 28th, 2016
67
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.62 KB | None | 0 0
  1. import gym
  2. import numpy as np
  3.  
  4. env = gym.make('CartPole-v0')
  5.  
  6. env.monitor.start('/tmp/cartpole-experiment-1',force=True)
  7.  
  8. #how many big iteration to do - big iteration is updating the means and variance
  9. big_its=20
  10. #how many small iterations to do for a fixed theta distribution
  11. small_its=20
  12. #what fraction of results (sorted by score) to keep
  13. keep=0.2
  14.  
  15. #run this then sample then fit the best 20% and re-do-it
  16. observation = env.reset()
  17. #initialize the parameter distribution
  18. meta_param = {'u':np.random.normal(0, 1, observation.size),'o':np.diag(np.ones(observation.size))}
  19. for bi in xrange(big_its):
  20. #sample the parameter distribution
  21. params = np.random.multivariate_normal(meta_param['u'], meta_param['o'], small_its)
  22. rewards = []
  23. avg_score=0
  24. #run for each small iteration
  25. for i in xrange(small_its):
  26. param=params[i]
  27. reward_total=0
  28. while True:
  29. action = 1 if np.dot(param, observation) > 0 else 0
  30. observation, reward, done, _ = env.step(action)
  31. reward_total += reward
  32. if done:
  33. break
  34. observation = env.reset()
  35. avg_score+=reward_total/float(small_its)
  36. rewards.append((reward_total,i))
  37. rewards.sort()
  38. #update the parameter distribution
  39. tops=np.vstack([ params[i] for x,i in rewards[int(len(rewards)*(1-keep)):] ])
  40. meta_param['u']=tops.mean(0)
  41. meta_param['o']=np.cov(tops,rowvar=0)
  42. print(avg_score)
  43. print(meta_param)
  44. env.monitor.close()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement