Advertisement
Guest User

Untitled

a guest
Jul 20th, 2017
82
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.69 KB | None | 0 0
  1. #####
  2. # Monte Carlo
  3. #####
  4. import gym
  5. from gym.wrappers.monitoring import Monitor
  6. from random import randint
  7. import numpy as np
  8.  
  9. def learn(episodeCount):
  10. for i_episode in xrange(episodeCount):
  11. obs = env.reset()
  12. episodeStatesActions = []
  13. totalRewards = 0
  14. for t in range(200):
  15. action = policy(obs, i_episode)
  16. obs, reward, done, _ = env.step(action['action'])
  17. episodeStatesActions.append(action)
  18. totalRewards += reward
  19. if done:
  20. for i, a in enumerate(episodeStatesActions):
  21. updatePolicy(a, totalRewards-i)
  22. break
  23.  
  24.  
  25. def actionExists(obs, action):
  26. for a in history[obs]:
  27. if actionAreEqual(a['action'], action['action']):
  28. return True
  29. return False
  30.  
  31.  
  32. def actionAreEqual(action1, action2):
  33. return action1[0] == action2[0] and action1[1] == action2[1] and action1[2] == action2[2]
  34.  
  35.  
  36. def updatePolicy(action, G):
  37. action['value'] = (action['value'] * action['count'] + G) / (action['count'] + 1)
  38. action['count'] += 1
  39.  
  40.  
  41. def policy(obs, i_episode):
  42. # Create obs in history if it does not exist
  43. if obs not in history: history[obs] = []
  44. # Get the most valued action
  45. maxValueAction = None
  46. for a in history[obs]:
  47. if maxValueAction == None or a['value'] > maxValueAction['value']:
  48. maxValueAction = a
  49. # If no maxValueAction or if it's time for exploration, return random action
  50. if maxValueAction == None or randint(0, 100) < 20*0.99**i_episode:
  51. randomAction = {'value': 0, 'action': env.action_space.sample(), 'count': 0}
  52. if not actionExists(obs, randomAction):
  53. history[obs].append(randomAction)
  54. return randomAction
  55. else:
  56. return maxValueAction
  57.  
  58.  
  59. nbEpisodes = 100000
  60.  
  61. env = gym.make('Copy-v0')
  62. env = Monitor(env, 'tmp/copy', force=True)
  63.  
  64. history = {}
  65. learn(nbEpisodes)
  66.  
  67. env.close()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement