Guest User

Untitled

a guest
Oct 22nd, 2017
87
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 4.76 KB | None | 0 0
  1. import keras
  2. from keras.models import Sequential
  3. from keras.layers import Dense, Activation
  4.  
  5. import numpy as np
  6. import random
  7. from collections import deque
  8.  
  9. import gym
  10. from gym import wrappers
  11.  
  12. env = gym.make('MountainCar-v0')
  13. # env = wrappers.Monitor(env, 'experiment', force=True)
  14.  
  15. episodes = 1000000 # number of training 'episodes' to run
  16. memory_size = 20000 # number of frames to store in memory
  17. batch_size = 50 # number of training frames to grab from memory and train on
  18.  
  19. discount_factor = 0.98 # the q learning future discount parameter
  20.  
  21. inputs = env.observation_space.shape[0]
  22.  
  23. # Setup our NN and training environment in keras
  24. model = Sequential()
  25. # model.add(Dense(units=50, input_dim=env.observation_space.shape[0],
  26. # kernel_initializer='zeros'))
  27. model.add(Dense(units=64*4, input_dim=env.observation_space.shape[0]))
  28. model.add(Activation('tanh'))
  29. model.add(Dense(units=64*4))
  30. model.add(Activation('tanh'))
  31. model.add(Dense(units=env.action_space.n))
  32. model.add(Activation('linear'))
  33.  
  34. model.compile(loss=keras.losses.mean_squared_error,
  35. optimizer=keras.optimizers.RMSprop(lr=0.001))
  36.  
  37. model.save('models/0.h5')
  38. # store experiences. list of (s, a, r, s', done)
  39. experiences = deque([], maxlen=memory_size)
  40. experience_weights = deque([], maxlen=memory_size)
  41. weight_sum = 0
  42.  
  43. ep_lens = [] # store the length of episodes in a list
  44. steps = 1 # store the total number of frames in a list
  45. e = 1
  46. epsilon_decay = 0.995
  47.  
  48.  
  49. def train_on_batch(model, batch, batch_size=batch_size):
  50. states = np.array([b[0] for b in batch])
  51. next_states = np.array([b[3] for b in batch])
  52. # feed our samples through the network to get our
  53. # predictions, which once updated with the target q vals will be
  54. # our targets
  55. outs = model.predict_on_batch(states)
  56. targets = np.copy(outs)
  57. # now, using the Q-value formula, come up with our list of
  58. # 'better' outputs
  59. next_qs = model.predict_on_batch(next_states)
  60. for i in range(batch_size):
  61. # now, set the target output for the action taken to be the
  62. # updated Q val
  63. max_q = max(next_qs[i])
  64. targets[i][batch[i][1]] = (batch[i][2]
  65. + (discount_factor * max_q
  66. if not batch[i][4] else 0))
  67. model.train_on_batch(states, targets)
  68.  
  69.  
  70. def batch_weighted_selection(items, weights, weight_sum, num_selections):
  71. selection_numbers = sorted([random.randint(0, weight_sum-1) for i in range(num_selections)])
  72. selections = []
  73. running_weight_sum = 0
  74. for i in range(len(items)):
  75. running_weight_sum += weights[i]
  76. while selection_numbers[0] <= running_weight_sum:
  77. selections.append(items[i])
  78. selection_numbers = selection_numbers[1:]
  79. if not selection_numbers:
  80. return selections
  81.  
  82.  
  83. episode_reward = -200
  84. train = False
  85. saved = False
  86. for n in range(episodes):
  87. # e = -episode_reward/200
  88. # e = 1/((episode_reward+201.))
  89. e *= epsilon_decay
  90. # store the total reward and survival time for this episode
  91. episode_reward = 0
  92. episode_survival = 0
  93.  
  94. # first observation
  95. observation = env.reset()
  96. start_point = observation[0]
  97. episode_experiences = []
  98.  
  99. while True:
  100. episode_survival += 1
  101.  
  102. action = None
  103. outputs = model.predict_on_batch(np.array([observation]))[0]
  104. # print(outputs)
  105. if random.uniform(0, 1) < e:
  106. action = env.action_space.sample()
  107. else:
  108. action = np.argmax(outputs)
  109. new_observation, reward, done, info = env.step(action)
  110. episode_experiences.append(
  111. [observation, action, reward, new_observation, done])
  112. # reward = (1 if done and episode_survival < 200 else
  113. # -1+abs(observation[0]-start_point)+abs(observation[1]))
  114. reward = 1 if done else reward
  115. observation = new_observation
  116. episode_reward += reward
  117. steps += 1
  118. if len(experiences) < 2*batch_size or n < 5:
  119. pass
  120. else:
  121. # selections = batch_weighted_selection(experiences, experience_weights, weight_sum, batch_size)
  122. selections = [random.choice(experiences) for i in range(batch_size)]
  123. train_on_batch(model, selections)
  124. if done:
  125. break
  126.  
  127. # ep_weight = 201-episode_survival
  128. # for i in range(episode_survival):
  129. # if len(experience_weights) >= memory_size:
  130. # weight_sum -= experience_weights.popleft()
  131. # experience_weights.append(ep_weight)
  132. # weight_sum += episode_survival*(201-episode_survival)
  133. experiences += episode_experiences
  134.  
  135. ep_lens.append(episode_survival)
  136. print "%s,%s,%s,%s" % (n, episode_reward, episode_survival, e)
  137. if episode_survival < 100 and not saved:
  138. saved = True
  139. model.save('models/a.h5')
Add Comment
Please, Sign In to add comment