Advertisement
Guest User

Untitled

a guest
Mar 29th, 2020
125
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.98 KB | None | 0 0
  1. # queue for game memory
  2. memory = deque(maxlen=1000)
  3.  
  4. # start empty queue for stacking frames
  5. stacked_frames = deque([np.zeros((90, 70), dtype=np.uint8) for i in range(stack_size)], maxlen=4)
  6.  
  7. # list for the rewards from each round
  8. rewards_list = []
  9.  
  10. # decay_step determines whether we move randomly or not.
  11. decay_step = 0
  12.  
  13. # play around with this value
  14. batch_size=20
  15.  
  16. # this next part is arbitrary, but vitally important. we need
  17. # to choose how many games we want to run through. for
  18. # each game, we set it up the usual way, by re-setting the
  19. # game environment, and setting the total_reward for the
  20. # game to zero. but we start this off a little differently by
  21. # creating a stacked_frames, stacked_state, and setting the new_game
  22. # parameter in stack_frames to true
  23. for episode in range(2000):
  24.     state = env.reset()
  25.     total_reward = 0
  26.     state, stacked_frames = stack_frames(stacked_frames, state, True)
  27.  
  28.     # like previous games, how many moves should we let it make?
  29.     for step in range(1001):
  30.        
  31.         # if you wish, uncomment the render line to see the game train
  32.         #env.render()
  33.         # increment the decay_step on each move
  34.         decay_step += 1
  35.        
  36.         # next, we'll predict an action using our pre-fabricated
  37.         # prediction function. Since it spits out a 1-hot array,
  38.         # and Gym needs an int for an action, we'll take the argmax
  39.         # of the output from the function, hence why the function
  40.         # is embedded in the np.argmax()
  41.         act = np.argmax(predict_action(model, decay_step))
  42.        
  43.         # now, we'll update the game environment
  44.         obs, reward, done, info = env.step(act)
  45.        
  46.         # we'll add the reward to our existing total_reward
  47.         total_reward += reward
  48.        
  49.         # if the game is done, we'll make sure the system knows
  50.         # it's game over, append the reward to the total_reward list,
  51.         # and add the stacked_state, action taken, reward, the new
  52.         # state (obs), and whether the game finished.
  53.         if done == True:
  54.             obs = np.zeros((210, 160, 3))
  55.             obs, stacked_frames = stack_frames(stacked_frames, obs, False)
  56.             rewards_list.append(total_reward)
  57.             memory.append((state, act, reward, obs, done))
  58.             break
  59.            
  60.         # if the game isn't done, don't add total_reward to reward_list
  61.         else:
  62.             obs, stacked_frames = stack_frames(stacked_frames, obs, False)
  63.             memory.append((state, act, reward, obs, done))
  64.            
  65.         # after all of this, update state to BE the stacked version of obs, and
  66.         # keep on keeping on.
  67.         state = obs
  68.        
  69.     # here's where the learning kicks in. to start, we need to specify
  70.     # when we want to even start learning. how many memories is a
  71.     # good starting point? i'm arbitrarily starting at 100
  72.     if len(memory) > 100:
  73.  
  74.         # pulling out useful info from a batch of memories and organizing
  75.         # them before we use each bit of info to structure training data
  76.         batch = sampleMemory(memory, batch_size=batch_size)
  77.         actions = [item[1] for item in batch]
  78.         states = np.array([item[0] for item in batch], ndmin=3)
  79.         rewards = [item[2] for item in batch]
  80.         next_states = np.array([item[0] for item in batch], ndmin=3)
  81.        
  82.         # creates the rewards that the net will receive for the actions
  83.         # taken in the batch.
  84.         targets = [learning_rate * np.max(item) for item in model.predict(next_states)]
  85.         targets = [targets[i] + rewards[i] for i in range(len(targets))]
  86.        
  87.         # creates the outputs to fit to
  88.         target_f = [item for item in model.predict(states)]
  89.         for i in range(len(target_f)):
  90.             target_f[i][actions[i]] = targets[i]
  91.            
  92.         # train on whole batch!
  93.         model.train_on_batch(x=np.array(states).reshape(-1, * state_size),
  94.                                  y=np.array(target_f).reshape(-1, action_size))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement