Untitled

# queue for game memory
memory = deque(maxlen=1000)

# start empty queue for stacking frames
stacked_frames = deque([np.zeros((90, 70), dtype=np.uint8) for i in range(stack_size)], maxlen=4)

# list for the rewards from each round
rewards_list = []

# decay_step determines whether we move randomly or not.
decay_step = 0

# play around with this value
batch_size=20

# this next part is arbitrary, but vitally important. we need
# to choose how many games we want to run through. for
# each game, we set it up the usual way, by re-setting the
# game environment, and setting the total_reward for the
# game to zero. but we start this off a little differently by
# creating a stacked_frames, stacked_state, and setting the new_game
# parameter in stack_frames to true
for episode in range(2000):
    state = env.reset()
    total_reward = 0
    state, stacked_frames = stack_frames(stacked_frames, state, True)

    # like previous games, how many moves should we let it make?
    for step in range(1001):

        # if you wish, uncomment the render line to see the game train
        #env.render()
        # increment the decay_step on each move
        decay_step += 1

        # next, we'll predict an action using our pre-fabricated
        # prediction function. Since it spits out a 1-hot array,
        # and Gym needs an int for an action, we'll take the argmax
        # of the output from the function, hence why the function
        # is embedded in the np.argmax()
        act = np.argmax(predict_action(model, decay_step))

        # now, we'll update the game environment
        obs, reward, done, info = env.step(act)

        # we'll add the reward to our existing total_reward
        total_reward += reward

        # if the game is done, we'll make sure the system knows
        # it's game over, append the reward to the total_reward list,
        # and add the stacked_state, action taken, reward, the new
        # state (obs), and whether the game finished.
        if done == True:
            obs = np.zeros((210, 160, 3))
            obs, stacked_frames = stack_frames(stacked_frames, obs, False)
            rewards_list.append(total_reward)
            memory.append((state, act, reward, obs, done))
            break

        # if the game isn't done, don't add total_reward to reward_list
        else:
            obs, stacked_frames = stack_frames(stacked_frames, obs, False)
            memory.append((state, act, reward, obs, done))

        # after all of this, update state to BE the stacked version of obs, and
        # keep on keeping on.
        state = obs

    # here's where the learning kicks in. to start, we need to specify
    # when we want to even start learning. how many memories is a
    # good starting point? i'm arbitrarily starting at 100
    if len(memory) > 100:

        # pulling out useful info from a batch of memories and organizing
        # them before we use each bit of info to structure training data
        batch = sampleMemory(memory, batch_size=batch_size)
        actions = [item[1] for item in batch]
        states = np.array([item[0] for item in batch], ndmin=3)
        rewards = [item[2] for item in batch]
        next_states = np.array([item[0] for item in batch], ndmin=3)

        # creates the rewards that the net will receive for the actions
        # taken in the batch.
        targets = [learning_rate * np.max(item) for item in model.predict(next_states)]
        targets = [targets[i] + rewards[i] for i in range(len(targets))]

        # creates the outputs to fit to
        target_f = [item for item in model.predict(states)]
        for i in range(len(target_f)):
            target_f[i][actions[i]] = targets[i]

        # train on whole batch!
        model.train_on_batch(x=np.array(states).reshape(-1, * state_size),
                                 y=np.array(target_f).reshape(-1, action_size))