Untitled

def train_single_step(self, state, action, reward, next_state, done):
        """
        This function will train the model
        Arguments:
            state {list} -- the current state of the game
            action {int} -- the action that was taken
            reward {int} -- the reward that was given
            next_state {list} -- the next state of the game
            done {bool} -- if the game is over or not
        """
        state = tf.convert_to_tensor(state, dtype=tf.float32)
        next_state = tf.convert_to_tensor(next_state, dtype=tf.float32)
        action = tf.convert_to_tensor(action, dtype=tf.int32)
        reward = tf.convert_to_tensor(reward, dtype=tf.float32)


        # This is because the model expects a batch of data
        state = tf.expand_dims(state, 0)
        next_state = tf.expand_dims(next_state, 0)
        action = tf.expand_dims(action, 0)
        reward = tf.expand_dims(reward, 0)
        done = (done, )

        """
        Remember the Bellman Equation!!!!
        Q(s, a) = r + gamma * max(Q(s', a'))
        We will be using a simplified version
        Q = model.predict(state0)
        Qnew = reward + gamma * max(Q(state1))
        """
        # Part One: Get the Q values for the current state
        with tf.GradientTape() as tape:
            predicted = self.model(state) # This is the predicted Q values
            target = predicted.numpy() # This is the target Q values

            Q_new = reward[0] # If the game is over, then the Q_new is just the reward

            if not done[0]: # If the game is not over, then we need to calculate the Q_new
                targetState = tf.expand_dims(next_state[0], axis=0) # Add time dimension to the next state
                Q_new = reward[0] + self.gamma * tf.reduce_max(self.model(targetState)) # This is the Bellman Equation

            target[0][np.argmax(action[0])] = Q_new # We need to update the Q value for the action that was taken

            loss = self.loss(target, predicted)

        # Part Two: Calculate the gradients
        gradients = tape.gradient(loss, self.model.trainable_variables)

        # Part Three: Update the weights
        self.optimizer.apply_gradients(zip(gradients, self.model.trainable_variables))


    def train_multiple_steps(self, state, action, reward, next_state, done):
        """
            This function will train the model with multiple input and output states
            Arguments:
                state {list} -- list of the current state of the game
                action {int} -- list of the action that was taken
                reward {int} -- list of the reward that was given
                next_state {list} -- list of the next state of the game
                done {bool} -- list of if the game is over or not
            """
        state = tf.convert_to_tensor(state, dtype=tf.float32)
        next_state = tf.convert_to_tensor(next_state, dtype=tf.float32)
        action = tf.convert_to_tensor(action, dtype=tf.int32)
        reward = tf.convert_to_tensor(reward, dtype=tf.float32)


        """
            Remember the Bellman Equation!!!!
            Q(s, a) = r + gamma * max(Q(s', a'))
            We will be using a simplified version
            Q = model(state0)
            Qnew = reward + gamma * max(Q(state1))
            """
        # Part One: Get the Q values for the current state
        predictedTotal = []
        targetTotal = []
        for i in range(len(done)):
            with tf.GradientTape() as tape:
                # The current state would be the states from index 0 to i if i is less than 50, otherwise it would be the last 50 states
                curState = state[:i+1] if i < 50 else state[i-49:i+1]
                predicted = self.model(curState) # This is the predicted Q values
                target = predicted.numpy() # This is the target Q values

                # If the game is over, then the Q_new is just the reward
                Q_new = reward[i]

                if not done[i]: # If the game is not over, then we need to calculate the Q_new
                    targetState = tf.expand_dims(next_state[i], axis=0) # Add time dimension to target state
                    Q_new = reward[i] + self.gamma * tf.reduce_max(self.model(targetState)) # This is the Bellman Equation

                target[0][np.argmax(action[i])] = Q_new # We need to update the Q value for the action that was taken

                # Calulate loss
                loss = self.loss(target, predicted)

            # Part Two: Calculate the gradients
            gradients = tape.gradient(loss, self.model.trainable_variables)

            # Part Three: Update the weights
            self.optimizer.apply_gradients(zip(gradients, self.model.trainable_variables))
        return loss.numpy()