Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- def train_single_step(self, state, action, reward, next_state, done):
- """
- This function will train the model
- Arguments:
- state {list} -- the current state of the game
- action {int} -- the action that was taken
- reward {int} -- the reward that was given
- next_state {list} -- the next state of the game
- done {bool} -- if the game is over or not
- """
- state = tf.convert_to_tensor(state, dtype=tf.float32)
- next_state = tf.convert_to_tensor(next_state, dtype=tf.float32)
- action = tf.convert_to_tensor(action, dtype=tf.int32)
- reward = tf.convert_to_tensor(reward, dtype=tf.float32)
- # This is because the model expects a batch of data
- state = tf.expand_dims(state, 0)
- next_state = tf.expand_dims(next_state, 0)
- action = tf.expand_dims(action, 0)
- reward = tf.expand_dims(reward, 0)
- done = (done, )
- """
- Remember the Bellman Equation!!!!
- Q(s, a) = r + gamma * max(Q(s', a'))
- We will be using a simplified version
- Q = model.predict(state0)
- Qnew = reward + gamma * max(Q(state1))
- """
- # Part One: Get the Q values for the current state
- with tf.GradientTape() as tape:
- predicted = self.model(state) # This is the predicted Q values
- target = predicted.numpy() # This is the target Q values
- Q_new = reward[0] # If the game is over, then the Q_new is just the reward
- if not done[0]: # If the game is not over, then we need to calculate the Q_new
- targetState = tf.expand_dims(next_state[0], axis=0) # Add time dimension to the next state
- Q_new = reward[0] + self.gamma * tf.reduce_max(self.model(targetState)) # This is the Bellman Equation
- target[0][np.argmax(action[0])] = Q_new # We need to update the Q value for the action that was taken
- loss = self.loss(target, predicted)
- # Part Two: Calculate the gradients
- gradients = tape.gradient(loss, self.model.trainable_variables)
- # Part Three: Update the weights
- self.optimizer.apply_gradients(zip(gradients, self.model.trainable_variables))
- def train_multiple_steps(self, state, action, reward, next_state, done):
- """
- This function will train the model with multiple input and output states
- Arguments:
- state {list} -- list of the current state of the game
- action {int} -- list of the action that was taken
- reward {int} -- list of the reward that was given
- next_state {list} -- list of the next state of the game
- done {bool} -- list of if the game is over or not
- """
- state = tf.convert_to_tensor(state, dtype=tf.float32)
- next_state = tf.convert_to_tensor(next_state, dtype=tf.float32)
- action = tf.convert_to_tensor(action, dtype=tf.int32)
- reward = tf.convert_to_tensor(reward, dtype=tf.float32)
- """
- Remember the Bellman Equation!!!!
- Q(s, a) = r + gamma * max(Q(s', a'))
- We will be using a simplified version
- Q = model(state0)
- Qnew = reward + gamma * max(Q(state1))
- """
- # Part One: Get the Q values for the current state
- predictedTotal = []
- targetTotal = []
- for i in range(len(done)):
- with tf.GradientTape() as tape:
- # The current state would be the states from index 0 to i if i is less than 50, otherwise it would be the last 50 states
- curState = state[:i+1] if i < 50 else state[i-49:i+1]
- predicted = self.model(curState) # This is the predicted Q values
- target = predicted.numpy() # This is the target Q values
- # If the game is over, then the Q_new is just the reward
- Q_new = reward[i]
- if not done[i]: # If the game is not over, then we need to calculate the Q_new
- targetState = tf.expand_dims(next_state[i], axis=0) # Add time dimension to target state
- Q_new = reward[i] + self.gamma * tf.reduce_max(self.model(targetState)) # This is the Bellman Equation
- target[0][np.argmax(action[i])] = Q_new # We need to update the Q value for the action that was taken
- # Calulate loss
- loss = self.loss(target, predicted)
- # Part Two: Calculate the gradients
- gradients = tape.gradient(loss, self.model.trainable_variables)
- # Part Three: Update the weights
- self.optimizer.apply_gradients(zip(gradients, self.model.trainable_variables))
- return loss.numpy()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement