Advertisement
Guest User

Untitled

a guest
Mar 3rd, 2023
137
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 4.80 KB | None | 0 0
  1. def train_single_step(self, state, action, reward, next_state, done):
  2. """
  3. This function will train the model
  4. Arguments:
  5. state {list} -- the current state of the game
  6. action {int} -- the action that was taken
  7. reward {int} -- the reward that was given
  8. next_state {list} -- the next state of the game
  9. done {bool} -- if the game is over or not
  10. """
  11. state = tf.convert_to_tensor(state, dtype=tf.float32)
  12. next_state = tf.convert_to_tensor(next_state, dtype=tf.float32)
  13. action = tf.convert_to_tensor(action, dtype=tf.int32)
  14. reward = tf.convert_to_tensor(reward, dtype=tf.float32)
  15.  
  16.  
  17. # This is because the model expects a batch of data
  18. state = tf.expand_dims(state, 0)
  19. next_state = tf.expand_dims(next_state, 0)
  20. action = tf.expand_dims(action, 0)
  21. reward = tf.expand_dims(reward, 0)
  22. done = (done, )
  23.  
  24. """
  25. Remember the Bellman Equation!!!!
  26. Q(s, a) = r + gamma * max(Q(s', a'))
  27. We will be using a simplified version
  28. Q = model.predict(state0)
  29. Qnew = reward + gamma * max(Q(state1))
  30. """
  31. # Part One: Get the Q values for the current state
  32. with tf.GradientTape() as tape:
  33. predicted = self.model(state) # This is the predicted Q values
  34. target = predicted.numpy() # This is the target Q values
  35.  
  36. Q_new = reward[0] # If the game is over, then the Q_new is just the reward
  37.  
  38. if not done[0]: # If the game is not over, then we need to calculate the Q_new
  39. targetState = tf.expand_dims(next_state[0], axis=0) # Add time dimension to the next state
  40. Q_new = reward[0] + self.gamma * tf.reduce_max(self.model(targetState)) # This is the Bellman Equation
  41.  
  42. target[0][np.argmax(action[0])] = Q_new # We need to update the Q value for the action that was taken
  43.  
  44. loss = self.loss(target, predicted)
  45.  
  46. # Part Two: Calculate the gradients
  47. gradients = tape.gradient(loss, self.model.trainable_variables)
  48.  
  49. # Part Three: Update the weights
  50. self.optimizer.apply_gradients(zip(gradients, self.model.trainable_variables))
  51.  
  52.  
  53. def train_multiple_steps(self, state, action, reward, next_state, done):
  54. """
  55. This function will train the model with multiple input and output states
  56. Arguments:
  57. state {list} -- list of the current state of the game
  58. action {int} -- list of the action that was taken
  59. reward {int} -- list of the reward that was given
  60. next_state {list} -- list of the next state of the game
  61. done {bool} -- list of if the game is over or not
  62. """
  63. state = tf.convert_to_tensor(state, dtype=tf.float32)
  64. next_state = tf.convert_to_tensor(next_state, dtype=tf.float32)
  65. action = tf.convert_to_tensor(action, dtype=tf.int32)
  66. reward = tf.convert_to_tensor(reward, dtype=tf.float32)
  67.  
  68.  
  69. """
  70. Remember the Bellman Equation!!!!
  71. Q(s, a) = r + gamma * max(Q(s', a'))
  72. We will be using a simplified version
  73. Q = model(state0)
  74. Qnew = reward + gamma * max(Q(state1))
  75. """
  76. # Part One: Get the Q values for the current state
  77. predictedTotal = []
  78. targetTotal = []
  79. for i in range(len(done)):
  80. with tf.GradientTape() as tape:
  81. # The current state would be the states from index 0 to i if i is less than 50, otherwise it would be the last 50 states
  82. curState = state[:i+1] if i < 50 else state[i-49:i+1]
  83. predicted = self.model(curState) # This is the predicted Q values
  84. target = predicted.numpy() # This is the target Q values
  85.  
  86. # If the game is over, then the Q_new is just the reward
  87. Q_new = reward[i]
  88.  
  89. if not done[i]: # If the game is not over, then we need to calculate the Q_new
  90. targetState = tf.expand_dims(next_state[i], axis=0) # Add time dimension to target state
  91. Q_new = reward[i] + self.gamma * tf.reduce_max(self.model(targetState)) # This is the Bellman Equation
  92.  
  93. target[0][np.argmax(action[i])] = Q_new # We need to update the Q value for the action that was taken
  94.  
  95. # Calulate loss
  96. loss = self.loss(target, predicted)
  97.  
  98. # Part Two: Calculate the gradients
  99. gradients = tape.gradient(loss, self.model.trainable_variables)
  100.  
  101. # Part Three: Update the weights
  102. self.optimizer.apply_gradients(zip(gradients, self.model.trainable_variables))
  103. return loss.numpy()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement