Guest User

Untitled

a guest
Oct 23rd, 2018
108
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.00 KB | None | 0 0
  1. class Mario_AI(object):
  2. def __init__(self, ob_space, ac_space, designHead='universe'):
  3.  
  4. # Step 1 - Receive the input from the game (S1, S2 - Current state and next state)
  5. input_shape = [None] + list(ob_space)
  6. self.s1 = phi1 = tf.placeholder(tf.float32, input_shape)
  7. self.s2 = phi2 = tf.placeholder(tf.float32, input_shape)
  8. self.asample = asample = tf.placeholder(tf.float32, [None, ac_space])
  9.  
  10. # Step 2 - Encode both states
  11. size = 256
  12. phi1 = universeHead(phi1)
  13. with tf.variable_scope(tf.get_variable_scope(), reuse=True):
  14. phi2 = universeHead(phi2)
  15.  
  16. # Step 3 - Feed the encoded States to Inverse and Forward Models
  17.  
  18. #Inverse Model - Neural Network 1
  19. g = tf.concat(1,[phi1, phi2])
  20. g = tf.nn.relu(linear(g, size, "g1", normalized_columns_initializer(0.01)))
  21. aindex = tf.argmax(asample, axis=1) # aindex: [batch_size,]
  22. logits = linear(g, ac_space, "glast", normalized_columns_initializer(0.01))
  23. self.invloss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(
  24. logits, aindex), name="invloss")
  25. self.ainvprobs = tf.nn.softmax(logits, dim=-1)
  26.  
  27. # Forward Model - Neural Network 2
  28. # Note: no backprop to asample of policy: it is treated as fixed for predictor training
  29. f = tf.concat(1, [phi1, asample])
  30. f = tf.nn.relu(linear(f, size, "f1", normalized_columns_initializer(0.01)))
  31. f = linear(f, phi1.get_shape()[1].value, "flast", normalized_columns_initializer(0.01))
  32. self.forwardloss = 0.5 * tf.reduce_mean(tf.square(tf.subtract(f, phi2)), name='forwardloss')
  33. # self.forwardloss = 0.5 * tf.reduce_mean(tf.sqrt(tf.abs(tf.subtract(f, phi2))), name='forwardloss')
  34. # self.forwardloss = cosineLoss(f, phi2, name='forwardloss')
  35. self.forwardloss = self.forwardloss * 288.0 # lenFeatures=288. Factored out to make hyperparams not depend on it.
  36.  
  37. # Step 4 - Predict action using both models
  38. def pred_act(self, s1, s2):
  39. '''
  40. returns action probability distribution predicted by inverse model
  41. input: s1,s2: [h, w, ch]
  42. output: ainvprobs: [ac_space]
  43. '''
  44. sess = tf.get_default_session()
  45. return sess.run(self.ainvprobs, {self.s1: [s1], self.s2: [s2]})[0, :]
  46.  
  47. # Step 5 - Predict Reward
  48. def pred_bonus(self, s1, s2, asample):
  49. '''
  50. returns bonus predicted by forward model
  51. input: s1,s2: [h, w, ch], asample: [ac_space] 1-hot encoding
  52. output: scalar bonus
  53. '''
  54. sess = tf.get_default_session()
  55. # error = sess.run([self.forwardloss, self.invloss],
  56. # {self.s1: [s1], self.s2: [s2], self.asample: [asample]})
  57. # print('ErrorF: ', error[0], ' ErrorI:', error[1])
  58. error = sess.run(self.forwardloss,
  59. {self.s1: [s1], self.s2: [s2], self.asample: [asample]})
  60. error = error * constants['PREDICTION_BETA']
  61. return error
Add Comment
Please, Sign In to add comment