Untitled

import tensorflow as tf

class Model:
  def __init__(self, alpha=0.1, discount=0.9):
    self.W = np.random.random(4)
    self.alpha = alpha
    self.discount = discount

  def predict(self, a, y, x, W=None):
    if W is None:
      W=self.W
    b = 1
    out = b*W[0] + a*W[1] + y*W[2] + x*W[3]
    return out

  def gradient(self, a, y, x):
    W=self.W
    W=tf.convert_to_tensor(W)
    with tf.GradientTape() as t:
      t.watch(W)
      out=self.predict(a,y,x, W)
    derivative = t.gradient(out, W)
    return derivative

  def updateW(self,oldState,action,newState,reward, terminal=False):
    if terminal:
      Qold=self.predict(action,oldState[0],oldState[1])
      Qoldgrad=self.gradient(action,oldState[0],oldState[1])
      self.W = self.W + self.alpha * (reward - Qold) * Qoldgrad
    else:
      actionspace=4
      A=None
      highestVal=None
      for a in range(1, actionspace +1): #get highest value action from newState
        aval = self.predict(a, newState[0], newState[1])
        if highestVal is None:
            highestVal = aval
            A = a
        elif (aval > highestVal):
            highestVal = aval
            A = a
      Qold=self.predict(action,oldState[0],oldState[1])
      Qnew=self.predict(A,newState[0],newState[1])
      Qoldgrad=self.gradient(action,oldState[0],oldState[1])
      print(self.W, self.alpha, reward, self.discount, Qnew, Qold, Qoldgrad)
      #now update W
      self.W = self.W + self.alpha * (reward + self.discount*Qnew - Qold) * Qoldgrad
model = Model()

for i in range(10):
      model.updateW([3,2],1,[3,4],0)