Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import tensorflow as tf
- class Model:
- def __init__(self, alpha=0.1, discount=0.9):
- self.W = np.random.random(4)
- self.alpha = alpha
- self.discount = discount
- def predict(self, a, y, x, W=None):
- if W is None:
- W=self.W
- b = 1
- out = b*W[0] + a*W[1] + y*W[2] + x*W[3]
- return out
- def gradient(self, a, y, x):
- W=self.W
- W=tf.convert_to_tensor(W)
- with tf.GradientTape() as t:
- t.watch(W)
- out=self.predict(a,y,x, W)
- derivative = t.gradient(out, W)
- return derivative
- def updateW(self,oldState,action,newState,reward, terminal=False):
- if terminal:
- Qold=self.predict(action,oldState[0],oldState[1])
- Qoldgrad=self.gradient(action,oldState[0],oldState[1])
- self.W = self.W + self.alpha * (reward - Qold) * Qoldgrad
- else:
- actionspace=4
- A=None
- highestVal=None
- for a in range(1, actionspace +1): #get highest value action from newState
- aval = self.predict(a, newState[0], newState[1])
- if highestVal is None:
- highestVal = aval
- A = a
- elif (aval > highestVal):
- highestVal = aval
- A = a
- Qold=self.predict(action,oldState[0],oldState[1])
- Qnew=self.predict(A,newState[0],newState[1])
- Qoldgrad=self.gradient(action,oldState[0],oldState[1])
- print(self.W, self.alpha, reward, self.discount, Qnew, Qold, Qoldgrad)
- #now update W
- self.W = self.W + self.alpha * (reward + self.discount*Qnew - Qold) * Qoldgrad
- model = Model()
- for i in range(10):
- model.updateW([3,2],1,[3,4],0)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement