Untitled

def __init__(self, mdp, discount = 0.9, iterations = 20):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter() # A Counter is a dict with default 0
        states = self.mdp.getStates()
        self.actions = util.Counter()

        for state in states:
            self.actions[state] = 'north'


        # Write value iteration code here
        "*** YOUR CODE HERE ***"


        for i in range (self.iterations):
            temp_values=util.Counter()

            for state in states:
                maxValue = -9999
                for action in self.mdp.getPossibleActions(state):
                    qvalue = self.computeQValueFromValues(state,action)
                    if maxValue<qvalue:
                        maxValue=qvalue
                        temp_values[state]=qvalue
                        self.actions[state]=action


            self.values=temp_values


    def getValue(self, state):
        """
          Return the value of the state (computed in __init__).
        """
        return self.values[state]


    def computeQValueFromValues(self, state, action):
        """
          Compute the Q-value of action in state from the
          value function stored in self.values.
        """
        "*** YOUR CODE HERE ***"
        qvalue=0
        SaP = self.mdp.getTransitionStatesAndProbs(state, action) #dohvati sljedeca stanja i vjerojatnosti prijelaza
        for sap in SaP:
            nextState=sap[0]
            prob=sap[1]
            qvalue+=prob*( self.mdp.getReward(state, action, nextState) + self.discount*self.getValue(nextState) )

        return qvalue

    def computeActionFromValues(self, state):
        """
          The policy is the best action in the given state
          according to the values currently stored in self.values.

          You may break ties any way you see fit.  Note that if
          there are no legal actions, which is the case at the
          terminal state, you should return None.
        """
        "*** YOUR CODE HERE ***"
        if  self.mdp.isTerminal(state):
            return None

        return self.actions[state]

    def getPolicy(self, state):
        return self.computeActionFromValues(state)

    def getAction(self, state):
        "Returns the policy at the state (no exploration)."
        return self.computeActionFromValues(state)

    def getQValue(self, state, action):
        return self.computeQValueFromValues(state, action)