Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # valueIterationAgents.py
- # -----------------------
- # Licensing Information: You are free to use or extend these projects for
- # educational purposes provided that (1) you do not distribute or publish
- # solutions, (2) you retain this notice, and (3) you provide clear
- # attribution to UC Berkeley, including a link to http://ai.berkeley.edu.
- #
- # Attribution Information: The Pacman AI projects were developed at UC Berkeley.
- # The core projects and autograders were primarily created by John DeNero
- # (denero@cs.berkeley.edu) and Dan Klein (klein@cs.berkeley.edu).
- # Student side autograding was added by Brad Miller, Nick Hay, and
- # Pieter Abbeel (pabbeel@cs.berkeley.edu).
- import mdp, util
- from learningAgents import ValueEstimationAgent
- class ValueIterationAgent(ValueEstimationAgent):
- """
- * Please read learningAgents.py before reading this.*
- A ValueIterationAgent takes a Markov decision process
- (see mdp.py) on initialization and runs value iteration
- for a given number of iterations using the supplied
- discount factor.
- """
- def __init__(self, mdp, discount = 0.9, iterations = 100):
- """
- Your value iteration agent should take an mdp on
- construction, run the indicated number of iterations
- and then act according to the resulting policy.
- Some useful mdp methods you will use:
- mdp.getStates()
- mdp.getPossibleActions(state)
- mdp.getTransitionStatesAndProbs(state, action)
- mdp.getReward(state, action, nextState)
- mdp.isTerminal(state)
- """
- self.mdp = mdp
- self.discount = discount
- self.iterations = iterations
- self.values = util.Counter() # A Counter is a dict with default 0
- # Write value iteration code here
- "*** YOUR CODE HERE ***"
- currIteration = 0
- states = mdp.getStates()
- while currIteration < iterations:
- helpVals = util.Counter()
- for state in states:
- tempAction = self.computeActionFromValues(state)
- if not tempAction == None:
- helpVals[state] = self.computeQValueFromValues(state,tempAction)
- self.values = helpVals
- currIteration += 1
- def getValue(self, state):
- """
- Return the value of the state (computed in __init__).
- """
- return self.values[state]
- def computeQValueFromValues(self, state, action):
- """
- Compute the Q-value of action in state from the
- value function stored in self.values.
- """
- "*** YOUR CODE HERE ***"
- nextStateAndProb = self.mdp.getTransitionStatesAndProbs(state,action)
- qVal = 0
- for nextState, probability in nextStateAndProb:
- qVal += probability*( self.mdp.getReward(state,action,nextState)
- + self.discount*self.values[nextState])
- return qVal
- util.raiseNotDefined()
- def computeActionFromValues(self, state):
- """
- The policy is the best action in the given state
- according to the values currently stored in self.values.
- You may break ties any way you see fit. Note that if
- there are no legal actions, which is the case at the
- terminal state, you should return None.
- """
- "*** YOUR CODE HERE ***"
- if self.mdp.isTerminal(state):
- return None
- possibleActions = self.mdp.getPossibleActions(state)
- maxAction = possibleActions[0]
- qVal = self.computeQValueFromValues(state,maxAction)
- for action in possibleActions:
- tempVal = self.computeQValueFromValues(state,action)
- if tempVal > qVal:
- qVal = tempVal
- maxAction = action
- return maxAction
- util.raiseNotDefined()
- def getPolicy(self, state):
- return self.computeActionFromValues(state)
- def getAction(self, state):
- "Returns the policy at the state (no exploration)."
- return self.computeActionFromValues(state)
- def getQValue(self, state, action):
- return self.computeQValueFromValues(state, action)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement