AI: MDP Value Iteration (Naïve)

from types import ListType
from types import FloatType

W = [
  [' ', ' ', ' ', 'G'],
  [' ', '#', ' ', 'H'],
  [' ', ' ', ' ', ' ']
]

Cost = 3

def RewardOfCell(C):
  Result = -Cost
  if C == 'G':
    Result = 100
  elif C == 'H':
    Result = -100
  return Result

Dir_North = 0
Dir_East = 1
Dir_South = 2
Dir_West = 3

ActionSet = [Dir_North, Dir_East, Dir_South, Dir_West]

DirDeltas = [
  (0, -1), (1, 0), (0, 1), (-1, 0)
]

def Dim(A):
  Z = A
  Result = ()
  while type(Z) == ListType:
    Result += (len(Z),)
    Z = Z[0]
  return Result

def NewArrayOfDim(Dim):
  class tRef (object):
    pass
  def Fill(A, k):
    e = Dim[k]
    if k + 1 < len(Dim):
      for i in range(e):
        A.append([])
        Fill(A[i], k + 1)
    else:
      for i in range(e):
        A.append(0)
  if len(Dim) > 0:
    A = []
    Fill(A, 0)
    return A
  else:
    return []

class tEnv (object):
  def __init__(self, World, PForward, DiscountFactor):
    self.World = World
    self.PForward = PForward
    self.DiscountFactor = DiscountFactor # 1 => no cost, 0.9 => 10% penalty

def CellAt(Env, State):
  return Env.World[State[1]][State[0]]

def RewardAt(Env, State):
  return RewardOfCell(CellAt(Env, State))

def LeftFrom(Dir):
  return [Dir_West, Dir_North, Dir_East, Dir_South][Dir]
def RightFrom(Dir):
  return [Dir_East, Dir_South, Dir_West, Dir_North][Dir]
def BackFrom(Dir):
  return [Dir_South, Dir_West, Dir_North, Dir_East][Dir]

def InitalValueMap(Env):
 Result = NewArrayOfDim(Dim(W))
 return Result

def StateAfterAction(Env, State, Action):
  Result = State
  if CellAt(Env, State) in [' ']:
    D = Dim(Env.World)
    Delta = DirDeltas[Action]
    NewState = (
      max(0, min(D[1] - 1, State[0] + Delta[0])),
      max(0, min(D[0] - 1, State[1] + Delta[1]))
    )
    if CellAt(Env, NewState) != '#':
      Result = NewState
    return Result

def ActionStates_LR(Env, State, Action):
  PLeft = 0.5 * (1.0 - Env.PForward)
  PRight = 0.5 * (1.0 - Env.PForward)
  Result = []
  if CellAt(Env, State) in [' ']:
    if Env.PForward > 0:
      Result.append((
        Env.PForward, StateAfterAction(Env, State, Action), Action
      ))
    if PLeft > 0:
      AltAction = LeftFrom(Action)
      Result.append((
        PLeft, StateAfterAction(Env, State, AltAction), AltAction
      ))
    if PRight > 0:
      AltAction = RightFrom(Action)
      Result.append((
        PRight, StateAfterAction(Env, State, AltAction), AltAction
      ))
  return Result

def ActionStates_B(Env, State, Action):
  PBack = 1.0 - Env.PForward
  Result = []
  if CellAt(Env, State) in [' ']:
    if Env.PForward > 0:
      Result.append((
        Env.PForward, StateAfterAction(Env, State, Action), Action
      ))
    if PBack > 0:
      AltAction = BackFrom(Action)
      Result.append((
        PBack, StateAfterAction(Env, State, AltAction), AltAction
      ))
  return Result

def ValueForAction(Env, V, State, Action):
  ASRecs = ActionStates_LR(Env, State, Action)
  Result = 0
  for Probability, NewState, EffectiveAction in ASRecs:
    Result += Probability * V[NewState[1]][NewState[0]]
  Result = Result * Env.DiscountFactor + RewardAt(Env, State)
  return Result

def ValueOfState(Env, V, State):
  Result = None
  if CellAt(Env, State) not in ['#']:
    for Action in ActionSet:
      ActionValue = ValueForAction(Env, V, State, Action)
      if (Result == None) or (ActionValue > Result):
        Result = ActionValue
  return Result

def UpdateValueAt(Env, V, State):
  x, y = State
  Result = ValueOfState(Env, V, (x, y))
  V[y][x] = Result
  return Result

def UpdateValues(Env, V):
  D = Dim(Env.World)
  for y in range(D[0]):
    for x in range(D[1]):
      UpdateValueAt(Env, V, (x, y))

def PolicyAt(Env, V, State):
  D = Dim(V)
  x, y = State
  Result = 'X'
  if CellAt(Env, State) in [' ']:
    BestValue = 0
    for Action in ActionSet:
      ActionValue = ValueForAction(Env, V, State, Action)
      if (Result == 'X') or (ActionValue > BestValue):
        BestValue = ActionValue
        Result = 'NESW'[Action]
  return Result

def Policy(Env, V):
  D = Dim(V)
  Result = NewArrayOfDim(D)
  for y in range(D[0]):
    for x in range(D[1]):
      Result[y][x] = PolicyAt(Env, V, (x, y))
  return Result


def PrintNice(A):
  D = Dim(A)
  for y in range(D[0]):
    PrefixStr = '[' if y == 0 else ' '
    SuffixStr = ']' if y + 1 == D[0] else ''
    LStr = ''
    for x in range(D[1]):
      Value = A[y][x]
      if type(Value) == FloatType:
        S = "%.4g" % (1.0 * Value)
      else:
        S = str(Value)
      LStr += S if x == 0 else ", " + S
    print PrefixStr + "[" + LStr + "]" + SuffixStr
  return

Env = tEnv(W, 0.8, 1.0)

V = InitalValueMap(Env)

for i in range(500):
  UpdateValues(Env, V)

PrintNice(V)
PrintNice(Policy(Env, V))

# 85.18  89.40  93.15   100
# 81.43  #####  68.37  -100
# 77.21  73.46  69.56  47.39