Untitled



import random
import pickle
import sys
from pprint import pprint
import time
import numpy as np
GAMMA = 0.9
GRID = None

class State:
    def __init__(self, evader, pursuer1, pursuer2, grid=None):
        self.evader = evader
        self.pursuer1 = pursuer1
        self.pursuer2 = pursuer2
        if grid is not None:
            self.V = 0
            # key = (evader, pursuer1, pursuer2)
            # self.my_actions = actionsE(grid, key)
            # self.op_actions = actionsP(grid, key)

    def get_my_actions(self):
        key = (self.evader, self.pursuer1, self.pursuer2)
        return actionsE(GRID, key)

    def get_op_actions(self):
        key = (self.evader, self.pursuer1, self.pursuer2)
        return actionsP(GRID, key)

    def getV0(self):
        if self.isEnd():
            return -1
        else:
            return 1

    def isEnd(self):
        self.evader == self.pursuer1 or self.evader == self.pursuer2

    def getV(self):
        return self.V


    def present(self):
        print("[" + str(self.evader) + "|" + str(self.pursuer1) + str(self.pursuer2) +
              str(self.V)) #+ str(self.best_action_evader) + "|" + str(self.best_action_pursuer))

    def computeQ(self, states, myA, opA):
        p1s, p2s = opA
        es = myA
        action = sort_key((es, p1s, p2s))
        s = states[action]      # transferred state
        return self.getV0() + GAMMA * s.getV()


    def updateV(self, states):
        my_actions = self.get_my_actions()
        op_actions = self.get_op_actions()

        if len(my_actions) == 0:
            self.best_action_evader = (-1, -1)
            return
        if self.isEnd():
            self.V = self.getV0()
            return

        my_max = -10000
        for m in my_actions:
            op_min = 10000
            for o in op_actions:
                q = self.computeQ(states, m, o)
                if q < op_min:
                    op_min = q
            if op_min > my_max:
                my_max = op_min
        self.V = my_max


    def set_best_action_evader(self, states):
        best_value = -10000
        best_action = (-1, -1)
        for a in self.get_my_actions():
            e = a
            p1, p2 = self.pursuer1, self.pursuer2
            key = sort_key((e, p1, p2))
            s = states[key]
            if s.V > best_value:
                best_value = s.V
                best_action = a
        self.best_action_evader = best_action


    def set_best_action_pursuer(self, states):
        best_value = 10000
        best_action = ((-1, -1), (-1, -1))
        for a in self.op_actions:
            p1, p2 = a
            e = self.evader
            key = sort_key((e, p1, p2))
            s = states[key]
            if s.V < best_value:
                best_value = s.V
                best_action = a
        self.best_action_pursuer = best_action


class ValueIteration:
    def __init__(self, grid, n):
        global GRID
        print("ValueIteration")
        GRID = grid
        PIK = "pacman.policy"

        try:
            with open(PIK, "rb") as f:
                print("opening a pickle " + PIK + "...")
                arr = pickle.load(f)
                self.states = self.decompress(arr)
                print("pickle loaded")
        except:
            self.learn(grid, PIK)
        print("DONE")

    def create_states(self, grid):
        self.states = dict()
        for e in grid.all_nodes():
            sys.stdout.write("\r creating state " + str(e))
            sys.stdout.flush()
            for p1 in grid.all_nodes():
                for p2 in grid.all_nodes():
                    p1s, p2s = sort(p1, p2)
                    if (e, p1s, p2s) in self.states:
                        continue
                    e = (np.int8(e[0]), np.int8(e[1]))
                    p1s = (np.int8(p1s[0]), np.int8(p1s[1]))
                    p2s = (np.int8(p2s[0]), np.int8(p2s[1]))
                    self.states[(e, p1s, p2s)] = State(e, p1s, p2s, grid)
        print()

    def learn(self, grid, PIK):
        print("computing a policy")
        self.grid = grid
        print("creating states...")
        self.create_states(grid)
        print("created %d states" % len(self.states))
        start_time = time.time()
        total_time = 60 * 10 * 10000
        i = 0
        while time.time() - start_time < total_time and i < 20:
            sys.stdout.write(
                "\r updating value, run %4d,  %5ds / %5ds" % (i, int(time.time() - start_time), total_time))
            sys.stdout.flush()
            self.updateAllV()
            i += 1
        print()
        for s in self.states.values():
            s.set_best_action_evader(self.states)
            s.set_best_action_pursuer(self.states)
        print("compressing...")
        data = self.compress()
        with open(PIK, "wb") as f:
            pickle.dump(data, f)
        print("policy computed and saved")

    def updateAllV(self):
        keys = self.states.keys()
        random.shuffle(keys)
        i = 0
        for key in keys:
            s = self.states[key]
            s.updateV(self.states)
            i += 1
        del keys

    def get_best_action_evader(self, e, p1, p2):
        key = sort_key((e, p1, p2))
        return self.states[key].best_action_evader

    def get_best_action_pursuer(self, e, my_p, p2):
        key = sort_key((e, my_p, p2))
        a1, a2 = self.states[key].best_action_pursuer
        if my_p == key[1]:
            return a1
        else:
            return a2

    def compress(self):
        new_arr = np.zeros((len(self.states), 12), dtype=np.int8)
        i = 0

        for s in self.states.values():
            del s.evader
            del s.pursuer1
            del s.pursuer2
            del s.my_actions
            del s.op_actions
            del s.V

        for k in self.states.keys():
            s = self.states[k]
            a = [k[0][0], k[0][1],
                 k[1][0], k[1][1],
                 k[2][0], k[2][1],
                 s.best_action_evader[0], s.best_action_evader[1],
                 s.best_action_pursuer[0][0], s.best_action_pursuer[0][1],
                 s.best_action_pursuer[1][0], s.best_action_pursuer[1][1]]
            new_arr[i, :] = a
            i += 1
        return new_arr

    def decompress(self, arr):
        print("decompressing...")
        print(arr)
        states = dict()
        print(type(arr))
        (n, _) = arr.shape
        arr = arr.astype(np.int)
        print("n = " + str(n))
        for i in range(n):
            s = arr[i, :]
            ex, ey = s[0], s[1]
            p1x, p1y, p2x, p2y = s[2], s[3], s[4], s[5]
            bex, bey = s[6], s[7]
            bp1x, bp1y, bp2x, bp2y = s[8], s[9], s[10], s[11]
            state = State((ex, ey), (p1x, p1y), (p2x, p2y))
            state.best_action_evader = (bex, bey)
            state.best_action_pursuer = ((bp1x, bp1y), (bp2x, bp2y))
            states[((ex, ey), (p1x, p1y), (p2x, p2y))] = state
        return states
        print("decompressed")


def sort(id1, id2):
    x1, y1 = id1
    x2, y2 = id2
    if x1 < x2 or (x1 == x2 and y1 < y2):
        return id1, id2
    else:
        return id2, id1


def sort_key(key):
    e, p1, p2 = key
    p1s, p2s = sort(p1, p2)
    return e, p1s, p2s


def actions(grid, s0):
    a = grid.neighbors4(s0)
    return filter(lambda s: grid.passable(s), a)

def actionsE(grid, key):  # returns actions for eaver, that are passable and don't go to persuers position
    e, p1, p2 = key
    return filter(lambda s: s != p1 and s != p2, actions(grid, e))


def actionsP(grid, key):  # returns all possible tuples of actions
    _, p1, p2 = key
    a = []
    for a1 in actions(grid, p1):
        for a2 in actions(grid, p2):
            a.append((a1, a2))
    return a