VoxelEngine/labirinth_ai/Subject.py

import random
import numpy as np
import tensorflow as tf
from tensorflow import keras

from labirinth_ai.LabyrinthWorld import LabyrinthWorld
from labirinth_ai.Models.EvolutionModel import EvolutionModel
from labirinth_ai.loss import loss2, loss3
from labirinth_ai.Models.BaseModel import BaseModel, train, create_optimizer, device, from_numpy

# import torch
# dtype = torch.float
# device = torch.device("cpu")


class Subject:
    name = 'random'
    col = 8
    num = 0
    random = True
    r = 255
    g = 255
    b = 255

    def __init__(self, x, y):
        self.alive = True
        self.x = x
        self.y = y
        self.kills = 0
        self.lives = 1
        self.tick = 0

        self.id = self.num
        Subject.num += 1

    def update(self, world: LabyrinthWorld):
        # 0, 0 is top left
        right = (1, 0)
        left = (-1, 0)
        up = (0, -1)
        down = (0, 1)
        directions = []

        if self.x - 1 >= 0:
            if world.board[self.x - 1, self.y] != 0:
                directions.append(left)

        if self.x + 1 < world.board_shape[0]:
            if world.board[self.x + 1, self.y] != 0:
                directions.append(right)

        if self.y - 1 >= 0:
            if world.board[self.x, self.y - 1] != 0:
                directions.append(up)

        if self.y + 1 < world.board_shape[1]:
            if world.board[self.x, self.y + 1] != 0:
                directions.append(down)

        if directions != [] and self.alive:
            if len(directions) > 1:
                d = directions[random.randint(0, len(directions) - 1)]
            else:
                d = directions[0]

            if len(world.subjectDict[(self.x + d[0], self.y + d[1])]) > 0:
                for sub in world.subjectDict[(self.x + d[0], self.y + d[1])]:
                    if sub.alive:
                        self.kills += 1
                    sub.alive = False
                    self.alive = True

            world.subjectDict[(self.x, self.y)].remove(self)
            world.trailMix[self.x, self.y] += 1
            self.x += d[0]
            self.y += d[1]
            world.subjectDict[(self.x, self.y)].append(self)

    def respawnUpdate(self, x, y, world: LabyrinthWorld):
        world.subjectDict[(self.x, self.y)].remove(self)
        self.x = x
        self.y = y
        world.subjectDict[(self.x, self.y)].append(self)
        self.alive = True
        self.lives += 1


class QLearner(Subject):
    name = 'QLearner'
    col = 14
    learningRate = 0.25
    discountFactor = 0.5
    random = False

    Q = {}
    def __init__(self, x, y):
        super(QLearner, self).__init__(x, y)
        # self.Q = {}
        self.viewD = 3
        self.lastAction = None
        self.lastState = None
        self.lastReward = 0

    def respawnUpdate(self, x, y, world: LabyrinthWorld):
        super(QLearner, self).respawnUpdate(x, y, world)
        self.lastReward -= 20

    def createState(self, world: LabyrinthWorld):
        state = np.zeros((2 * self.viewD + 1, 2 * self.viewD + 1), np.int)  # - 1

        maxdirleft = self.x - max(self.x - (self.viewD), 0)
        maxdirright = min(self.x + (self.viewD), (world.board_shape[0] - 1)) - self.x
        maxdirup = self.y - max(self.y - (self.viewD), 0)
        maxdirdown = min(self.y + (self.viewD), (world.board_shape[1] - 1)) - self.y

        # state[self.viewD - maxdirleft: self.viewD + maxdirright, self.viewD - maxdirup: self.viewD + maxdirdown] = world.board[self.x - maxdirleft: self.x + maxdirright, self.y - maxdirup: self.y + maxdirdown]
        for sub in world.subjects:
            if abs(sub.x - self.x) < self.viewD and abs(sub.y - self.y) < self.viewD:
                if state[self.viewD + sub.x - self.x, self.viewD + sub.y - self.y] != 3:
                    state[self.viewD + sub.x - self.x, self.viewD + sub.y - self.y] = state[self.viewD + sub.x - self.x, self.viewD + sub.y - self.y] * 100 + 1# sub.col

        return state

    def update(self, world: LabyrinthWorld):
        # 0, 0 is top left
        right = (1, 0)
        left = (-1, 0)
        up = (0, -1)
        down = (0, 1)
        directions = []

        if self.x - 1 >= 0:
            if world.board[self.x - 1, self.y] != 0:
                directions.append(left)

        if self.x + 1 < world.board_shape[0]:
            if world.board[self.x + 1, self.y] != 0:
                directions.append(right)

        if self.y - 1 >= 0:
            if world.board[self.x, self.y - 1] != 0:
                directions.append(up)

        if self.y + 1 < world.board_shape[1]:
            if world.board[self.x, self.y + 1] != 0:
                directions.append(down)

        if directions != [] and self.alive:
            state = self.createState(world)

            if str(state) not in self.Q.keys():
                self.Q[str(state)] = {}
            for dir in directions:
                if dir not in self.Q[str(state)].keys():
                    self.Q[str(state)][dir] = random.randint(0, 5)

            allowedActions = dict(filter(lambda elem: elem[0] in directions,self.Q[str(state)].items()))
            action = max(allowedActions, key=allowedActions.get)

            if self.learningRate != 0:
                self.Q[str(state)][action] = (1 - self.learningRate) * self.Q[str(state)][action] + self.learningRate * (self.lastReward + self.discountFactor * self.Q[str(state)][action])

            self.lastAction = action
            self.lastState = state
            self.lastReward = 0

            if len(action) == 2:
                if len(world.subjectDict[(self.x + action[0], self.y + action[1])]) > 0:
                    for sub in world.subjectDict[(self.x + action[0], self.y + action[1])]:
                        if sub.alive:
                            self.kills += 1
                        sub.alive = False
                        self.alive = True
                        self.lastReward += 10

                world.subjectDict[(self.x, self.y)].remove(self)
                self.x += action[0]
                self.y += action[1]
                world.subjectDict[(self.x, self.y)].append(self)
            pass


class DoubleQLearner(QLearner):
    name = 'DoubleQLearner'
    col = 11
    learningRate = 0.5
    discountFactor = 0.5
    random = False

    QA = {}
    QB = {}
    def __init__(self, x, y):
        super(DoubleQLearner, self).__init__(x, y)
        self.viewD = 3
        self.lastAction = None
        self.lastState = None
        self.lastReward = 0

    def respawnUpdate(self, x, y, world: LabyrinthWorld):
        super(DoubleQLearner, self).respawnUpdate(x, y, world)

    def update(self, world: LabyrinthWorld):
        # 0, 0 is top left
        right = (1, 0)
        left = (-1, 0)
        up = (0, -1)
        down = (0, 1)
        directions = []

        if self.x - 1 >= 0:
            if world.board[self.x - 1, self.y] != 0:
                directions.append(left)

        if self.x + 1 < world.board_shape[0]:
            if world.board[self.x + 1, self.y] != 0:
                directions.append(right)

        if self.y - 1 >= 0:
            if world.board[self.x, self.y - 1] != 0:
                directions.append(up)

        if self.y + 1 < world.board_shape[1]:
            if world.board[self.x, self.y + 1] != 0:
                directions.append(down)

        if directions != [] and self.alive:
            state = self.createState(world)

            if str(state) not in self.QA.keys():
                self.QA[str(state)] = {}
                self.QB[str(state)] = {}
            for dir in directions:
                if dir not in self.QA[str(state)].keys():
                    self.QA[str(state)][dir] = random.randint(0, 5)
                    self.QB[str(state)][dir] = random.randint(0, 5)

            allowedActionsA = dict(filter(lambda elem: elem[0] in directions, self.QA[str(state)].items()))
            allowedActionsB = dict(filter(lambda elem: elem[0] in directions, self.QB[str(state)].items()))
            allowedActions = {}
            for key in allowedActionsA.keys():
                allowedActions[key] = allowedActionsA[key] + allowedActionsB[key]

            actionA = max(allowedActionsA, key=allowedActionsA.get)
            actionB = max(allowedActionsB, key=allowedActionsB.get)
            action = max(allowedActions, key=allowedActions.get)

            if self.learningRate != 0:
                if random.randint(0, 1) == 0:
                    valA = self.QA[str(state)][action]
                    self.QA[str(state)][action] = valA + self.learningRate * (self.lastReward + self.discountFactor * self.QB[str(state)][actionA] - valA)
                else:
                    valB = self.QB[str(state)][action]
                    self.QB[str(state)][action] = valB + self.learningRate * (self.lastReward + self.discountFactor * self.QA[str(state)][actionB] - valB)

            self.lastAction = action
            self.lastState = state
            self.lastReward = 0

            if len(action) == 2:
                if len(world.subjectDict[(self.x + action[0], self.y + action[1])]) > 0:
                    for sub in world.subjectDict[(self.x + action[0], self.y + action[1])]:
                        if sub.alive:
                            self.kills += 1
                        sub.alive = False
                        self.alive = True
                        self.lastReward += 10

                world.subjectDict[(self.x, self.y)].remove(self)
                self.x += action[0]
                self.y += action[1]
                world.subjectDict[(self.x, self.y)].append(self)
            pass


RECALCULATE = False
class NetLearner(Subject):
    right = (1, 0)
    left = (-1, 0)
    up = (0, -1)
    down = (0, 1)
    act2IDict = {right: 0, left: 1, up: 2, down: 3}

    name = 'NetLearner'
    col = 15
    viewD = 3
    historyLength = 2
    channels = 4

    learningRate = 0.001
    discountFactor = 0.5
    randomBuffer = 0
    batchsize = 1000
    randomBuffer = max(4*batchsize, randomBuffer)
    randomChance = 9

    historySizeMul = 20

    # samples = []

    # x_in = keras.Input(shape=(4 * (2 * viewD + 1) * (2 * viewD + 1) + 2))
    # target = keras.Input(shape=(10, 1))
    # inVec = keras.layers.Flatten()(x_in)
    # # kernel_regularizer=keras.regularizers.l2(0.01)
    # actions = keras.layers.Dense((3 * (2 * viewD + 1) * (2 * viewD + 1)), activation='relu')(inVec)
    # actions = keras.layers.Dense(((2 * viewD + 1) * (2 * viewD + 1)), activation='relu')(actions)
    # actions = keras.layers.Dense(8, activation='linear', use_bias=False)(actions)
    #
    # model = keras.Model(inputs=x_in, outputs=actions)
    #
    # # model.compile(optimizer='adam', loss=loss, target_tensors=[target])
    # model.compile(optimizer=tf.keras.optimizers.RMSprop(learningRate), loss=loss, target_tensors=[target])

    def respawnUpdate(self, x, y, world: LabyrinthWorld):
        super(NetLearner, self).respawnUpdate(x, y, world)
        # self.lastReward -= 20

        if len(self.samples) < self.randomBuffer or random.randint(0, 10) > self.randomChance:
            self.random = True
            # print('Rando ' + self.name)
            pass
        else:
            self.random = False
            # print('Slau ' + self.name)

        self.strikes = 0

    def __init__(self, x, y, genes=None, genotype_class=None):
        super(NetLearner, self).__init__(x, y)

        self.action = None
        self.state = None
        self.actDict = {}

        self.history = []
        self.lastAction = None
        self.lastState = None
        self.lastReward = 0
        self.lastVal = 0
        self.random = False
        self.nextTrain = self.randomBuffer

        self.samples = []

        self.x_in = []
        self.actions = []
        self.target = []

        # self.model = BaseModel(self.viewD, 4, 4).to(device)
        self.model = EvolutionModel(self.viewD, 4, 4, genes=genes, genotype_class=genotype_class).to(device)

        self.optimizer = create_optimizer(self.model)

        if len(self.samples) < self.randomBuffer:
            self.random = True
        else:
            self.random = False

        self.strikes = 0

        self.lastRewards = []

        self.accumulated_rewards = 0

    def visualize(self):
        print(self.name)
        layers = self.model.get_weights()
        # layers.reverse()
        layersN = [[0, 1, 8, 9, 16], [2, 3, 10, 11, 17], [4, 5, 12, 13, 18], [6, 7, 14, 15, 19]]
        for action in range(8):
            v = np.zeros((1, 2))
            v[0][0 if action < 4 else 1] = 1.0
            layerN = list(layersN[action % 4])
            layerN.reverse()
            for n in layerN:
                l = layers[n]
                if len(l.shape) == 2:
                    layer = np.transpose(l)
                    v = np.dot(v, layer)
                else:
                    layer = np.array([l])
                    v = v + layer
            lastAction = v[0, -2:]
            v = np.reshape(v[0, :-2], (4, (2 * self.viewD + 1), (2 * self.viewD + 1)))

            # right, left, up, down
            dir = {0: 'right', 1: 'left', 2: 'up', 3: 'down'}
            dir = dir[action % 4]
            #0-3 current
            #4-8 future
            if action < 4:
                time = 'current '
            else:
                time = 'future '
            import matplotlib
            import matplotlib.pyplot as plt
            fig, axs = plt.subplots(2, 2, figsize=(5, 5))

            fig.suptitle(time + dir)
            im = axs[0, 0].pcolor(np.rot90(v[0]))
            fig.colorbar(im, ax=axs[0, 0])
            axs[0, 0].set_title('board')

            axs[0, 1].pcolor(np.rot90(v[1]))
            fig.colorbar(im, ax=axs[0, 1])
            axs[0, 1].set_title('subjects')

            axs[1, 0].pcolor(np.rot90(v[2]))
            fig.colorbar(im, ax=axs[1, 0])
            axs[1, 0].set_title('trail')

            axs[1, 1].pcolor(np.rot90(v[3]))
            fig.colorbar(im, ax=axs[1, 1])
            axs[1, 1].set_title('grass')
            plt.show(block=True)

    def createState(self, world: LabyrinthWorld):
        state = np.zeros((2 * self.viewD + 1, 2 * self.viewD + 1), np.float)  # - 1
        state2 = np.zeros((2 * self.viewD + 1, 2 * self.viewD + 1), np.float)  # - 1
        state3 = np.zeros((2 * self.viewD + 1, 2 * self.viewD + 1), np.float)  # - 1
        state4 = np.zeros((2 * self.viewD + 1, 2 * self.viewD + 1), np.float)  # - 1

        maxdirleft = self.x - max(self.x - (self.viewD), 0)
        maxdirright = min(self.x + (self.viewD), (world.board_shape[0] - 1)) - self.x
        maxdirup = self.y - max(self.y - (self.viewD), 0)
        maxdirdown = min(self.y + (self.viewD), (world.board_shape[1] - 1)) - self.y

        state[self.viewD - maxdirleft: self.viewD + maxdirright, self.viewD - maxdirup: self.viewD + maxdirdown] = world.board[self.x - maxdirleft: self.x + maxdirright, self.y - maxdirup: self.y + maxdirdown]
        # for sub in world.subjects:
        #     if abs(sub.x - self.x) < self.viewD and abs(sub.y - self.y) < self.viewD:
        #         if state[self.viewD + sub.x - self.x, self.viewD + sub.y - self.y] != 3:
        #             state2[self.viewD + sub.x - self.x, self.viewD + sub.y - self.y] = sub.col
        for x in range(-maxdirleft, maxdirright, 1):
            for y in range(-maxdirup, maxdirdown, 1):
                if world.subjectDict[(self.x + x, self.y + y)] != []:
                    state2[x + maxdirleft, y + maxdirup] = 1#world.subjectDict[(self.x + x, self.y + y)][0].col

        state3[self.viewD - maxdirleft: self.viewD + maxdirright, self.viewD - maxdirup: self.viewD + maxdirdown] = world.trailMix[self.x - maxdirleft: self.x + maxdirright, self.y - maxdirup: self.y + maxdirdown]
        state4[self.viewD - maxdirleft: self.viewD + maxdirright, self.viewD - maxdirup: self.viewD + maxdirdown] = world.hunter_grass[self.x - maxdirleft: self.x + maxdirright, self.y - maxdirup: self.y + maxdirdown]

        if not self.random:
            test=1

        area = np.reshape(np.stack((state, state2, state3, state4)), (4 * (2 * self.viewD + 1) * (2 * self.viewD + 1)))
        action = [0, 0]
        if self.lastAction is not None:
            action = self.lastAction
        return np.reshape(np.concatenate((area, action)), (1, 4 * (2 * self.viewD + 1) * (2 * self.viewD + 1) + 2))

    def generate_valid_directions(self, world: LabyrinthWorld):
        directions = []
        if self.x - 1 >= 0:
            if world.board[self.x - 1, self.y] != 0:
                directions.append(self.left)

        if self.x + 1 < world.board_shape[0]:
            if world.board[self.x + 1, self.y] != 0:
                directions.append(self.right)

        if self.y - 1 >= 0:
            if world.board[self.x, self.y - 1] != 0:
                directions.append(self.up)

        if self.y + 1 < world.board_shape[1]:
            if world.board[self.x, self.y + 1] != 0:
                directions.append(self.down)
        return directions

    def calculateAction(self, world: LabyrinthWorld, vals=None, state=None):
        # 0, 0 is top left
        directions = self.generate_valid_directions(world)

        if directions == []:
            print('Wut?')
            return

        if directions != [] and self.alive:
            if state is None:
                state = self.createState(world)
            if vals is None:
                vals = self.model(from_numpy(state)).detach().numpy()
                vals = np.reshape(np.transpose(np.reshape(vals, (4, 2)), (1, 0)),
                                  (1, 8))

            self.actDict = {self.right: vals[0][0] + vals[0][4], self.left: vals[0][1] + vals[0][5], self.up: vals[0][2] + vals[0][6], self.down: vals[0][3] + vals[0][7]}

            allowedActions = dict(filter(lambda elem: elem[0] in directions, self.actDict.items()))

            # if self.name == 'Herbivore' and self.id == 11 and not self.random:
            #     print(allowedActions)
            #     print(self.lastReward)
            if self.strikes <= 0:
                self.random = False

            if not self.random:
                self.action = max(allowedActions, key=allowedActions.get)
            else:
                self.action = self.randomAct(world)

            self.state = state

    def update(self, world: LabyrinthWorld, doTrain=True):
        if self.lastAction is not None:
            if not self.random:
                if self.lastAction[0] + self.action[0] == 0 and self.lastAction[1] + self.action[1] == 0:
                    self.strikes += 1
                else:
                    self.strikes -= 1
                if self.strikes > 100:
                    self.random = True
            else:
                self.strikes -= 1

            if len(self.history) >= self.historyLength:
                self.history.pop(0)
            self.history.append((self.lastState.copy(), int(self.act2IDict[self.lastAction]), int(self.lastVal), float(self.lastReward), np.array(self.lastRewards)))

            # if self.lastReward != 0 or random.randint(0, 9) == 0:
            if len(self.history) == self.historyLength:
                self.samples.append(self.history.copy())

            # if len(self.samples) % self.batchsize == 0 and len(self.samples) >= self.randomBuffer:
            if len(self.samples) > self.nextTrain and doTrain:
                print('train', len(self.samples))
                self.train()
                self.nextTrain = len(self.samples)
                self.nextTrain = min(self.batchsize + self.nextTrain, (self.historySizeMul + 1) * self.batchsize)
                print(len(self.samples), self.nextTrain)

        if not self.random:
            self.accumulated_rewards += self.lastReward

        self.lastAction = self.action
        self.lastState = self.state
        self.lastReward = 0
        self.lastVal = self.actDict[self.action]

        maxVal = 0

        self.executeAction(world, self.action)

    def randomAct(self, world: LabyrinthWorld):
        directions = self.generate_valid_directions(world)

        if len(directions) == 0:
            return 0, 0

        d = random.randint(0, len(directions) - 1)
        action = directions[d]

        return action

    def executeAction(self, world: LabyrinthWorld, action):
        pass

    def generateSamples(self):
        # history element: (self.lastState.copy(), self.act2IDict[self.lastAction], self.lastVal, self.lastReward, np.array(self.lastRewards))
        # history: [t-2, t-1]
        states = []
        targets = []
        for i in range(4):
            true_batch = int(self.batchsize/4)
            target = np.zeros((true_batch, 2, 1))
            samples = np.array(self.samples[:-self.batchsize])
            # print('Samples for ' + str(i))
            # print(len(samples))
            samples = np.array(list(filter(lambda e: e[0, 1] == i, list(samples))))
            # print(len(samples))
            partTwo = True
            if len(samples) == 0:
                print('No samples for:' + str(i))
                partTwo = False
                samples = np.array(self.samples[:-self.batchsize])
            buffer_size = len(samples)
            index = np.random.choice(np.arange(buffer_size),
                                     size=true_batch,
                                     replace=True)
            samples = samples[index]
            # self.samples = []
            target[:, 1, 0] = samples[:, 0, 3]  # reward t-2 got
            if partTwo:
                if RECALCULATE:
                    nextState = np.concatenate(samples[:, 1, 0]) #states of t-1
                    nextVals = self.model(from_numpy(nextState)).detach().numpy()

                    nextVals2 = np.max(nextVals[:, :,  0] + nextVals[:, :, 1], axis=1)
                    target[:, 0, 0] = nextVals2 #best q t-1
                else:
                    target[:, 0, 0] = samples[:, 1, 2] #best q t-1

            targets.append(target)

            states.append(np.concatenate(samples[:, 0, 0])) #states of t-2

        return states, targets

    def train(self):
        print(self.name)
        states, target = self.generateSamples()
        train(states, target, self.model, self.optimizer)

        self.samples = self.samples[-self.historySizeMul*self.batchsize:]

        # print(self.model.get_weights())

        pass


class Herbivore(NetLearner):
    name = 'Herbivore'
    col = 9
    r = 255
    g = 255
    b = 0
    viewD = 3
    historyLength = 2

    learningRate = 0.001
    discountFactor = 0.5
    randomBuffer = 0
    batchsize = 1000
    randomBuffer = max(2 * batchsize, randomBuffer)
    randomChance = 9

    samples = []

    def createState(self, world: LabyrinthWorld):
        state = np.zeros((2 * self.viewD + 1, 2 * self.viewD + 1), np.float)  # - 1
        state2 = np.zeros((2 * self.viewD + 1, 2 * self.viewD + 1), np.float)  # - 1
        state3 = np.zeros((2 * self.viewD + 1, 2 * self.viewD + 1), np.float)  # - 1
        state4 = np.zeros((2 * self.viewD + 1, 2 * self.viewD + 1), np.float)  # - 1

        maxdirleft = self.x - max(self.x - (self.viewD), 0)
        maxdirright = min(self.x + (self.viewD), (world.board_shape[0] - 1)) - self.x
        maxdirup = self.y - max(self.y - (self.viewD), 0)
        maxdirdown = min(self.y + (self.viewD), (world.board_shape[1] - 1)) - self.y

        state[self.viewD - maxdirleft: self.viewD + maxdirright, self.viewD - maxdirup: self.viewD + maxdirdown] = world.board[self.x - maxdirleft: self.x + maxdirright, self.y - maxdirup: self.y + maxdirdown]
        # for sub in world.subjects:
        #     if abs(sub.x - self.x) < self.viewD and abs(sub.y - self.y) < self.viewD:
        #         if state[self.viewD + sub.x - self.x, self.viewD + sub.y - self.y] != 3:
        #             state2[self.viewD + sub.x - self.x, self.viewD + sub.y - self.y] = sub.col
        for x in range(-maxdirleft, maxdirright, 1):
            for y in range(-maxdirup, maxdirdown, 1):
                if world.subjectDict[(self.x + x, self.y + y)] != []:
                    state2[x + maxdirleft, y + maxdirup] = 1#world.subjectDict[(self.x + x, self.y + y)][0].col

        state3[self.viewD - maxdirleft: self.viewD + maxdirright, self.viewD - maxdirup: self.viewD + maxdirdown] = world.trailMix[self.x - maxdirleft: self.x + maxdirright, self.y - maxdirup: self.y + maxdirdown]
        state4[self.viewD - maxdirleft: self.viewD + maxdirright, self.viewD - maxdirup: self.viewD + maxdirdown] = world.grass[self.x - maxdirleft: self.x + maxdirright, self.y - maxdirup: self.y + maxdirdown]

        if not self.random:
            test=1

        area = np.reshape(np.stack((state, state2, state3, state4)), (4 * (2 * self.viewD + 1) * (2 * self.viewD + 1)))
        action = [0, 0]
        if self.lastAction is not None:
            action = self.lastAction
        return np.reshape(np.concatenate((area, action)), (1, 4 * (2 * self.viewD + 1) * (2 * self.viewD + 1) + 2))

    def executeAction(self, world: LabyrinthWorld, action):
        directions = self.generate_valid_directions(world)
        if len(action) == 2:
            if len(world.subjectDict[(self.x + action[0], self.y + action[1])]) > 0:
                for sub in world.subjectDict[(self.x + action[0], self.y + action[1])]:
                    if isinstance(sub, Hunter):
                        if sub.alive:
                            sub.kills += 1
                            sub.alive = True
                            sub.lastReward += 10
                            self.alive = False

            self.lastRewards = []
            if self.right in directions:
                self.lastRewards.append(world.grass[self.x + 1, self.y])
            else:
                self.lastRewards.append(0)
            if self.left in directions:
                self.lastRewards.append(world.grass[self.x - 1, self.y])
            else:
                self.lastRewards.append(0)
            if self.up in directions:
                self.lastRewards.append(world.grass[self.x, self.y - 1])
            else:
                self.lastRewards.append(0)
            if self.down in directions:
               self.lastRewards.append(world.grass[self.x, self.y + 1])
            else:
                self.lastRewards.append(0)
            assert len(self.lastRewards) == 4, 'Last Rewards not filled correctly!'

            world.subjectDict[(self.x, self.y)].remove(self)
            # self.lastReward += world.trailMix[self.x, self.y]
            self.x += action[0]
            self.y += action[1]
            world.subjectDict[(self.x, self.y)].append(self)
            world.trailMix[self.x, self.y] = max(1.0, world.trailMix[self.x, self.y])
            self.lastReward += (world.grass[self.x, self.y] - 0.0)
            world.grass[self.x, self.y] = 0
            world.hunter_grass[self.x, self.y] = 0

    def generate_valid_directions(self, world: LabyrinthWorld):
        directions = []
        if self.x - 1 >= 0:
            if world.board[self.x - 1, self.y] != 0:
                if not world.subjectDict[(self.x - 1, self.y)]:
                    directions.append(self.left)

        if self.x + 1 < world.board_shape[0]:
            if world.board[self.x + 1, self.y] != 0:
                if not world.subjectDict[(self.x + 1, self.y)]:
                    directions.append(self.right)

        if self.y - 1 >= 0:
            if world.board[self.x, self.y - 1] != 0:
                if not world.subjectDict[(self.x, self.y - 1)]:
                    directions.append(self.up)

        if self.y + 1 < world.board_shape[1]:
            if world.board[self.x, self.y + 1] != 0:
                if not world.subjectDict[(self.x, self.y + 1)]:
                    directions.append(self.down)
        return directions

    def randomAct(self, world: LabyrinthWorld):
        directions = []
        actDict = {}

        if self.x - 1 >= 0:
            if world.board[self.x - 1, self.y] != 0:
                if not world.subjectDict[(self.x - 1, self.y)]:
                    directions.append(self.left)
                    actDict[self.left] = world.grass[self.x - 1, self.y]

        if self.x + 1 < world.board_shape[0]:
            if world.board[self.x + 1, self.y] != 0:
                if not world.subjectDict[(self.x + 1, self.y)]:
                    directions.append(self.right)
                    actDict[self.right] = world.grass[self.x + 1, self.y]

        if self.y - 1 >= 0:
            if world.board[self.x, self.y - 1] != 0:
                if not world.subjectDict[(self.x, self.y - 1)]:
                    directions.append(self.up)
                    actDict[self.up] = world.grass[self.x, self.y - 1]

        if self.y + 1 < world.board_shape[1]:
            if world.board[self.x, self.y + 1] != 0:
                if not world.subjectDict[(self.x, self.y + 1)]:
                    directions.append(self.down)
                    actDict[self.down] = world.grass[self.x, self.y + 1]
        if len(directions) == 0:
            return 0, 0

        allowedActions = dict(filter(lambda elem: elem[0] in directions, actDict.items()))
        action = max(allowedActions, key=allowedActions.get)

        return action

    def respawnUpdate(self, x, y, world: LabyrinthWorld):
        super(Herbivore, self).respawnUpdate(x, y, world)
        # self.lastReward -= 1


class Hunter(NetLearner):
    name = 'Hunter'
    hunterGrassScale = 0.5
    r = 0
    g = 255
    b = 255
    def randomAct(self, world: LabyrinthWorld):
        directions = []
        actDict = {}

        if self.x - 1 >= 0:
            if world.board[self.x - 1, self.y] > 0.01:
                directions.append(self.left)

                sub = self.getClosestSubject(world, self.x - 1, self.y)
                dist = self.viewD
                if sub is not None:
                    dist = np.sqrt(np.square(self.x - 1 - sub.x) + np.square(self.y - sub.y))
                distReward = self.viewD - dist

                actDict[self.left] = world.trailMix[self.x - 1, self.y] + world.hunter_grass[self.x - 1, self.y] * self.hunterGrassScale + distReward
                if len(world.subjectDict[(self.x + self.left[0], self.y + self.left[1])]) > 0:
                    for sub in world.subjectDict[(self.x + self.left[0], self.y + self.left[1])]:
                        if sub.col != self.col:
                            actDict[self.left] += 10

        if self.x + 1 < world.board_shape[0]:
            if world.board[self.x + 1, self.y] > 0.01:
                directions.append(self.right)

                sub = self.getClosestSubject(world, self.x + 1, self.y)
                dist = self.viewD
                if sub is not None:
                    dist = np.sqrt(np.square(self.x + 1 - sub.x) + np.square(self.y - sub.y))
                distReward = self.viewD - dist

                actDict[self.right] = world.trailMix[self.x + 1, self.y] + world.hunter_grass[self.x + 1, self.y] * self.hunterGrassScale + distReward
                if len(world.subjectDict[(self.x + self.right[0], self.y + self.right[1])]) > 0:
                    for sub in world.subjectDict[(self.x + self.right[0], self.y + self.right[1])]:
                        if sub.col != self.col:
                            actDict[self.right] += 10

        if self.y - 1 >= 0:
            if world.board[self.x, self.y - 1] > 0.01:
                directions.append(self.up)

                sub = self.getClosestSubject(world, self.x, self.y - 1)
                dist = self.viewD
                if sub is not None:
                    dist = np.sqrt(np.square(self.x - sub.x) + np.square(self.y - 1 - sub.y))
                distReward = self.viewD - dist

                actDict[self.up] = world.trailMix[self.x, self.y - 1] + world.hunter_grass[self.x, self.y - 1] * self.hunterGrassScale + distReward
                if len(world.subjectDict[(self.x + self.up[0], self.y + self.up[1])]) > 0:
                    for sub in world.subjectDict[(self.x + self.up[0], self.y + self.up[1])]:
                        if sub.col != self.col:
                            actDict[self.up] += 10

        if self.y + 1 < world.board_shape[1]:
            if world.board[self.x, self.y + 1] > 0.01:
                directions.append(self.down)

                sub = self.getClosestSubject(world, self.x, self.y + 1)
                dist = self.viewD
                if sub is not None:
                    dist = np.sqrt(np.square(self.x - sub.x) + np.square(self.y + 1 - sub.y))
                distReward = self.viewD - dist

                actDict[self.down] = world.trailMix[self.x, self.y + 1] + world.hunter_grass[self.x, self.y + 1] * self.hunterGrassScale + distReward
                if len(world.subjectDict[(self.x + self.down[0], self.y + self.down[1])]) > 0:
                    for sub in world.subjectDict[(self.x + self.down[0], self.y + self.down[1])]:
                        if sub.col != self.col:
                            actDict[self.down] += 10

        if len(actDict) > 0:
            allowedActions = dict(filter(lambda elem: elem[0] in directions, actDict.items()))
        else:
            return super(Hunter, self).randomAct(world)
        action = max(allowedActions, key=allowedActions.get)

        return action

    def respawnUpdate(self, x, y, world: LabyrinthWorld):
        super(Hunter, self).respawnUpdate(x, y, world)
        self.lastReward -= 1

    def getClosestSubject(self, world, x, y):
        for dist in range(1, self.viewD):
            dy = dist
            for dx in range(-dist, dist):
                if world.board_shape[0] > x + dx >= 0 and world.board_shape[1] > y + dy >= 0:
                    for sub in world.subjectDict[(x + dx, y + dy)]:
                        if sub.alive and sub.col != self.col:
                            return sub

            dy = -dist
            for dx in range(-dist, dist):
                if world.board_shape[0] > x + dx >= 0 and world.board_shape[1] > y + dy >= 0:
                    for sub in world.subjectDict[(x + dx, y + dy)]:
                        if sub.alive and sub.col != self.col:
                            return sub

            dx = dist
            for dy in range(-dist, dist):
                if world.board_shape[0] > x + dx >= 0 and world.board_shape[1] > y + dy >= 0:
                    for sub in world.subjectDict[(x + dx, y + dy)]:
                        if sub.alive and sub.col != self.col:
                            return sub

            dx = -dist
            for dy in range(-dist, dist):
                if world.board_shape[0] > x + dx >= 0 and world.board_shape[1] > y + dy >= 0:
                    for sub in world.subjectDict[(x + dx, y + dy)]:
                        if sub.alive and sub.col != self.col:
                            return sub
        return None

    def executeAction(self, world: LabyrinthWorld, action):
        grass_factor = 0.5

        directions = self.generate_valid_directions(world)

        if len(action) == 2:
            right_kill = left_kill = up_kill = down_kill = False
            if self.right in directions:
                for sub in world.subjectDict[(self.x + self.right[0], self.y + self.right[1])]:
                    if sub.alive:
                        if sub.col != self.col:
                            right_kill = True
            if self.left in directions:
                for sub in world.subjectDict[(self.x + self.left[0], self.y + self.left[1])]:
                    if sub.alive:
                        if sub.col != self.col:
                            left_kill = True
            if self.up in directions:
                for sub in world.subjectDict[(self.x + self.up[0], self.y + self.up[1])]:
                    if sub.alive:
                        if sub.col != self.col:
                            up_kill = True
            if self.down in directions:
                for sub in world.subjectDict[(self.x + self.down[0], self.y + self.down[1])]:
                    if sub.alive:
                        if sub.col != self.col:
                            down_kill = True

            if len(world.subjectDict[(self.x + action[0], self.y + action[1])]) > 0:
                for sub in world.subjectDict[(self.x + action[0], self.y + action[1])]:
                    if sub.alive:
                        self.kills += 1
                        if sub.col != self.col:
                            self.lastReward += 10
                    sub.alive = False
                    self.alive = True

            self.lastRewards = []
            if self.right in directions:
                sub = self.getClosestSubject(world, self.x + 1, self.y)
                dist = self.viewD
                if sub is not None:
                    dist = np.sqrt(np.square(self.x + 1 - sub.x) + np.square(self.y - sub.y))
                distReward = self.viewD - dist
                if right_kill:
                    self.lastRewards.append(10 + world.trailMix[self.x + 1, self.y] + world.hunter_grass[self.x + 1, self.y] * grass_factor + distReward)
                else:
                    self.lastRewards.append(world.trailMix[self.x + 1, self.y] + world.hunter_grass[self.x + 1, self.y] * grass_factor + distReward)
            else:
                self.lastRewards.append(0)
            if self.left in directions:
                sub = self.getClosestSubject(world, self.x - 1, self.y)
                dist = self.viewD
                if sub is not None:
                    dist = np.sqrt(np.square(self.x - 1 - sub.x) + np.square(self.y - sub.y))
                distReward = self.viewD - dist
                if left_kill:
                    self.lastRewards.append(10 + world.trailMix[self.x - 1, self.y] + world.hunter_grass[self.x - 1, self.y] * grass_factor + distReward)
                else:
                    self.lastRewards.append(world.trailMix[self.x - 1, self.y] + world.hunter_grass[self.x - 1, self.y] * grass_factor + distReward)
            else:
                self.lastRewards.append(0)
            if self.up in directions:
                sub = self.getClosestSubject(world, self.x, self.y - 1)
                dist = self.viewD
                if sub is not None:
                    dist = np.sqrt(np.square(self.x - sub.x) + np.square(self.y - sub.y - 1))
                distReward = self.viewD - dist
                if up_kill:
                    self.lastRewards.append(10 + world.trailMix[self.x, self.y - 1] + world.hunter_grass[self.x, self.y - 1] * grass_factor + distReward)
                else:
                    self.lastRewards.append(world.trailMix[self.x, self.y - 1] + world.hunter_grass[self.x, self.y - 1] * grass_factor + distReward)
            else:
                self.lastRewards.append(0)
            if self.down in directions:
                sub = self.getClosestSubject(world, self.x, self.y + 1)
                dist = self.viewD
                if sub is not None:
                    dist = np.sqrt(np.square(self.x - sub.x) + np.square(self.y + 1 - sub.y))
                distReward = self.viewD - dist
                if down_kill:
                    self.lastRewards.append(10 + world.trailMix[self.x, self.y + 1] + world.hunter_grass[self.x, self.y + 1] * grass_factor + distReward)
                else:
                    self.lastRewards.append(world.trailMix[self.x, self.y + 1] + world.hunter_grass[self.x, self.y + 1] * grass_factor + distReward)
            else:
                self.lastRewards.append(0)
            assert len(self.lastRewards) == 4, 'Last Rewards not filled correctly!'

            world.subjectDict[(self.x, self.y)].remove(self)
            self.x += action[0]
            self.y += action[1]
            self.lastReward += world.trailMix[self.x, self.y]
            world.subjectDict[(self.x, self.y)].append(self)
            self.lastReward += (world.hunter_grass[self.x, self.y] * 0.1)
            world.hunter_grass[self.x, self.y] = 0

            sub = self.getClosestSubject(world, self.x, self.y)
            dist = self.viewD
            if sub is not None:
                dist = np.sqrt(np.square(self.x - sub.x) + np.square(self.y - sub.y))
            distReward = self.viewD - dist

            self.lastReward += distReward