diff --git a/labirinth_ai/LabyrinthClient.py b/labirinth_ai/LabyrinthClient.py index fdcb22e..daa8e93 100644 --- a/labirinth_ai/LabyrinthClient.py +++ b/labirinth_ai/LabyrinthClient.py @@ -17,9 +17,9 @@ class LabyrinthClient(Client): if self.world_provider.world.board[x, y] in [1, 2]: r, g, b = 57, 92, 152 if 1.5 >= self.world_provider.world.hunter_grass[x, y] > 0.5: - r, g, b = 25, 149, 156 - if 3 >= self.world_provider.world.hunter_grass[x, y] > 1.5: r, g, b = 112, 198, 169 + if 3 >= self.world_provider.world.hunter_grass[x, y] > 1.5: + r, g, b = 25, 149, 156 self.world_provider.world.set_color(x, y, 0, r / 255.0, g / 255.0, b / 255.0) if self.world_provider.world.board[x, y] == 3: self.world_provider.world.set_color(x, y, 0, 139 / 255.0, 72 / 255.0, 82 / 255.0) diff --git a/labirinth_ai/Models/BaseModel.py b/labirinth_ai/Models/BaseModel.py new file mode 100644 index 0000000..e4210ad --- /dev/null +++ b/labirinth_ai/Models/BaseModel.py @@ -0,0 +1,125 @@ +import torch +from torch import nn +import numpy as np +import tqdm +from torch.utils.data import Dataset, DataLoader + +device = "cuda" if torch.cuda.is_available() else "cpu" +print(f"Using {device} device") + + +# Define model +class BaseModel(nn.Module): + def __init__(self, view_dimension, action_num, channels): + super(BaseModel, self).__init__() + self.flatten = nn.Flatten() + self.actions = [] + self.action_num = action_num + self.viewD = view_dimension + self.channels = channels + for action in range(action_num): + action_sequence = nn.Sequential( + nn.Linear(channels * (2 * self.viewD + 1) * (2 * self.viewD + 1) + 2, + (2 * self.viewD + 1) * (2 * self.viewD + 1)), + nn.ELU(), + nn.Linear((2 * self.viewD + 1) * (2 * self.viewD + 1), (self.viewD + 1) * (self.viewD + 1)), + nn.ELU(), + nn.Linear((self.viewD + 1) * (self.viewD + 1), 2) + ) + self.add_module('action_' + str(action), action_sequence) + self.actions.append(action_sequence) + + def forward(self, x): + x_flat = self.flatten(x) + actions = [] + for action in range(self.action_num): + actions.append(self.actions[action](x_flat)) + return torch.stack(actions, dim=1) + + +class BaseDataSet(Dataset): + def __init__(self, states, targets): + assert len(states) == len(targets), "Needs to have as many states as targets!" + self.states = torch.tensor(states, dtype=torch.float32) + self.targets = torch.tensor(targets, dtype=torch.float32) + + def __len__(self): + return len(self.states) + + def __getitem__(self, idx): + return self.states[idx], self.targets[idx] + + +def create_optimizer(model): + return torch.optim.RMSprop(model.parameters(), lr=1e-3) + + +def create_loss_function(action): + def custom_loss(prediction, target): + return torch.mean(0.5 * torch.square( + 0.1 * target[:, 0, 0] + target[:, 1, 0] - ( + prediction[:, action, 0] + prediction[:, action, 1])) + 0.5 * torch.square( + target[:, 1, 0] - prediction[:, action, 0]), dim=0) + + return custom_loss + + +def from_numpy(x): + return torch.tensor(x, dtype=torch.float32) + + +def train(states, targets, model, optimizer): + for action in range(model.action_num): + data_set = BaseDataSet(states[action], targets[action]) + dataloader = DataLoader(data_set, batch_size=64, shuffle=True) + loss_fn = create_loss_function(action) + + size = len(dataloader) + model.train() + for batch, (X, y) in enumerate(dataloader): + X, y = X.to(device), y.to(device) + + # Compute prediction error + pred = model(X) + loss = loss_fn(pred, y) + + # Backpropagation + optimizer.zero_grad() + loss.backward() + optimizer.step() + + if batch % 100 == 0: + loss, current = loss.item(), batch * len(X) + print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]") + model.eval() + + +if __name__ == '__main__': + sample = np.random.random((1, 4, 11, 11)) + + model = BaseModel(5, 4, 4).to(device) + print(model) + + test = model(torch.tensor(sample, dtype=torch.float32)) + # test = test.cpu().detach().numpy() + print(test) + + state = np.random.random((4, 11, 11)) + target = np.random.random((4, 2)) + states = [ + [state], + [state], + [state], + [state], + ] + targets = [ + [target], + [target], + [target], + [target], + ] + + optimizer = torch.optim.RMSprop(model.parameters(), lr=1e-3) + + train(states, targets, model, optimizer) + diff --git a/labirinth_ai/Models/__init__.py b/labirinth_ai/Models/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/labirinth_ai/Subject.py b/labirinth_ai/Subject.py index ec0593c..3fe291a 100644 --- a/labirinth_ai/Subject.py +++ b/labirinth_ai/Subject.py @@ -5,6 +5,7 @@ from tensorflow import keras from labirinth_ai.LabyrinthWorld import LabyrinthWorld from labirinth_ai.loss import loss2, loss3 +from labirinth_ai.Models.BaseModel import BaseModel, train, create_optimizer, device, from_numpy # import torch # dtype = torch.float @@ -369,22 +370,9 @@ class NetLearner(Subject): self.x_in = [] self.actions = [] self.target = [] - for i in range(4): - x_in = keras.Input(shape=(self.channels * (2 * self.viewD + 1) * (2 * self.viewD + 1) + 2)) - self.x_in.append(x_in) - inVec = keras.layers.Flatten()(x_in) - actions = keras.layers.Dense(((2 * self.viewD + 1) * (2 * self.viewD + 1)), activation='elu', - kernel_regularizer=keras.regularizers.l2(0.001), - name=self.name + str(self.id) + 'Dense' + str(i) + 'l1')(inVec) - actions = keras.layers.Dense(((self.viewD + 1) * (self.viewD + 1)), activation='elu', - kernel_regularizer=keras.regularizers.l2(0.001))(actions) - self.target.append(keras.Input(shape=(2, 1))) - self.actions.append(keras.layers.Dense(2, activation='linear', use_bias=False, kernel_regularizer=keras.regularizers.l2(0.001))(actions)) - - self.model = keras.Model(inputs=self.x_in, outputs=self.actions) - - self.model.compile(optimizer=tf.keras.optimizers.RMSprop(self.learningRate), loss=loss3, - target_tensors=self.target) + self.model = BaseModel(self.viewD, 4, 4) + self.model.to(device) + self.optimizer = create_optimizer(self.model) if len(self.samples) < self.randomBuffer: self.random = True @@ -508,7 +496,7 @@ class NetLearner(Subject): if state is None: state = self.createState(world) if vals is None: - vals = self.model.predict([state, state, state, state]) + vals = self.model(from_numpy(state)).detach().numpy() vals = np.reshape(np.transpose(np.reshape(vals, (4, 2)), (1, 0)), (1, 8)) @@ -623,9 +611,9 @@ class NetLearner(Subject): target[:, 1, 0] = samples[:, 1, 3] #reward t-2 got nextState = np.concatenate(samples[:, 1, 0]) #states of t-1 - nextVals = self.model.predict([nextState, nextState, nextState, nextState]) + nextVals = self.model(from_numpy(nextState)).detach().numpy() - nextVals2 = nextVals[i][:, 0] + nextVals[i][:, 1] + nextVals2 = nextVals[:, i, 0] + nextVals[:, i, 1] target[:, 0, 0] = nextVals2 #best q t-1 else: target[:, 1, 0] = np.array(list(map(lambda elem: list(elem), list(np.array(samples[:, 1, 4])))))[:, i] # reward t-2 got @@ -639,7 +627,7 @@ class NetLearner(Subject): def train(self): print(self.name) states, target = self.generateSamples() - self.model.fit(states, target, epochs=1) + train(states, target, self.model, self.optimizer) self.samples = self.samples[-self.historySizeMul*self.batchsize:]