From c328f982d738c231553d38b07a8279ecc0c830d2 Mon Sep 17 00:00:00 2001
From: Bam4d <chrisbam4d@gmail.com>
Date: Wed, 10 Mar 2021 19:49:51 +0000
Subject: [PATCH 01/45] working on conditional action distributions

---
 .../rllib_single_agent_conditional_actions.py | 63 +++++++++++++
 .../agents/global_average_pooling_agent.py    |  2 +
 .../torch/conditional_actions/__init__.py     |  0
 .../conditional_action_exploration.py         | 91 +++++++++++++++++++
 .../conditional_action_mixin.py               | 66 ++++++++++++++
 .../conditional_action_policy_trainer.py      | 26 ++++++
 python/griddly/util/rllib/wrappers/core.py    |  5 +-
 7 files changed, 251 insertions(+), 2 deletions(-)
 create mode 100644 python/examples/rllib/rllib_single_agent_conditional_actions.py
 create mode 100644 python/griddly/util/rllib/torch/conditional_actions/__init__.py
 create mode 100644 python/griddly/util/rllib/torch/conditional_actions/conditional_action_exploration.py
 create mode 100644 python/griddly/util/rllib/torch/conditional_actions/conditional_action_mixin.py
 create mode 100644 python/griddly/util/rllib/torch/conditional_actions/conditional_action_policy_trainer.py

diff --git a/python/examples/rllib/rllib_single_agent_conditional_actions.py b/python/examples/rllib/rllib_single_agent_conditional_actions.py
new file mode 100644
index 000000000..0a7ed1ae4
--- /dev/null
+++ b/python/examples/rllib/rllib_single_agent_conditional_actions.py
@@ -0,0 +1,63 @@
+import os
+import sys
+
+import ray
+from ray import tune
+from ray.rllib.models import ModelCatalog
+from ray.tune.registry import register_env
+
+from griddly import gd
+from griddly.util.rllib.torch import GAPAgent
+from griddly.util.rllib.torch.conditional_actions.conditional_action_policy_trainer import ConditionalActionImpalaTrainer
+from griddly.util.rllib.wrappers.core import RLlibEnv
+
+if __name__ == '__main__':
+    sep = os.pathsep
+    os.environ['PYTHONPATH'] = sep.join(sys.path)
+
+    ray.init(num_gpus=1, local_mode=True)
+
+    env_name = "ray-griddly-env"
+
+    register_env(env_name, RLlibEnv)
+    ModelCatalog.register_custom_model("GAP", GAPAgent)
+
+    max_training_steps = 100000000
+
+    config = {
+        'framework': 'torch',
+        'num_workers': 1,
+        'num_envs_per_worker': 1,
+
+        'model': {
+            'custom_model': 'GAP',
+            'custom_model_config': {}
+        },
+        'env': env_name,
+        'env_config': {
+            'record_video_config': {
+                'frequency': 100000
+            },
+
+            'conditional_action_sampling': True,
+            'invalid_action_masking': True,
+            'random_level_on_reset': True,
+            'yaml_file': 'Single-Player/GVGAI/clusters_partially_observable.yaml',
+            'global_observer_type': gd.ObserverType.SPRITE_2D,
+            'max_steps': 1000,
+        },
+        'entropy_coeff_schedule': [
+            [0, 0.01],
+            [max_training_steps, 0.0]
+        ],
+        'lr_schedule': [
+            [0, 0.005],
+            [max_training_steps, 0.0]
+        ]
+    }
+
+    stop = {
+        "timesteps_total": max_training_steps,
+    }
+
+    result = tune.run(ConditionalActionImpalaTrainer, config=config, stop=stop)
diff --git a/python/griddly/util/rllib/torch/agents/global_average_pooling_agent.py b/python/griddly/util/rllib/torch/agents/global_average_pooling_agent.py
index 2929508bb..d2249abcb 100644
--- a/python/griddly/util/rllib/torch/agents/global_average_pooling_agent.py
+++ b/python/griddly/util/rllib/torch/agents/global_average_pooling_agent.py
@@ -1,4 +1,5 @@
 import numpy as np
+from gym.spaces import Dict
 from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
 from torch import nn
 
@@ -32,6 +33,7 @@ def __init__(self, obs_space, action_space, num_outputs, model_config, name):
         nn.Module.__init__(self)
 
         self._num_objects = obs_space.shape[2]
+
         self._num_actions = num_outputs
 
         self.network = nn.Sequential(
diff --git a/python/griddly/util/rllib/torch/conditional_actions/__init__.py b/python/griddly/util/rllib/torch/conditional_actions/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/python/griddly/util/rllib/torch/conditional_actions/conditional_action_exploration.py b/python/griddly/util/rllib/torch/conditional_actions/conditional_action_exploration.py
new file mode 100644
index 000000000..341ab2879
--- /dev/null
+++ b/python/griddly/util/rllib/torch/conditional_actions/conditional_action_exploration.py
@@ -0,0 +1,91 @@
+import torch
+from gym.spaces import Discrete, MultiDiscrete
+from ray.rllib.models.torch.torch_action_dist import TorchCategorical, TorchMultiCategorical
+from torch.distributions import Categorical
+import numpy as np
+
+class TorchConditionalMaskingExploration():
+
+    def __init__(self, model, dist_inputs, valid_action_trees, explore=False):
+        self._valid_action_trees = valid_action_trees
+
+        self._num_inputs = dist_inputs.shape[0]
+        if isinstance(model.action_space, Discrete):
+            self._action_space_shape = [model.action_space.n]
+        elif isinstance(model.action_space, MultiDiscrete):
+            self._action_space_shape = model.action_space.nvec
+
+        self._num_action_logits = np.sum(self._action_space_shape)
+        self._num_action_parts = len(self._action_space_shape)
+
+        self._explore = explore
+
+        self._inputs_split = dist_inputs.split(tuple(self._action_space_shape), dim=1)
+
+    def _mask_and_sample(self, options, logits):
+
+        mask = torch.zeros([logits.shape[0]])
+        mask[options] = 1
+
+        logits += torch.log(mask)
+        dist = Categorical(logits=logits)
+        sampled = dist.sample()
+        logp = dist.log_prob(sampled)
+
+        return sampled, logits, logp, mask
+
+    def get_actions_and_mask(self):
+
+        actions = torch.zeros([self._num_inputs, self._num_action_parts])
+        masked_logits = torch.zeros([self._num_inputs, self._num_action_logits])
+        mask = torch.zeros([self._num_inputs, self._num_action_logits])
+        logp_sums = torch.zeros([self._num_inputs])
+
+        if self._valid_action_trees is not None:
+
+            for i in range(self._num_inputs):
+                if len(self._valid_action_trees) >= 1:
+
+                    subtree = self._valid_action_trees[i]
+                    subtree_options = list(subtree.keys())
+
+                    # In the case there are no available actions for the player
+                    if len(subtree_options) == 0:
+                        subtree = {}
+                        for _ in range(self._num_action_parts):
+                            subtree[0] = {}
+                        subtree_options = [0]
+
+                    logp_parts = torch.zeros([self._num_action_parts])
+                    mask_offset = 0
+                    for a in range(self._num_action_parts):
+                        dist_part = self._inputs_split[a]
+                        sampled, masked_part_logits, logp, mask_part = self._mask_and_sample(subtree_options, dist_part[i])
+
+                        # Set the action and the mask for each part of the action
+                        actions[i, a] = sampled
+                        masked_logits[i, mask_offset:mask_offset + self._action_space_shape[a]] = masked_part_logits
+                        mask[i, mask_offset:mask_offset + self._action_space_shape[a]] = mask_part
+
+                        logp_parts[a] = logp
+
+                        if mask_part.sum() == 0:
+                            raise RuntimeError('mask calculated incorrectly')
+
+                        mask_offset += self._action_space_shape[a]
+
+                        if isinstance(subtree, dict):
+                            subtree = subtree[int(sampled)]
+                            if isinstance(subtree, dict):
+                                subtree_options = list(subtree.keys())
+                            else:
+                                # Leaf nodes with action_id list
+                                subtree_options = subtree
+
+                    logp_sums[i] = torch.sum(logp_parts)
+
+        # if its a discrete then flatten the space
+        if self._num_action_parts == 1:
+            actions = actions.flatten()
+
+        return actions, masked_logits, logp_sums, mask
\ No newline at end of file
diff --git a/python/griddly/util/rllib/torch/conditional_actions/conditional_action_mixin.py b/python/griddly/util/rllib/torch/conditional_actions/conditional_action_mixin.py
new file mode 100644
index 000000000..ebe5b8baa
--- /dev/null
+++ b/python/griddly/util/rllib/torch/conditional_actions/conditional_action_mixin.py
@@ -0,0 +1,66 @@
+import numpy as np
+import torch
+from ray.rllib import Policy, SampleBatch
+from ray.rllib.utils import override
+from ray.rllib.utils.torch_ops import convert_to_non_torch_type
+
+from griddly.util.rllib.torch.conditional_actions.conditional_action_exploration import TorchConditionalMaskingExploration
+
+
+class ConditionalActionMixin:
+
+    @override(Policy)
+    def compute_actions_from_input_dict(
+            self,
+            input_dict,
+            explore=None,
+            timestep = None,
+            **kwargs):
+
+        explore = explore if explore is not None else self.config["explore"]
+        timestep = timestep if timestep is not None else self.global_timestep
+
+        with torch.no_grad():
+            # Pass lazy (torch) tensor dict to Model as `input_dict`.
+            input_dict = self._lazy_tensor_dict(input_dict)
+            # Pack internal state inputs into (separate) list.
+            state_batches = [
+                input_dict[k] for k in input_dict.keys() if "state_in" in k[:8]
+            ]
+            # Calculate RNN sequence lengths.
+            seq_lens = np.array([1] * len(input_dict["obs"])) \
+                if state_batches else None
+
+            self._is_recurrent = state_batches is not None and state_batches != []
+
+            # Switch to eval mode.
+            self.model.eval()
+
+            dist_inputs, state_out = self.model(input_dict, state_batches,
+                                                seq_lens)
+
+            infos = input_dict[SampleBatch.INFOS] if SampleBatch.INFOS in input_dict else {}
+
+            valid_action_trees = infos[0]['valid_action_trees'] if isinstance(infos, np.ndarray) and 'valid_action_trees' in infos[0] else None
+
+            exploration = TorchConditionalMaskingExploration(
+                self.model,
+                dist_inputs,
+                valid_action_trees,
+                explore,
+            )
+
+            actions, masked_logits, logp, mask = exploration.get_actions_and_mask()
+
+            input_dict[SampleBatch.ACTIONS] = actions
+
+            extra_fetches = {
+                SampleBatch.ACTION_DIST_INPUTS: dist_inputs,
+                SampleBatch.ACTION_PROB: torch.exp(logp.float()),
+                SampleBatch.ACTION_LOGP: logp
+            }
+
+            # Update our global timestep by the batch size.
+            self.global_timestep += len(input_dict[SampleBatch.CUR_OBS])
+
+            return convert_to_non_torch_type((actions, state_out, extra_fetches))
diff --git a/python/griddly/util/rllib/torch/conditional_actions/conditional_action_policy_trainer.py b/python/griddly/util/rllib/torch/conditional_actions/conditional_action_policy_trainer.py
new file mode 100644
index 000000000..440b7360a
--- /dev/null
+++ b/python/griddly/util/rllib/torch/conditional_actions/conditional_action_policy_trainer.py
@@ -0,0 +1,26 @@
+from ray.rllib.agents.impala import ImpalaTrainer
+from ray.rllib.agents.impala.vtrace_torch_policy import VTraceTorchPolicy
+from ray.rllib.policy.torch_policy import LearningRateSchedule, EntropyCoeffSchedule
+
+from griddly.util.rllib.torch.conditional_actions.conditional_action_mixin import ConditionalActionMixin
+
+def setup_mixins(policy, obs_space, action_space, config):
+    ConditionalActionMixin.__init__(policy)
+    EntropyCoeffSchedule.__init__(policy, config["entropy_coeff"],
+                                  config["entropy_coeff_schedule"])
+    LearningRateSchedule.__init__(policy, config["lr"], config["lr_schedule"])
+
+ConditionalActionVTraceTorchPolicy = VTraceTorchPolicy.with_updates(
+    name="ConditionalMaskingVTraceTorchPolicy",
+    before_init=setup_mixins,
+    mixins=[LearningRateSchedule, EntropyCoeffSchedule, ConditionalActionMixin]
+)
+
+def get_vtrace_policy_class(config):
+    if config['framework'] == 'torch':
+        return ConditionalActionVTraceTorchPolicy
+    else:
+        raise NotImplementedError('Tensorflow not supported')
+
+ConditionalActionImpalaTrainer = ImpalaTrainer.with_updates(default_policy=ConditionalActionVTraceTorchPolicy,
+                                                               get_policy_class=get_vtrace_policy_class)
\ No newline at end of file
diff --git a/python/griddly/util/rllib/wrappers/core.py b/python/griddly/util/rllib/wrappers/core.py
index 3c96ee659..d504b57c7 100644
--- a/python/griddly/util/rllib/wrappers/core.py
+++ b/python/griddly/util/rllib/wrappers/core.py
@@ -58,6 +58,7 @@ def __init__(self, env_config):
         super().__init__(**env_config)
 
         self.invalid_action_masking = env_config.get('invalid_action_masking', False)
+        self.conditional_action_sampling = env_config.get('conditional_action_sampling', False)
         self._record_video_config = env_config.get('record_video_config', None)
         self._random_level_on_reset = env_config.get('random_level_on_reset', False)
 
@@ -149,7 +150,7 @@ def reset(self, **kwargs):
         observation = super().reset(**kwargs)
         self.set_transform()
 
-        if self.invalid_action_masking:
+        if self.conditional_action_sampling:
             self.last_valid_action_trees = self._build_valid_action_trees()
 
         return self._transform(observation)
@@ -161,7 +162,7 @@ def step(self, action):
 
         self._env_steps += 1
 
-        if self.invalid_action_masking:
+        if self.conditional_action_sampling:
             self.last_valid_action_trees = self._build_valid_action_trees()
             info['valid_action_trees'] = self.last_valid_action_trees
 

From 8f62d4fa37a9ea19e261828d75f4dac9126b5432 Mon Sep 17 00:00:00 2001
From: Bam4d <chrisbam4d@gmail.com>
Date: Thu, 11 Mar 2021 20:17:07 +0000
Subject: [PATCH 02/45] building tree in c++ side for speed

---
 bindings/python.cpp                           |   1 +
 bindings/wrapper/GameWrapper.cpp              |  74 +++++-
 python/examples/rllib/rllib_multi_agent.py    |   2 +-
 python/examples/rllib/rllib_single_agent.py   |   2 +-
 .../rllib_single_agent_conditional_actions.py |  18 +-
 .../conditional_action_exploration.py         |  27 +-
 .../conditional_action_mixin.py               |  10 +-
 .../conditional_action_policy_trainer.py      |  11 +-
 .../griddly/util/rllib/wrappers/__init__.py   |   0
 python/griddly/util/rllib/wrappers/core.py    | 236 ------------------
 10 files changed, 108 insertions(+), 273 deletions(-)
 delete mode 100644 python/griddly/util/rllib/wrappers/__init__.py
 delete mode 100644 python/griddly/util/rllib/wrappers/core.py

diff --git a/bindings/python.cpp b/bindings/python.cpp
index ce84016ff..a5fde9813 100644
--- a/bindings/python.cpp
+++ b/bindings/python.cpp
@@ -56,6 +56,7 @@ PYBIND11_MODULE(python_griddly, m) {
   // Get available actions for objects in the current game
   game_process.def("get_available_actions", &Py_GameWrapper::getAvailableActionNames);
   game_process.def("get_available_action_ids", &Py_GameWrapper::getAvailableActionIds);
+  game_process.def("build_valid_action_trees", &Py_GameWrapper::buildValidActionTrees);
 
   // Width and height of the game grid 
   game_process.def("get_width", &Py_GameWrapper::getWidth);
diff --git a/bindings/wrapper/GameWrapper.cpp b/bindings/wrapper/GameWrapper.cpp
index 2b9510f20..da04ae768 100644
--- a/bindings/wrapper/GameWrapper.cpp
+++ b/bindings/wrapper/GameWrapper.cpp
@@ -39,8 +39,65 @@ class Py_GameWrapper {
     return player;
   }
 
-  uint32_t getNumPlayers() const {
-    return gameProcess_->getNumPlayers();
+  std::vector<py::dict> buildValidActionTrees() const {
+    
+    std::vector<py::dict> valid_action_trees; 
+    auto externalActionNames = gdyFactory_->getExternalActionNames();
+    for (int playerId = 1; playerId <= playerCount_; playerId++) {
+      py::dict valid_action_tree;
+      for (auto actionNamesAtLocation : gameProcess_->getAvailableActionNames(playerId)) {
+        auto location = actionNamesAtLocation.first;
+        auto actionNames = actionNamesAtLocation.second;
+
+        for (auto actionName : actionNames) {
+          auto& treePtr = valid_action_tree;
+          auto actionInputsDefinitions = gdyFactory_->getActionInputsDefinitions();
+          if (actionInputsDefinitions.find(actionName) != actionInputsDefinitions.end()) {
+            auto locationVec = glm::ivec2{location[0], location[1]};
+            auto actionIdsForName = gameProcess_->getAvailableActionIdsAtLocation(locationVec, actionName);
+
+            if (actionIdsForName.size() > 0) {
+              if (gdyFactory_->getAvatarObject().length() == 0) {
+                auto py_x = py::cast(locationVec[0]);
+                auto py_y = py::cast(locationVec[1]);
+                if(!treePtr.contains(py_x)) {
+                  treePtr[py_x] = py::dict();
+                }
+
+                treePtr = treePtr[py_x];
+
+                if(!treePtr.contains(py_y)) {
+                  treePtr[py_y] = py::dict();
+                }
+
+                treePtr = treePtr[py_y];
+              }
+
+              if (externalActionNames.size() > 1) {
+                auto py_actionName = py::cast(actionName);
+                if(!treePtr.contains(py_actionName)) {
+                  treePtr[py_actionName] = py::dict();
+                }
+
+                treePtr = treePtr[py_actionName];
+              }
+
+              for(auto id : actionIdsForName) {
+                auto py_id = py::cast(id);
+                treePtr[py_id] = py::dict();
+              }
+
+              auto py_nop = py::cast(0);
+              treePtr[py_nop] = py::dict();
+
+            }
+          }
+        }
+      }
+      valid_action_trees.push_back(valid_action_tree);
+    }
+
+    return valid_action_trees;
   }
 
   py::dict getAvailableActionNames(int playerId) const {
@@ -106,8 +163,6 @@ class Py_GameWrapper {
   }
 
   py::tuple stepParallel(py::buffer stepArray) {
-
-
     auto stepArrayInfo = stepArray.request();
     if (stepArrayInfo.format != "l" && stepArrayInfo.format != "i") {
       auto error = fmt::format("Invalid data type {0}, must be an integer.", stepArrayInfo.format);
@@ -130,7 +185,7 @@ class Py_GameWrapper {
     }
 
     auto externalActionNames = gdyFactory_->getExternalActionNames();
-    
+
     std::vector<int32_t> playerRewards;
     bool terminated;
     py::dict info;
@@ -138,7 +193,7 @@ class Py_GameWrapper {
     for (int p = 0; p < playerSize; p++) {
       std::string actionName;
       std::vector<int32_t> actionArray;
-      auto pStr = (int32_t *)stepArrayInfo.ptr + p * playerStride;
+      auto pStr = (int32_t*)stepArrayInfo.ptr + p * playerStride;
 
       bool lastPlayer = p == (playerSize - 1);
 
@@ -173,7 +228,7 @@ class Py_GameWrapper {
       auto playerStepResult = players_[p]->stepSingle(actionName, actionArray, lastPlayer);
 
       playerRewards.push_back(playerStepResult[0].cast<int32_t>());
-      if(lastPlayer) {
+      if (lastPlayer) {
         terminated = playerStepResult[1].cast<bool>();
         info = playerStepResult[2];
       }
@@ -253,7 +308,6 @@ class Py_GameWrapper {
   }
 
   py::dict getGlobalVariables(std::vector<std::string> variables) const {
-
     py::dict py_globalVariables;
     auto globalVariables = gameProcess_->getGrid()->getGlobalVariables();
 
@@ -262,7 +316,7 @@ class Py_GameWrapper {
 
       auto globalVariableMap = globalVariables[variableNameIt];
 
-      for(auto playerVariableIt : globalVariableMap) {
+      for (auto playerVariableIt : globalVariableMap) {
         resolvedGlobalVariableMap.insert({playerVariableIt.first, *playerVariableIt.second});
       }
 
@@ -280,7 +334,7 @@ class Py_GameWrapper {
         py::dict py_event;
 
         py::dict rewards;
-        for (auto& reward: historyEvent.rewards) {
+        for (auto& reward : historyEvent.rewards) {
           rewards[py::cast(reward.first)] = reward.second;
         }
 
diff --git a/python/examples/rllib/rllib_multi_agent.py b/python/examples/rllib/rllib_multi_agent.py
index 0b80a9ca3..1560a127f 100644
--- a/python/examples/rllib/rllib_multi_agent.py
+++ b/python/examples/rllib/rllib_multi_agent.py
@@ -10,7 +10,7 @@
 
 from griddly import gd
 from griddly.util.rllib.torch.agents.conv_agent import SimpleConvAgent
-from griddly.util.rllib.wrappers.core import RLlibMultiAgentWrapper, RLlibEnv
+from griddly.util.rllib.env.core import RLlibMultiAgentWrapper, RLlibEnv
 
 if __name__ == '__main__':
     sep = os.pathsep
diff --git a/python/examples/rllib/rllib_single_agent.py b/python/examples/rllib/rllib_single_agent.py
index c9a603505..eda9d41e3 100644
--- a/python/examples/rllib/rllib_single_agent.py
+++ b/python/examples/rllib/rllib_single_agent.py
@@ -9,7 +9,7 @@
 
 from griddly import gd
 from griddly.util.rllib.torch import GAPAgent
-from griddly.util.rllib.wrappers.core import RLlibEnv
+from griddly.util.rllib.env.core import RLlibEnv
 
 if __name__ == '__main__':
     sep = os.pathsep
diff --git a/python/examples/rllib/rllib_single_agent_conditional_actions.py b/python/examples/rllib/rllib_single_agent_conditional_actions.py
index 0a7ed1ae4..8e1920751 100644
--- a/python/examples/rllib/rllib_single_agent_conditional_actions.py
+++ b/python/examples/rllib/rllib_single_agent_conditional_actions.py
@@ -9,25 +9,25 @@
 from griddly import gd
 from griddly.util.rllib.torch import GAPAgent
 from griddly.util.rllib.torch.conditional_actions.conditional_action_policy_trainer import ConditionalActionImpalaTrainer
-from griddly.util.rllib.wrappers.core import RLlibEnv
+from griddly.util.rllib.env.core import RLlibEnv
 
 if __name__ == '__main__':
     sep = os.pathsep
     os.environ['PYTHONPATH'] = sep.join(sys.path)
 
-    ray.init(num_gpus=1, local_mode=True)
+    ray.init(num_gpus=1)
 
     env_name = "ray-griddly-env"
 
     register_env(env_name, RLlibEnv)
     ModelCatalog.register_custom_model("GAP", GAPAgent)
 
-    max_training_steps = 100000000
+    max_training_steps = 5000000
 
     config = {
         'framework': 'torch',
-        'num_workers': 1,
-        'num_envs_per_worker': 1,
+        'num_workers': 6,
+        'num_envs_per_worker': 2,
 
         'model': {
             'custom_model': 'GAP',
@@ -39,8 +39,8 @@
                 'frequency': 100000
             },
 
-            'conditional_action_sampling': True,
-            'invalid_action_masking': True,
+            'invalid_action_masking': tune.grid_search([True, False]),
+            'generate_valid_action_trees': tune.grid_search([True, False]),
             'random_level_on_reset': True,
             'yaml_file': 'Single-Player/GVGAI/clusters_partially_observable.yaml',
             'global_observer_type': gd.ObserverType.SPRITE_2D,
@@ -53,7 +53,9 @@
         'lr_schedule': [
             [0, 0.005],
             [max_training_steps, 0.0]
-        ]
+        ],
+
+
     }
 
     stop = {
diff --git a/python/griddly/util/rllib/torch/conditional_actions/conditional_action_exploration.py b/python/griddly/util/rllib/torch/conditional_actions/conditional_action_exploration.py
index 341ab2879..4f6a513d0 100644
--- a/python/griddly/util/rllib/torch/conditional_actions/conditional_action_exploration.py
+++ b/python/griddly/util/rllib/torch/conditional_actions/conditional_action_exploration.py
@@ -6,7 +6,7 @@
 
 class TorchConditionalMaskingExploration():
 
-    def __init__(self, model, dist_inputs, valid_action_trees, explore=False):
+    def __init__(self, model, dist_inputs, valid_action_trees, explore=False, invalid_action_masking=False):
         self._valid_action_trees = valid_action_trees
 
         self._num_inputs = dist_inputs.shape[0]
@@ -18,16 +18,21 @@ def __init__(self, model, dist_inputs, valid_action_trees, explore=False):
         self._num_action_logits = np.sum(self._action_space_shape)
         self._num_action_parts = len(self._action_space_shape)
 
+        self._invalid_action_masking = invalid_action_masking
+
         self._explore = explore
 
         self._inputs_split = dist_inputs.split(tuple(self._action_space_shape), dim=1)
 
     def _mask_and_sample(self, options, logits):
 
-        mask = torch.zeros([logits.shape[0]])
+        #if self._invalid_action_masking:
+        mask = torch.zeros([logits.shape[0]]).to(logits.device)
         mask[options] = 1
-
         logits += torch.log(mask)
+        #else:
+        #    mask = torch.ones([logits.shape[0]])
+
         dist = Categorical(logits=logits)
         sampled = dist.sample()
         logp = dist.log_prob(sampled)
@@ -51,10 +56,11 @@ def get_actions_and_mask(self):
 
                     # In the case there are no available actions for the player
                     if len(subtree_options) == 0:
-                        subtree = {}
+                        build_tree = subtree
                         for _ in range(self._num_action_parts):
-                            subtree[0] = {}
-                        subtree_options = [0]
+                            build_tree[0] = {}
+                            build_tree = build_tree[0]
+                        subtree_options = list(subtree.keys())
 
                     logp_parts = torch.zeros([self._num_action_parts])
                     mask_offset = 0
@@ -74,13 +80,8 @@ def get_actions_and_mask(self):
 
                         mask_offset += self._action_space_shape[a]
 
-                        if isinstance(subtree, dict):
-                            subtree = subtree[int(sampled)]
-                            if isinstance(subtree, dict):
-                                subtree_options = list(subtree.keys())
-                            else:
-                                # Leaf nodes with action_id list
-                                subtree_options = subtree
+                        subtree = subtree[int(sampled)]
+                        subtree_options = list(subtree.keys())
 
                     logp_sums[i] = torch.sum(logp_parts)
 
diff --git a/python/griddly/util/rllib/torch/conditional_actions/conditional_action_mixin.py b/python/griddly/util/rllib/torch/conditional_actions/conditional_action_mixin.py
index ebe5b8baa..199d3f825 100644
--- a/python/griddly/util/rllib/torch/conditional_actions/conditional_action_mixin.py
+++ b/python/griddly/util/rllib/torch/conditional_actions/conditional_action_mixin.py
@@ -41,13 +41,21 @@ def compute_actions_from_input_dict(
 
             infos = input_dict[SampleBatch.INFOS] if SampleBatch.INFOS in input_dict else {}
 
-            valid_action_trees = infos[0]['valid_action_trees'] if isinstance(infos, np.ndarray) and 'valid_action_trees' in infos[0] else None
+            valid_action_trees = []
+            for info in infos:
+                if isinstance(info, dict) and 'valid_action_tree' in info:
+                    valid_action_trees.append(info['valid_action_tree'])
+                else:
+                    valid_action_trees.append({})
+
+            invalid_action_masking = self.config["env_config"].get("invalid_action_masking", False)
 
             exploration = TorchConditionalMaskingExploration(
                 self.model,
                 dist_inputs,
                 valid_action_trees,
                 explore,
+                invalid_action_masking,
             )
 
             actions, masked_logits, logp, mask = exploration.get_actions_and_mask()
diff --git a/python/griddly/util/rllib/torch/conditional_actions/conditional_action_policy_trainer.py b/python/griddly/util/rllib/torch/conditional_actions/conditional_action_policy_trainer.py
index 440b7360a..441da7d74 100644
--- a/python/griddly/util/rllib/torch/conditional_actions/conditional_action_policy_trainer.py
+++ b/python/griddly/util/rllib/torch/conditional_actions/conditional_action_policy_trainer.py
@@ -1,26 +1,31 @@
-from ray.rllib.agents.impala import ImpalaTrainer
+from ray.rllib.agents import with_common_config
+from ray.rllib.agents.impala import ImpalaTrainer, DEFAULT_CONFIG as IMPALA_CONFIG
 from ray.rllib.agents.impala.vtrace_torch_policy import VTraceTorchPolicy
 from ray.rllib.policy.torch_policy import LearningRateSchedule, EntropyCoeffSchedule
 
 from griddly.util.rllib.torch.conditional_actions.conditional_action_mixin import ConditionalActionMixin
 
+
 def setup_mixins(policy, obs_space, action_space, config):
     ConditionalActionMixin.__init__(policy)
     EntropyCoeffSchedule.__init__(policy, config["entropy_coeff"],
                                   config["entropy_coeff_schedule"])
     LearningRateSchedule.__init__(policy, config["lr"], config["lr_schedule"])
 
+
 ConditionalActionVTraceTorchPolicy = VTraceTorchPolicy.with_updates(
-    name="ConditionalMaskingVTraceTorchPolicy",
+    name="ConditionalActionVTraceTorchPolicy",
     before_init=setup_mixins,
     mixins=[LearningRateSchedule, EntropyCoeffSchedule, ConditionalActionMixin]
 )
 
+
 def get_vtrace_policy_class(config):
     if config['framework'] == 'torch':
         return ConditionalActionVTraceTorchPolicy
     else:
         raise NotImplementedError('Tensorflow not supported')
 
+
 ConditionalActionImpalaTrainer = ImpalaTrainer.with_updates(default_policy=ConditionalActionVTraceTorchPolicy,
-                                                               get_policy_class=get_vtrace_policy_class)
\ No newline at end of file
+                                                            get_policy_class=get_vtrace_policy_class)
diff --git a/python/griddly/util/rllib/wrappers/__init__.py b/python/griddly/util/rllib/wrappers/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/python/griddly/util/rllib/wrappers/core.py b/python/griddly/util/rllib/wrappers/core.py
deleted file mode 100644
index d504b57c7..000000000
--- a/python/griddly/util/rllib/wrappers/core.py
+++ /dev/null
@@ -1,236 +0,0 @@
-from uuid import uuid1
-from collections import defaultdict
-from enum import Enum
-from typing import Tuple
-
-import gym
-from gym.spaces import Dict
-from ray.rllib import MultiAgentEnv
-from ray.rllib.utils.typing import MultiAgentDict
-
-from griddly import GymWrapper
-import numpy as np
-
-from griddly.RenderTools import VideoRecorder
-
-
-class RecordingState(Enum):
-    NOT_RECORDING = 1
-    WAITING_FOR_EPISODE_START = 2
-    BEFORE_RECORDING = 3
-    RECORDING = 4
-
-
-class RLlibEnv(GymWrapper):
-    """
-    Wraps a Griddly environment for compatibility with RLLib.
-
-    Use the `env_config` in the rllib config to provide Griddly Environment Parameters
-
-    Example:
-
-    Firstly register the RLlibWrapper using rllib's
-
-    env_name = "my_env_name"
-
-    register_env(env_name, RLlibWrapper)
-
-    you can then configure it
-
-    rllib_config = {
-        'env_config': {
-            'yaml_file': 'Single-Player/GVGAI/butterflies.yaml',
-            'level": 6,
-            'player_observer_type': gd.ObserverType.SPRITE_2D,
-            'global_observer_type': gd.ObserverType.ISOMETRIC,
-            'max_steps': 1000,
-        },
-        # Other configuration options
-    }
-
-    Create the rllib trainer using this config:
-
-    trainer = ImpalaTrainer(rllib_config, env=env_name)
-
-    """
-
-    def __init__(self, env_config):
-        super().__init__(**env_config)
-
-        self.invalid_action_masking = env_config.get('invalid_action_masking', False)
-        self.conditional_action_sampling = env_config.get('conditional_action_sampling', False)
-        self._record_video_config = env_config.get('record_video_config', None)
-        self._random_level_on_reset = env_config.get('random_level_on_reset', False)
-
-        super().reset()
-
-        self._recording_state = None
-        self._env_steps = 0
-
-        if self._record_video_config is not None:
-            self._recording_state = RecordingState.BEFORE_RECORDING
-            self._record_frequency = self._record_video_config.get('frequency', 1000)
-
-        self.set_transform()
-
-    def _get_player_action_tree(self, player_id):
-
-        valid_action_tree = defaultdict(lambda: defaultdict(lambda: defaultdict(defaultdict)))
-        for location, action_names in self.game.get_available_actions(player_id).items():
-            for action_name, action_ids in self.game.get_available_action_ids(location, list(action_names)).items():
-                if len(action_ids) > 0:
-                    valid_action_tree[location[0]][location[1]][self.action_names.index(action_name)] = action_ids
-        return valid_action_tree
-
-    def _build_valid_action_trees(self):
-        player_valid_action_trees = []
-
-        if self.player_count > 0:
-            for p in range(self.player_count):
-                player_valid_action_trees.append(self._get_player_action_tree(p + 1))
-
-        else:
-            player_valid_action_trees.append(self._get_player_action_tree(1))
-
-        return player_valid_action_trees
-
-    def _transform(self, observation):
-
-        if self.player_count > 1:
-            transformed_obs = [obs.transpose(1, 2, 0).astype(np.float) for obs in observation]
-        else:
-            transformed_obs = observation.transpose(1, 2, 0).astype(np.float)
-
-        return transformed_obs
-
-    def _after_step(self, observation, reward, done, info):
-        if self._recording_state is not None:
-            if self._recording_state is RecordingState.NOT_RECORDING and self._env_steps % self._record_frequency == 0:
-                self._recording_state = RecordingState.WAITING_FOR_EPISODE_START
-
-            if self._recording_state == RecordingState.BEFORE_RECORDING:
-                global_obs = self.render(observer='global', mode='rgb_array')
-                self._global_recorder = VideoRecorder()
-                self._global_recorder.start(f'global_video_{uuid1()}_{self._env_steps}.mp4', global_obs.shape)
-                self._recording_state = RecordingState.RECORDING
-
-            if self._recording_state == RecordingState.RECORDING:
-                global_obs = self.render(observer='global', mode='rgb_array')
-                self._global_recorder.add_frame(global_obs)
-                if done:
-                    self._recording_state = RecordingState.NOT_RECORDING
-                    self._global_recorder.close()
-
-            if self._recording_state == RecordingState.WAITING_FOR_EPISODE_START:
-                if done:
-                    self._recording_state = RecordingState.BEFORE_RECORDING
-
-    def set_transform(self):
-        """
-        Create the transform for rllib based on the observation space
-        """
-
-        if self.player_count > 1:
-            self.observation_space = self.observation_space[0]
-            self.action_space = self.action_space[0]
-
-        self.observation_space = gym.spaces.Box(
-            self.observation_space.low.transpose((1, 2, 0)).astype(np.float),
-            self.observation_space.high.transpose((1, 2, 0)).astype(np.float),
-            dtype=np.float,
-        )
-
-        self.height = self.observation_space.shape[0]
-        self.width = self.observation_space.shape[1]
-
-    def reset(self, **kwargs):
-
-        if self._random_level_on_reset:
-            kwargs['level_id'] = np.random.choice(self.level_count)
-        observation = super().reset(**kwargs)
-        self.set_transform()
-
-        if self.conditional_action_sampling:
-            self.last_valid_action_trees = self._build_valid_action_trees()
-
-        return self._transform(observation)
-
-    def step(self, action):
-        observation, reward, done, info = super().step(action)
-
-        self._after_step(observation, reward, done, info)
-
-        self._env_steps += 1
-
-        if self.conditional_action_sampling:
-            self.last_valid_action_trees = self._build_valid_action_trees()
-            info['valid_action_trees'] = self.last_valid_action_trees
-
-        return self._transform(observation), reward, done, info
-
-    def render(self, mode='human', observer=0):
-        return super().render(mode, observer='global')
-
-
-class RLlibMultiAgentWrapper(gym.Wrapper, MultiAgentEnv):
-
-    def __init__(self, env, env_config):
-        super().__init__(env)
-
-        self._player_done_variable = env_config.get('player_done_variable', None)
-
-        # Used to keep track of agents that are active in the environment
-        self._active_agents = set()
-
-        assert self.player_count > 1, 'RLlibMultiAgentWrapper can only be used with environments that have multiple agents'
-
-    def _to_multi_agent_map(self, data):
-        return {a: data[a - 1] for a in self._active_agents}
-
-    def reset(self, **kwargs):
-        obs = super().reset(**kwargs)
-        self._active_agents.update([a + 1 for a in range(self.player_count)])
-        return self._to_multi_agent_map(obs)
-
-    def _resolve_player_done_variable(self):
-        resolved_variables = self.game.get_global_variable([self._player_done_variable])
-        return resolved_variables[self._player_done_variable]
-
-    def step(self, action_dict: MultiAgentDict):
-        actions_array = np.zeros((self.player_count, *self.action_space.shape))
-        for agent_id, action in action_dict.items():
-            actions_array[agent_id - 1] = action
-
-        obs, reward, all_done, info = super().step(actions_array)
-
-        done_map = {'__all__': all_done}
-
-        if self._player_done_variable is not None:
-            griddly_players_done = self._resolve_player_done_variable()
-
-            for agent_id in self._active_agents:
-                done_map[agent_id] = griddly_players_done[agent_id] == 1 or all_done
-        else:
-            for p in range(self.player_count):
-                done_map[p] = False
-
-        if self.invalid_action_masking:
-            info_map = self._to_multi_agent_map([
-                {'valid_action_tree': valid_action_tree} for valid_action_tree in info['valid_action_trees']
-            ])
-        else:
-            info_map = self._to_multi_agent_map(defaultdict(dict))
-
-        obs_map = self._to_multi_agent_map(obs)
-        reward_map = self._to_multi_agent_map(reward)
-
-        # Finally remove any agent ids that are done
-        for agent_id, is_done in done_map.items():
-            if is_done:
-                self._active_agents.discard(agent_id)
-
-        assert len(obs_map) == len(reward_map)
-        assert len(obs_map) == len(done_map) - 1
-        assert len(obs_map) == len(info_map)
-
-        return obs_map, reward_map, done_map, info_map

From cc9ee0367cb07d7ce3b346f7b31fd705a694f3cc Mon Sep 17 00:00:00 2001
From: Bam4d <chrisbam4d@gmail.com>
Date: Tue, 16 Mar 2021 15:10:56 +0000
Subject: [PATCH 03/45] experiments and associated code for action space work

---
 .gitignore                                    |   3 +
 bindings/wrapper/GameWrapper.cpp              |  80 ++--
 .../clusters_po.yaml                          | 310 +++++++++++++
 .../clusters_po_with_push.yaml                | 330 ++++++++++++++
 ...rs_po_with_push_separate_colors_units.yaml |   0
 ...clusters_po_with_push_seperate_colors.yaml | 409 ++++++++++++++++++
 .../clusters_po_with_push_units.yaml          |   0
 .../rllib_conditional_actions.py              |  81 ++++
 python/examples/rllib/rllib_single_agent.py   |   2 +-
 .../rllib_single_agent_conditional_actions.py |  28 +-
 python/griddly/GymWrapper.py                  |   3 +
 python/griddly/RenderTools.py                 |   1 +
 python/griddly/util/rllib/callbacks.py        |  58 +++
 .../conditional_action_exploration.py         |  79 +++-
 .../conditional_action_mixin.py               |  57 ++-
 .../conditional_action_policy_trainer.py      |   3 +-
 16 files changed, 1361 insertions(+), 83 deletions(-)
 create mode 100644 python/examples/experiments/conditional_action_spaces/clusters_po.yaml
 create mode 100644 python/examples/experiments/conditional_action_spaces/clusters_po_with_push.yaml
 create mode 100644 python/examples/experiments/conditional_action_spaces/clusters_po_with_push_separate_colors_units.yaml
 create mode 100644 python/examples/experiments/conditional_action_spaces/clusters_po_with_push_seperate_colors.yaml
 create mode 100644 python/examples/experiments/conditional_action_spaces/clusters_po_with_push_units.yaml
 create mode 100644 python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py
 create mode 100644 python/griddly/util/rllib/callbacks.py

diff --git a/.gitignore b/.gitignore
index fdba0dac0..2531ed319 100644
--- a/.gitignore
+++ b/.gitignore
@@ -137,3 +137,6 @@ rules.ninja
 
 # misc
 bin/
+
+# wandb 
+wandb/
diff --git a/bindings/wrapper/GameWrapper.cpp b/bindings/wrapper/GameWrapper.cpp
index da04ae768..39ff52cd5 100644
--- a/bindings/wrapper/GameWrapper.cpp
+++ b/bindings/wrapper/GameWrapper.cpp
@@ -7,6 +7,30 @@
 #include "StepPlayerWrapper.cpp"
 
 namespace griddly {
+
+class ValidActionNode {
+  public:
+    std::unordered_map<uint32_t, std::shared_ptr<ValidActionNode>> children;
+
+    bool contains(uint32_t value) {
+      return children.find(value) != children.end();
+    }
+
+    void add(uint32_t value) {
+      children[value] = std::shared_ptr<ValidActionNode>(new ValidActionNode());
+    }
+
+    static py::dict toPyDict(std::shared_ptr<ValidActionNode> node) {
+      py::dict py_dict;
+      for(auto child: node->children) {
+        py_dict[py::cast(child.first)] = toPyDict(child.second);
+      }
+
+      return py_dict;
+    }
+};
+
+
 class Py_GameWrapper {
  public:
   Py_GameWrapper(ObserverType globalObserverType, std::shared_ptr<GDYFactory> gdyFactory)
@@ -39,62 +63,68 @@ class Py_GameWrapper {
     return player;
   }
 
+  const uint32_t getActionTypeId(std::string actionName) const {
+    auto actionNames = gdyFactory_->getExternalActionNames();
+    for(int i = 0; i<actionNames.size(); i++) {
+      if(actionNames[i] == actionName) {
+        return i;
+      }
+    }
+    throw std::runtime_error("unregistered action");
+  }
+
   std::vector<py::dict> buildValidActionTrees() const {
     
     std::vector<py::dict> valid_action_trees; 
     auto externalActionNames = gdyFactory_->getExternalActionNames();
     for (int playerId = 1; playerId <= playerCount_; playerId++) {
-      py::dict valid_action_tree;
+      std::shared_ptr<ValidActionNode> node = std::shared_ptr<ValidActionNode>(new ValidActionNode());
       for (auto actionNamesAtLocation : gameProcess_->getAvailableActionNames(playerId)) {
         auto location = actionNamesAtLocation.first;
         auto actionNames = actionNamesAtLocation.second;
 
         for (auto actionName : actionNames) {
-          auto& treePtr = valid_action_tree;
+          std::shared_ptr<ValidActionNode> treePtr = node;
           auto actionInputsDefinitions = gdyFactory_->getActionInputsDefinitions();
           if (actionInputsDefinitions.find(actionName) != actionInputsDefinitions.end()) {
             auto locationVec = glm::ivec2{location[0], location[1]};
             auto actionIdsForName = gameProcess_->getAvailableActionIdsAtLocation(locationVec, actionName);
 
             if (actionIdsForName.size() > 0) {
-              if (gdyFactory_->getAvatarObject().length() == 0) {
-                auto py_x = py::cast(locationVec[0]);
-                auto py_y = py::cast(locationVec[1]);
-                if(!treePtr.contains(py_x)) {
-                  treePtr[py_x] = py::dict();
-                }
+              // if (gdyFactory_->getAvatarObject().length() == 0) {
+              //   auto py_x = py::cast(locationVec[0]);
+              //   auto py_y = py::cast(locationVec[1]);
+              //   if(!treePtr.contains(py_x)) {
+              //     (*treePtr)[py_x] = py::dict();
+              //   }
 
-                treePtr = treePtr[py_x];
+              //   treePtr = treePtr[py_x];
 
-                if(!treePtr.contains(py_y)) {
-                  treePtr[py_y] = py::dict();
-                }
+              //   if(!treePtr.contains(py_y)) {
+              //     treePtr[py_y] = py::dict();
+              //   }
 
-                treePtr = treePtr[py_y];
-              }
+              //   treePtr = treePtr[py_y];
+              // }
 
               if (externalActionNames.size() > 1) {
-                auto py_actionName = py::cast(actionName);
-                if(!treePtr.contains(py_actionName)) {
-                  treePtr[py_actionName] = py::dict();
+                auto actionTypeId = getActionTypeId(actionName);
+                if(!treePtr->contains(actionTypeId)) {
+                  treePtr->add(actionTypeId);
                 }
 
-                treePtr = treePtr[py_actionName];
+                treePtr = treePtr->children[actionTypeId];
               }
 
               for(auto id : actionIdsForName) {
-                auto py_id = py::cast(id);
-                treePtr[py_id] = py::dict();
+                treePtr->add(id);
               }
-
-              auto py_nop = py::cast(0);
-              treePtr[py_nop] = py::dict();
-
+              treePtr->add(0);
             }
           }
         }
       }
-      valid_action_trees.push_back(valid_action_tree);
+      valid_action_trees.push_back(ValidActionNode::toPyDict(node));
     }
 
     return valid_action_trees;
diff --git a/python/examples/experiments/conditional_action_spaces/clusters_po.yaml b/python/examples/experiments/conditional_action_spaces/clusters_po.yaml
new file mode 100644
index 000000000..3d8b70722
--- /dev/null
+++ b/python/examples/experiments/conditional_action_spaces/clusters_po.yaml
@@ -0,0 +1,310 @@
+Version: "0.1"
+Environment:
+  Name: Partially Observable Clusters
+  Description: Cluster the coloured objects together by pushing them against the static coloured blocks.
+  Observers:
+    Sprite2D:
+      TileSize: 24
+      BackgroundTile: oryx/oryx_fantasy/floor1-2.png
+  Variables:
+    - Name: box_count
+      InitialValue: 0
+  Player:
+    Observer:
+      RotateWithAvatar: true
+      TrackAvatar: true
+      Height: 5
+      Width: 5
+      OffsetX: 0
+      OffsetY: 2
+    AvatarObject: avatar # The player can only control a single avatar in the game
+  Termination:
+    Win:
+      - eq: [box_count, 0]
+    Lose:
+      - eq: [broken_box:count, 1]
+      - eq: [avatar:count, 0]
+  Levels:
+    - |
+      w w w w w w w w w w w w w
+      w . . . . . . . . . . . w
+      w . . 1 1 . . . 2 . 2 . w
+      w . . . . 1 . . . . . . w
+      w . . . a . . . . . 2 . w
+      w . . . . . . . h . . . w
+      w . . . . 1 . . . . b . w
+      w . . . . . . 1 . . . . w
+      w . . . . . . . . A . . w
+      w w w w w w w w w w w w w
+    - |
+      w w w w w w w w w w w w w
+      w . . . . . . . . . . . w
+      w . . 1 . . 2 . c 3 . . w
+      w . . . . h . . h . . . w
+      w . . . 2 . . 3 . . 1 . w
+      w . . . . b . . h . . . w
+      w . . 3 . . . 2 . . 1 . w
+      w . . h . h . . . a . . w
+      w . . . . . A . . . . . w
+      w w w w w w w w w w w w w
+    - |
+      w w w w w w w w w w w w w
+      w . . a . . b . . c . . w
+      w . . . . . . . . . . . w
+      w . . . . . . . . . . . w
+      w h h h h h . h h h h h w
+      w . . . . h . h . . . . w
+      w . 1 2 . h . h . 1 3 . w
+      w . 3 . . . . . . . 2 . w
+      w . . . . . A . . . . . w
+      w w w w w w w w w w w w w
+    - |
+      w w w w w w w w w w w w w
+      w . . . . . . . . . . . w
+      w . . . 1 . 2 . . c . . w
+      w . . . . . 3 . . 3 . . w
+      w . . a . 2 . . . h . . w
+      w . . . . h h . 3 . . . w
+      w . . 1 . . . . . 2 . . w
+      w . . . . . 1 . . b . . w
+      w . . . . . A . . . . . w
+      w w w w w w w w w w w w w
+    - |
+      w w w w w w w w w w w w w
+      w . . . . . . . . . . . w
+      w . . . . . . 1 . . . . w
+      w . . h . . b . . h . . w
+      w . . . . 1 . . . . . . w
+      w . . 3 . . . . 2 . . . w
+      w . . . a . h . . c . . w
+      w . . . . 3 . . . . 2 . w
+      w . . . . . A . . . . . w
+      w w w w w w w w w w w w w
+
+Actions:
+
+  # A simple action to count the number of boxes in the game at the start
+  # Not currently a way to do complex things in termination conditions like combine multiple conditions
+  - Name: box_counter
+    InputMapping:
+      Internal: true
+      Inputs:
+        1:
+          Description: "The only action here is to increment the box count"
+    Behaviours:
+      - Src:
+          Object: [blue_box, red_box, green_box]
+          Commands:
+            - incr: box_count
+        Dst:
+          Object: [blue_box, red_box, green_box]
+
+  # Define the move action
+  - Name: move
+    InputMapping:
+      Inputs:
+        1:
+          Description: Rotate left
+          OrientationVector: [-1, 0]
+        2:
+          Description: Move forwards
+          OrientationVector: [0, -1]
+          VectorToDest: [0, -1]
+        3:
+          Description: Rotate right
+          OrientationVector: [1, 0]
+      Relative: true
+    Behaviours:
+
+      # Avatar rotates
+      - Src:
+          Object: avatar
+          Commands:
+            - rot: _dir
+        Dst:
+          Object: avatar
+
+      # Avatar and boxes can move into empty space
+      - Src:
+          Object: [avatar, blue_box, green_box, red_box]
+          Commands:
+            - mov: _dest
+        Dst:
+          Object: _empty
+
+      # Boxes can be pushed by the avatar
+      - Src:
+          Object: avatar
+          Commands:
+            - mov: _dest
+        Dst:
+          Object: [blue_box, green_box, red_box]
+          Commands:
+            - cascade: _dest
+
+      # When boxes are pushed against the blocks they change
+      - Src:
+          Object: blue_box
+          Commands:
+            - change_to: blue_block
+            - reward: 1
+            - decr:  box_count
+        Dst:
+          Object: blue_block
+      - Src:
+          Object: red_box
+          Commands:
+            - reward: 1
+            - change_to: red_block
+            - decr:  box_count
+        Dst:
+          Object: red_block
+      - Src:
+          Object: green_box
+          Commands:
+            - reward: 1
+            - change_to: green_block
+            - decr:  box_count
+        Dst:
+          Object: green_block
+
+      # Boxes break if they hit the spikes
+      - Src:
+          Object: [blue_box, green_box, red_box]
+          Commands:
+            - change_to: broken_box
+            - reward: -1
+        Dst:
+          Object: spike
+
+      # Avatar dies if it hits the spikes
+      - Src:
+          Object: avatar
+          Commands:
+            - remove: true
+            - reward: -1
+        Dst:
+          Object: spike
+
+Objects:
+  - Name: avatar
+    MapCharacter: A
+    Observers:
+      Sprite2D:
+        - Image: gvgai/oryx/knight1.png
+      Block2D:
+        - Shape: triangle
+          Color: [0.0, 1.0, 0.0]
+          Scale: 0.8
+
+  - Name: wall
+    MapCharacter: w
+    Observers:
+      Sprite2D:
+        - TilingMode: WALL_16
+          Image:
+            - oryx/oryx_fantasy/wall1-0.png
+            - oryx/oryx_fantasy/wall1-1.png
+            - oryx/oryx_fantasy/wall1-2.png
+            - oryx/oryx_fantasy/wall1-3.png
+            - oryx/oryx_fantasy/wall1-4.png
+            - oryx/oryx_fantasy/wall1-5.png
+            - oryx/oryx_fantasy/wall1-6.png
+            - oryx/oryx_fantasy/wall1-7.png
+            - oryx/oryx_fantasy/wall1-8.png
+            - oryx/oryx_fantasy/wall1-9.png
+            - oryx/oryx_fantasy/wall1-10.png
+            - oryx/oryx_fantasy/wall1-11.png
+            - oryx/oryx_fantasy/wall1-12.png
+            - oryx/oryx_fantasy/wall1-13.png
+            - oryx/oryx_fantasy/wall1-14.png
+            - oryx/oryx_fantasy/wall1-15.png
+      Block2D:
+        - Shape: square
+          Color: [0.5, 0.5, 0.5]
+          Scale: 0.9
+
+  - Name: spike
+    MapCharacter: h
+    Observers:
+      Sprite2D:
+        - Image: gvgai/oryx/spike2.png
+      Block2D:
+        - Shape: triangle
+          Color: [0.9, 0.1, 0.1]
+          Scale: 0.5
+
+  - Name: red_box
+    MapCharacter: "2"
+    InitialActions:
+      - Action: box_counter
+        ActionId: 1
+    Observers:
+      Sprite2D:
+        - Image: gvgai/newset/blockR.png
+      Block2D:
+        - Shape: square
+          Color: [0.5, 0.2, 0.2]
+          Scale: 0.5
+  - Name: red_block
+    MapCharacter: b
+    Observers:
+      Sprite2D:
+        - Image: gvgai/newset/blockR2.png
+      Block2D:
+        - Shape: square
+          Color: [1.0, 0.0, 0.0]
+          Scale: 1.0
+
+  - Name: green_box
+    MapCharacter: "3"
+    InitialActions:
+      - Action: box_counter
+        ActionId: 1
+    Observers:
+      Sprite2D:
+        - Image: gvgai/newset/blockG.png
+      Block2D:
+        - Shape: square
+          Color: [0.2, 0.5, 0.2]
+          Scale: 0.5
+  - Name: green_block
+    MapCharacter: c
+    Observers:
+      Sprite2D:
+        - Image: gvgai/newset/blockG2.png
+      Block2D:
+        - Shape: square
+          Color: [0.0, 1.0, 0.0]
+          Scale: 1.0
+
+  - Name: blue_box
+    MapCharacter: "1"
+    InitialActions:
+      - Action: box_counter
+        ActionId: 1
+    Observers:
+      Sprite2D:
+        - Image: gvgai/newset/blockB.png
+      Block2D:
+        - Shape: square
+          Color: [0.2, 0.2, 0.5]
+          Scale: 0.5
+  - Name: blue_block
+    MapCharacter: a
+    Observers:
+      Sprite2D:
+        - Image: gvgai/newset/blockB2.png
+      Block2D:
+        - Shape: square
+          Color: [0.0, 0.0, 1.0]
+          Scale: 1.0
+
+  - Name: broken_box
+    Observers:
+      Sprite2D:
+        - Image: gvgai/newset/block3.png
+      Block2D:
+        - Shape: triangle
+          Color: [1.0, 0.0, 1.0]
+          Scale: 1.0
diff --git a/python/examples/experiments/conditional_action_spaces/clusters_po_with_push.yaml b/python/examples/experiments/conditional_action_spaces/clusters_po_with_push.yaml
new file mode 100644
index 000000000..9904e87e7
--- /dev/null
+++ b/python/examples/experiments/conditional_action_spaces/clusters_po_with_push.yaml
@@ -0,0 +1,330 @@
+Version: "0.1"
+Environment:
+  Name: Partially Observable Clusters
+  Description: Cluster the coloured objects together by pushing them against the static coloured blocks.
+  Observers:
+    Sprite2D:
+      TileSize: 24
+      BackgroundTile: oryx/oryx_fantasy/floor1-2.png
+  Variables:
+    - Name: box_count
+      InitialValue: 0
+  Player:
+    Observer:
+      RotateWithAvatar: true
+      TrackAvatar: true
+      Height: 5
+      Width: 5
+      OffsetX: 0
+      OffsetY: 2
+    AvatarObject: avatar # The player can only control a single avatar in the game
+  Termination:
+    Win:
+      - eq: [box_count, 0]
+    Lose:
+      - eq: [broken_box:count, 1]
+      - eq: [avatar:count, 0]
+  Levels:
+    - |
+      w w w w w w w w w w w w w
+      w . . . . . . . . . . . w
+      w . . 1 1 . . . 2 . 2 . w
+      w . . . . 1 . . . . . . w
+      w . . . a . . . . . 2 . w
+      w . . . . . . . h . . . w
+      w . . . . 1 . . . . b . w
+      w . . . . . . 1 . . . . w
+      w . . . . . . . . A . . w
+      w w w w w w w w w w w w w
+    - |
+      w w w w w w w w w w w w w
+      w . . . . . . . . . . . w
+      w . . 1 . . 2 . c 3 . . w
+      w . . . . h . . h . . . w
+      w . . . 2 . . 3 . . 1 . w
+      w . . . . b . . h . . . w
+      w . . 3 . . . 2 . . 1 . w
+      w . . h . h . . . a . . w
+      w . . . . . A . . . . . w
+      w w w w w w w w w w w w w
+    - |
+      w w w w w w w w w w w w w
+      w . . a . . b . . c . . w
+      w . . . . . . . . . . . w
+      w . . . . . . . . . . . w
+      w h h h h h . h h h h h w
+      w . . . . h . h . . . . w
+      w . 1 2 . h . h . 1 3 . w
+      w . 3 . . . . . . . 2 . w
+      w . . . . . A . . . . . w
+      w w w w w w w w w w w w w
+    - |
+      w w w w w w w w w w w w w
+      w . . . . . . . . . . . w
+      w . . . 1 . 2 . . c . . w
+      w . . . . . 3 . . 3 . . w
+      w . . a . 2 . . . h . . w
+      w . . . . h h . 3 . . . w
+      w . . 1 . . . . . 2 . . w
+      w . . . . . 1 . . b . . w
+      w . . . . . A . . . . . w
+      w w w w w w w w w w w w w
+    - |
+      w w w w w w w w w w w w w
+      w . . . . . . . . . . . w
+      w . . . . . . 1 . . . . w
+      w . . h . . b . . h . . w
+      w . . . . 1 . . . . . . w
+      w . . 3 . . . . 2 . . . w
+      w . . . a . h . . c . . w
+      w . . . . 3 . . . . 2 . w
+      w . . . . . A . . . . . w
+      w w w w w w w w w w w w w
+
+Actions:
+
+  # A simple action to count the number of boxes in the game at the start
+  # Not currently a way to do complex things in termination conditions like combine multiple conditions
+  - Name: box_counter
+    InputMapping:
+      Internal: true
+      Inputs:
+        1:
+          Description: "The only action here is to increment the box count"
+    Behaviours:
+      - Src:
+          Object: [blue_box, red_box, green_box]
+          Commands:
+            - incr: box_count
+        Dst:
+          Object: [blue_box, red_box, green_box]
+
+  # Define the move action
+  - Name: move
+    InputMapping:
+      Inputs:
+        1:
+          Description: Rotate left
+          OrientationVector: [-1, 0]
+        2:
+          Description: Move forwards
+          OrientationVector: [0, -1]
+          VectorToDest: [0, -1]
+        3:
+          Description: Rotate right
+          OrientationVector: [1, 0]
+      Relative: true
+    Behaviours:
+
+      # Avatar rotates
+      - Src:
+          Object: avatar
+          Commands:
+            - rot: _dir
+        Dst:
+          Object: avatar
+
+      # Avatar can move into empty space
+      - Src:
+          Object: avatar
+          Commands:
+            - mov: _dest
+        Dst:
+          Object: _empty
+
+      # Avatar dies if it hits the spikes
+      - Src:
+          Object: avatar
+          Commands:
+            - remove: true
+            - reward: -1
+        Dst:
+          Object: spike
+
+
+  - Name: push
+    InputMapping:
+      Inputs:
+        1:
+          Description: Push Forwards
+          OrientationVector: [ 0, -1 ]
+          VectorToDest: [ 0, -1 ]
+      Relative: true
+    Behaviours:
+
+      # Boxes can be pushed by the avatar
+      - Src:
+          Object: avatar
+          Commands:
+            - mov: _dest
+        Dst:
+          Object: [blue_box, green_box, red_box]
+          Commands:
+            - cascade: _dest
+
+      # Boxes break if they hit the spikes
+      - Src:
+          Object: [ blue_box, green_box, red_box ]
+          Commands:
+            - change_to: broken_box
+            - reward: -1
+        Dst:
+          Object: spike
+
+      # Boxes can pushed into empty space
+      - Src:
+          Object: [blue_box, green_box, red_box]
+          Commands:
+            - mov: _dest
+        Dst:
+          Object: _empty
+
+      # When boxes are pushed against the blocks they change
+      - Src:
+          Object: blue_box
+          Commands:
+            - change_to: blue_block
+            - reward: 1
+            - decr:  box_count
+        Dst:
+          Object: blue_block
+      - Src:
+          Object: red_box
+          Commands:
+            - reward: 1
+            - change_to: red_block
+            - decr:  box_count
+        Dst:
+          Object: red_block
+      - Src:
+          Object: green_box
+          Commands:
+            - reward: 1
+            - change_to: green_block
+            - decr:  box_count
+        Dst:
+          Object: green_block
+
+
+Objects:
+  - Name: avatar
+    MapCharacter: A
+    Observers:
+      Sprite2D:
+        - Image: gvgai/oryx/knight1.png
+      Block2D:
+        - Shape: triangle
+          Color: [0.0, 1.0, 0.0]
+          Scale: 0.8
+
+  - Name: wall
+    MapCharacter: w
+    Observers:
+      Sprite2D:
+        - TilingMode: WALL_16
+          Image:
+            - oryx/oryx_fantasy/wall1-0.png
+            - oryx/oryx_fantasy/wall1-1.png
+            - oryx/oryx_fantasy/wall1-2.png
+            - oryx/oryx_fantasy/wall1-3.png
+            - oryx/oryx_fantasy/wall1-4.png
+            - oryx/oryx_fantasy/wall1-5.png
+            - oryx/oryx_fantasy/wall1-6.png
+            - oryx/oryx_fantasy/wall1-7.png
+            - oryx/oryx_fantasy/wall1-8.png
+            - oryx/oryx_fantasy/wall1-9.png
+            - oryx/oryx_fantasy/wall1-10.png
+            - oryx/oryx_fantasy/wall1-11.png
+            - oryx/oryx_fantasy/wall1-12.png
+            - oryx/oryx_fantasy/wall1-13.png
+            - oryx/oryx_fantasy/wall1-14.png
+            - oryx/oryx_fantasy/wall1-15.png
+      Block2D:
+        - Shape: square
+          Color: [0.5, 0.5, 0.5]
+          Scale: 0.9
+
+  - Name: spike
+    MapCharacter: h
+    Observers:
+      Sprite2D:
+        - Image: gvgai/oryx/spike2.png
+      Block2D:
+        - Shape: triangle
+          Color: [0.9, 0.1, 0.1]
+          Scale: 0.5
+
+  - Name: red_box
+    MapCharacter: "2"
+    InitialActions:
+      - Action: box_counter
+        ActionId: 1
+    Observers:
+      Sprite2D:
+        - Image: gvgai/newset/blockR.png
+      Block2D:
+        - Shape: square
+          Color: [0.5, 0.2, 0.2]
+          Scale: 0.5
+  - Name: red_block
+    MapCharacter: b
+    Observers:
+      Sprite2D:
+        - Image: gvgai/newset/blockR2.png
+      Block2D:
+        - Shape: square
+          Color: [1.0, 0.0, 0.0]
+          Scale: 1.0
+
+  - Name: green_box
+    MapCharacter: "3"
+    InitialActions:
+      - Action: box_counter
+        ActionId: 1
+    Observers:
+      Sprite2D:
+        - Image: gvgai/newset/blockG.png
+      Block2D:
+        - Shape: square
+          Color: [0.2, 0.5, 0.2]
+          Scale: 0.5
+  - Name: green_block
+    MapCharacter: c
+    Observers:
+      Sprite2D:
+        - Image: gvgai/newset/blockG2.png
+      Block2D:
+        - Shape: square
+          Color: [0.0, 1.0, 0.0]
+          Scale: 1.0
+
+  - Name: blue_box
+    MapCharacter: "1"
+    InitialActions:
+      - Action: box_counter
+        ActionId: 1
+    Observers:
+      Sprite2D:
+        - Image: gvgai/newset/blockB.png
+      Block2D:
+        - Shape: square
+          Color: [0.2, 0.2, 0.5]
+          Scale: 0.5
+  - Name: blue_block
+    MapCharacter: a
+    Observers:
+      Sprite2D:
+        - Image: gvgai/newset/blockB2.png
+      Block2D:
+        - Shape: square
+          Color: [0.0, 0.0, 1.0]
+          Scale: 1.0
+
+  - Name: broken_box
+    Observers:
+      Sprite2D:
+        - Image: gvgai/newset/block3.png
+      Block2D:
+        - Shape: triangle
+          Color: [1.0, 0.0, 1.0]
+          Scale: 1.0
diff --git a/python/examples/experiments/conditional_action_spaces/clusters_po_with_push_separate_colors_units.yaml b/python/examples/experiments/conditional_action_spaces/clusters_po_with_push_separate_colors_units.yaml
new file mode 100644
index 000000000..e69de29bb
diff --git a/python/examples/experiments/conditional_action_spaces/clusters_po_with_push_seperate_colors.yaml b/python/examples/experiments/conditional_action_spaces/clusters_po_with_push_seperate_colors.yaml
new file mode 100644
index 000000000..bb173e3bc
--- /dev/null
+++ b/python/examples/experiments/conditional_action_spaces/clusters_po_with_push_seperate_colors.yaml
@@ -0,0 +1,409 @@
+Version: "0.1"
+Environment:
+  Name: Partially Observable Clusters
+  Description: Cluster the coloured objects together by pushing them against the static coloured blocks.
+  Observers:
+    Sprite2D:
+      TileSize: 24
+      BackgroundTile: oryx/oryx_fantasy/floor1-2.png
+  Variables:
+    - Name: box_count
+      InitialValue: 0
+  Player:
+    Observer:
+      RotateWithAvatar: true
+      TrackAvatar: true
+      Height: 5
+      Width: 5
+      OffsetX: 0
+      OffsetY: 2
+    AvatarObject: avatar # The player can only control a single avatar in the game
+  Termination:
+    Win:
+      - eq: [box_count, 0]
+    Lose:
+      - eq: [broken_box:count, 1]
+      - eq: [avatar:count, 0]
+  Levels:
+    - |
+      w w w w w w w w w w w w w
+      w . . . . . . . . . . . w
+      w . . 1 1 . . . 2 . 2 . w
+      w . . . . 1 . . . . . . w
+      w . . . a . . . . . 2 . w
+      w . . . . . . . h . . . w
+      w . . . . 1 . . . . b . w
+      w . . . . . . 1 . . . . w
+      w . . . . . . . . A . . w
+      w w w w w w w w w w w w w
+    - |
+      w w w w w w w w w w w w w
+      w . . . . . . . . . . . w
+      w . . 1 . . 2 . c 3 . . w
+      w . . . . h . . h . . . w
+      w . . . 2 . . 3 . . 1 . w
+      w . . . . b . . h . . . w
+      w . . 3 . . . 2 . . 1 . w
+      w . . h . h . . . a . . w
+      w . . . . . A . . . . . w
+      w w w w w w w w w w w w w
+    - |
+      w w w w w w w w w w w w w
+      w . . a . . b . . c . . w
+      w . . . . . . . . . . . w
+      w . . . . . . . . . . . w
+      w h h h h h . h h h h h w
+      w . . . . h . h . . . . w
+      w . 1 2 . h . h . 1 3 . w
+      w . 3 . . . . . . . 2 . w
+      w . . . . . A . . . . . w
+      w w w w w w w w w w w w w
+    - |
+      w w w w w w w w w w w w w
+      w . . . . . . . . . . . w
+      w . . . 1 . 2 . . c . . w
+      w . . . . . 3 . . 3 . . w
+      w . . a . 2 . . . h . . w
+      w . . . . h h . 3 . . . w
+      w . . 1 . . . . . 2 . . w
+      w . . . . . 1 . . b . . w
+      w . . . . . A . . . . . w
+      w w w w w w w w w w w w w
+    - |
+      w w w w w w w w w w w w w
+      w . . . . . . . . . . . w
+      w . . . . . . 1 . . . . w
+      w . . h . . b . . h . . w
+      w . . . . 1 . . . . . . w
+      w . . 3 . . . . 2 . . . w
+      w . . . a . h . . c . . w
+      w . . . . 3 . . . . 2 . w
+      w . . . . . A . . . . . w
+      w w w w w w w w w w w w w
+
+Actions:
+
+  # A simple action to count the number of boxes in the game at the start
+  # Not currently a way to do complex things in termination conditions like combine multiple conditions
+  - Name: box_counter
+    InputMapping:
+      Internal: true
+      Inputs:
+        1:
+          Description: "The only action here is to increment the box count"
+    Behaviours:
+      - Src:
+          Object: [blue_box, red_box, green_box]
+          Commands:
+            - incr: box_count
+        Dst:
+          Object: [blue_box, red_box, green_box]
+
+  # Define the move action
+  - Name: move
+    InputMapping:
+      Inputs:
+        1:
+          Description: Rotate left
+          OrientationVector: [-1, 0]
+        2:
+          Description: Move forwards
+          OrientationVector: [0, -1]
+          VectorToDest: [0, -1]
+        3:
+          Description: Rotate right
+          OrientationVector: [1, 0]
+      Relative: true
+    Behaviours:
+
+      # Avatar rotates
+      - Src:
+          Object: avatar
+          Commands:
+            - rot: _dir
+        Dst:
+          Object: avatar
+
+      # Avatar can move into empty space
+      - Src:
+          Object: avatar
+          Commands:
+            - mov: _dest
+        Dst:
+          Object: _empty
+
+
+      # Avatar dies if it hits the spikes
+      - Src:
+          Object: avatar
+          Commands:
+            - remove: true
+            - reward: -1
+        Dst:
+          Object: spike
+
+
+  - Name: push_blue
+    InputMapping:
+      Inputs:
+        1:
+          Description: Push Blue
+          OrientationVector: [ 0, -1 ]
+          VectorToDest: [ 0, -1 ]
+      Relative: true
+    Behaviours:
+
+      # Boxes can be pushed by the avatar
+      - Src:
+          Object: avatar
+          Commands:
+            - mov: _dest
+        Dst:
+          Object: blue_box
+          Commands:
+            - cascade: _dest
+
+      # Boxes break if they are pushed into the spikes
+      - Src:
+          Object: blue_box
+          Commands:
+            - change_to: broken_box
+            - reward: -1
+        Dst:
+          Object: spike
+
+      # Boxes can pushed into empty space
+      - Src:
+          Object: blue_box
+          Commands:
+            - mov: _dest
+        Dst:
+          Object: _empty
+
+      # When boxes are pushed against the blocks they change
+      - Src:
+          Object: blue_box
+          Commands:
+            - change_to: blue_block
+            - reward: 1
+            - decr:  box_count
+        Dst:
+          Object: blue_block
+
+  - Name: push_red
+    InputMapping:
+      Inputs:
+        1:
+          Description: Push Red
+          OrientationVector: [ 0, -1 ]
+          VectorToDest: [ 0, -1 ]
+      Relative: true
+    Behaviours:
+
+      # Boxes can be pushed by the avatar
+      - Src:
+          Object: avatar
+          Commands:
+            - mov: _dest
+        Dst:
+          Object: red_box
+          Commands:
+            - cascade: _dest
+
+      # Boxes break if they are pushed into the spikes
+      - Src:
+          Object: red_box
+          Commands:
+            - change_to: broken_box
+            - reward: -1
+        Dst:
+          Object: spike
+
+      # Boxes can pushed into empty space
+      - Src:
+          Object: red_box
+          Commands:
+            - mov: _dest
+        Dst:
+          Object: _empty
+
+      # When boxes are pushed against the blocks they change
+      - Src:
+          Object: red_box
+          Commands:
+            - reward: 1
+            - change_to: red_block
+            - decr:  box_count
+        Dst:
+          Object: red_block
+
+  - Name: push_green
+    InputMapping:
+      Inputs:
+        1:
+          Description: Push Green
+          OrientationVector: [ 0, -1 ]
+          VectorToDest: [ 0, -1 ]
+      Relative: true
+    Behaviours:
+
+      # Boxes can be pushed by the avatar
+      - Src:
+          Object: avatar
+          Commands:
+            - mov: _dest
+        Dst:
+          Object: green_box
+          Commands:
+            - cascade: _dest
+
+      # Boxes break if they are pushed into the spikes
+      - Src:
+          Object: green_box
+          Commands:
+            - change_to: broken_box
+            - reward: -1
+        Dst:
+          Object: spike
+
+      # Boxes can pushed into empty space
+      - Src:
+          Object: green_box
+          Commands:
+            - mov: _dest
+        Dst:
+          Object: _empty
+
+        # When boxes are pushed against the blocks they change
+      - Src:
+          Object: green_box
+          Commands:
+            - reward: 1
+            - change_to: green_block
+            - decr:  box_count
+        Dst:
+          Object: green_block
+
+
+Objects:
+  - Name: avatar
+    MapCharacter: A
+    Observers:
+      Sprite2D:
+        - Image: gvgai/oryx/knight1.png
+      Block2D:
+        - Shape: triangle
+          Color: [0.0, 1.0, 0.0]
+          Scale: 0.8
+
+  - Name: wall
+    MapCharacter: w
+    Observers:
+      Sprite2D:
+        - TilingMode: WALL_16
+          Image:
+            - oryx/oryx_fantasy/wall1-0.png
+            - oryx/oryx_fantasy/wall1-1.png
+            - oryx/oryx_fantasy/wall1-2.png
+            - oryx/oryx_fantasy/wall1-3.png
+            - oryx/oryx_fantasy/wall1-4.png
+            - oryx/oryx_fantasy/wall1-5.png
+            - oryx/oryx_fantasy/wall1-6.png
+            - oryx/oryx_fantasy/wall1-7.png
+            - oryx/oryx_fantasy/wall1-8.png
+            - oryx/oryx_fantasy/wall1-9.png
+            - oryx/oryx_fantasy/wall1-10.png
+            - oryx/oryx_fantasy/wall1-11.png
+            - oryx/oryx_fantasy/wall1-12.png
+            - oryx/oryx_fantasy/wall1-13.png
+            - oryx/oryx_fantasy/wall1-14.png
+            - oryx/oryx_fantasy/wall1-15.png
+      Block2D:
+        - Shape: square
+          Color: [0.5, 0.5, 0.5]
+          Scale: 0.9
+
+  - Name: spike
+    MapCharacter: h
+    Observers:
+      Sprite2D:
+        - Image: gvgai/oryx/spike2.png
+      Block2D:
+        - Shape: triangle
+          Color: [0.9, 0.1, 0.1]
+          Scale: 0.5
+
+  - Name: red_box
+    MapCharacter: "2"
+    InitialActions:
+      - Action: box_counter
+        ActionId: 1
+    Observers:
+      Sprite2D:
+        - Image: gvgai/newset/blockR.png
+      Block2D:
+        - Shape: square
+          Color: [0.5, 0.2, 0.2]
+          Scale: 0.5
+  - Name: red_block
+    MapCharacter: b
+    Observers:
+      Sprite2D:
+        - Image: gvgai/newset/blockR2.png
+      Block2D:
+        - Shape: square
+          Color: [1.0, 0.0, 0.0]
+          Scale: 1.0
+
+  - Name: green_box
+    MapCharacter: "3"
+    InitialActions:
+      - Action: box_counter
+        ActionId: 1
+    Observers:
+      Sprite2D:
+        - Image: gvgai/newset/blockG.png
+      Block2D:
+        - Shape: square
+          Color: [0.2, 0.5, 0.2]
+          Scale: 0.5
+  - Name: green_block
+    MapCharacter: c
+    Observers:
+      Sprite2D:
+        - Image: gvgai/newset/blockG2.png
+      Block2D:
+        - Shape: square
+          Color: [0.0, 1.0, 0.0]
+          Scale: 1.0
+
+  - Name: blue_box
+    MapCharacter: "1"
+    InitialActions:
+      - Action: box_counter
+        ActionId: 1
+    Observers:
+      Sprite2D:
+        - Image: gvgai/newset/blockB.png
+      Block2D:
+        - Shape: square
+          Color: [0.2, 0.2, 0.5]
+          Scale: 0.5
+  - Name: blue_block
+    MapCharacter: a
+    Observers:
+      Sprite2D:
+        - Image: gvgai/newset/blockB2.png
+      Block2D:
+        - Shape: square
+          Color: [0.0, 0.0, 1.0]
+          Scale: 1.0
+
+  - Name: broken_box
+    Observers:
+      Sprite2D:
+        - Image: gvgai/newset/block3.png
+      Block2D:
+        - Shape: triangle
+          Color: [1.0, 0.0, 1.0]
+          Scale: 1.0
diff --git a/python/examples/experiments/conditional_action_spaces/clusters_po_with_push_units.yaml b/python/examples/experiments/conditional_action_spaces/clusters_po_with_push_units.yaml
new file mode 100644
index 000000000..e69de29bb
diff --git a/python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py b/python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py
new file mode 100644
index 000000000..789bc4c6c
--- /dev/null
+++ b/python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py
@@ -0,0 +1,81 @@
+import os
+import sys
+
+import ray
+from ray import tune
+from ray.rllib.models import ModelCatalog
+from ray.tune.integration.wandb import WandbLoggerCallback
+from ray.tune.registry import register_env
+
+from griddly import gd
+from griddly.util.rllib.env.core import RLlibEnv
+# from griddly.util.rllib.callbacks import GriddlyCallbacks
+from griddly.util.rllib.torch import GAPAgent
+from griddly.util.rllib.torch.conditional_actions.conditional_action_policy_trainer import \
+    ConditionalActionImpalaTrainer
+
+if __name__ == '__main__':
+    sep = os.pathsep
+    os.environ['PYTHONPATH'] = sep.join(sys.path)
+
+    yaml_file = os.path.realpath('clusters_po_with_push_seperate_colors.yaml')
+
+    ray.init(num_gpus=1)
+    # ray.init(num_gpus=1, local_mode=True)
+
+    env_name = "ray-griddly-env"
+
+    register_env(env_name, RLlibEnv)
+    ModelCatalog.register_custom_model("GAP", GAPAgent)
+
+    wandbLoggerCallback = WandbLoggerCallback(
+        project='conditional_actions',
+        api_key_file='~/.wandb_rc'
+    )
+
+    max_training_steps = 5000000
+
+    config = {
+        'framework': 'torch',
+        'num_workers': 4,
+        'num_envs_per_worker': 4,
+
+        # 'callbacks': GriddlyCallbacks,
+
+        'model': {
+            'custom_model': 'GAP',
+            'custom_model_config': {}
+        },
+        'env': env_name,
+        'env_config': {
+            'record_video_config': {
+                'frequency': 100000,
+                'directory': 'videos'
+            },
+
+            'allow_nop': tune.grid_search([True, False]),
+            'invalid_action_masking': tune.grid_search(['none', 'conditional', 'collapsed']),
+            # 'invalid_action_masking': 'collapsed',
+            # 'allow_nop': False,
+            'generate_valid_action_trees': tune.grid_search([True, False]),
+            'random_level_on_reset': True,
+            'yaml_file': yaml_file,
+            'global_observer_type': gd.ObserverType.SPRITE_2D,
+            'max_steps': 1000,
+        },
+        'entropy_coeff_schedule': [
+            [0, 0.01],
+            [max_training_steps, 0.0]
+        ],
+        'lr_schedule': [
+            [0, 0.0005],
+            [max_training_steps, 0.0]
+        ],
+
+    }
+
+    stop = {
+        "timesteps_total": max_training_steps,
+    }
+
+    result = tune.run(ConditionalActionImpalaTrainer, config=config, stop=stop, callbacks=[wandbLoggerCallback])
diff --git a/python/examples/rllib/rllib_single_agent.py b/python/examples/rllib/rllib_single_agent.py
index eda9d41e3..de96b6589 100644
--- a/python/examples/rllib/rllib_single_agent.py
+++ b/python/examples/rllib/rllib_single_agent.py
@@ -49,7 +49,7 @@
             [max_training_steps, 0.0]
         ],
         'lr_schedule': [
-            [0, 0.005],
+            [0, 0.0005],
             [max_training_steps, 0.0]
         ]
     }
diff --git a/python/examples/rllib/rllib_single_agent_conditional_actions.py b/python/examples/rllib/rllib_single_agent_conditional_actions.py
index 8e1920751..a7d0b4a61 100644
--- a/python/examples/rllib/rllib_single_agent_conditional_actions.py
+++ b/python/examples/rllib/rllib_single_agent_conditional_actions.py
@@ -16,18 +16,19 @@
     os.environ['PYTHONPATH'] = sep.join(sys.path)
 
     ray.init(num_gpus=1)
+    #ray.init(num_gpus=1, local_mode=True)
 
     env_name = "ray-griddly-env"
 
     register_env(env_name, RLlibEnv)
     ModelCatalog.register_custom_model("GAP", GAPAgent)
 
-    max_training_steps = 5000000
+    max_training_steps = 2000000
 
     config = {
         'framework': 'torch',
-        'num_workers': 6,
-        'num_envs_per_worker': 2,
+        'num_workers': 8,
+        'num_envs_per_worker': 4,
 
         'model': {
             'custom_model': 'GAP',
@@ -39,21 +40,22 @@
                 'frequency': 100000
             },
 
-            'invalid_action_masking': tune.grid_search([True, False]),
-            'generate_valid_action_trees': tune.grid_search([True, False]),
+            'allow_nop': tune.grid_search([True, False]),
+            'invalid_action_masking': tune.grid_search(['none', 'conditional']),
+            'generate_valid_action_trees': True,
             'random_level_on_reset': True,
             'yaml_file': 'Single-Player/GVGAI/clusters_partially_observable.yaml',
             'global_observer_type': gd.ObserverType.SPRITE_2D,
             'max_steps': 1000,
         },
-        'entropy_coeff_schedule': [
-            [0, 0.01],
-            [max_training_steps, 0.0]
-        ],
-        'lr_schedule': [
-            [0, 0.005],
-            [max_training_steps, 0.0]
-        ],
+        #'entropy_coeff_schedule': [
+        #    [0, 0.01],
+        #    [max_training_steps, 0.0]
+        #],
+        #'lr_schedule': [
+        #    [0, 0.005],
+        #    [max_training_steps, 0.0]
+        #],
 
 
     }
diff --git a/python/griddly/GymWrapper.py b/python/griddly/GymWrapper.py
index 1879b82b2..d4922f659 100644
--- a/python/griddly/GymWrapper.py
+++ b/python/griddly/GymWrapper.py
@@ -39,6 +39,7 @@ def __init__(self, yaml_file=None, level=0, global_observer_type=gd.ObserverType
 
             if level is not None:
                 self.game.load_level(level)
+                self.level_id = level
 
         # if we are loading a copy of the game
         elif gdy is not None and game is not None:
@@ -150,8 +151,10 @@ def reset(self, level_id=None, level_string=None, global_observations=False):
 
         if level_string is not None:
             self.game.load_level_string(level_string)
+            self.level_id = 'custom'
         elif level_id is not None:
             self.game.load_level(level_id)
+            self.level_id = level_id
 
         self.game.reset()
 
diff --git a/python/griddly/RenderTools.py b/python/griddly/RenderTools.py
index 6fe4714bd..73029561d 100644
--- a/python/griddly/RenderTools.py
+++ b/python/griddly/RenderTools.py
@@ -83,6 +83,7 @@ def start(self, output_file, observation_shape, fps=30):
         :param fps:
         :return:
         """
+        self.output_file = output_file
         self._image_encoder = ImageEncoder(output_file, observation_shape, fps, fps)
 
     def add_frame(self, observation):
diff --git a/python/griddly/util/rllib/callbacks.py b/python/griddly/util/rllib/callbacks.py
new file mode 100644
index 000000000..b1c0587f1
--- /dev/null
+++ b/python/griddly/util/rllib/callbacks.py
@@ -0,0 +1,58 @@
+from typing import Optional, Dict
+
+from ray.rllib import Policy, SampleBatch, BaseEnv
+from ray.rllib.agents.callbacks import DefaultCallbacks
+from ray.rllib.evaluation import MultiAgentEpisode
+from ray.rllib.utils.typing import AgentID, PolicyID
+
+
+# from wandb import Video
+
+
+class GriddlyCallbacks(DefaultCallbacks):
+
+    def __init__(self, legacy_callbacks_dict: Dict[str, callable] = None):
+        super().__init__(legacy_callbacks_dict)
+
+        # self._videos = {}
+
+    def on_episode_start(self, *, worker: "RolloutWorker", base_env: BaseEnv, policies: Dict[PolicyID, Policy],
+                         episode: MultiAgentEpisode, env_index: Optional[int] = None, **kwargs) -> None:
+        super().on_episode_start(worker=worker, base_env=base_env, policies=policies, episode=episode,
+                                 env_index=env_index, **kwargs)
+
+    def on_episode_step(self, *, worker: "RolloutWorker", base_env: BaseEnv, episode: MultiAgentEpisode,
+                        env_index: Optional[int] = None, **kwargs) -> None:
+        super().on_episode_step(worker=worker, base_env=base_env, episode=episode, env_index=env_index, **kwargs)
+
+    def on_episode_end(self, *, worker: "RolloutWorker", base_env: BaseEnv, policies: Dict[PolicyID, Policy],
+                       episode: MultiAgentEpisode, env_index: Optional[int] = None, **kwargs) -> None:
+        super().on_episode_end(worker=worker, base_env=base_env, policies=policies, episode=episode,
+                               env_index=env_index, **kwargs)
+        # if not worker.multiagent:
+        #     info = episode.last_info_for()
+        #     if 'video' in info:
+        #         video_info = info['video']
+        #         self._videos[video_info['level']] = video_info['path']
+
+    def on_postprocess_trajectory(self, *, worker: "RolloutWorker", episode: MultiAgentEpisode, agent_id: AgentID,
+                                  policy_id: PolicyID, policies: Dict[PolicyID, Policy],
+                                  postprocessed_batch: SampleBatch, original_batches: Dict[AgentID, SampleBatch],
+                                  **kwargs) -> None:
+        super().on_postprocess_trajectory(worker=worker, episode=episode, agent_id=agent_id, policy_id=policy_id,
+                                          policies=policies, postprocessed_batch=postprocessed_batch,
+                                          original_batches=original_batches, **kwargs)
+
+    def on_sample_end(self, *, worker: "RolloutWorker", samples: SampleBatch, **kwargs) -> None:
+        super().on_sample_end(worker=worker, samples=samples, **kwargs)
+
+    def on_learn_on_batch(self, *, policy: Policy, train_batch: SampleBatch, **kwargs) -> None:
+        # TODO: extract any video from infos when this API is updated
+        super().on_learn_on_batch(policy=policy, train_batch=train_batch, **kwargs)
+
+    def on_train_result(self, *, trainer, result: dict, **kwargs) -> None:
+        super().on_train_result(trainer=trainer, result=result, **kwargs)
+
+        # for level, path in self._videos.items():
+        #     result[f'level_{level}'] = Video(path)
+        #     del self._videos[level]
diff --git a/python/griddly/util/rllib/torch/conditional_actions/conditional_action_exploration.py b/python/griddly/util/rllib/torch/conditional_actions/conditional_action_exploration.py
index 4f6a513d0..be92f86a1 100644
--- a/python/griddly/util/rllib/torch/conditional_actions/conditional_action_exploration.py
+++ b/python/griddly/util/rllib/torch/conditional_actions/conditional_action_exploration.py
@@ -1,12 +1,13 @@
+from collections import defaultdict
+
 import torch
 from gym.spaces import Discrete, MultiDiscrete
-from ray.rllib.models.torch.torch_action_dist import TorchCategorical, TorchMultiCategorical
 from torch.distributions import Categorical
 import numpy as np
 
 class TorchConditionalMaskingExploration():
 
-    def __init__(self, model, dist_inputs, valid_action_trees, explore=False, invalid_action_masking=False):
+    def __init__(self, model, dist_inputs, valid_action_trees, explore=False, invalid_action_masking='none', allow_nop=False):
         self._valid_action_trees = valid_action_trees
 
         self._num_inputs = dist_inputs.shape[0]
@@ -19,25 +20,68 @@ def __init__(self, model, dist_inputs, valid_action_trees, explore=False, invali
         self._num_action_parts = len(self._action_space_shape)
 
         self._invalid_action_masking = invalid_action_masking
+        self._allow_nop = allow_nop
 
         self._explore = explore
 
         self._inputs_split = dist_inputs.split(tuple(self._action_space_shape), dim=1)
 
-    def _mask_and_sample(self, options, logits):
+    def _mask_and_sample(self, options, logits, is_parameters=False):
 
-        #if self._invalid_action_masking:
         mask = torch.zeros([logits.shape[0]]).to(logits.device)
         mask[options] = 1
-        logits += torch.log(mask)
-        #else:
-        #    mask = torch.ones([logits.shape[0]])
 
-        dist = Categorical(logits=logits)
+        if is_parameters:
+            if not self._allow_nop and len(options) > 1:
+                mask[0] = 0
+
+        masked_logits = logits + torch.log(mask)
+
+        dist = Categorical(logits=masked_logits)
         sampled = dist.sample()
-        logp = dist.log_prob(sampled)
 
-        return sampled, logits, logp, mask
+        if self._invalid_action_masking != 'none':
+            logp = dist.log_prob(sampled)
+            out_logits = masked_logits
+        else:
+            mask = torch.ones([logits.shape[0]])
+            dist = Categorical(logits=logits)
+            logp = dist.log_prob(sampled)
+            out_logits = logits
+
+
+        return sampled, out_logits, logp, mask
+
+    def _merge_all_branches(self, tree):
+        all_nodes = {}
+        merged_tree = {}
+        for k, v in tree.items():
+            v = self._merge_all_branches(v)
+            all_nodes.update(v)
+
+        for k in tree.keys():
+            merged_tree[k] = all_nodes
+
+        return merged_tree
+
+    def _process_valid_action_tree(self, valid_action_tree):
+        subtree = valid_action_tree
+        subtree_options = list(subtree.keys())
+
+        # In the case there are no available actions for the player
+        if len(subtree_options) == 0:
+            build_tree = subtree
+            for _ in range(self._num_action_parts):
+                build_tree[0] = {}
+                build_tree = build_tree[0]
+            subtree_options = list(subtree.keys())
+
+        # If we want very basic action masking where parameterized masks are superimposed we use this
+        if self._invalid_action_masking == 'collapsed':
+            subtree = self._merge_all_branches(valid_action_tree)
+            subtree_options = list(subtree.keys())
+
+        return subtree, subtree_options
 
     def get_actions_and_mask(self):
 
@@ -51,22 +95,15 @@ def get_actions_and_mask(self):
             for i in range(self._num_inputs):
                 if len(self._valid_action_trees) >= 1:
 
-                    subtree = self._valid_action_trees[i]
-                    subtree_options = list(subtree.keys())
-
-                    # In the case there are no available actions for the player
-                    if len(subtree_options) == 0:
-                        build_tree = subtree
-                        for _ in range(self._num_action_parts):
-                            build_tree[0] = {}
-                            build_tree = build_tree[0]
-                        subtree_options = list(subtree.keys())
+                    subtree, subtree_options = self._process_valid_action_tree(self._valid_action_trees[i])
 
                     logp_parts = torch.zeros([self._num_action_parts])
                     mask_offset = 0
                     for a in range(self._num_action_parts):
+
                         dist_part = self._inputs_split[a]
-                        sampled, masked_part_logits, logp, mask_part = self._mask_and_sample(subtree_options, dist_part[i])
+                        is_parameters = a==(self._num_action_parts-1)
+                        sampled, masked_part_logits, logp, mask_part = self._mask_and_sample(subtree_options, dist_part[i], is_parameters)
 
                         # Set the action and the mask for each part of the action
                         actions[i, a] = sampled
diff --git a/python/griddly/util/rllib/torch/conditional_actions/conditional_action_mixin.py b/python/griddly/util/rllib/torch/conditional_actions/conditional_action_mixin.py
index 199d3f825..5a3c09277 100644
--- a/python/griddly/util/rllib/torch/conditional_actions/conditional_action_mixin.py
+++ b/python/griddly/util/rllib/torch/conditional_actions/conditional_action_mixin.py
@@ -39,31 +39,46 @@ def compute_actions_from_input_dict(
             dist_inputs, state_out = self.model(input_dict, state_batches,
                                                 seq_lens)
 
-            infos = input_dict[SampleBatch.INFOS] if SampleBatch.INFOS in input_dict else {}
-
-            valid_action_trees = []
-            for info in infos:
-                if isinstance(info, dict) and 'valid_action_tree' in info:
-                    valid_action_trees.append(info['valid_action_tree'])
-                else:
-                    valid_action_trees.append({})
-
-            invalid_action_masking = self.config["env_config"].get("invalid_action_masking", False)
-
-            exploration = TorchConditionalMaskingExploration(
-                self.model,
-                dist_inputs,
-                valid_action_trees,
-                explore,
-                invalid_action_masking,
-            )
-
-            actions, masked_logits, logp, mask = exploration.get_actions_and_mask()
+            generate_valid_action_trees = self.config['env_config'].get('generate_valid_action_trees', False)
+            invalid_action_masking = self.config["env_config"].get("invalid_action_masking", 'none')
+            allow_nop = self.config["env_config"].get("allow_nop", False)
+
+            if generate_valid_action_trees:
+                infos = input_dict[SampleBatch.INFOS] if SampleBatch.INFOS in input_dict else {}
+
+                valid_action_trees = []
+                for info in infos:
+                    if isinstance(info, dict) and 'valid_action_tree' in info:
+                        valid_action_trees.append(info['valid_action_tree'])
+                    else:
+                        valid_action_trees.append({})
+
+                exploration = TorchConditionalMaskingExploration(
+                    self.model,
+                    dist_inputs,
+                    valid_action_trees,
+                    explore,
+                    invalid_action_masking,
+                    allow_nop
+                )
+
+                actions, masked_logits, logp, mask = exploration.get_actions_and_mask()
+            else:
+                action_dist = self.dist_class(dist_inputs, self.model)
+
+                # Get the exploration action from the forward results.
+                actions, logp = \
+                    self.exploration.get_exploration_action(
+                        action_distribution=action_dist,
+                        timestep=timestep,
+                        explore=explore)
+
+                masked_logits = dist_inputs
 
             input_dict[SampleBatch.ACTIONS] = actions
 
             extra_fetches = {
-                SampleBatch.ACTION_DIST_INPUTS: dist_inputs,
+                SampleBatch.ACTION_DIST_INPUTS: masked_logits,
                 SampleBatch.ACTION_PROB: torch.exp(logp.float()),
                 SampleBatch.ACTION_LOGP: logp
             }
diff --git a/python/griddly/util/rllib/torch/conditional_actions/conditional_action_policy_trainer.py b/python/griddly/util/rllib/torch/conditional_actions/conditional_action_policy_trainer.py
index 441da7d74..20d82ead6 100644
--- a/python/griddly/util/rllib/torch/conditional_actions/conditional_action_policy_trainer.py
+++ b/python/griddly/util/rllib/torch/conditional_actions/conditional_action_policy_trainer.py
@@ -1,5 +1,4 @@
-from ray.rllib.agents import with_common_config
-from ray.rllib.agents.impala import ImpalaTrainer, DEFAULT_CONFIG as IMPALA_CONFIG
+from ray.rllib.agents.impala import ImpalaTrainer
 from ray.rllib.agents.impala.vtrace_torch_policy import VTraceTorchPolicy
 from ray.rllib.policy.torch_policy import LearningRateSchedule, EntropyCoeffSchedule
 

From 684be8b235c4d6ff486df003fc3655fa78ba276c Mon Sep 17 00:00:00 2001
From: Bam4d <chrisbam4d@gmail.com>
Date: Tue, 16 Mar 2021 15:20:44 +0000
Subject: [PATCH 04/45] add template arguments for centos build

---
 bindings/wrapper/GameWrapper.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bindings/wrapper/GameWrapper.cpp b/bindings/wrapper/GameWrapper.cpp
index 39ff52cd5..4a74ca412 100644
--- a/bindings/wrapper/GameWrapper.cpp
+++ b/bindings/wrapper/GameWrapper.cpp
@@ -380,8 +380,8 @@ class Py_GameWrapper {
         py_event["SourceObjectPlayerId"] = historyEvent.sourceObjectPlayerId;
         py_event["DestinationObjectPlayerId"] = historyEvent.destinationObjectPlayerId;
 
-        py_event["SourceLocation"] = std::array{historyEvent.sourceLocation.x, historyEvent.sourceLocation.y};
-        py_event["DestinationLocation"] = std::array{historyEvent.destLocation.x, historyEvent.destLocation.y};
+        py_event["SourceLocation"] = std::array<uint32_t, 2>{historyEvent.sourceLocation.x, historyEvent.sourceLocation.y};
+        py_event["DestinationLocation"] = std::array<uint32_t, 2>{historyEvent.destLocation.x, historyEvent.destLocation.y};
 
         py_events.push_back(py_event);
       }

From 71168fb92bff15bce6ea4bae8dc201084ecc88da Mon Sep 17 00:00:00 2001
From: Bam4d <chrisbam4d@gmail.com>
Date: Tue, 16 Mar 2021 19:36:50 +0000
Subject: [PATCH 05/45] shinking networks a bit for simplicity

---
 .../rllib_baseline.py                         | 84 +++++++++++++++++++
 .../rllib_conditional_actions.py              |  2 +-
 .../util/rllib/torch/agents/conv_agent.py     |  9 +-
 .../agents/global_average_pooling_agent.py    |  9 +-
 4 files changed, 89 insertions(+), 15 deletions(-)
 create mode 100644 python/examples/experiments/conditional_action_spaces/rllib_baseline.py

diff --git a/python/examples/experiments/conditional_action_spaces/rllib_baseline.py b/python/examples/experiments/conditional_action_spaces/rllib_baseline.py
new file mode 100644
index 000000000..9b7aa7a98
--- /dev/null
+++ b/python/examples/experiments/conditional_action_spaces/rllib_baseline.py
@@ -0,0 +1,84 @@
+import os
+import sys
+
+import ray
+from ray import tune
+from ray.rllib.models import ModelCatalog
+from ray.tune.integration.wandb import WandbLoggerCallback
+from ray.tune.registry import register_env
+
+from griddly import gd
+from griddly.util.rllib.env.core import RLlibEnv
+from griddly.util.rllib.torch.agents.conv_agent import SimpleConvAgent
+# from griddly.util.rllib.callbacks import GriddlyCallbacks
+from griddly.util.rllib.torch.conditional_actions.conditional_action_policy_trainer import \
+    ConditionalActionImpalaTrainer
+
+if __name__ == '__main__':
+    sep = os.pathsep
+    os.environ['PYTHONPATH'] = sep.join(sys.path)
+
+    yaml_files = [
+        os.path.realpath('clusters_po.yaml'),
+        os.path.realpath('clusters_po_with_push.yaml'),
+        os.path.realpath('clusters_po_with_push_seperate_colors.yaml')
+    ]
+
+
+    ray.init(num_gpus=1)
+    # ray.init(num_gpus=1, local_mode=True)
+
+    env_name = "ray-griddly-env"
+
+    register_env(env_name, RLlibEnv)
+    ModelCatalog.register_custom_model("SimpleConv", SimpleConvAgent)
+
+    wandbLoggerCallback = WandbLoggerCallback(
+        project='conditional_actions',
+        group='baseline',
+        api_key_file='~/.wandb_rc'
+    )
+
+    max_training_steps = 5000000
+
+    config = {
+        'framework': 'torch',
+        'num_workers': 8,
+        'num_envs_per_worker': 4,
+
+        # 'callbacks': GriddlyCallbacks,
+
+        'model': {
+            'custom_model': 'SimpleConv',
+            'custom_model_config': {}
+        },
+        'env': env_name,
+        'env_config': {
+            'record_video_config': {
+                'frequency': 100000,
+                'directory': 'baseline_videos'
+            },
+
+            # Put this here so it shows up in wandb
+            'generate_valid_action_trees': False,
+            'random_level_on_reset': True,
+            'yaml_file': tune.grid_search(yaml_files),
+            'global_observer_type': gd.ObserverType.SPRITE_2D,
+            'max_steps': 1000,
+        },
+        'entropy_coeff_schedule': [
+            [0, 0.01],
+            [max_training_steps, 0.0]
+        ],
+        'lr_schedule': [
+            [0, 0.0005],
+            [max_training_steps, 0.0]
+        ],
+
+    }
+
+    stop = {
+        "timesteps_total": max_training_steps,
+    }
+
+    result = tune.run(ConditionalActionImpalaTrainer, config=config, stop=stop, callbacks=[wandbLoggerCallback])
diff --git a/python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py b/python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py
index 789bc4c6c..79f5a0b85 100644
--- a/python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py
+++ b/python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py
@@ -18,7 +18,7 @@
     sep = os.pathsep
     os.environ['PYTHONPATH'] = sep.join(sys.path)
 
-    yaml_file = os.path.realpath('clusters_po_with_push_seperate_colors.yaml')
+    yaml_files = os.path.realpath('clusters_po_with_push_seperate_colors.yaml')
 
     ray.init(num_gpus=1)
     # ray.init(num_gpus=1, local_mode=True)
diff --git a/python/griddly/util/rllib/torch/agents/conv_agent.py b/python/griddly/util/rllib/torch/agents/conv_agent.py
index 52dccf80a..ff1ddd9b4 100644
--- a/python/griddly/util/rllib/torch/agents/conv_agent.py
+++ b/python/griddly/util/rllib/torch/agents/conv_agent.py
@@ -23,22 +23,17 @@ def __init__(self, obs_space, action_space, num_outputs, model_config, name):
             nn.ReLU(),
             layer_init(nn.Conv2d(32, 64, 3, padding=1)),
             nn.ReLU(),
-            layer_init(nn.Conv2d(64, 64, 3, padding=1)),
-            nn.ReLU(),
-            layer_init(nn.Conv2d(64, 64, 3, padding=1)),
-            nn.ReLU(),
             nn.Flatten(),
             layer_init(nn.Linear(linear_flatten, 1024)),
             nn.ReLU(),
             layer_init(nn.Linear(1024, 512)),
             nn.ReLU(),
-            layer_init(nn.Linear(512, 512))
         )
 
         self._actor_head = nn.Sequential(
-            layer_init(nn.Linear(512, 512), std=0.01),
+            layer_init(nn.Linear(512, 256), std=0.01),
             nn.ReLU(),
-            layer_init(nn.Linear(512, self._num_actions), std=0.01)
+            layer_init(nn.Linear(256, self._num_actions), std=0.01)
         )
 
         self._critic_head = nn.Sequential(
diff --git a/python/griddly/util/rllib/torch/agents/global_average_pooling_agent.py b/python/griddly/util/rllib/torch/agents/global_average_pooling_agent.py
index d2249abcb..8fab31c26 100644
--- a/python/griddly/util/rllib/torch/agents/global_average_pooling_agent.py
+++ b/python/griddly/util/rllib/torch/agents/global_average_pooling_agent.py
@@ -41,22 +41,17 @@ def __init__(self, obs_space, action_space, num_outputs, model_config, name):
             nn.ReLU(),
             layer_init(nn.Conv2d(32, 64, 3, padding=1)),
             nn.ReLU(),
-            layer_init(nn.Conv2d(64, 64, 3, padding=1)),
-            nn.ReLU(),
-            layer_init(nn.Conv2d(64, 64, 3, padding=1)),
-            nn.ReLU(),
             GlobalAvePool(2048),
             layer_init(nn.Linear(2048, 1024)),
             nn.ReLU(),
             layer_init(nn.Linear(1024, 512)),
             nn.ReLU(),
-            layer_init(nn.Linear(512, 512))
         )
 
         self._actor_head = nn.Sequential(
-            layer_init(nn.Linear(512, 512), std=0.01),
+            layer_init(nn.Linear(512, 256), std=0.01),
             nn.ReLU(),
-            layer_init(nn.Linear(512, self._num_actions), std=0.01)
+            layer_init(nn.Linear(256, self._num_actions), std=0.01)
         )
 
         self._critic_head = nn.Sequential(

From 923a4893d26d586a1bd7a22186f59b22d21fe445 Mon Sep 17 00:00:00 2001
From: Bam4d <chrisbam4d@gmail.com>
Date: Thu, 18 Mar 2021 08:12:26 +0000
Subject: [PATCH 06/45] some work on fixes for videos

---
 python/.gitignore                             |   3 +
 .../rllib_baseline.py                         |   8 +-
 .../rllib_conditional_actions.py              |  30 +--
 python/examples/rllib/rllib_multi_agent.py    |   2 +-
 python/examples/rllib/rllib_single_agent.py   |   2 +-
 .../rllib_single_agent_conditional_actions.py |   2 +-
 python/griddly/RenderTools.py                 |   1 +
 python/griddly/util/rllib/callbacks.py        |  36 +--
 .../util/rllib/environment/__init__.py        |   0
 python/griddly/util/rllib/environment/core.py | 239 ++++++++++++++++++
 .../conditional_action_mixin.py               |   1 +
 11 files changed, 286 insertions(+), 38 deletions(-)
 create mode 100644 python/griddly/util/rllib/environment/__init__.py
 create mode 100644 python/griddly/util/rllib/environment/core.py

diff --git a/python/.gitignore b/python/.gitignore
index bfed1620f..5391f8bed 100644
--- a/python/.gitignore
+++ b/python/.gitignore
@@ -143,3 +143,6 @@ griddly/resources/
 
 # Hacky stuff
 scratchpad/
+
+# Video folders
+.video/
diff --git a/python/examples/experiments/conditional_action_spaces/rllib_baseline.py b/python/examples/experiments/conditional_action_spaces/rllib_baseline.py
index 9b7aa7a98..f023a7b5f 100644
--- a/python/examples/experiments/conditional_action_spaces/rllib_baseline.py
+++ b/python/examples/experiments/conditional_action_spaces/rllib_baseline.py
@@ -8,7 +8,8 @@
 from ray.tune.registry import register_env
 
 from griddly import gd
-from griddly.util.rllib.env.core import RLlibEnv
+from griddly.util.rllib.environment.core import RLlibEnv
+from griddly.util.rllib.torch import GAPAgent
 from griddly.util.rllib.torch.agents.conv_agent import SimpleConvAgent
 # from griddly.util.rllib.callbacks import GriddlyCallbacks
 from griddly.util.rllib.torch.conditional_actions.conditional_action_policy_trainer import \
@@ -32,6 +33,7 @@
 
     register_env(env_name, RLlibEnv)
     ModelCatalog.register_custom_model("SimpleConv", SimpleConvAgent)
+    ModelCatalog.register_custom_model("GAP", GAPAgent)
 
     wandbLoggerCallback = WandbLoggerCallback(
         project='conditional_actions',
@@ -39,7 +41,7 @@
         api_key_file='~/.wandb_rc'
     )
 
-    max_training_steps = 5000000
+    max_training_steps = 20000000
 
     config = {
         'framework': 'torch',
@@ -49,7 +51,7 @@
         # 'callbacks': GriddlyCallbacks,
 
         'model': {
-            'custom_model': 'SimpleConv',
+            'custom_model': tune.grid_search(['SimpleConv', 'GAP']),
             'custom_model_config': {}
         },
         'env': env_name,
diff --git a/python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py b/python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py
index 79f5a0b85..62d355312 100644
--- a/python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py
+++ b/python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py
@@ -8,9 +8,9 @@
 from ray.tune.registry import register_env
 
 from griddly import gd
-from griddly.util.rllib.env.core import RLlibEnv
-# from griddly.util.rllib.callbacks import GriddlyCallbacks
-from griddly.util.rllib.torch import GAPAgent
+from griddly.util.rllib.callbacks import GriddlyCallbacks
+from griddly.util.rllib.environment.core import RLlibEnv
+from griddly.util.rllib.torch.agents.conv_agent import SimpleConvAgent
 from griddly.util.rllib.torch.conditional_actions.conditional_action_policy_trainer import \
     ConditionalActionImpalaTrainer
 
@@ -18,46 +18,46 @@
     sep = os.pathsep
     os.environ['PYTHONPATH'] = sep.join(sys.path)
 
-    yaml_files = os.path.realpath('clusters_po_with_push_seperate_colors.yaml')
+    yaml_file = os.path.realpath('clusters_po.yaml')
 
     ray.init(num_gpus=1)
-    # ray.init(num_gpus=1, local_mode=True)
+    #ray.init(num_gpus=1, local_mode=True)
 
     env_name = "ray-griddly-env"
 
     register_env(env_name, RLlibEnv)
-    ModelCatalog.register_custom_model("GAP", GAPAgent)
+    ModelCatalog.register_custom_model("SimpleConv", SimpleConvAgent)
 
     wandbLoggerCallback = WandbLoggerCallback(
         project='conditional_actions',
         api_key_file='~/.wandb_rc'
     )
 
-    max_training_steps = 5000000
+    max_training_steps = 1000000
 
     config = {
         'framework': 'torch',
-        'num_workers': 4,
-        'num_envs_per_worker': 4,
+        'num_workers': 1,
+        'num_envs_per_worker': 1,
 
-        # 'callbacks': GriddlyCallbacks,
+        'callbacks': GriddlyCallbacks,
 
         'model': {
-            'custom_model': 'GAP',
+            'custom_model': 'SimpleConv',
             'custom_model_config': {}
         },
         'env': env_name,
         'env_config': {
             'record_video_config': {
-                'frequency': 100000,
+                'frequency': 1000,
                 'directory': 'videos'
             },
 
-            'allow_nop': tune.grid_search([True, False]),
-            'invalid_action_masking': tune.grid_search(['none', 'conditional', 'collapsed']),
+            #'allow_nop': tune.grid_search([True, False]),
+            #'invalid_action_masking': tune.grid_search(['none', 'conditional', 'collapsed']),
             # 'invalid_action_masking': 'collapsed',
             # 'allow_nop': False,
-            'generate_valid_action_trees': tune.grid_search([True, False]),
+            'generate_valid_action_trees': True,
             'random_level_on_reset': True,
             'yaml_file': yaml_file,
             'global_observer_type': gd.ObserverType.SPRITE_2D,
diff --git a/python/examples/rllib/rllib_multi_agent.py b/python/examples/rllib/rllib_multi_agent.py
index 1560a127f..713910dbc 100644
--- a/python/examples/rllib/rllib_multi_agent.py
+++ b/python/examples/rllib/rllib_multi_agent.py
@@ -10,7 +10,7 @@
 
 from griddly import gd
 from griddly.util.rllib.torch.agents.conv_agent import SimpleConvAgent
-from griddly.util.rllib.env.core import RLlibMultiAgentWrapper, RLlibEnv
+from griddly.util.rllib.environment.core import RLlibMultiAgentWrapper, RLlibEnv
 
 if __name__ == '__main__':
     sep = os.pathsep
diff --git a/python/examples/rllib/rllib_single_agent.py b/python/examples/rllib/rllib_single_agent.py
index de96b6589..c69697a81 100644
--- a/python/examples/rllib/rllib_single_agent.py
+++ b/python/examples/rllib/rllib_single_agent.py
@@ -9,7 +9,7 @@
 
 from griddly import gd
 from griddly.util.rllib.torch import GAPAgent
-from griddly.util.rllib.env.core import RLlibEnv
+from griddly.util.rllib.environment.core import RLlibEnv
 
 if __name__ == '__main__':
     sep = os.pathsep
diff --git a/python/examples/rllib/rllib_single_agent_conditional_actions.py b/python/examples/rllib/rllib_single_agent_conditional_actions.py
index a7d0b4a61..1075d8f1b 100644
--- a/python/examples/rllib/rllib_single_agent_conditional_actions.py
+++ b/python/examples/rllib/rllib_single_agent_conditional_actions.py
@@ -9,7 +9,7 @@
 from griddly import gd
 from griddly.util.rllib.torch import GAPAgent
 from griddly.util.rllib.torch.conditional_actions.conditional_action_policy_trainer import ConditionalActionImpalaTrainer
-from griddly.util.rllib.env.core import RLlibEnv
+from griddly.util.rllib.environment.core import RLlibEnv
 
 if __name__ == '__main__':
     sep = os.pathsep
diff --git a/python/griddly/RenderTools.py b/python/griddly/RenderTools.py
index 73029561d..b95386afd 100644
--- a/python/griddly/RenderTools.py
+++ b/python/griddly/RenderTools.py
@@ -1,5 +1,6 @@
 import imageio
 from gym.wrappers.monitoring.video_recorder import ImageEncoder
+import os
 
 
 class RenderWindow():
diff --git a/python/griddly/util/rllib/callbacks.py b/python/griddly/util/rllib/callbacks.py
index b1c0587f1..ab7ce09e0 100644
--- a/python/griddly/util/rllib/callbacks.py
+++ b/python/griddly/util/rllib/callbacks.py
@@ -4,9 +4,7 @@
 from ray.rllib.agents.callbacks import DefaultCallbacks
 from ray.rllib.evaluation import MultiAgentEpisode
 from ray.rllib.utils.typing import AgentID, PolicyID
-
-
-# from wandb import Video
+from wandb import Video
 
 
 class GriddlyCallbacks(DefaultCallbacks):
@@ -14,8 +12,6 @@ class GriddlyCallbacks(DefaultCallbacks):
     def __init__(self, legacy_callbacks_dict: Dict[str, callable] = None):
         super().__init__(legacy_callbacks_dict)
 
-        # self._videos = {}
-
     def on_episode_start(self, *, worker: "RolloutWorker", base_env: BaseEnv, policies: Dict[PolicyID, Policy],
                          episode: MultiAgentEpisode, env_index: Optional[int] = None, **kwargs) -> None:
         super().on_episode_start(worker=worker, base_env=base_env, policies=policies, episode=episode,
@@ -29,11 +25,14 @@ def on_episode_end(self, *, worker: "RolloutWorker", base_env: BaseEnv, policies
                        episode: MultiAgentEpisode, env_index: Optional[int] = None, **kwargs) -> None:
         super().on_episode_end(worker=worker, base_env=base_env, policies=policies, episode=episode,
                                env_index=env_index, **kwargs)
-        # if not worker.multiagent:
-        #     info = episode.last_info_for()
-        #     if 'video' in info:
-        #         video_info = info['video']
-        #         self._videos[video_info['level']] = video_info['path']
+        if not worker.multiagent:
+            info = episode.last_info_for()
+            if 'video' in info:
+                level = info['video']['level']
+                path = info['video']['path']
+                print(f'creating video with path: {path}')
+                episode.media['video_test'] = 'here is some test data'
+                episode.media[f'level_{level}'] = Video(path)
 
     def on_postprocess_trajectory(self, *, worker: "RolloutWorker", episode: MultiAgentEpisode, agent_id: AgentID,
                                   policy_id: PolicyID, policies: Dict[PolicyID, Policy],
@@ -46,13 +45,16 @@ def on_postprocess_trajectory(self, *, worker: "RolloutWorker", episode: MultiAg
     def on_sample_end(self, *, worker: "RolloutWorker", samples: SampleBatch, **kwargs) -> None:
         super().on_sample_end(worker=worker, samples=samples, **kwargs)
 
-    def on_learn_on_batch(self, *, policy: Policy, train_batch: SampleBatch, **kwargs) -> None:
-        # TODO: extract any video from infos when this API is updated
-        super().on_learn_on_batch(policy=policy, train_batch=train_batch, **kwargs)
+    def on_learn_on_batch(self, *, policy: Policy, train_batch: SampleBatch, result: dict, **kwargs) -> None:
+        pass
+        # Loop through the 'info' keys looking for 'video'
+        # for info_dict in train_batch[SampleBatch.INFOS]:
+        #     if 'video' in info_dict:
+        #         level = info_dict['video']['level']
+        #         path = info_dict['video']['path']
+        #         print(f'creating video with path: {path}')
+        #         result['video_test'] = 1
+        #         result[f'level_{level}'] = Video(path)
 
     def on_train_result(self, *, trainer, result: dict, **kwargs) -> None:
         super().on_train_result(trainer=trainer, result=result, **kwargs)
-
-        # for level, path in self._videos.items():
-        #     result[f'level_{level}'] = Video(path)
-        #     del self._videos[level]
diff --git a/python/griddly/util/rllib/environment/__init__.py b/python/griddly/util/rllib/environment/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/python/griddly/util/rllib/environment/core.py b/python/griddly/util/rllib/environment/core.py
new file mode 100644
index 000000000..097f6c7b0
--- /dev/null
+++ b/python/griddly/util/rllib/environment/core.py
@@ -0,0 +1,239 @@
+import os
+from collections import defaultdict
+from enum import Enum
+from uuid import uuid1
+
+import gym
+import numpy as np
+from ray.rllib import MultiAgentEnv
+from ray.rllib.utils.typing import MultiAgentDict
+
+from griddly import GymWrapper
+from griddly.RenderTools import VideoRecorder
+
+
+class RecordingState(Enum):
+    NOT_RECORDING = 1
+    WAITING_FOR_EPISODE_START = 2
+    BEFORE_RECORDING = 3
+    RECORDING = 4
+
+
+class RLlibEnv(GymWrapper):
+    """
+    Wraps a Griddly environment for compatibility with RLLib.
+
+    Use the `env_config` in the rllib config to provide Griddly Environment Parameters
+
+    Example:
+
+    Firstly register the RLlibWrapper using rllib's
+
+    env_name = "my_env_name"
+
+    register_env(env_name, RLlibWrapper)
+
+    you can then configure it
+
+    rllib_config = {
+        'env_config': {
+            'yaml_file': 'Single-Player/GVGAI/butterflies.yaml',
+            'level": 6,
+            'player_observer_type': gd.ObserverType.SPRITE_2D,
+            'global_observer_type': gd.ObserverType.ISOMETRIC,
+            'max_steps': 1000,
+        },
+        # Other configuration options
+    }
+
+    Create the rllib trainer using this config:
+
+    trainer = ImpalaTrainer(rllib_config, env=env_name)
+
+    """
+
+    def __init__(self, env_config):
+        super().__init__(**env_config)
+
+        self.generate_valid_action_trees = env_config.get('generate_valid_action_trees', False)
+        self._record_video_config = env_config.get('record_video_config', None)
+        self._random_level_on_reset = env_config.get('random_level_on_reset', False)
+
+        super().reset()
+
+        self._recording_state = None
+        self._env_steps = 0
+
+        if self._record_video_config is not None:
+            self._recording_state = RecordingState.BEFORE_RECORDING
+            self._video_frequency = self._record_video_config.get('frequency', 1000)
+            self._video_directory = os.path.realpath(self._record_video_config.get('directory', '.'))
+            os.makedirs(self._video_directory, exist_ok=True)
+
+        self.set_transform()
+
+    def _transform(self, observation):
+
+        if self.player_count > 1:
+            transformed_obs = [obs.transpose(1, 2, 0).astype(np.float) for obs in observation]
+        else:
+            transformed_obs = observation.transpose(1, 2, 0).astype(np.float)
+
+        return transformed_obs
+
+    def _after_step(self, observation, reward, done, info):
+        extra_info = {}
+        if self._recording_state is not None:
+            if self._recording_state is RecordingState.NOT_RECORDING and self._env_steps % self._video_frequency == 0:
+                self._recording_state = RecordingState.WAITING_FOR_EPISODE_START
+
+            if self._recording_state == RecordingState.BEFORE_RECORDING:
+                global_obs = self.render(observer='global', mode='rgb_array')
+                self._global_recorder = VideoRecorder()
+
+                video_filename = os.path.join(
+                    self._video_directory,
+                    f'global_video_{uuid1()}_{self.level_id}_{self._env_steps}.mp4'
+                )
+
+                self._global_recorder.start(video_filename, global_obs.shape)
+                self._recording_state = RecordingState.RECORDING
+
+            if self._recording_state == RecordingState.RECORDING:
+                global_obs = self.render(observer='global', mode='rgb_array')
+                self._global_recorder.add_frame(global_obs)
+                if done:
+                    self._recording_state = RecordingState.NOT_RECORDING
+                    self._global_recorder.close()
+
+                    print(f'finished recording {self._global_recorder.output_file}')
+
+                    extra_info['video'] = {
+                        'level': self.level_id,
+                        'path': self._global_recorder.output_file
+                    }
+
+            if self._recording_state == RecordingState.WAITING_FOR_EPISODE_START:
+                if done:
+                    self._recording_state = RecordingState.BEFORE_RECORDING
+
+        return extra_info
+
+    def set_transform(self):
+        """
+        Create the transform for rllib based on the observation space
+        """
+
+        if self.player_count > 1:
+            self.observation_space = self.observation_space[0]
+            self.action_space = self.action_space[0]
+
+        self.observation_space = gym.spaces.Box(
+            self.observation_space.low.transpose((1, 2, 0)).astype(np.float),
+            self.observation_space.high.transpose((1, 2, 0)).astype(np.float),
+            dtype=np.float,
+        )
+
+        self.height = self.observation_space.shape[0]
+        self.width = self.observation_space.shape[1]
+
+    def _get_valid_action_trees(self):
+        valid_action_trees = self.game.build_valid_action_trees()
+        if self.player_count == 1:
+            return valid_action_trees[0]
+        return valid_action_trees
+
+    def reset(self, **kwargs):
+
+        if self._random_level_on_reset:
+            kwargs['level_id'] = np.random.choice(self.level_count)
+        observation = super().reset(**kwargs)
+        self.set_transform()
+
+        if self.generate_valid_action_trees:
+            self.last_valid_action_trees = self._get_valid_action_trees()
+
+        return self._transform(observation)
+
+    def step(self, action):
+        observation, reward, done, info = super().step(action)
+
+        extra_info = self._after_step(observation, reward, done, info)
+
+        if 'video' in extra_info:
+            info['video'] = extra_info['video']
+
+        self._env_steps += 1
+
+        if self.generate_valid_action_trees:
+            self.last_valid_action_trees = self._get_valid_action_trees()
+            info['valid_action_tree'] = self.last_valid_action_trees
+
+        return self._transform(observation), reward, done, info
+
+    def render(self, mode='human', observer=0):
+        return super().render(mode, observer='global')
+
+
+class RLlibMultiAgentWrapper(gym.Wrapper, MultiAgentEnv):
+
+    def __init__(self, env, env_config):
+        super().__init__(env)
+
+        self._player_done_variable = env_config.get('player_done_variable', None)
+
+        # Used to keep track of agents that are active in the environment
+        self._active_agents = set()
+
+        assert self.player_count > 1, 'RLlibMultiAgentWrapper can only be used with environments that have multiple agents'
+
+    def _to_multi_agent_map(self, data):
+        return {a: data[a - 1] for a in self._active_agents}
+
+    def reset(self, **kwargs):
+        obs = super().reset(**kwargs)
+        self._active_agents.update([a + 1 for a in range(self.player_count)])
+        return self._to_multi_agent_map(obs)
+
+    def _resolve_player_done_variable(self):
+        resolved_variables = self.game.get_global_variable([self._player_done_variable])
+        return resolved_variables[self._player_done_variable]
+
+    def step(self, action_dict: MultiAgentDict):
+        actions_array = np.zeros((self.player_count, *self.action_space.shape))
+        for agent_id, action in action_dict.items():
+            actions_array[agent_id - 1] = action
+
+        obs, reward, all_done, info = super().step(actions_array)
+
+        done_map = {'__all__': all_done}
+
+        if self._player_done_variable is not None:
+            griddly_players_done = self._resolve_player_done_variable()
+
+            for agent_id in self._active_agents:
+                done_map[agent_id] = griddly_players_done[agent_id] == 1 or all_done
+        else:
+            for p in range(self.player_count):
+                done_map[p] = False
+
+        if self.generate_valid_action_trees:
+            info_map = self._to_multi_agent_map([
+                {'valid_action_tree': valid_action_tree} for valid_action_tree in info['valid_action_trees']
+            ])
+        else:
+            info_map = self._to_multi_agent_map(defaultdict(dict))
+
+        obs_map = self._to_multi_agent_map(obs)
+        reward_map = self._to_multi_agent_map(reward)
+
+        # Finally remove any agent ids that are done
+        for agent_id, is_done in done_map.items():
+            if is_done:
+                self._active_agents.discard(agent_id)
+
+        assert len(obs_map) == len(reward_map)
+        assert len(obs_map) == len(done_map) - 1
+        assert len(obs_map) == len(info_map)
+
+        return obs_map, reward_map, done_map, info_map
diff --git a/python/griddly/util/rllib/torch/conditional_actions/conditional_action_mixin.py b/python/griddly/util/rllib/torch/conditional_actions/conditional_action_mixin.py
index 5a3c09277..c9ee92568 100644
--- a/python/griddly/util/rllib/torch/conditional_actions/conditional_action_mixin.py
+++ b/python/griddly/util/rllib/torch/conditional_actions/conditional_action_mixin.py
@@ -50,6 +50,7 @@ def compute_actions_from_input_dict(
                 for info in infos:
                     if isinstance(info, dict) and 'valid_action_tree' in info:
                         valid_action_trees.append(info['valid_action_tree'])
+                        print('valid_action_tree_found')
                     else:
                         valid_action_trees.append({})
 

From 9c73bb8458c8fd403e4b127dc61a54e8ab033f7b Mon Sep 17 00:00:00 2001
From: Bam4d <chrisbam4d@gmail.com>
Date: Thu, 18 Mar 2021 16:25:40 +0000
Subject: [PATCH 07/45] removing some code and updating other examples

---
 .../conditional_action_spaces/rllib_baseline.py  |  1 -
 .../rllib_conditional_actions.py                 | 16 ++++++++++------
 python/examples/rllib/rllib_multi_agent.py       |  3 ++-
 python/examples/rllib/rllib_single_agent.py      |  3 ++-
 .../rllib_single_agent_conditional_actions.py    |  5 +++--
 python/griddly/util/rllib/callbacks.py           | 10 ----------
 python/griddly/util/rllib/environment/core.py    |  2 --
 .../conditional_action_mixin.py                  |  1 -
 8 files changed, 17 insertions(+), 24 deletions(-)

diff --git a/python/examples/experiments/conditional_action_spaces/rllib_baseline.py b/python/examples/experiments/conditional_action_spaces/rllib_baseline.py
index f023a7b5f..a89f160c8 100644
--- a/python/examples/experiments/conditional_action_spaces/rllib_baseline.py
+++ b/python/examples/experiments/conditional_action_spaces/rllib_baseline.py
@@ -33,7 +33,6 @@
 
     register_env(env_name, RLlibEnv)
     ModelCatalog.register_custom_model("SimpleConv", SimpleConvAgent)
-    ModelCatalog.register_custom_model("GAP", GAPAgent)
 
     wandbLoggerCallback = WandbLoggerCallback(
         project='conditional_actions',
diff --git a/python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py b/python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py
index 62d355312..002e40308 100644
--- a/python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py
+++ b/python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py
@@ -18,7 +18,11 @@
     sep = os.pathsep
     os.environ['PYTHONPATH'] = sep.join(sys.path)
 
-    yaml_file = os.path.realpath('clusters_po.yaml')
+    yaml_files = [
+        os.path.realpath('clusters_po.yaml'),
+        os.path.realpath('clusters_po_with_push.yaml'),
+        os.path.realpath('clusters_po_with_push_seperate_colors.yaml')
+    ]
 
     ray.init(num_gpus=1)
     #ray.init(num_gpus=1, local_mode=True)
@@ -37,7 +41,7 @@
 
     config = {
         'framework': 'torch',
-        'num_workers': 1,
+        'num_workers': 8,
         'num_envs_per_worker': 1,
 
         'callbacks': GriddlyCallbacks,
@@ -49,17 +53,17 @@
         'env': env_name,
         'env_config': {
             'record_video_config': {
-                'frequency': 1000,
+                'frequency': 100000,
                 'directory': 'videos'
             },
 
-            #'allow_nop': tune.grid_search([True, False]),
-            #'invalid_action_masking': tune.grid_search(['none', 'conditional', 'collapsed']),
+            'allow_nop': tune.grid_search([True, False]),
+            'invalid_action_masking': tune.grid_search(['none', 'conditional', 'collapsed']),
             # 'invalid_action_masking': 'collapsed',
             # 'allow_nop': False,
             'generate_valid_action_trees': True,
             'random_level_on_reset': True,
-            'yaml_file': yaml_file,
+            'yaml_file': tune.grid_search(yaml_files),
             'global_observer_type': gd.ObserverType.SPRITE_2D,
             'max_steps': 1000,
         },
diff --git a/python/examples/rllib/rllib_multi_agent.py b/python/examples/rllib/rllib_multi_agent.py
index 713910dbc..efa82c5a7 100644
--- a/python/examples/rllib/rllib_multi_agent.py
+++ b/python/examples/rllib/rllib_multi_agent.py
@@ -49,7 +49,8 @@ def _create_env(env_config):
             # 'player_done_variable': 'player_done',
 
             'record_video_config': {
-                'frequency': 20000  # number of rollouts
+                'frequency': 20000,  # number of rollouts
+                'directory': 'videos'
             },
 
             'random_level_on_reset': True,
diff --git a/python/examples/rllib/rllib_single_agent.py b/python/examples/rllib/rllib_single_agent.py
index c69697a81..4f6b47a1d 100644
--- a/python/examples/rllib/rllib_single_agent.py
+++ b/python/examples/rllib/rllib_single_agent.py
@@ -36,7 +36,8 @@
         'env': env_name,
         'env_config': {
             'record_video_config': {
-                'frequency': 100000
+                'frequency': 100000,
+                'directory': 'videos'
             },
 
             'random_level_on_reset': True,
diff --git a/python/examples/rllib/rllib_single_agent_conditional_actions.py b/python/examples/rllib/rllib_single_agent_conditional_actions.py
index 1075d8f1b..b2ff4efca 100644
--- a/python/examples/rllib/rllib_single_agent_conditional_actions.py
+++ b/python/examples/rllib/rllib_single_agent_conditional_actions.py
@@ -23,7 +23,7 @@
     register_env(env_name, RLlibEnv)
     ModelCatalog.register_custom_model("GAP", GAPAgent)
 
-    max_training_steps = 2000000
+    max_training_steps = 20000000
 
     config = {
         'framework': 'torch',
@@ -37,7 +37,8 @@
         'env': env_name,
         'env_config': {
             'record_video_config': {
-                'frequency': 100000
+                'frequency': 100000,
+                'directory': 'videos'
             },
 
             'allow_nop': tune.grid_search([True, False]),
diff --git a/python/griddly/util/rllib/callbacks.py b/python/griddly/util/rllib/callbacks.py
index ab7ce09e0..354c90c3f 100644
--- a/python/griddly/util/rllib/callbacks.py
+++ b/python/griddly/util/rllib/callbacks.py
@@ -30,8 +30,6 @@ def on_episode_end(self, *, worker: "RolloutWorker", base_env: BaseEnv, policies
             if 'video' in info:
                 level = info['video']['level']
                 path = info['video']['path']
-                print(f'creating video with path: {path}')
-                episode.media['video_test'] = 'here is some test data'
                 episode.media[f'level_{level}'] = Video(path)
 
     def on_postprocess_trajectory(self, *, worker: "RolloutWorker", episode: MultiAgentEpisode, agent_id: AgentID,
@@ -47,14 +45,6 @@ def on_sample_end(self, *, worker: "RolloutWorker", samples: SampleBatch, **kwar
 
     def on_learn_on_batch(self, *, policy: Policy, train_batch: SampleBatch, result: dict, **kwargs) -> None:
         pass
-        # Loop through the 'info' keys looking for 'video'
-        # for info_dict in train_batch[SampleBatch.INFOS]:
-        #     if 'video' in info_dict:
-        #         level = info_dict['video']['level']
-        #         path = info_dict['video']['path']
-        #         print(f'creating video with path: {path}')
-        #         result['video_test'] = 1
-        #         result[f'level_{level}'] = Video(path)
 
     def on_train_result(self, *, trainer, result: dict, **kwargs) -> None:
         super().on_train_result(trainer=trainer, result=result, **kwargs)
diff --git a/python/griddly/util/rllib/environment/core.py b/python/griddly/util/rllib/environment/core.py
index 097f6c7b0..86672c862 100644
--- a/python/griddly/util/rllib/environment/core.py
+++ b/python/griddly/util/rllib/environment/core.py
@@ -106,8 +106,6 @@ def _after_step(self, observation, reward, done, info):
                     self._recording_state = RecordingState.NOT_RECORDING
                     self._global_recorder.close()
 
-                    print(f'finished recording {self._global_recorder.output_file}')
-
                     extra_info['video'] = {
                         'level': self.level_id,
                         'path': self._global_recorder.output_file
diff --git a/python/griddly/util/rllib/torch/conditional_actions/conditional_action_mixin.py b/python/griddly/util/rllib/torch/conditional_actions/conditional_action_mixin.py
index c9ee92568..5a3c09277 100644
--- a/python/griddly/util/rllib/torch/conditional_actions/conditional_action_mixin.py
+++ b/python/griddly/util/rllib/torch/conditional_actions/conditional_action_mixin.py
@@ -50,7 +50,6 @@ def compute_actions_from_input_dict(
                 for info in infos:
                     if isinstance(info, dict) and 'valid_action_tree' in info:
                         valid_action_trees.append(info['valid_action_tree'])
-                        print('valid_action_tree_found')
                     else:
                         valid_action_trees.append({})
 

From 28a107dcf1d2210b1a7b00cbfc6c8c3f5684bf7c Mon Sep 17 00:00:00 2001
From: Bam4d <chrisbam4d@gmail.com>
Date: Sat, 20 Mar 2021 15:18:57 +0000
Subject: [PATCH 08/45] added more options for running/testing on cluster

---
 .../rllib_conditional_actions.py              | 45 ++++++++++++++-----
 python/griddly/util/rllib/callbacks.py        | 35 +--------------
 2 files changed, 36 insertions(+), 44 deletions(-)

diff --git a/python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py b/python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py
index 002e40308..1ac7e9cc0 100644
--- a/python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py
+++ b/python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py
@@ -14,7 +14,28 @@
 from griddly.util.rllib.torch.conditional_actions.conditional_action_policy_trainer import \
     ConditionalActionImpalaTrainer
 
+import argparse
+
+parser = argparse.ArgumentParser(description='Run experiments')
+
+parser.add_argument('--root-directory', default=os.path.expanduser("~/ray_results"), help='root directory for all data associated with the run')
+parser.add_argument('--num-gpus', default=1, type=int, help='Number of GPUs to make available')
+
+parser.add_argument('--num-workers', default=8, type=int, help='Number of workers')
+parser.add_argument('--num-envs-per-worker', default=2, type=int, help='Number of workers')
+parser.add_argument('--num-gpus-per-worker', default=0, type=float, help='Number of gpus per worker')
+parser.add_argument('--num-cpus-per-worker', default=1, type=float, help='Number of gpus per worker')
+parser.add_argument('--max-training-steps', default=20000000, type=int, help='Number of workers')
+
+parser.add_argument('--video-directory', default='videos', help='directory of video')
+parser.add_argument('--video-frequency', type=int, default=10000, help='Frequency of videos')
+
+parser.add_argument('--lr', type=float, default=0.0005, help='learning rate')
+
 if __name__ == '__main__':
+
+    args = parser.parse_args()
+
     sep = os.pathsep
     os.environ['PYTHONPATH'] = sep.join(sys.path)
 
@@ -24,8 +45,8 @@
         os.path.realpath('clusters_po_with_push_seperate_colors.yaml')
     ]
 
-    ray.init(num_gpus=1)
-    #ray.init(num_gpus=1, local_mode=True)
+    ray.init(num_gpus=args.num_gpus)
+    #ray.init(num_gpus=args.num_gpus, local_mode=True)
 
     env_name = "ray-griddly-env"
 
@@ -37,12 +58,14 @@
         api_key_file='~/.wandb_rc'
     )
 
-    max_training_steps = 1000000
+    max_training_steps = args.max_training_steps
 
     config = {
         'framework': 'torch',
-        'num_workers': 8,
-        'num_envs_per_worker': 1,
+        'num_workers': args.num_workers,
+        'num_envs_per_worker': args.num_envs_per_worker,
+        'num_gpus_per_worker': float(args.num_gpus_per_worker),
+        'num_cpus_per_worker': args.num_cpus_per_worker,
 
         'callbacks': GriddlyCallbacks,
 
@@ -53,12 +76,12 @@
         'env': env_name,
         'env_config': {
             'record_video_config': {
-                'frequency': 100000,
-                'directory': 'videos'
+                'frequency': args.video_frequency,
+                'directory': os.path.join(args.root_directory, args.video_directory)
             },
 
-            'allow_nop': tune.grid_search([True, False]),
-            'invalid_action_masking': tune.grid_search(['none', 'conditional', 'collapsed']),
+            #'allow_nop': tune.grid_search([True, False]),
+            #'invalid_action_masking': tune.grid_search(['none', 'conditional', 'collapsed']),
             # 'invalid_action_masking': 'collapsed',
             # 'allow_nop': False,
             'generate_valid_action_trees': True,
@@ -72,7 +95,7 @@
             [max_training_steps, 0.0]
         ],
         'lr_schedule': [
-            [0, 0.0005],
+            [0, args.lr],
             [max_training_steps, 0.0]
         ],
 
@@ -82,4 +105,4 @@
         "timesteps_total": max_training_steps,
     }
 
-    result = tune.run(ConditionalActionImpalaTrainer, config=config, stop=stop, callbacks=[wandbLoggerCallback])
+    result = tune.run(ConditionalActionImpalaTrainer, local_dir=args.root_directory, config=config, stop=stop, callbacks=[wandbLoggerCallback])
diff --git a/python/griddly/util/rllib/callbacks.py b/python/griddly/util/rllib/callbacks.py
index 354c90c3f..dfb1865fe 100644
--- a/python/griddly/util/rllib/callbacks.py
+++ b/python/griddly/util/rllib/callbacks.py
@@ -1,50 +1,19 @@
 from typing import Optional, Dict
 
-from ray.rllib import Policy, SampleBatch, BaseEnv
+from ray.rllib import Policy, BaseEnv
 from ray.rllib.agents.callbacks import DefaultCallbacks
 from ray.rllib.evaluation import MultiAgentEpisode
-from ray.rllib.utils.typing import AgentID, PolicyID
+from ray.rllib.utils.typing import PolicyID
 from wandb import Video
 
 
 class GriddlyCallbacks(DefaultCallbacks):
 
-    def __init__(self, legacy_callbacks_dict: Dict[str, callable] = None):
-        super().__init__(legacy_callbacks_dict)
-
-    def on_episode_start(self, *, worker: "RolloutWorker", base_env: BaseEnv, policies: Dict[PolicyID, Policy],
-                         episode: MultiAgentEpisode, env_index: Optional[int] = None, **kwargs) -> None:
-        super().on_episode_start(worker=worker, base_env=base_env, policies=policies, episode=episode,
-                                 env_index=env_index, **kwargs)
-
-    def on_episode_step(self, *, worker: "RolloutWorker", base_env: BaseEnv, episode: MultiAgentEpisode,
-                        env_index: Optional[int] = None, **kwargs) -> None:
-        super().on_episode_step(worker=worker, base_env=base_env, episode=episode, env_index=env_index, **kwargs)
-
     def on_episode_end(self, *, worker: "RolloutWorker", base_env: BaseEnv, policies: Dict[PolicyID, Policy],
                        episode: MultiAgentEpisode, env_index: Optional[int] = None, **kwargs) -> None:
-        super().on_episode_end(worker=worker, base_env=base_env, policies=policies, episode=episode,
-                               env_index=env_index, **kwargs)
         if not worker.multiagent:
             info = episode.last_info_for()
             if 'video' in info:
                 level = info['video']['level']
                 path = info['video']['path']
                 episode.media[f'level_{level}'] = Video(path)
-
-    def on_postprocess_trajectory(self, *, worker: "RolloutWorker", episode: MultiAgentEpisode, agent_id: AgentID,
-                                  policy_id: PolicyID, policies: Dict[PolicyID, Policy],
-                                  postprocessed_batch: SampleBatch, original_batches: Dict[AgentID, SampleBatch],
-                                  **kwargs) -> None:
-        super().on_postprocess_trajectory(worker=worker, episode=episode, agent_id=agent_id, policy_id=policy_id,
-                                          policies=policies, postprocessed_batch=postprocessed_batch,
-                                          original_batches=original_batches, **kwargs)
-
-    def on_sample_end(self, *, worker: "RolloutWorker", samples: SampleBatch, **kwargs) -> None:
-        super().on_sample_end(worker=worker, samples=samples, **kwargs)
-
-    def on_learn_on_batch(self, *, policy: Policy, train_batch: SampleBatch, result: dict, **kwargs) -> None:
-        pass
-
-    def on_train_result(self, *, trainer, result: dict, **kwargs) -> None:
-        super().on_train_result(trainer=trainer, result=result, **kwargs)

From 05abd17ee4dea3400c8eb16601a0cc220c88bd6d Mon Sep 17 00:00:00 2001
From: Bam4d <chrisbam4d@gmail.com>
Date: Sat, 20 Mar 2021 15:45:46 +0000
Subject: [PATCH 09/45] don't ever need dashbaord

---
 .../conditional_action_spaces/rllib_conditional_actions.py    | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py b/python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py
index 1ac7e9cc0..554a2386e 100644
--- a/python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py
+++ b/python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py
@@ -45,8 +45,8 @@
         os.path.realpath('clusters_po_with_push_seperate_colors.yaml')
     ]
 
-    ray.init(num_gpus=args.num_gpus)
-    #ray.init(num_gpus=args.num_gpus, local_mode=True)
+    ray.init(include_dashboard=False, num_gpus=args.num_gpus)
+    #ray.init(include_dashboard=False, num_gpus=args.num_gpus, local_mode=True)
 
     env_name = "ray-griddly-env"
 

From 10f5873e1e7f5065924cfc1195aea776bcf4c928 Mon Sep 17 00:00:00 2001
From: Bam4d <chrisbam4d@gmail.com>
Date: Sat, 20 Mar 2021 15:53:25 +0000
Subject: [PATCH 10/45] turn these tune hyperparams back on

---
 .../conditional_action_spaces/rllib_conditional_actions.py    | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py b/python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py
index 554a2386e..d416d00a6 100644
--- a/python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py
+++ b/python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py
@@ -80,8 +80,8 @@
                 'directory': os.path.join(args.root_directory, args.video_directory)
             },
 
-            #'allow_nop': tune.grid_search([True, False]),
-            #'invalid_action_masking': tune.grid_search(['none', 'conditional', 'collapsed']),
+            'allow_nop': tune.grid_search([True, False]),
+            'invalid_action_masking': tune.grid_search(['none', 'conditional', 'collapsed']),
             # 'invalid_action_masking': 'collapsed',
             # 'allow_nop': False,
             'generate_valid_action_trees': True,

From da337feb7b0f0e471d56f66bf0612dc2a972c53f Mon Sep 17 00:00:00 2001
From: Bam4d <chrisbam4d@gmail.com>
Date: Sat, 20 Mar 2021 16:06:45 +0000
Subject: [PATCH 11/45] option for turning off videos

---
 .../rllib_conditional_actions.py              | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py b/python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py
index d416d00a6..957790a45 100644
--- a/python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py
+++ b/python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py
@@ -18,7 +18,8 @@
 
 parser = argparse.ArgumentParser(description='Run experiments')
 
-parser.add_argument('--root-directory', default=os.path.expanduser("~/ray_results"), help='root directory for all data associated with the run')
+parser.add_argument('--root-directory', default=os.path.expanduser("~/ray_results"),
+                    help='root directory for all data associated with the run')
 parser.add_argument('--num-gpus', default=1, type=int, help='Number of GPUs to make available')
 
 parser.add_argument('--num-workers', default=8, type=int, help='Number of workers')
@@ -27,6 +28,7 @@
 parser.add_argument('--num-cpus-per-worker', default=1, type=float, help='Number of gpus per worker')
 parser.add_argument('--max-training-steps', default=20000000, type=int, help='Number of workers')
 
+parser.add_argument('--capture-video', action='store_true', help='enable video capture')
 parser.add_argument('--video-directory', default='videos', help='directory of video')
 parser.add_argument('--video-frequency', type=int, default=10000, help='Frequency of videos')
 
@@ -46,7 +48,7 @@
     ]
 
     ray.init(include_dashboard=False, num_gpus=args.num_gpus)
-    #ray.init(include_dashboard=False, num_gpus=args.num_gpus, local_mode=True)
+    # ray.init(include_dashboard=False, num_gpus=args.num_gpus, local_mode=True)
 
     env_name = "ray-griddly-env"
 
@@ -75,10 +77,6 @@
         },
         'env': env_name,
         'env_config': {
-            'record_video_config': {
-                'frequency': args.video_frequency,
-                'directory': os.path.join(args.root_directory, args.video_directory)
-            },
 
             'allow_nop': tune.grid_search([True, False]),
             'invalid_action_masking': tune.grid_search(['none', 'conditional', 'collapsed']),
@@ -101,8 +99,15 @@
 
     }
 
+    if args.capture_video:
+        config['env_config']['record_video_config'] = {
+            'frequency': args.video_frequency,
+            'directory': os.path.join(args.root_directory, args.video_directory)
+        }
+
     stop = {
         "timesteps_total": max_training_steps,
     }
 
-    result = tune.run(ConditionalActionImpalaTrainer, local_dir=args.root_directory, config=config, stop=stop, callbacks=[wandbLoggerCallback])
+    result = tune.run(ConditionalActionImpalaTrainer, local_dir=args.root_directory, config=config, stop=stop,
+                      callbacks=[wandbLoggerCallback])

From d771199bfdaf97924b956b20ea2adab7e60f75ba Mon Sep 17 00:00:00 2001
From: Bam4d <chrisbam4d@gmail.com>
Date: Sun, 21 Mar 2021 11:45:15 +0000
Subject: [PATCH 12/45] adding in option for wandb default directory and set
 level with argument

---
 .../conditional_action_spaces/rllib_baseline.py       |  4 ++--
 .../rllib_conditional_actions.py                      | 11 +++--------
 2 files changed, 5 insertions(+), 10 deletions(-)

diff --git a/python/examples/experiments/conditional_action_spaces/rllib_baseline.py b/python/examples/experiments/conditional_action_spaces/rllib_baseline.py
index a89f160c8..46a86849f 100644
--- a/python/examples/experiments/conditional_action_spaces/rllib_baseline.py
+++ b/python/examples/experiments/conditional_action_spaces/rllib_baseline.py
@@ -26,8 +26,8 @@
     ]
 
 
-    ray.init(num_gpus=1)
-    # ray.init(num_gpus=1, local_mode=True)
+    #ray.init(num_gpus=1)
+    ray.init(num_gpus=1, local_mode=True)
 
     env_name = "ray-griddly-env"
 
diff --git a/python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py b/python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py
index 957790a45..4cf15a7bd 100644
--- a/python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py
+++ b/python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py
@@ -41,12 +41,6 @@
     sep = os.pathsep
     os.environ['PYTHONPATH'] = sep.join(sys.path)
 
-    yaml_files = [
-        os.path.realpath('clusters_po.yaml'),
-        os.path.realpath('clusters_po_with_push.yaml'),
-        os.path.realpath('clusters_po_with_push_seperate_colors.yaml')
-    ]
-
     ray.init(include_dashboard=False, num_gpus=args.num_gpus)
     # ray.init(include_dashboard=False, num_gpus=args.num_gpus, local_mode=True)
 
@@ -57,7 +51,8 @@
 
     wandbLoggerCallback = WandbLoggerCallback(
         project='conditional_actions',
-        api_key_file='~/.wandb_rc'
+        api_key_file='~/.wandb_rc',
+        dir=os.path.join(args.root_directory, 'wandb')
     )
 
     max_training_steps = args.max_training_steps
@@ -84,7 +79,7 @@
             # 'allow_nop': False,
             'generate_valid_action_trees': True,
             'random_level_on_reset': True,
-            'yaml_file': tune.grid_search(yaml_files),
+            'yaml_file': args.yaml_file,
             'global_observer_type': gd.ObserverType.SPRITE_2D,
             'max_steps': 1000,
         },

From 899e1e687cc723d93d31f5c59a6c37b97ea8b68d Mon Sep 17 00:00:00 2001
From: Bam4d <chrisbam4d@gmail.com>
Date: Sun, 21 Mar 2021 11:50:58 +0000
Subject: [PATCH 13/45] yaml file

---
 .../conditional_action_spaces/rllib_conditional_actions.py      | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py b/python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py
index 4cf15a7bd..58f8e585e 100644
--- a/python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py
+++ b/python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py
@@ -18,6 +18,8 @@
 
 parser = argparse.ArgumentParser(description='Run experiments')
 
+parser.add_argument('--yaml-file', help='YAML file condining GDY for the game')
+
 parser.add_argument('--root-directory', default=os.path.expanduser("~/ray_results"),
                     help='root directory for all data associated with the run')
 parser.add_argument('--num-gpus', default=1, type=int, help='Number of GPUs to make available')

From 8723b6dad1466f51e6941b7f295e4f78f2a13fc8 Mon Sep 17 00:00:00 2001
From: Bam4d <chrisbam4d@gmail.com>
Date: Sun, 21 Mar 2021 11:56:38 +0000
Subject: [PATCH 14/45] fixing wandb data dir

---
 .../conditional_action_spaces/rllib_conditional_actions.py      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py b/python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py
index 58f8e585e..0353f15e2 100644
--- a/python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py
+++ b/python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py
@@ -54,7 +54,7 @@
     wandbLoggerCallback = WandbLoggerCallback(
         project='conditional_actions',
         api_key_file='~/.wandb_rc',
-        dir=os.path.join(args.root_directory, 'wandb')
+        dir=args.root_directory
     )
 
     max_training_steps = args.max_training_steps

From 5d3538b9aab7e2adde0db9fce98077f112e76e9b Mon Sep 17 00:00:00 2001
From: Bam4d <chrisbam4d@gmail.com>
Date: Mon, 22 Mar 2021 13:44:56 +0000
Subject: [PATCH 15/45] vtrace masking option

---
 .../clusters_po_with_push_units.yaml          | 330 ++++++++++++++++++
 .../rllib_conditional_actions.py              |   9 +-
 python/griddly/util/rllib/callbacks.py        |  13 +-
 .../conditional_action_mixin.py               |   3 +-
 .../conditional_action_policy_trainer.py      |  93 ++++-
 5 files changed, 436 insertions(+), 12 deletions(-)

diff --git a/python/examples/experiments/conditional_action_spaces/clusters_po_with_push_units.yaml b/python/examples/experiments/conditional_action_spaces/clusters_po_with_push_units.yaml
index e69de29bb..9904e87e7 100644
--- a/python/examples/experiments/conditional_action_spaces/clusters_po_with_push_units.yaml
+++ b/python/examples/experiments/conditional_action_spaces/clusters_po_with_push_units.yaml
@@ -0,0 +1,330 @@
+Version: "0.1"
+Environment:
+  Name: Partially Observable Clusters
+  Description: Cluster the coloured objects together by pushing them against the static coloured blocks.
+  Observers:
+    Sprite2D:
+      TileSize: 24
+      BackgroundTile: oryx/oryx_fantasy/floor1-2.png
+  Variables:
+    - Name: box_count
+      InitialValue: 0
+  Player:
+    Observer:
+      RotateWithAvatar: true
+      TrackAvatar: true
+      Height: 5
+      Width: 5
+      OffsetX: 0
+      OffsetY: 2
+    AvatarObject: avatar # The player can only control a single avatar in the game
+  Termination:
+    Win:
+      - eq: [box_count, 0]
+    Lose:
+      - eq: [broken_box:count, 1]
+      - eq: [avatar:count, 0]
+  Levels:
+    - |
+      w w w w w w w w w w w w w
+      w . . . . . . . . . . . w
+      w . . 1 1 . . . 2 . 2 . w
+      w . . . . 1 . . . . . . w
+      w . . . a . . . . . 2 . w
+      w . . . . . . . h . . . w
+      w . . . . 1 . . . . b . w
+      w . . . . . . 1 . . . . w
+      w . . . . . . . . A . . w
+      w w w w w w w w w w w w w
+    - |
+      w w w w w w w w w w w w w
+      w . . . . . . . . . . . w
+      w . . 1 . . 2 . c 3 . . w
+      w . . . . h . . h . . . w
+      w . . . 2 . . 3 . . 1 . w
+      w . . . . b . . h . . . w
+      w . . 3 . . . 2 . . 1 . w
+      w . . h . h . . . a . . w
+      w . . . . . A . . . . . w
+      w w w w w w w w w w w w w
+    - |
+      w w w w w w w w w w w w w
+      w . . a . . b . . c . . w
+      w . . . . . . . . . . . w
+      w . . . . . . . . . . . w
+      w h h h h h . h h h h h w
+      w . . . . h . h . . . . w
+      w . 1 2 . h . h . 1 3 . w
+      w . 3 . . . . . . . 2 . w
+      w . . . . . A . . . . . w
+      w w w w w w w w w w w w w
+    - |
+      w w w w w w w w w w w w w
+      w . . . . . . . . . . . w
+      w . . . 1 . 2 . . c . . w
+      w . . . . . 3 . . 3 . . w
+      w . . a . 2 . . . h . . w
+      w . . . . h h . 3 . . . w
+      w . . 1 . . . . . 2 . . w
+      w . . . . . 1 . . b . . w
+      w . . . . . A . . . . . w
+      w w w w w w w w w w w w w
+    - |
+      w w w w w w w w w w w w w
+      w . . . . . . . . . . . w
+      w . . . . . . 1 . . . . w
+      w . . h . . b . . h . . w
+      w . . . . 1 . . . . . . w
+      w . . 3 . . . . 2 . . . w
+      w . . . a . h . . c . . w
+      w . . . . 3 . . . . 2 . w
+      w . . . . . A . . . . . w
+      w w w w w w w w w w w w w
+
+Actions:
+
+  # A simple action to count the number of boxes in the game at the start
+  # Not currently a way to do complex things in termination conditions like combine multiple conditions
+  - Name: box_counter
+    InputMapping:
+      Internal: true
+      Inputs:
+        1:
+          Description: "The only action here is to increment the box count"
+    Behaviours:
+      - Src:
+          Object: [blue_box, red_box, green_box]
+          Commands:
+            - incr: box_count
+        Dst:
+          Object: [blue_box, red_box, green_box]
+
+  # Define the move action
+  - Name: move
+    InputMapping:
+      Inputs:
+        1:
+          Description: Rotate left
+          OrientationVector: [-1, 0]
+        2:
+          Description: Move forwards
+          OrientationVector: [0, -1]
+          VectorToDest: [0, -1]
+        3:
+          Description: Rotate right
+          OrientationVector: [1, 0]
+      Relative: true
+    Behaviours:
+
+      # Avatar rotates
+      - Src:
+          Object: avatar
+          Commands:
+            - rot: _dir
+        Dst:
+          Object: avatar
+
+      # Avatar can move into empty space
+      - Src:
+          Object: avatar
+          Commands:
+            - mov: _dest
+        Dst:
+          Object: _empty
+
+      # Avatar dies if it hits the spikes
+      - Src:
+          Object: avatar
+          Commands:
+            - remove: true
+            - reward: -1
+        Dst:
+          Object: spike
+
+
+  - Name: push
+    InputMapping:
+      Inputs:
+        1:
+          Description: Push Forwards
+          OrientationVector: [ 0, -1 ]
+          VectorToDest: [ 0, -1 ]
+      Relative: true
+    Behaviours:
+
+      # Boxes can be pushed by the avatar
+      - Src:
+          Object: avatar
+          Commands:
+            - mov: _dest
+        Dst:
+          Object: [blue_box, green_box, red_box]
+          Commands:
+            - cascade: _dest
+
+      # Boxes break if they hit the spikes
+      - Src:
+          Object: [ blue_box, green_box, red_box ]
+          Commands:
+            - change_to: broken_box
+            - reward: -1
+        Dst:
+          Object: spike
+
+      # Boxes can pushed into empty space
+      - Src:
+          Object: [blue_box, green_box, red_box]
+          Commands:
+            - mov: _dest
+        Dst:
+          Object: _empty
+
+      # When boxes are pushed against the blocks they change
+      - Src:
+          Object: blue_box
+          Commands:
+            - change_to: blue_block
+            - reward: 1
+            - decr:  box_count
+        Dst:
+          Object: blue_block
+      - Src:
+          Object: red_box
+          Commands:
+            - reward: 1
+            - change_to: red_block
+            - decr:  box_count
+        Dst:
+          Object: red_block
+      - Src:
+          Object: green_box
+          Commands:
+            - reward: 1
+            - change_to: green_block
+            - decr:  box_count
+        Dst:
+          Object: green_block
+
+
+Objects:
+  - Name: avatar
+    MapCharacter: A
+    Observers:
+      Sprite2D:
+        - Image: gvgai/oryx/knight1.png
+      Block2D:
+        - Shape: triangle
+          Color: [0.0, 1.0, 0.0]
+          Scale: 0.8
+
+  - Name: wall
+    MapCharacter: w
+    Observers:
+      Sprite2D:
+        - TilingMode: WALL_16
+          Image:
+            - oryx/oryx_fantasy/wall1-0.png
+            - oryx/oryx_fantasy/wall1-1.png
+            - oryx/oryx_fantasy/wall1-2.png
+            - oryx/oryx_fantasy/wall1-3.png
+            - oryx/oryx_fantasy/wall1-4.png
+            - oryx/oryx_fantasy/wall1-5.png
+            - oryx/oryx_fantasy/wall1-6.png
+            - oryx/oryx_fantasy/wall1-7.png
+            - oryx/oryx_fantasy/wall1-8.png
+            - oryx/oryx_fantasy/wall1-9.png
+            - oryx/oryx_fantasy/wall1-10.png
+            - oryx/oryx_fantasy/wall1-11.png
+            - oryx/oryx_fantasy/wall1-12.png
+            - oryx/oryx_fantasy/wall1-13.png
+            - oryx/oryx_fantasy/wall1-14.png
+            - oryx/oryx_fantasy/wall1-15.png
+      Block2D:
+        - Shape: square
+          Color: [0.5, 0.5, 0.5]
+          Scale: 0.9
+
+  - Name: spike
+    MapCharacter: h
+    Observers:
+      Sprite2D:
+        - Image: gvgai/oryx/spike2.png
+      Block2D:
+        - Shape: triangle
+          Color: [0.9, 0.1, 0.1]
+          Scale: 0.5
+
+  - Name: red_box
+    MapCharacter: "2"
+    InitialActions:
+      - Action: box_counter
+        ActionId: 1
+    Observers:
+      Sprite2D:
+        - Image: gvgai/newset/blockR.png
+      Block2D:
+        - Shape: square
+          Color: [0.5, 0.2, 0.2]
+          Scale: 0.5
+  - Name: red_block
+    MapCharacter: b
+    Observers:
+      Sprite2D:
+        - Image: gvgai/newset/blockR2.png
+      Block2D:
+        - Shape: square
+          Color: [1.0, 0.0, 0.0]
+          Scale: 1.0
+
+  - Name: green_box
+    MapCharacter: "3"
+    InitialActions:
+      - Action: box_counter
+        ActionId: 1
+    Observers:
+      Sprite2D:
+        - Image: gvgai/newset/blockG.png
+      Block2D:
+        - Shape: square
+          Color: [0.2, 0.5, 0.2]
+          Scale: 0.5
+  - Name: green_block
+    MapCharacter: c
+    Observers:
+      Sprite2D:
+        - Image: gvgai/newset/blockG2.png
+      Block2D:
+        - Shape: square
+          Color: [0.0, 1.0, 0.0]
+          Scale: 1.0
+
+  - Name: blue_box
+    MapCharacter: "1"
+    InitialActions:
+      - Action: box_counter
+        ActionId: 1
+    Observers:
+      Sprite2D:
+        - Image: gvgai/newset/blockB.png
+      Block2D:
+        - Shape: square
+          Color: [0.2, 0.2, 0.5]
+          Scale: 0.5
+  - Name: blue_block
+    MapCharacter: a
+    Observers:
+      Sprite2D:
+        - Image: gvgai/newset/blockB2.png
+      Block2D:
+        - Shape: square
+          Color: [0.0, 0.0, 1.0]
+          Scale: 1.0
+
+  - Name: broken_box
+    Observers:
+      Sprite2D:
+        - Image: gvgai/newset/block3.png
+      Block2D:
+        - Shape: triangle
+          Color: [1.0, 0.0, 1.0]
+          Scale: 1.0
diff --git a/python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py b/python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py
index 0353f15e2..ffc7e3c28 100644
--- a/python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py
+++ b/python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py
@@ -24,8 +24,8 @@
                     help='root directory for all data associated with the run')
 parser.add_argument('--num-gpus', default=1, type=int, help='Number of GPUs to make available')
 
-parser.add_argument('--num-workers', default=8, type=int, help='Number of workers')
-parser.add_argument('--num-envs-per-worker', default=2, type=int, help='Number of workers')
+parser.add_argument('--num-workers', default=11, type=int, help='Number of workers')
+parser.add_argument('--num-envs-per-worker', default=5, type=int, help='Number of workers')
 parser.add_argument('--num-gpus-per-worker', default=0, type=float, help='Number of gpus per worker')
 parser.add_argument('--num-cpus-per-worker', default=1, type=float, help='Number of gpus per worker')
 parser.add_argument('--max-training-steps', default=20000000, type=int, help='Number of workers')
@@ -44,7 +44,7 @@
     os.environ['PYTHONPATH'] = sep.join(sys.path)
 
     ray.init(include_dashboard=False, num_gpus=args.num_gpus)
-    # ray.init(include_dashboard=False, num_gpus=args.num_gpus, local_mode=True)
+    #ray.init(include_dashboard=False, num_gpus=1, local_mode=True)
 
     env_name = "ray-griddly-env"
 
@@ -77,12 +77,13 @@
 
             'allow_nop': tune.grid_search([True, False]),
             'invalid_action_masking': tune.grid_search(['none', 'conditional', 'collapsed']),
+            'vtrace_masking': tune.grid_search([True, False]),
             # 'invalid_action_masking': 'collapsed',
             # 'allow_nop': False,
             'generate_valid_action_trees': True,
             'random_level_on_reset': True,
             'yaml_file': args.yaml_file,
-            'global_observer_type': gd.ObserverType.SPRITE_2D,
+            'global_observer_type': gd.ObserverType.VECTOR,
             'max_steps': 1000,
         },
         'entropy_coeff_schedule': [
diff --git a/python/griddly/util/rllib/callbacks.py b/python/griddly/util/rllib/callbacks.py
index dfb1865fe..37767b835 100644
--- a/python/griddly/util/rllib/callbacks.py
+++ b/python/griddly/util/rllib/callbacks.py
@@ -11,9 +11,10 @@ class GriddlyCallbacks(DefaultCallbacks):
 
     def on_episode_end(self, *, worker: "RolloutWorker", base_env: BaseEnv, policies: Dict[PolicyID, Policy],
                        episode: MultiAgentEpisode, env_index: Optional[int] = None, **kwargs) -> None:
-        if not worker.multiagent:
-            info = episode.last_info_for()
-            if 'video' in info:
-                level = info['video']['level']
-                path = info['video']['path']
-                episode.media[f'level_{level}'] = Video(path)
+        if worker.worker_index == 0 and env_index == 0:
+            if not worker.multiagent:
+                info = episode.last_info_for()
+                if 'video' in info:
+                    level = info['video']['level']
+                    path = info['video']['path']
+                    episode.media[f'level_{level}'] = Video(path)
diff --git a/python/griddly/util/rllib/torch/conditional_actions/conditional_action_mixin.py b/python/griddly/util/rllib/torch/conditional_actions/conditional_action_mixin.py
index 5a3c09277..9d714fa45 100644
--- a/python/griddly/util/rllib/torch/conditional_actions/conditional_action_mixin.py
+++ b/python/griddly/util/rllib/torch/conditional_actions/conditional_action_mixin.py
@@ -80,7 +80,8 @@ def compute_actions_from_input_dict(
             extra_fetches = {
                 SampleBatch.ACTION_DIST_INPUTS: masked_logits,
                 SampleBatch.ACTION_PROB: torch.exp(logp.float()),
-                SampleBatch.ACTION_LOGP: logp
+                SampleBatch.ACTION_LOGP: logp,
+                'invalid_action_mask': mask
             }
 
             # Update our global timestep by the batch size.
diff --git a/python/griddly/util/rllib/torch/conditional_actions/conditional_action_policy_trainer.py b/python/griddly/util/rllib/torch/conditional_actions/conditional_action_policy_trainer.py
index 20d82ead6..20252f4e9 100644
--- a/python/griddly/util/rllib/torch/conditional_actions/conditional_action_policy_trainer.py
+++ b/python/griddly/util/rllib/torch/conditional_actions/conditional_action_policy_trainer.py
@@ -1,10 +1,100 @@
+import gym
+import numpy as np
+import torch
+from ray.rllib import SampleBatch
 from ray.rllib.agents.impala import ImpalaTrainer
-from ray.rllib.agents.impala.vtrace_torch_policy import VTraceTorchPolicy
+from ray.rllib.agents.impala.vtrace_tf_policy import build_vtrace_loss
+from ray.rllib.agents.impala.vtrace_torch_policy import VTraceTorchPolicy, VTraceLoss, make_time_major
+from ray.rllib.models.torch.torch_action_dist import TorchCategorical
 from ray.rllib.policy.torch_policy import LearningRateSchedule, EntropyCoeffSchedule
+from tensorflow import sequence_mask
 
 from griddly.util.rllib.torch.conditional_actions.conditional_action_mixin import ConditionalActionMixin
 
 
+def build_invalid_masking_vtrace_loss(policy, model, dist_class, train_batch):
+
+    if not policy.config['env_config'].get('vtrace_masking', False):
+        return build_vtrace_loss(policy, model, dist_class, train_batch)
+
+    model_out, _ = model.from_batch(train_batch)
+
+    if isinstance(policy.action_space, gym.spaces.Discrete):
+        is_multidiscrete = False
+        output_hidden_shape = [policy.action_space.n]
+    elif isinstance(policy.action_space, gym.spaces.MultiDiscrete):
+        is_multidiscrete = True
+        output_hidden_shape = policy.action_space.nvec.astype(np.int32)
+    else:
+        is_multidiscrete = False
+        output_hidden_shape = 1
+
+    def _make_time_major(*args, **kw):
+        return make_time_major(policy, train_batch.get("seq_lens"), *args,
+                               **kw)
+
+    actions = train_batch[SampleBatch.ACTIONS]
+    dones = train_batch[SampleBatch.DONES]
+    rewards = train_batch[SampleBatch.REWARDS]
+    behaviour_action_logp = train_batch[SampleBatch.ACTION_LOGP]
+    behaviour_logits = train_batch[SampleBatch.ACTION_DIST_INPUTS]
+
+    invalid_action_mask = train_batch['invalid_action_mask']
+
+    if 'seq_lens' in train_batch:
+        max_seq_len = policy.config['rollout_fragment_length']
+        mask_orig = sequence_mask(train_batch["seq_lens"], max_seq_len)
+        mask = torch.reshape(mask_orig, [-1])
+    else:
+        mask = torch.ones_like(rewards)
+
+    model_out += torch.log(invalid_action_mask)
+    action_dist = dist_class(model_out, model)
+
+    if isinstance(output_hidden_shape, (list, tuple, np.ndarray)):
+        unpacked_behaviour_logits = torch.split(
+            behaviour_logits, list(output_hidden_shape), dim=1)
+        unpacked_outputs = torch.split(
+            model_out, list(output_hidden_shape), dim=1)
+    else:
+        unpacked_behaviour_logits = torch.chunk(
+            behaviour_logits, output_hidden_shape, dim=1)
+        unpacked_outputs = torch.chunk(model_out, output_hidden_shape, dim=1)
+    values = model.value_function()
+
+    # Prepare actions for loss.
+    loss_actions = actions if is_multidiscrete else torch.unsqueeze(
+        actions, dim=1)
+
+    # Inputs are reshaped from [B * T] => [T - 1, B] for V-trace calc.
+    policy.loss = VTraceLoss(
+        actions=_make_time_major(loss_actions, drop_last=True),
+        actions_logp=_make_time_major(
+            action_dist.logp(actions), drop_last=True),
+        actions_entropy=_make_time_major(
+            action_dist.entropy(), drop_last=True),
+        dones=_make_time_major(dones, drop_last=True),
+        behaviour_action_logp=_make_time_major(
+            behaviour_action_logp, drop_last=True),
+        behaviour_logits=_make_time_major(
+            unpacked_behaviour_logits, drop_last=True),
+        target_logits=_make_time_major(unpacked_outputs, drop_last=True),
+        discount=policy.config["gamma"],
+        rewards=_make_time_major(rewards, drop_last=True),
+        values=_make_time_major(values, drop_last=True),
+        bootstrap_value=_make_time_major(values)[-1],
+        dist_class=TorchCategorical if is_multidiscrete else dist_class,
+        model=model,
+        valid_mask=_make_time_major(mask, drop_last=True),
+        config=policy.config,
+        vf_loss_coeff=policy.config["vf_loss_coeff"],
+        entropy_coeff=policy.entropy_coeff,
+        clip_rho_threshold=policy.config["vtrace_clip_rho_threshold"],
+        clip_pg_rho_threshold=policy.config["vtrace_clip_pg_rho_threshold"])
+
+    return policy.loss.total_loss
+
+
 def setup_mixins(policy, obs_space, action_space, config):
     ConditionalActionMixin.__init__(policy)
     EntropyCoeffSchedule.__init__(policy, config["entropy_coeff"],
@@ -14,6 +104,7 @@ def setup_mixins(policy, obs_space, action_space, config):
 
 ConditionalActionVTraceTorchPolicy = VTraceTorchPolicy.with_updates(
     name="ConditionalActionVTraceTorchPolicy",
+    loss_fn=build_invalid_masking_vtrace_loss,
     before_init=setup_mixins,
     mixins=[LearningRateSchedule, EntropyCoeffSchedule, ConditionalActionMixin]
 )

From 9f88c0f91d9a48e36504389c9d605e82d136079d Mon Sep 17 00:00:00 2001
From: Bam4d <chrisbam4d@gmail.com>
Date: Mon, 22 Mar 2021 13:46:45 +0000
Subject: [PATCH 16/45] restrict cpus to same number of workers

---
 .../conditional_action_spaces/rllib_conditional_actions.py      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py b/python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py
index ffc7e3c28..097d6a5bd 100644
--- a/python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py
+++ b/python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py
@@ -43,7 +43,7 @@
     sep = os.pathsep
     os.environ['PYTHONPATH'] = sep.join(sys.path)
 
-    ray.init(include_dashboard=False, num_gpus=args.num_gpus)
+    ray.init(include_dashboard=False, num_gpus=args.num_gpus, num_cpus=args.num_workers)
     #ray.init(include_dashboard=False, num_gpus=1, local_mode=True)
 
     env_name = "ray-griddly-env"

From 2a5784c2bb6f56d337dc803d17dd6bfe17a66a57 Mon Sep 17 00:00:00 2001
From: Bam4d <chrisbam4d@gmail.com>
Date: Mon, 22 Mar 2021 14:23:56 +0000
Subject: [PATCH 17/45] num cpus has to be independent of num workers as one
 cpu is used for impala

---
 .../conditional_action_spaces/rllib_conditional_actions.py   | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py b/python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py
index 097d6a5bd..5d91fdcb8 100644
--- a/python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py
+++ b/python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py
@@ -22,7 +22,8 @@
 
 parser.add_argument('--root-directory', default=os.path.expanduser("~/ray_results"),
                     help='root directory for all data associated with the run')
-parser.add_argument('--num-gpus', default=1, type=int, help='Number of GPUs to make available')
+parser.add_argument('--num-gpus', default=1, type=int, help='Number of GPUs to make available to ray.')
+parser.add_argument('--num-cpus', default=8, type=int, help='Number of CPUs to make available to ray.')
 
 parser.add_argument('--num-workers', default=11, type=int, help='Number of workers')
 parser.add_argument('--num-envs-per-worker', default=5, type=int, help='Number of workers')
@@ -43,7 +44,7 @@
     sep = os.pathsep
     os.environ['PYTHONPATH'] = sep.join(sys.path)
 
-    ray.init(include_dashboard=False, num_gpus=args.num_gpus, num_cpus=args.num_workers)
+    ray.init(include_dashboard=False, num_gpus=args.num_gpus, num_cpus=args.num_cpus)
     #ray.init(include_dashboard=False, num_gpus=1, local_mode=True)
 
     env_name = "ray-griddly-env"

From eebfdd6b0bee91ef5909b02e7b6ad31bd0ee1235 Mon Sep 17 00:00:00 2001
From: Bam4d <chrisbam4d@gmail.com>
Date: Mon, 22 Mar 2021 16:33:03 +0000
Subject: [PATCH 18/45] invalid action mask fixes

---
 .../conditional_action_spaces/rllib_conditional_actions.py  | 6 +++---
 .../conditional_action_policy_trainer.py                    | 1 -
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py b/python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py
index 5d91fdcb8..b9ff11e4e 100644
--- a/python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py
+++ b/python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py
@@ -25,7 +25,7 @@
 parser.add_argument('--num-gpus', default=1, type=int, help='Number of GPUs to make available to ray.')
 parser.add_argument('--num-cpus', default=8, type=int, help='Number of CPUs to make available to ray.')
 
-parser.add_argument('--num-workers', default=11, type=int, help='Number of workers')
+parser.add_argument('--num-workers', default=7, type=int, help='Number of workers')
 parser.add_argument('--num-envs-per-worker', default=5, type=int, help='Number of workers')
 parser.add_argument('--num-gpus-per-worker', default=0, type=float, help='Number of gpus per worker')
 parser.add_argument('--num-cpus-per-worker', default=1, type=float, help='Number of gpus per worker')
@@ -45,7 +45,7 @@
     os.environ['PYTHONPATH'] = sep.join(sys.path)
 
     ray.init(include_dashboard=False, num_gpus=args.num_gpus, num_cpus=args.num_cpus)
-    #ray.init(include_dashboard=False, num_gpus=1, local_mode=True)
+    #ray.init(include_dashboard=False, num_gpus=1, num_cpus=2, local_mode=True)
 
     env_name = "ray-griddly-env"
 
@@ -84,7 +84,7 @@
             'generate_valid_action_trees': True,
             'random_level_on_reset': True,
             'yaml_file': args.yaml_file,
-            'global_observer_type': gd.ObserverType.VECTOR,
+            'global_observer_type': gd.ObserverType.SPRITE_2D,
             'max_steps': 1000,
         },
         'entropy_coeff_schedule': [
diff --git a/python/griddly/util/rllib/torch/conditional_actions/conditional_action_policy_trainer.py b/python/griddly/util/rllib/torch/conditional_actions/conditional_action_policy_trainer.py
index 20252f4e9..9bb3ffcd9 100644
--- a/python/griddly/util/rllib/torch/conditional_actions/conditional_action_policy_trainer.py
+++ b/python/griddly/util/rllib/torch/conditional_actions/conditional_action_policy_trainer.py
@@ -13,7 +13,6 @@
 
 
 def build_invalid_masking_vtrace_loss(policy, model, dist_class, train_batch):
-
     if not policy.config['env_config'].get('vtrace_masking', False):
         return build_vtrace_loss(policy, model, dist_class, train_batch)
 

From afcbd48f443636c98143f7ad32ea2fce15fd8d0b Mon Sep 17 00:00:00 2001
From: Bam4d <chrisbam4d@gmail.com>
Date: Mon, 22 Mar 2021 19:20:51 +0000
Subject: [PATCH 19/45] video frequency too high

---
 .../conditional_action_spaces/rllib_conditional_actions.py      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py b/python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py
index b9ff11e4e..aca66deba 100644
--- a/python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py
+++ b/python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py
@@ -33,7 +33,7 @@
 
 parser.add_argument('--capture-video', action='store_true', help='enable video capture')
 parser.add_argument('--video-directory', default='videos', help='directory of video')
-parser.add_argument('--video-frequency', type=int, default=10000, help='Frequency of videos')
+parser.add_argument('--video-frequency', type=int, default=1000000, help='Frequency of videos')
 
 parser.add_argument('--lr', type=float, default=0.0005, help='learning rate')
 

From 94b3af036691bd6512b4334f1ceb5f4a5143d620 Mon Sep 17 00:00:00 2001
From: Bam4d <chrisbam4d@gmail.com>
Date: Tue, 23 Mar 2021 10:39:41 +0000
Subject: [PATCH 20/45] vtrace masking not in grid-search

---
 .../clusters_po_with_push_units.yaml          | 192 ++++++------------
 .../rllib_conditional_actions.py              |   2 +-
 2 files changed, 58 insertions(+), 136 deletions(-)

diff --git a/python/examples/experiments/conditional_action_spaces/clusters_po_with_push_units.yaml b/python/examples/experiments/conditional_action_spaces/clusters_po_with_push_units.yaml
index 9904e87e7..879a28681 100644
--- a/python/examples/experiments/conditional_action_spaces/clusters_po_with_push_units.yaml
+++ b/python/examples/experiments/conditional_action_spaces/clusters_po_with_push_units.yaml
@@ -9,15 +9,6 @@ Environment:
   Variables:
     - Name: box_count
       InitialValue: 0
-  Player:
-    Observer:
-      RotateWithAvatar: true
-      TrackAvatar: true
-      Height: 5
-      Width: 5
-      OffsetX: 0
-      OffsetY: 2
-    AvatarObject: avatar # The player can only control a single avatar in the game
   Termination:
     Win:
       - eq: [box_count, 0]
@@ -26,60 +17,60 @@ Environment:
       - eq: [avatar:count, 0]
   Levels:
     - |
-      w w w w w w w w w w w w w
-      w . . . . . . . . . . . w
-      w . . 1 1 . . . 2 . 2 . w
-      w . . . . 1 . . . . . . w
-      w . . . a . . . . . 2 . w
-      w . . . . . . . h . . . w
-      w . . . . 1 . . . . b . w
-      w . . . . . . 1 . . . . w
-      w . . . . . . . . A . . w
-      w w w w w w w w w w w w w
+      w  w  w  w  w  w  w  w  w  w  w  w  w
+      w  .  .  .  .  .  .  .  .  .  .  .  w
+      w  .  .  b1 b1 .  .  .  r1 .  r1 .  w
+      w  .  .  .  .  b1 .  .  .  .  .  .  w
+      w  .  .  .  B  .  .  .  .  .  r1 .  w
+      w  .  .  .  .  .  .  .  x  .  .  .  w
+      w  .  .  .  .  b1 .  .  .  .  R  .  w
+      w  .  .  .  .  .  .  b1 .  .  .  .  w
+      w  .  .  .  .  .  .  .  .  .  .  .  w
+      w  w  w  w  w  w  w  w  w  w  w  w  w
     - |
-      w w w w w w w w w w w w w
-      w . . . . . . . . . . . w
-      w . . 1 . . 2 . c 3 . . w
-      w . . . . h . . h . . . w
-      w . . . 2 . . 3 . . 1 . w
-      w . . . . b . . h . . . w
-      w . . 3 . . . 2 . . 1 . w
-      w . . h . h . . . a . . w
-      w . . . . . A . . . . . w
-      w w w w w w w w w w w w w
+      w  w  w  w  w  w  w  w  w  w  w  w  w
+      w  .  .  .  .  .  .  .  .  .  .  .  w
+      w  .  .  b1 .  .  r1 .  G  g1 .  .  w
+      w  .  .  .  .  x  .  .  x  .  .  .  w
+      w  .  .  .  r1 .  .  g1 .  .  b1 .  w
+      w  .  .  .  .  b  .  .  h  .  .  .  w
+      w  .  .  g1 .  .  .  r1 .  .  b1 .  w
+      w  .  .  x  .  x  .  .  .  B  .  .  w
+      w  .  .  .  .  .  .  .  .  .  .  .  w
+      w  w  w  w  w  w  w  w  w  w  w  w  w
     - |
-      w w w w w w w w w w w w w
-      w . . a . . b . . c . . w
-      w . . . . . . . . . . . w
-      w . . . . . . . . . . . w
-      w h h h h h . h h h h h w
-      w . . . . h . h . . . . w
-      w . 1 2 . h . h . 1 3 . w
-      w . 3 . . . . . . . 2 . w
-      w . . . . . A . . . . . w
-      w w w w w w w w w w w w w
+      w  w  w  w  w  w  w  w  w  w  w  w  w
+      w  .  .  B  .  .  R  .  .  G  .  .  w
+      w  .  .  .  .  .  .  .  .  .  .  .  w
+      w  .  .  .  .  .  .  .  .  .  .  .  w
+      w  x  x  x  x  x  .  x  x  x  x  x  w
+      w  .  .  .  .  x  .  x  .  .  .  .  w
+      w  .  b1 r1 .  x  .  x  .  b1 g1 .  w
+      w  .  g1 .  .  .  .  .  .  .  r1 .  w
+      w  .  .  .  .  .  .  .  .  .  .  .  w
+      w  w  w  w  w  w  w  w  w  w  w  w  w
     - |
-      w w w w w w w w w w w w w
-      w . . . . . . . . . . . w
-      w . . . 1 . 2 . . c . . w
-      w . . . . . 3 . . 3 . . w
-      w . . a . 2 . . . h . . w
-      w . . . . h h . 3 . . . w
-      w . . 1 . . . . . 2 . . w
-      w . . . . . 1 . . b . . w
-      w . . . . . A . . . . . w
-      w w w w w w w w w w w w w
+      w   w  w  w  w  w  w  w  w  w  w  w  w
+      w   .  .  .  .  .  .  .  .  .  .  .  w
+      w   .  .  .  b1 .  r1 .  .  G  .  .  w
+      w   .  .  .  .  .  g1 .  .  g1 .  .  w
+      w   .  .  B  .  r1 .  .  .  x  .  .  w
+      w   .  .  .  .  x  x  .  g1 .  .  .  w
+      w   .  .  b1 .  .  .  .  .  r1 .  .  w
+      w   .  .  .  .  .  b1 .  .  R  .  .  w
+      w   .  .  .  .  .  .  .  .  .  .  .  w
+      w   w  w  w  w  w  w  w  w  w  w  w  w
     - |
-      w w w w w w w w w w w w w
-      w . . . . . . . . . . . w
-      w . . . . . . 1 . . . . w
-      w . . h . . b . . h . . w
-      w . . . . 1 . . . . . . w
-      w . . 3 . . . . 2 . . . w
-      w . . . a . h . . c . . w
-      w . . . . 3 . . . . 2 . w
-      w . . . . . A . . . . . w
-      w w w w w w w w w w w w w
+      w  w  w  w  w  w  w  w  w  w  w  w  w
+      w  .  .  .  .  .  .  .  .  .  .  .  w
+      w  .  .  .  .  .  .  b1 .  .  .  .  w
+      w  .  .  x  .  .  R  .  .  h  .  .  w
+      w  .  .  .  .  b1 .  .  .  .  .  .  w
+      w  .  .  g1 .  .  .  .  r1 .  .  .  w
+      w  .  .  .  B  .  x  .  .  G  .  .  w
+      w  .  .  .  .  g1 .  .  .  .  r1 .  w
+      w  .  .  .  .  .  .  .  .  .  .  .  w
+      w  w  w  w  w  w  w  w  w  w  w  w  w
 
 Actions:
 
@@ -99,69 +90,9 @@ Actions:
         Dst:
           Object: [blue_box, red_box, green_box]
 
-  # Define the move action
-  - Name: move
-    InputMapping:
-      Inputs:
-        1:
-          Description: Rotate left
-          OrientationVector: [-1, 0]
-        2:
-          Description: Move forwards
-          OrientationVector: [0, -1]
-          VectorToDest: [0, -1]
-        3:
-          Description: Rotate right
-          OrientationVector: [1, 0]
-      Relative: true
-    Behaviours:
-
-      # Avatar rotates
-      - Src:
-          Object: avatar
-          Commands:
-            - rot: _dir
-        Dst:
-          Object: avatar
-
-      # Avatar can move into empty space
-      - Src:
-          Object: avatar
-          Commands:
-            - mov: _dest
-        Dst:
-          Object: _empty
-
-      # Avatar dies if it hits the spikes
-      - Src:
-          Object: avatar
-          Commands:
-            - remove: true
-            - reward: -1
-        Dst:
-          Object: spike
-
-
   - Name: push
-    InputMapping:
-      Inputs:
-        1:
-          Description: Push Forwards
-          OrientationVector: [ 0, -1 ]
-          VectorToDest: [ 0, -1 ]
-      Relative: true
     Behaviours:
 
-      # Boxes can be pushed by the avatar
-      - Src:
-          Object: avatar
-          Commands:
-            - mov: _dest
-        Dst:
-          Object: [blue_box, green_box, red_box]
-          Commands:
-            - cascade: _dest
-
       # Boxes break if they hit the spikes
       - Src:
           Object: [ blue_box, green_box, red_box ]
@@ -207,15 +138,6 @@ Actions:
 
 
 Objects:
-  - Name: avatar
-    MapCharacter: A
-    Observers:
-      Sprite2D:
-        - Image: gvgai/oryx/knight1.png
-      Block2D:
-        - Shape: triangle
-          Color: [0.0, 1.0, 0.0]
-          Scale: 0.8
 
   - Name: wall
     MapCharacter: w
@@ -245,7 +167,7 @@ Objects:
           Scale: 0.9
 
   - Name: spike
-    MapCharacter: h
+    MapCharacter: x
     Observers:
       Sprite2D:
         - Image: gvgai/oryx/spike2.png
@@ -255,7 +177,7 @@ Objects:
           Scale: 0.5
 
   - Name: red_box
-    MapCharacter: "2"
+    MapCharacter: r
     InitialActions:
       - Action: box_counter
         ActionId: 1
@@ -267,7 +189,7 @@ Objects:
           Color: [0.5, 0.2, 0.2]
           Scale: 0.5
   - Name: red_block
-    MapCharacter: b
+    MapCharacter: R
     Observers:
       Sprite2D:
         - Image: gvgai/newset/blockR2.png
@@ -277,7 +199,7 @@ Objects:
           Scale: 1.0
 
   - Name: green_box
-    MapCharacter: "3"
+    MapCharacter: g
     InitialActions:
       - Action: box_counter
         ActionId: 1
@@ -289,7 +211,7 @@ Objects:
           Color: [0.2, 0.5, 0.2]
           Scale: 0.5
   - Name: green_block
-    MapCharacter: c
+    MapCharacter: G
     Observers:
       Sprite2D:
         - Image: gvgai/newset/blockG2.png
@@ -299,7 +221,7 @@ Objects:
           Scale: 1.0
 
   - Name: blue_box
-    MapCharacter: "1"
+    MapCharacter: b
     InitialActions:
       - Action: box_counter
         ActionId: 1
@@ -311,7 +233,7 @@ Objects:
           Color: [0.2, 0.2, 0.5]
           Scale: 0.5
   - Name: blue_block
-    MapCharacter: a
+    MapCharacter: B
     Observers:
       Sprite2D:
         - Image: gvgai/newset/blockB2.png
diff --git a/python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py b/python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py
index aca66deba..d96526562 100644
--- a/python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py
+++ b/python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py
@@ -78,7 +78,7 @@
 
             'allow_nop': tune.grid_search([True, False]),
             'invalid_action_masking': tune.grid_search(['none', 'conditional', 'collapsed']),
-            'vtrace_masking': tune.grid_search([True, False]),
+            'vtrace_masking': False,
             # 'invalid_action_masking': 'collapsed',
             # 'allow_nop': False,
             'generate_valid_action_trees': True,

From d7950ed25f1ce7c07abfa00d26363ab889d4595e Mon Sep 17 00:00:00 2001
From: Bam4d <chrisbam4d@gmail.com>
Date: Tue, 23 Mar 2021 10:40:55 +0000
Subject: [PATCH 21/45] allow not not in grid-search

---
 .../conditional_action_spaces/rllib_conditional_actions.py      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py b/python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py
index d96526562..2e6d37260 100644
--- a/python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py
+++ b/python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py
@@ -76,7 +76,7 @@
         'env': env_name,
         'env_config': {
 
-            'allow_nop': tune.grid_search([True, False]),
+            'allow_nop': True,
             'invalid_action_masking': tune.grid_search(['none', 'conditional', 'collapsed']),
             'vtrace_masking': False,
             # 'invalid_action_masking': 'collapsed',

From a10958c2ce76fe70f6f90451278af289ce6662a7 Mon Sep 17 00:00:00 2001
From: Bam4d <chrisbam4d@gmail.com>
Date: Tue, 23 Mar 2021 10:49:15 +0000
Subject: [PATCH 22/45] add vtrace masking and no-ops as command flags

---
 .../conditional_action_spaces/rllib_conditional_actions.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py b/python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py
index 2e6d37260..4cebac626 100644
--- a/python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py
+++ b/python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py
@@ -35,6 +35,9 @@
 parser.add_argument('--video-directory', default='videos', help='directory of video')
 parser.add_argument('--video-frequency', type=int, default=1000000, help='Frequency of videos')
 
+parser.add_argument('--allow-nop', action='store_true', default=False, help='allow NOP actions in action tree')
+parser.add_argument('--vtrace-masking', action='store_true', default=False, help='use masks in vtrace calculations')
+
 parser.add_argument('--lr', type=float, default=0.0005, help='learning rate')
 
 if __name__ == '__main__':
@@ -76,9 +79,9 @@
         'env': env_name,
         'env_config': {
 
-            'allow_nop': True,
+            'allow_nop': args.allow_nop,
             'invalid_action_masking': tune.grid_search(['none', 'conditional', 'collapsed']),
-            'vtrace_masking': False,
+            'vtrace_masking': args.vtrace_masking,
             # 'invalid_action_masking': 'collapsed',
             # 'allow_nop': False,
             'generate_valid_action_trees': True,

From b3440a0a6ccf4b5d83d4d1c8ffdbe1bebd398d04 Mon Sep 17 00:00:00 2001
From: Bam4d <chrisbam4d@gmail.com>
Date: Tue, 23 Mar 2021 11:02:25 +0000
Subject: [PATCH 23/45] add seed for consistency

---
 .../conditional_action_spaces/rllib_conditional_actions.py     | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py b/python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py
index 4cebac626..aff2108a0 100644
--- a/python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py
+++ b/python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py
@@ -38,6 +38,8 @@
 parser.add_argument('--allow-nop', action='store_true', default=False, help='allow NOP actions in action tree')
 parser.add_argument('--vtrace-masking', action='store_true', default=False, help='use masks in vtrace calculations')
 
+parser.add_argument('--seed', type=int, default=69420, help='seed for experiments')
+
 parser.add_argument('--lr', type=float, default=0.0005, help='learning rate')
 
 if __name__ == '__main__':
@@ -65,6 +67,7 @@
 
     config = {
         'framework': 'torch',
+        'seed': args.seed,
         'num_workers': args.num_workers,
         'num_envs_per_worker': args.num_envs_per_worker,
         'num_gpus_per_worker': float(args.num_gpus_per_worker),

From edf7d771c533c8bce8d69a2ec2f323ec26ac488a Mon Sep 17 00:00:00 2001
From: Bam4d <chrisbam4d@gmail.com>
Date: Wed, 24 Mar 2021 20:44:34 +0000
Subject: [PATCH 24/45] allowing the x+y values to be placed in trees, making
 rts-style clusters

---
 bindings/wrapper/GameWrapper.cpp              |  24 +-
 ...rs_po_with_push_separate_colors_units.yaml | 304 ++++++++++++++++++
 .../clusters_po_with_push_units.yaml          |  46 +--
 .../rllib_baseline.py                         |  76 +++--
 .../rllib_conditional_actions.py              |  10 +-
 python/griddly/util/rllib/callbacks.py        |   2 +-
 .../conditional_action_exploration.py         |  30 +-
 .../conditional_action_mixin.py               |  11 +-
 .../conditional_action_policy_trainer.py      |   2 +-
 9 files changed, 425 insertions(+), 80 deletions(-)

diff --git a/bindings/wrapper/GameWrapper.cpp b/bindings/wrapper/GameWrapper.cpp
index 4a74ca412..de3f0175b 100644
--- a/bindings/wrapper/GameWrapper.cpp
+++ b/bindings/wrapper/GameWrapper.cpp
@@ -91,21 +91,21 @@ class Py_GameWrapper {
             auto actionIdsForName = gameProcess_->getAvailableActionIdsAtLocation(locationVec, actionName);
 
             if (actionIdsForName.size() > 0) {
-              // if (gdyFactory_->getAvatarObject().length() == 0) {
-              //   auto py_x = py::cast(locationVec[0]);
-              //   auto py_y = py::cast(locationVec[1]);
-              //   if(!treePtr.contains(py_x)) {
-              //     (*treePtr)[py_x] = py::dict();
-              //   }
+              if (gdyFactory_->getAvatarObject().length() == 0) {
+                auto py_x = locationVec[0];
+                auto py_y = locationVec[1];
+                if(!treePtr->contains(py_x)) {
+                   treePtr->add(py_x);
+                }
 
-              //   treePtr = treePtr[py_x];
+                treePtr = treePtr->children[py_x];
 
-              //   if(!treePtr.contains(py_y)) {
-              //     treePtr[py_y] = py::dict();
-              //   }
+                if(!treePtr->contains(py_y)) {
+                   treePtr->add(py_y);
+                }
 
-              //   treePtr = treePtr[py_y];
-              // }
+                treePtr = treePtr->children[py_y];
+              }
 
               if (externalActionNames.size() > 1) {
                 auto actionTypeId = getActionTypeId(actionName);
diff --git a/python/examples/experiments/conditional_action_spaces/clusters_po_with_push_separate_colors_units.yaml b/python/examples/experiments/conditional_action_spaces/clusters_po_with_push_separate_colors_units.yaml
index e69de29bb..1fba6490c 100644
--- a/python/examples/experiments/conditional_action_spaces/clusters_po_with_push_separate_colors_units.yaml
+++ b/python/examples/experiments/conditional_action_spaces/clusters_po_with_push_separate_colors_units.yaml
@@ -0,0 +1,304 @@
+Version: "0.1"
+Environment:
+  Name: Partially Observable Clusters
+  Description: Cluster the coloured objects together by pushing them against the static coloured blocks.
+  Observers:
+    Sprite2D:
+      TileSize: 24
+      BackgroundTile: oryx/oryx_fantasy/floor1-2.png
+  Variables:
+    - Name: box_count
+      InitialValue: 0
+      PerPlayer: true
+    - Name: broken_boxes
+      InitialValue: 0
+      PerPlayer: true
+  Player:
+    Count: 1
+  Termination:
+    Win:
+      - eq: [ box_count, 0 ]
+    Lose:
+      - eq: [ broken_boxes, 1 ]
+  Levels:
+    - |
+      w  w  w  w  w  w  w  w  w  w  w  w  w
+      w  .  .  .  .  .  .  .  .  .  .  .  w
+      w  .  .  b1 b1 .  .  .  r1 .  r1 .  w
+      w  .  .  .  .  b1 .  .  .  .  .  .  w
+      w  .  .  .  B  .  .  .  .  .  r1 .  w
+      w  .  .  .  .  .  .  .  x  .  .  .  w
+      w  .  .  .  .  b1 .  .  .  .  R  .  w
+      w  .  .  .  .  .  .  b1 .  .  .  .  w
+      w  .  .  .  .  .  .  .  .  .  .  .  w
+      w  w  w  w  w  w  w  w  w  w  w  w  w
+    - |
+      w  w  w  w  w  w  w  w  w  w  w  w  w
+      w  .  .  .  .  .  .  .  .  .  .  .  w
+      w  .  .  b1 .  .  r1 .  G  g1 .  .  w
+      w  .  .  .  .  x  .  .  x  .  .  .  w
+      w  .  .  .  r1 .  .  g1 .  .  b1 .  w
+      w  .  .  .  .  b  .  .  x  .  .  .  w
+      w  .  .  g1 .  .  .  r1 .  .  b1 .  w
+      w  .  .  x  .  x  .  .  .  B  .  .  w
+      w  .  .  .  .  .  .  .  .  .  .  .  w
+      w  w  w  w  w  w  w  w  w  w  w  w  w
+    - |
+      w  w  w  w  w  w  w  w  w  w  w  w  w
+      w  .  .  B  .  .  R  .  .  G  .  .  w
+      w  .  .  .  .  .  .  .  .  .  .  .  w
+      w  .  .  .  .  .  .  .  .  .  .  .  w
+      w  x  x  x  x  x  .  x  x  x  x  x  w
+      w  .  .  .  .  x  .  x  .  .  .  .  w
+      w  .  b1 r1 .  x  .  x  .  b1 g1 .  w
+      w  .  g1 .  .  .  .  .  .  .  r1 .  w
+      w  .  .  .  .  .  .  .  .  .  .  .  w
+      w  w  w  w  w  w  w  w  w  w  w  w  w
+    - |
+      w   w  w  w  w  w  w  w  w  w  w  w  w
+      w   .  .  .  .  .  .  .  .  .  .  .  w
+      w   .  .  .  b1 .  r1 .  .  G  .  .  w
+      w   .  .  .  .  .  g1 .  .  g1 .  .  w
+      w   .  .  B  .  r1 .  .  .  x  .  .  w
+      w   .  .  .  .  x  x  .  g1 .  .  .  w
+      w   .  .  b1 .  .  .  .  .  r1 .  .  w
+      w   .  .  .  .  .  b1 .  .  R  .  .  w
+      w   .  .  .  .  .  .  .  .  .  .  .  w
+      w   w  w  w  w  w  w  w  w  w  w  w  w
+    - |
+      w  w  w  w  w  w  w  w  w  w  w  w  w
+      w  .  .  .  .  .  .  .  .  .  .  .  w
+      w  .  .  .  .  .  .  b1 .  .  .  .  w
+      w  .  .  x  .  .  R  .  .  x  .  .  w
+      w  .  .  .  .  b1 .  .  .  .  .  .  w
+      w  .  .  g1 .  .  .  .  r1 .  .  .  w
+      w  .  .  .  B  .  x  .  .  G  .  .  w
+      w  .  .  .  .  g1 .  .  .  .  r1 .  w
+      w  .  .  .  .  .  .  .  .  .  .  .  w
+      w  w  w  w  w  w  w  w  w  w  w  w  w
+
+Actions:
+
+  # A simple action to count the number of boxes in the game at the start
+  # Not currently a way to do complex things in termination conditions like combine multiple conditions
+  - Name: box_counter
+    InputMapping:
+      Internal: true
+      Inputs:
+        1:
+          Description: "The only action here is to increment the box count"
+    Behaviours:
+      - Src:
+          Object: [ blue_box, red_box, green_box ]
+          Commands:
+            - incr: box_count
+        Dst:
+          Object: [ blue_box, red_box, green_box ]
+
+  - Name: push_blue
+    Behaviours:
+
+      # Boxes break if they are pushed into the spikes
+      - Src:
+          Object: blue_box
+          Commands:
+            - incr: broken_boxes
+            - change_to: broken_box
+            - reward: -1
+        Dst:
+          Object: spike
+
+      # Boxes can pushed into empty space
+      - Src:
+          Object: blue_box
+          Commands:
+            - mov: _dest
+        Dst:
+          Object: _empty
+
+      # When boxes are pushed against the blocks they change
+      - Src:
+          Object: blue_box
+          Commands:
+            - change_to: blue_block
+            - reward: 1
+            - decr: box_count
+        Dst:
+          Object: blue_block
+
+  - Name: push_red
+    Behaviours:
+
+      # Boxes break if they are pushed into the spikes
+      - Src:
+          Object: red_box
+          Commands:
+            - incr: broken_boxes
+            - change_to: broken_box
+            - reward: -1
+        Dst:
+          Object: spike
+
+      # Boxes can pushed into empty space
+      - Src:
+          Object: red_box
+          Commands:
+            - mov: _dest
+        Dst:
+          Object: _empty
+
+      # When boxes are pushed against the blocks they change
+      - Src:
+          Object: red_box
+          Commands:
+            - reward: 1
+            - change_to: red_block
+            - decr: box_count
+        Dst:
+          Object: red_block
+
+  - Name: push_green
+    Behaviours:
+
+      # Boxes break if they are pushed into the spikes
+      - Src:
+          Object: green_box
+          Commands:
+            - incr: broken_boxes
+            - change_to: broken_box
+            - reward: -1
+        Dst:
+          Object: spike
+
+      # Boxes can pushed into empty space
+      - Src:
+          Object: green_box
+          Commands:
+            - mov: _dest
+        Dst:
+          Object: _empty
+
+        # When boxes are pushed against the blocks they change
+      - Src:
+          Object: green_box
+          Commands:
+            - reward: 1
+            - change_to: green_block
+            - decr: box_count
+        Dst:
+          Object: green_block
+
+
+Objects:
+
+  - Name: wall
+    MapCharacter: w
+    Observers:
+      Sprite2D:
+        - TilingMode: WALL_16
+          Image:
+            - oryx/oryx_fantasy/wall1-0.png
+            - oryx/oryx_fantasy/wall1-1.png
+            - oryx/oryx_fantasy/wall1-2.png
+            - oryx/oryx_fantasy/wall1-3.png
+            - oryx/oryx_fantasy/wall1-4.png
+            - oryx/oryx_fantasy/wall1-5.png
+            - oryx/oryx_fantasy/wall1-6.png
+            - oryx/oryx_fantasy/wall1-7.png
+            - oryx/oryx_fantasy/wall1-8.png
+            - oryx/oryx_fantasy/wall1-9.png
+            - oryx/oryx_fantasy/wall1-10.png
+            - oryx/oryx_fantasy/wall1-11.png
+            - oryx/oryx_fantasy/wall1-12.png
+            - oryx/oryx_fantasy/wall1-13.png
+            - oryx/oryx_fantasy/wall1-14.png
+            - oryx/oryx_fantasy/wall1-15.png
+      Block2D:
+        - Shape: square
+          Color: [ 0.5, 0.5, 0.5 ]
+          Scale: 0.9
+
+  - Name: spike
+    MapCharacter: x
+    Observers:
+      Sprite2D:
+        - Image: gvgai/oryx/spike2.png
+      Block2D:
+        - Shape: triangle
+          Color: [ 0.9, 0.1, 0.1 ]
+          Scale: 0.5
+
+  - Name: red_box
+    MapCharacter: r
+    InitialActions:
+      - Action: box_counter
+        ActionId: 1
+    Observers:
+      Sprite2D:
+        - Image: gvgai/newset/blockR.png
+      Block2D:
+        - Shape: square
+          Color: [ 0.5, 0.2, 0.2 ]
+          Scale: 0.5
+  - Name: red_block
+    MapCharacter: R
+    Observers:
+      Sprite2D:
+        - Image: gvgai/newset/blockR2.png
+      Block2D:
+        - Shape: square
+          Color: [ 1.0, 0.0, 0.0 ]
+          Scale: 1.0
+
+  - Name: green_box
+    MapCharacter: g
+    InitialActions:
+      - Action: box_counter
+        ActionId: 1
+    Observers:
+      Sprite2D:
+        - Image: gvgai/newset/blockG.png
+      Block2D:
+        - Shape: square
+          Color: [ 0.2, 0.5, 0.2 ]
+          Scale: 0.5
+  - Name: green_block
+    MapCharacter: G
+    Observers:
+      Sprite2D:
+        - Image: gvgai/newset/blockG2.png
+      Block2D:
+        - Shape: square
+          Color: [ 0.0, 1.0, 0.0 ]
+          Scale: 1.0
+
+  - Name: blue_box
+    MapCharacter: b
+    InitialActions:
+      - Action: box_counter
+        ActionId: 1
+    Observers:
+      Sprite2D:
+        - Image: gvgai/newset/blockB.png
+      Block2D:
+        - Shape: square
+          Color: [ 0.2, 0.2, 0.5 ]
+          Scale: 0.5
+  - Name: blue_block
+    MapCharacter: B
+    Observers:
+      Sprite2D:
+        - Image: gvgai/newset/blockB2.png
+      Block2D:
+        - Shape: square
+          Color: [ 0.0, 0.0, 1.0 ]
+          Scale: 1.0
+
+  - Name: broken_box
+    Observers:
+      Sprite2D:
+        - Image: gvgai/newset/block3.png
+      Block2D:
+        - Shape: triangle
+          Color: [ 1.0, 0.0, 1.0 ]
+          Scale: 1.0
diff --git a/python/examples/experiments/conditional_action_spaces/clusters_po_with_push_units.yaml b/python/examples/experiments/conditional_action_spaces/clusters_po_with_push_units.yaml
index 879a28681..4a00e3a8c 100644
--- a/python/examples/experiments/conditional_action_spaces/clusters_po_with_push_units.yaml
+++ b/python/examples/experiments/conditional_action_spaces/clusters_po_with_push_units.yaml
@@ -9,12 +9,17 @@ Environment:
   Variables:
     - Name: box_count
       InitialValue: 0
+      PerPlayer: true
+    - Name: broken_boxes
+      InitialValue: 0
+      PerPlayer: true
+  Player:
+    Count: 1
   Termination:
     Win:
-      - eq: [box_count, 0]
+      - eq: [ box_count, 0 ]
     Lose:
-      - eq: [broken_box:count, 1]
-      - eq: [avatar:count, 0]
+      - eq: [ broken_boxes, 1 ]
   Levels:
     - |
       w  w  w  w  w  w  w  w  w  w  w  w  w
@@ -33,7 +38,7 @@ Environment:
       w  .  .  b1 .  .  r1 .  G  g1 .  .  w
       w  .  .  .  .  x  .  .  x  .  .  .  w
       w  .  .  .  r1 .  .  g1 .  .  b1 .  w
-      w  .  .  .  .  b  .  .  h  .  .  .  w
+      w  .  .  .  .  b  .  .  x  .  .  .  w
       w  .  .  g1 .  .  .  r1 .  .  b1 .  w
       w  .  .  x  .  x  .  .  .  B  .  .  w
       w  .  .  .  .  .  .  .  .  .  .  .  w
@@ -64,7 +69,7 @@ Environment:
       w  w  w  w  w  w  w  w  w  w  w  w  w
       w  .  .  .  .  .  .  .  .  .  .  .  w
       w  .  .  .  .  .  .  b1 .  .  .  .  w
-      w  .  .  x  .  .  R  .  .  h  .  .  w
+      w  .  .  x  .  .  R  .  .  x  .  .  w
       w  .  .  .  .  b1 .  .  .  .  .  .  w
       w  .  .  g1 .  .  .  .  r1 .  .  .  w
       w  .  .  .  B  .  x  .  .  G  .  .  w
@@ -84,11 +89,11 @@ Actions:
           Description: "The only action here is to increment the box count"
     Behaviours:
       - Src:
-          Object: [blue_box, red_box, green_box]
+          Object: [ blue_box, red_box, green_box ]
           Commands:
             - incr: box_count
         Dst:
-          Object: [blue_box, red_box, green_box]
+          Object: [ blue_box, red_box, green_box ]
 
   - Name: push
     Behaviours:
@@ -97,6 +102,7 @@ Actions:
       - Src:
           Object: [ blue_box, green_box, red_box ]
           Commands:
+            - incr: broken_boxes
             - change_to: broken_box
             - reward: -1
         Dst:
@@ -104,7 +110,7 @@ Actions:
 
       # Boxes can pushed into empty space
       - Src:
-          Object: [blue_box, green_box, red_box]
+          Object: [ blue_box, green_box, red_box ]
           Commands:
             - mov: _dest
         Dst:
@@ -116,7 +122,7 @@ Actions:
           Commands:
             - change_to: blue_block
             - reward: 1
-            - decr:  box_count
+            - decr: box_count
         Dst:
           Object: blue_block
       - Src:
@@ -124,7 +130,7 @@ Actions:
           Commands:
             - reward: 1
             - change_to: red_block
-            - decr:  box_count
+            - decr: box_count
         Dst:
           Object: red_block
       - Src:
@@ -132,7 +138,7 @@ Actions:
           Commands:
             - reward: 1
             - change_to: green_block
-            - decr:  box_count
+            - decr: box_count
         Dst:
           Object: green_block
 
@@ -163,7 +169,7 @@ Objects:
             - oryx/oryx_fantasy/wall1-15.png
       Block2D:
         - Shape: square
-          Color: [0.5, 0.5, 0.5]
+          Color: [ 0.5, 0.5, 0.5 ]
           Scale: 0.9
 
   - Name: spike
@@ -173,7 +179,7 @@ Objects:
         - Image: gvgai/oryx/spike2.png
       Block2D:
         - Shape: triangle
-          Color: [0.9, 0.1, 0.1]
+          Color: [ 0.9, 0.1, 0.1 ]
           Scale: 0.5
 
   - Name: red_box
@@ -186,7 +192,7 @@ Objects:
         - Image: gvgai/newset/blockR.png
       Block2D:
         - Shape: square
-          Color: [0.5, 0.2, 0.2]
+          Color: [ 0.5, 0.2, 0.2 ]
           Scale: 0.5
   - Name: red_block
     MapCharacter: R
@@ -195,7 +201,7 @@ Objects:
         - Image: gvgai/newset/blockR2.png
       Block2D:
         - Shape: square
-          Color: [1.0, 0.0, 0.0]
+          Color: [ 1.0, 0.0, 0.0 ]
           Scale: 1.0
 
   - Name: green_box
@@ -208,7 +214,7 @@ Objects:
         - Image: gvgai/newset/blockG.png
       Block2D:
         - Shape: square
-          Color: [0.2, 0.5, 0.2]
+          Color: [ 0.2, 0.5, 0.2 ]
           Scale: 0.5
   - Name: green_block
     MapCharacter: G
@@ -217,7 +223,7 @@ Objects:
         - Image: gvgai/newset/blockG2.png
       Block2D:
         - Shape: square
-          Color: [0.0, 1.0, 0.0]
+          Color: [ 0.0, 1.0, 0.0 ]
           Scale: 1.0
 
   - Name: blue_box
@@ -230,7 +236,7 @@ Objects:
         - Image: gvgai/newset/blockB.png
       Block2D:
         - Shape: square
-          Color: [0.2, 0.2, 0.5]
+          Color: [ 0.2, 0.2, 0.5 ]
           Scale: 0.5
   - Name: blue_block
     MapCharacter: B
@@ -239,7 +245,7 @@ Objects:
         - Image: gvgai/newset/blockB2.png
       Block2D:
         - Shape: square
-          Color: [0.0, 0.0, 1.0]
+          Color: [ 0.0, 0.0, 1.0 ]
           Scale: 1.0
 
   - Name: broken_box
@@ -248,5 +254,5 @@ Objects:
         - Image: gvgai/newset/block3.png
       Block2D:
         - Shape: triangle
-          Color: [1.0, 0.0, 1.0]
+          Color: [ 1.0, 0.0, 1.0 ]
           Scale: 1.0
diff --git a/python/examples/experiments/conditional_action_spaces/rllib_baseline.py b/python/examples/experiments/conditional_action_spaces/rllib_baseline.py
index 46a86849f..e95c20c7a 100644
--- a/python/examples/experiments/conditional_action_spaces/rllib_baseline.py
+++ b/python/examples/experiments/conditional_action_spaces/rllib_baseline.py
@@ -1,3 +1,4 @@
+import argparse
 import os
 import sys
 
@@ -8,6 +9,7 @@
 from ray.tune.registry import register_env
 
 from griddly import gd
+from griddly.util.rllib.callbacks import GriddlyCallbacks
 from griddly.util.rllib.environment.core import RLlibEnv
 from griddly.util.rllib.torch import GAPAgent
 from griddly.util.rllib.torch.agents.conv_agent import SimpleConvAgent
@@ -15,19 +17,38 @@
 from griddly.util.rllib.torch.conditional_actions.conditional_action_policy_trainer import \
     ConditionalActionImpalaTrainer
 
+parser = argparse.ArgumentParser(description='Run experiments')
+
+parser.add_argument('--yaml-file', help='YAML file condining GDY for the game')
+
+parser.add_argument('--root-directory', default=os.path.expanduser("~/ray_results"),
+                    help='root directory for all data associated with the run')
+parser.add_argument('--num-gpus', default=1, type=int, help='Number of GPUs to make available to ray.')
+parser.add_argument('--num-cpus', default=8, type=int, help='Number of CPUs to make available to ray.')
+
+parser.add_argument('--num-workers', default=7, type=int, help='Number of workers')
+parser.add_argument('--num-envs-per-worker', default=5, type=int, help='Number of workers')
+parser.add_argument('--num-gpus-per-worker', default=0, type=float, help='Number of gpus per worker')
+parser.add_argument('--num-cpus-per-worker', default=1, type=float, help='Number of gpus per worker')
+parser.add_argument('--max-training-steps', default=20000000, type=int, help='Number of workers')
+
+parser.add_argument('--capture-video', action='store_true', help='enable video capture')
+parser.add_argument('--video-directory', default='videos', help='directory of video')
+parser.add_argument('--video-frequency', type=int, default=1000000, help='Frequency of videos')
+
+parser.add_argument('--seed', type=int, default=69420, help='seed for experiments')
+
+parser.add_argument('--lr', type=float, default=0.0005, help='learning rate')
+
 if __name__ == '__main__':
-    sep = os.pathsep
-    os.environ['PYTHONPATH'] = sep.join(sys.path)
 
-    yaml_files = [
-        os.path.realpath('clusters_po.yaml'),
-        os.path.realpath('clusters_po_with_push.yaml'),
-        os.path.realpath('clusters_po_with_push_seperate_colors.yaml')
-    ]
+    args = parser.parse_args()
 
+    sep = os.pathsep
+    os.environ['PYTHONPATH'] = sep.join(sys.path)
 
-    #ray.init(num_gpus=1)
-    ray.init(num_gpus=1, local_mode=True)
+    ray.init(include_dashboard=False, num_gpus=args.num_gpus, num_cpus=args.num_cpus)
+    #ray.init(include_dashboard=False, num_gpus=1, num_cpus=args.num_cpus, local_mode=True)
 
     env_name = "ray-griddly-env"
 
@@ -36,34 +57,32 @@
 
     wandbLoggerCallback = WandbLoggerCallback(
         project='conditional_actions',
-        group='baseline',
-        api_key_file='~/.wandb_rc'
+        api_key_file='~/.wandb_rc',
+        dir=args.root_directory
     )
 
-    max_training_steps = 20000000
+    max_training_steps = args.max_training_steps
+
 
     config = {
         'framework': 'torch',
-        'num_workers': 8,
-        'num_envs_per_worker': 4,
+        'seed': args.seed,
+        'num_workers': args.num_workers,
+        'num_envs_per_worker': args.num_envs_per_worker,
+        'num_gpus_per_worker': float(args.num_gpus_per_worker),
+        'num_cpus_per_worker': args.num_cpus_per_worker,
 
-        # 'callbacks': GriddlyCallbacks,
+        'callbacks': GriddlyCallbacks,
 
         'model': {
-            'custom_model': tune.grid_search(['SimpleConv', 'GAP']),
+            'custom_model': 'SimpleConv',
             'custom_model_config': {}
         },
         'env': env_name,
         'env_config': {
-            'record_video_config': {
-                'frequency': 100000,
-                'directory': 'baseline_videos'
-            },
-
-            # Put this here so it shows up in wandb
             'generate_valid_action_trees': False,
             'random_level_on_reset': True,
-            'yaml_file': tune.grid_search(yaml_files),
+            'yaml_file': args.yaml_file,
             'global_observer_type': gd.ObserverType.SPRITE_2D,
             'max_steps': 1000,
         },
@@ -72,14 +91,21 @@
             [max_training_steps, 0.0]
         ],
         'lr_schedule': [
-            [0, 0.0005],
+            [0, args.lr],
             [max_training_steps, 0.0]
         ],
 
     }
+    if args.capture_video:
+        real_video_frequency = args.video_frequency/(args.num_envs_per_worker*args.num_workers)
+        config['env_config']['record_video_config'] = {
+            'frequency': real_video_frequency,
+            'directory': os.path.join(args.root_directory, args.video_directory)
+        }
 
     stop = {
         "timesteps_total": max_training_steps,
     }
 
-    result = tune.run(ConditionalActionImpalaTrainer, config=config, stop=stop, callbacks=[wandbLoggerCallback])
+    result = tune.run(ConditionalActionImpalaTrainer, local_dir=args.root_directory, config=config, stop=stop,
+                      callbacks=[wandbLoggerCallback])
diff --git a/python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py b/python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py
index aff2108a0..4e1729590 100644
--- a/python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py
+++ b/python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py
@@ -50,7 +50,7 @@
     os.environ['PYTHONPATH'] = sep.join(sys.path)
 
     ray.init(include_dashboard=False, num_gpus=args.num_gpus, num_cpus=args.num_cpus)
-    #ray.init(include_dashboard=False, num_gpus=1, num_cpus=2, local_mode=True)
+    #ray.init(include_dashboard=False, num_gpus=1, num_cpus=args.num_cpus, local_mode=True)
 
     env_name = "ray-griddly-env"
 
@@ -83,10 +83,9 @@
         'env_config': {
 
             'allow_nop': args.allow_nop,
-            'invalid_action_masking': tune.grid_search(['none', 'conditional', 'collapsed']),
+            #'invalid_action_masking': tune.grid_search(['none', 'conditional', 'collapsed']),
             'vtrace_masking': args.vtrace_masking,
-            # 'invalid_action_masking': 'collapsed',
-            # 'allow_nop': False,
+            'invalid_action_masking': 'conditional',
             'generate_valid_action_trees': True,
             'random_level_on_reset': True,
             'yaml_file': args.yaml_file,
@@ -105,8 +104,9 @@
     }
 
     if args.capture_video:
+        real_video_frequency = args.video_frequency / (args.num_envs_per_worker * args.num_workers)
         config['env_config']['record_video_config'] = {
-            'frequency': args.video_frequency,
+            'frequency': real_video_frequency,
             'directory': os.path.join(args.root_directory, args.video_directory)
         }
 
diff --git a/python/griddly/util/rllib/callbacks.py b/python/griddly/util/rllib/callbacks.py
index 37767b835..c46e85824 100644
--- a/python/griddly/util/rllib/callbacks.py
+++ b/python/griddly/util/rllib/callbacks.py
@@ -11,7 +11,7 @@ class GriddlyCallbacks(DefaultCallbacks):
 
     def on_episode_end(self, *, worker: "RolloutWorker", base_env: BaseEnv, policies: Dict[PolicyID, Policy],
                        episode: MultiAgentEpisode, env_index: Optional[int] = None, **kwargs) -> None:
-        if worker.worker_index == 0 and env_index == 0:
+        if env_index == 0:
             if not worker.multiagent:
                 info = episode.last_info_for()
                 if 'video' in info:
diff --git a/python/griddly/util/rllib/torch/conditional_actions/conditional_action_exploration.py b/python/griddly/util/rllib/torch/conditional_actions/conditional_action_exploration.py
index be92f86a1..dfa00e7ac 100644
--- a/python/griddly/util/rllib/torch/conditional_actions/conditional_action_exploration.py
+++ b/python/griddly/util/rllib/torch/conditional_actions/conditional_action_exploration.py
@@ -101,24 +101,28 @@ def get_actions_and_mask(self):
                     mask_offset = 0
                     for a in range(self._num_action_parts):
 
-                        dist_part = self._inputs_split[a]
-                        is_parameters = a==(self._num_action_parts-1)
-                        sampled, masked_part_logits, logp, mask_part = self._mask_and_sample(subtree_options, dist_part[i], is_parameters)
+                        try:
+                            dist_part = self._inputs_split[a]
+                            is_parameters = a==(self._num_action_parts-1)
+                            sampled, masked_part_logits, logp, mask_part = self._mask_and_sample(subtree_options, dist_part[i], is_parameters)
 
-                        # Set the action and the mask for each part of the action
-                        actions[i, a] = sampled
-                        masked_logits[i, mask_offset:mask_offset + self._action_space_shape[a]] = masked_part_logits
-                        mask[i, mask_offset:mask_offset + self._action_space_shape[a]] = mask_part
+                            # Set the action and the mask for each part of the action
+                            actions[i, a] = sampled
+                            masked_logits[i, mask_offset:mask_offset + self._action_space_shape[a]] = masked_part_logits
+                            mask[i, mask_offset:mask_offset + self._action_space_shape[a]] = mask_part
 
-                        logp_parts[a] = logp
+                            logp_parts[a] = logp
 
-                        if mask_part.sum() == 0:
-                            raise RuntimeError('mask calculated incorrectly')
+                            if mask_part.sum() == 0:
+                                raise RuntimeError('mask calculated incorrectly')
 
-                        mask_offset += self._action_space_shape[a]
+                            mask_offset += self._action_space_shape[a]
+
+                            subtree = subtree[int(sampled)]
+                            subtree_options = list(subtree.keys())
+                        except ValueError as e:
+                            print(e)
 
-                        subtree = subtree[int(sampled)]
-                        subtree_options = list(subtree.keys())
 
                     logp_sums[i] = torch.sum(logp_parts)
 
diff --git a/python/griddly/util/rllib/torch/conditional_actions/conditional_action_mixin.py b/python/griddly/util/rllib/torch/conditional_actions/conditional_action_mixin.py
index 9d714fa45..21afd5c8a 100644
--- a/python/griddly/util/rllib/torch/conditional_actions/conditional_action_mixin.py
+++ b/python/griddly/util/rllib/torch/conditional_actions/conditional_action_mixin.py
@@ -43,6 +43,8 @@ def compute_actions_from_input_dict(
             invalid_action_masking = self.config["env_config"].get("invalid_action_masking", 'none')
             allow_nop = self.config["env_config"].get("allow_nop", False)
 
+            extra_fetches = {}
+
             if generate_valid_action_trees:
                 infos = input_dict[SampleBatch.INFOS] if SampleBatch.INFOS in input_dict else {}
 
@@ -63,6 +65,10 @@ def compute_actions_from_input_dict(
                 )
 
                 actions, masked_logits, logp, mask = exploration.get_actions_and_mask()
+
+                extra_fetches.update({
+                    'invalid_action_mask': mask
+                })
             else:
                 action_dist = self.dist_class(dist_inputs, self.model)
 
@@ -77,12 +83,11 @@ def compute_actions_from_input_dict(
 
             input_dict[SampleBatch.ACTIONS] = actions
 
-            extra_fetches = {
+            extra_fetches.update({
                 SampleBatch.ACTION_DIST_INPUTS: masked_logits,
                 SampleBatch.ACTION_PROB: torch.exp(logp.float()),
                 SampleBatch.ACTION_LOGP: logp,
-                'invalid_action_mask': mask
-            }
+            })
 
             # Update our global timestep by the batch size.
             self.global_timestep += len(input_dict[SampleBatch.CUR_OBS])
diff --git a/python/griddly/util/rllib/torch/conditional_actions/conditional_action_policy_trainer.py b/python/griddly/util/rllib/torch/conditional_actions/conditional_action_policy_trainer.py
index 9bb3ffcd9..c3e4f9407 100644
--- a/python/griddly/util/rllib/torch/conditional_actions/conditional_action_policy_trainer.py
+++ b/python/griddly/util/rllib/torch/conditional_actions/conditional_action_policy_trainer.py
@@ -3,7 +3,7 @@
 import torch
 from ray.rllib import SampleBatch
 from ray.rllib.agents.impala import ImpalaTrainer
-from ray.rllib.agents.impala.vtrace_tf_policy import build_vtrace_loss
+from ray.rllib.agents.impala.vtrace_torch_policy import build_vtrace_loss
 from ray.rllib.agents.impala.vtrace_torch_policy import VTraceTorchPolicy, VTraceLoss, make_time_major
 from ray.rllib.models.torch.torch_action_dist import TorchCategorical
 from ray.rllib.policy.torch_policy import LearningRateSchedule, EntropyCoeffSchedule

From e06ab42578a87c372f9982840296e6bafda62dc2 Mon Sep 17 00:00:00 2001
From: Bam4d <chrisbam4d@gmail.com>
Date: Thu, 25 Mar 2021 11:53:20 +0000
Subject: [PATCH 25/45] frequency needs to be an integer value for modulo to
 work

---
 .../experiments/conditional_action_spaces/rllib_baseline.py     | 2 +-
 .../conditional_action_spaces/rllib_conditional_actions.py      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/examples/experiments/conditional_action_spaces/rllib_baseline.py b/python/examples/experiments/conditional_action_spaces/rllib_baseline.py
index e95c20c7a..7ef20d001 100644
--- a/python/examples/experiments/conditional_action_spaces/rllib_baseline.py
+++ b/python/examples/experiments/conditional_action_spaces/rllib_baseline.py
@@ -97,7 +97,7 @@
 
     }
     if args.capture_video:
-        real_video_frequency = args.video_frequency/(args.num_envs_per_worker*args.num_workers)
+        real_video_frequency = int(args.video_frequency / (args.num_envs_per_worker * args.num_workers))
         config['env_config']['record_video_config'] = {
             'frequency': real_video_frequency,
             'directory': os.path.join(args.root_directory, args.video_directory)
diff --git a/python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py b/python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py
index 4e1729590..323ee6f4c 100644
--- a/python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py
+++ b/python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py
@@ -104,7 +104,7 @@
     }
 
     if args.capture_video:
-        real_video_frequency = args.video_frequency / (args.num_envs_per_worker * args.num_workers)
+        real_video_frequency = int(args.video_frequency / (args.num_envs_per_worker * args.num_workers))
         config['env_config']['record_video_config'] = {
             'frequency': real_video_frequency,
             'directory': os.path.join(args.root_directory, args.video_directory)

From 7264a14f3114ab3c596c72d8fadd5f1acb1e917c Mon Sep 17 00:00:00 2001
From: Bam4d <chrisbam4d@gmail.com>
Date: Sat, 27 Mar 2021 11:51:12 +0000
Subject: [PATCH 26/45] some fixes for flat representation

---
 ...lusters_po_with_push_separate_colors.yaml} |   0
 .../rllib_baseline_flat.py                    | 270 ++++++++++++++++++
 python/griddly/util/rllib/environment/core.py |   4 +-
 3 files changed, 272 insertions(+), 2 deletions(-)
 rename python/examples/experiments/conditional_action_spaces/{clusters_po_with_push_seperate_colors.yaml => clusters_po_with_push_separate_colors.yaml} (100%)
 create mode 100644 python/examples/experiments/conditional_action_spaces/rllib_baseline_flat.py

diff --git a/python/examples/experiments/conditional_action_spaces/clusters_po_with_push_seperate_colors.yaml b/python/examples/experiments/conditional_action_spaces/clusters_po_with_push_separate_colors.yaml
similarity index 100%
rename from python/examples/experiments/conditional_action_spaces/clusters_po_with_push_seperate_colors.yaml
rename to python/examples/experiments/conditional_action_spaces/clusters_po_with_push_separate_colors.yaml
diff --git a/python/examples/experiments/conditional_action_spaces/rllib_baseline_flat.py b/python/examples/experiments/conditional_action_spaces/rllib_baseline_flat.py
new file mode 100644
index 000000000..b0ebd7f9c
--- /dev/null
+++ b/python/examples/experiments/conditional_action_spaces/rllib_baseline_flat.py
@@ -0,0 +1,270 @@
+import argparse
+import os
+import sys
+
+import gym
+import numpy as np
+import ray
+import torch
+from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
+from torch import nn
+from gym.spaces import MultiDiscrete, Dict, Box
+from ray import tune
+from ray.rllib.agents.impala import ImpalaTrainer
+from ray.rllib.models import ModelCatalog
+from ray.tune.integration.wandb import WandbLoggerCallback
+from ray.tune.registry import register_env
+
+from griddly import gd
+from griddly.util.rllib.callbacks import GriddlyCallbacks
+from griddly.util.rllib.environment.core import RLlibEnv
+from griddly.util.rllib.torch.agents.common import layer_init
+from griddly.util.rllib.torch.agents.conv_agent import SimpleConvAgent
+
+parser = argparse.ArgumentParser(description='Run experiments')
+
+parser.add_argument('--yaml-file', help='YAML file condining GDY for the game')
+
+parser.add_argument('--root-directory', default=os.path.expanduser("~/ray_results"),
+                    help='root directory for all data associated with the run')
+parser.add_argument('--num-gpus', default=1, type=int, help='Number of GPUs to make available to ray.')
+parser.add_argument('--num-cpus', default=8, type=int, help='Number of CPUs to make available to ray.')
+
+parser.add_argument('--num-workers', default=7, type=int, help='Number of workers')
+parser.add_argument('--num-envs-per-worker', default=5, type=int, help='Number of workers')
+parser.add_argument('--num-gpus-per-worker', default=0, type=float, help='Number of gpus per worker')
+parser.add_argument('--num-cpus-per-worker', default=1, type=float, help='Number of gpus per worker')
+parser.add_argument('--max-training-steps', default=20000000, type=int, help='Number of workers')
+
+parser.add_argument('--capture-video', action='store_true', help='enable video capture')
+parser.add_argument('--video-directory', default='videos', help='directory of video')
+parser.add_argument('--video-frequency', type=int, default=1000000, help='Frequency of videos')
+
+parser.add_argument('--seed', type=int, default=69420, help='seed for experiments')
+
+parser.add_argument('--lr', type=float, default=0.0005, help='learning rate')
+
+
+class FlatActionWrapper(gym.Wrapper):
+
+    def __init__(self, env):
+        super().__init__(env)
+
+        self._num_action_parts = 1
+        self._action_params_offset = 0
+        if not self.has_avatar:
+            self._num_action_parts += 1
+            self._action_params_offset = 1
+
+
+        self._action_splits = np.zeros(self._num_action_parts)
+
+        self._total_position_params = 0
+        if not self.has_avatar:
+            self._action_splits[0] = self.width*self.height
+            self._total_position_params += self.width*self.height
+
+        self._action_logit_offsets = {}
+
+        total_action_params = 0
+        for i, action_name in enumerate(self.env.action_names):
+            self._action_logit_offsets[action_name] = total_action_params + self._total_position_params
+            total_action_params += self.num_action_ids[action_name]
+
+        self._action_splits[self._action_params_offset] = total_action_params
+
+        self._total_actions = int(np.sum(self._action_splits))
+
+        self.action_space = MultiDiscrete(self._action_splits)
+        self.observation_space = Dict({
+            'obs': self.observation_space,
+            'mask': Box(0, 1, shape=(self._total_actions,)),
+        })
+
+    def _get_flat_mask(self):
+        flat_mask = np.zeros(self._total_actions)
+        for location, action_names in self.env.game.get_available_actions(1).items():
+            if not self.has_avatar:
+                flat_location = self.width * location[1] + location[0]
+                flat_mask[flat_location] = 1
+            for action_name, action_ids in self.env.game.get_available_action_ids(location, list(action_names)).items():
+                mask_offset = self._action_logit_offsets[action_name]
+                flat_mask[mask_offset:mask_offset + self.num_action_ids[action_name]][action_ids] = 1
+        return flat_mask
+
+    def _to_griddly_action(self, action):
+        # convert the flat action back to Griddly's tree based format
+
+        griddly_action = []
+        action_ptr = 0
+        if not self.has_avatar:
+            x = action[action_ptr] % self.width
+            griddly_action.append(x)
+            y = int(action[action_ptr] / self.width)
+            griddly_action.append(y)
+            action_ptr += 1
+
+        if self.action_count > 0:
+            action_type_id = 0
+            action_param_id = 0
+            for action_name in self.action_names:
+                action_offset_after_position = (self._action_logit_offsets[action_name] - self._total_position_params)
+                next_offset = action_offset_after_position + self.num_action_ids[action_name]
+                if next_offset > action[action_ptr]:
+                    action_param_id = action[action_ptr] - action_offset_after_position
+                    break
+                action_type_id += 1
+
+            griddly_action.append(action_type_id)
+            griddly_action.append(action_param_id)
+        else:
+            griddly_action.append(action[action_ptr])
+
+        return griddly_action
+
+    def reset(self, **kwargs):
+
+        obs = super().reset(**kwargs)
+
+        observations = {
+            'obs': obs,
+            'mask': self._get_flat_mask()
+        }
+
+        return observations
+
+    def step(self, action):
+        griddly_action = self._to_griddly_action(action)
+
+        obs, reward, info, done = super().step(griddly_action)
+
+        observations = {
+            'obs': obs,
+            'mask': self._get_flat_mask()
+        }
+
+        return observations, reward, info, done
+
+
+class SimpleConvFlatAgent(TorchModelV2, nn.Module):
+
+    def __init__(self, obs_space, action_space, num_outputs, model_config, name):
+        super().__init__(obs_space, action_space, num_outputs, model_config, name)
+        nn.Module.__init__(self)
+
+        self._num_objects = obs_space.original_space['obs'].shape[2]
+        self._num_actions = num_outputs
+
+        linear_flatten = np.prod(obs_space.original_space['obs'].shape[:2]) * 64
+
+        self.network = nn.Sequential(
+            layer_init(nn.Conv2d(self._num_objects, 32, 3, padding=1)),
+            nn.ReLU(),
+            layer_init(nn.Conv2d(32, 64, 3, padding=1)),
+            nn.ReLU(),
+            nn.Flatten(),
+            layer_init(nn.Linear(linear_flatten, 1024)),
+            nn.ReLU(),
+            layer_init(nn.Linear(1024, 512)),
+            nn.ReLU(),
+        )
+
+        self._actor_head = nn.Sequential(
+            layer_init(nn.Linear(512, 256), std=0.01),
+            nn.ReLU(),
+            layer_init(nn.Linear(256, self._num_actions), std=0.01)
+        )
+
+        self._critic_head = nn.Sequential(
+            layer_init(nn.Linear(512, 1), std=0.01)
+        )
+
+    def forward(self, input_dict, state, seq_lens):
+        obs_transformed = input_dict['obs']['obs'].permute(0, 3, 1, 2)
+        mask = input_dict['obs']['mask']
+        network_output = self.network(obs_transformed)
+        value = self._critic_head(network_output)
+        self._value = value.reshape(-1)
+        logits = self._actor_head(network_output)
+
+        logits += torch.maximum(torch.log(mask), torch.tensor(torch.finfo().min))
+
+        return logits, state
+
+    def value_function(self):
+        return self._value
+
+
+if __name__ == '__main__':
+
+    args = parser.parse_args()
+
+    sep = os.pathsep
+    os.environ['PYTHONPATH'] = sep.join(sys.path)
+
+    ray.init(include_dashboard=False, num_gpus=args.num_gpus, num_cpus=args.num_cpus)
+    #ray.init(include_dashboard=False, num_gpus=1, num_cpus=args.num_cpus, local_mode=True)
+    env_name = "ray-griddly-env"
+
+
+    def _create_env(env_config):
+        env = RLlibEnv(env_config)
+        return FlatActionWrapper(env)
+
+
+    register_env(env_name, _create_env)
+    ModelCatalog.register_custom_model("SimpleConv", SimpleConvFlatAgent)
+
+    wandbLoggerCallback = WandbLoggerCallback(
+        project='conditional_actions',
+        api_key_file='~/.wandb_rc',
+        dir=args.root_directory
+    )
+
+    max_training_steps = args.max_training_steps
+
+    config = {
+        'framework': 'torch',
+        'seed': args.seed,
+        'num_workers': args.num_workers,
+        'num_envs_per_worker': args.num_envs_per_worker,
+        'num_gpus_per_worker': float(args.num_gpus_per_worker),
+        'num_cpus_per_worker': args.num_cpus_per_worker,
+
+        'callbacks': GriddlyCallbacks,
+
+        'model': {
+            'custom_model': 'SimpleConv',
+            'custom_model_config': {}
+        },
+        'env': env_name,
+        'env_config': {
+            'generate_valid_action_trees': False,
+            'random_level_on_reset': True,
+            'yaml_file': args.yaml_file,
+            'global_observer_type': gd.ObserverType.SPRITE_2D,
+            'max_steps': 1000,
+        },
+        'entropy_coeff_schedule': [
+            [0, 0.01],
+            [max_training_steps, 0.0]
+        ],
+        'lr_schedule': [
+            [0, args.lr],
+            [max_training_steps, 0.0]
+        ],
+
+    }
+    if args.capture_video:
+        real_video_frequency = int(args.video_frequency / (args.num_envs_per_worker * args.num_workers))
+        config['env_config']['record_video_config'] = {
+            'frequency': real_video_frequency,
+            'directory': os.path.join(args.root_directory, args.video_directory)
+        }
+
+    stop = {
+        "timesteps_total": max_training_steps,
+    }
+
+    result = tune.run(ImpalaTrainer, local_dir=args.root_directory, config=config, stop=stop,
+                      callbacks=[wandbLoggerCallback])
diff --git a/python/griddly/util/rllib/environment/core.py b/python/griddly/util/rllib/environment/core.py
index 86672c862..fe6723ccf 100644
--- a/python/griddly/util/rllib/environment/core.py
+++ b/python/griddly/util/rllib/environment/core.py
@@ -132,8 +132,8 @@ def set_transform(self):
             dtype=np.float,
         )
 
-        self.height = self.observation_space.shape[0]
-        self.width = self.observation_space.shape[1]
+        self.height = self.observation_space.shape[1]
+        self.width = self.observation_space.shape[0]
 
     def _get_valid_action_trees(self):
         valid_action_trees = self.game.build_valid_action_trees()

From f36c3acb606ccb3054262b988332a17ac1fda642 Mon Sep 17 00:00:00 2001
From: Bam4d <chrisbam4d@gmail.com>
Date: Mon, 29 Mar 2021 09:15:50 +0100
Subject: [PATCH 27/45] fixing bug with action trees not synchronized with
 observations during policy rollouts

---
 .../rllib_baseline_flat.py                    |  6 ++--
 .../rllib_conditional_actions.py              |  7 ++--
 python/griddly/util/rllib/environment/core.py |  2 +-
 .../conditional_action_exploration.py         | 32 +++++++++----------
 src/Griddly/Core/Grid.cpp                     |  8 ++---
 5 files changed, 27 insertions(+), 28 deletions(-)

diff --git a/python/examples/experiments/conditional_action_spaces/rllib_baseline_flat.py b/python/examples/experiments/conditional_action_spaces/rllib_baseline_flat.py
index b0ebd7f9c..afda50765 100644
--- a/python/examples/experiments/conditional_action_spaces/rllib_baseline_flat.py
+++ b/python/examples/experiments/conditional_action_spaces/rllib_baseline_flat.py
@@ -56,13 +56,12 @@ def __init__(self, env):
             self._num_action_parts += 1
             self._action_params_offset = 1
 
-
         self._action_splits = np.zeros(self._num_action_parts)
 
         self._total_position_params = 0
         if not self.has_avatar:
-            self._action_splits[0] = self.width*self.height
-            self._total_position_params += self.width*self.height
+            self._action_splits[0] = self.width * self.height
+            self._total_position_params += self.width * self.height
 
         self._action_logit_offsets = {}
 
@@ -202,6 +201,7 @@ def value_function(self):
     sep = os.pathsep
     os.environ['PYTHONPATH'] = sep.join(sys.path)
 
+
     ray.init(include_dashboard=False, num_gpus=args.num_gpus, num_cpus=args.num_cpus)
     #ray.init(include_dashboard=False, num_gpus=1, num_cpus=args.num_cpus, local_mode=True)
     env_name = "ray-griddly-env"
diff --git a/python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py b/python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py
index 323ee6f4c..80700db63 100644
--- a/python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py
+++ b/python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py
@@ -49,8 +49,8 @@
     sep = os.pathsep
     os.environ['PYTHONPATH'] = sep.join(sys.path)
 
-    ray.init(include_dashboard=False, num_gpus=args.num_gpus, num_cpus=args.num_cpus)
-    #ray.init(include_dashboard=False, num_gpus=1, num_cpus=args.num_cpus, local_mode=True)
+    #ray.init(include_dashboard=False, num_gpus=args.num_gpus, num_cpus=args.num_cpus)
+    ray.init(include_dashboard=False, num_gpus=1, num_cpus=args.num_cpus, local_mode=True)
 
     env_name = "ray-griddly-env"
 
@@ -87,7 +87,8 @@
             'vtrace_masking': args.vtrace_masking,
             'invalid_action_masking': 'conditional',
             'generate_valid_action_trees': True,
-            'random_level_on_reset': True,
+            #'random_level_on_reset': True,
+            'level': 0,
             'yaml_file': args.yaml_file,
             'global_observer_type': gd.ObserverType.SPRITE_2D,
             'max_steps': 1000,
diff --git a/python/griddly/util/rllib/environment/core.py b/python/griddly/util/rllib/environment/core.py
index fe6723ccf..602fa5f16 100644
--- a/python/griddly/util/rllib/environment/core.py
+++ b/python/griddly/util/rllib/environment/core.py
@@ -165,7 +165,7 @@ def step(self, action):
 
         if self.generate_valid_action_trees:
             self.last_valid_action_trees = self._get_valid_action_trees()
-            info['valid_action_tree'] = self.last_valid_action_trees
+            info['valid_action_tree'] = dict(self.last_valid_action_trees)
 
         return self._transform(observation), reward, done, info
 
diff --git a/python/griddly/util/rllib/torch/conditional_actions/conditional_action_exploration.py b/python/griddly/util/rllib/torch/conditional_actions/conditional_action_exploration.py
index dfa00e7ac..5d092055a 100644
--- a/python/griddly/util/rllib/torch/conditional_actions/conditional_action_exploration.py
+++ b/python/griddly/util/rllib/torch/conditional_actions/conditional_action_exploration.py
@@ -7,7 +7,7 @@
 
 class TorchConditionalMaskingExploration():
 
-    def __init__(self, model, dist_inputs, valid_action_trees, explore=False, invalid_action_masking='none', allow_nop=False):
+    def __init__(self, model, dist_inputs, valid_action_trees, explore=False, invalid_action_masking='conditional', allow_nop=False):
         self._valid_action_trees = valid_action_trees
 
         self._num_inputs = dist_inputs.shape[0]
@@ -26,32 +26,36 @@ def __init__(self, model, dist_inputs, valid_action_trees, explore=False, invali
 
         self._inputs_split = dist_inputs.split(tuple(self._action_space_shape), dim=1)
 
+        self._full_tree = self._fill_node(self._action_space_shape,0)
+
     def _mask_and_sample(self, options, logits, is_parameters=False):
 
         mask = torch.zeros([logits.shape[0]]).to(logits.device)
         mask[options] = 1
 
         if is_parameters:
-            if not self._allow_nop and len(options) > 1:
+            if not self._allow_nop:
                 mask[0] = 0
 
         masked_logits = logits + torch.log(mask)
 
         dist = Categorical(logits=masked_logits)
         sampled = dist.sample()
+        logp = dist.log_prob(sampled)
+        out_logits = masked_logits
 
-        if self._invalid_action_masking != 'none':
-            logp = dist.log_prob(sampled)
-            out_logits = masked_logits
-        else:
-            mask = torch.ones([logits.shape[0]])
-            dist = Categorical(logits=logits)
-            logp = dist.log_prob(sampled)
-            out_logits = logits
+        if not self._allow_nop and is_parameters:
+            assert sampled != 0
 
 
         return sampled, out_logits, logp, mask
 
+    def _fill_node(self, keys, pos):
+        if pos < len(keys):
+            return {k: self._fill_node(keys, pos + 1) for k in np.arange(keys[pos])}
+        else:
+            return {}
+
     def _merge_all_branches(self, tree):
         all_nodes = {}
         merged_tree = {}
@@ -70,10 +74,7 @@ def _process_valid_action_tree(self, valid_action_tree):
 
         # In the case there are no available actions for the player
         if len(subtree_options) == 0:
-            build_tree = subtree
-            for _ in range(self._num_action_parts):
-                build_tree[0] = {}
-                build_tree = build_tree[0]
+            subtree = self._full_tree
             subtree_options = list(subtree.keys())
 
         # If we want very basic action masking where parameterized masks are superimposed we use this
@@ -113,9 +114,6 @@ def get_actions_and_mask(self):
 
                             logp_parts[a] = logp
 
-                            if mask_part.sum() == 0:
-                                raise RuntimeError('mask calculated incorrectly')
-
                             mask_offset += self._action_space_shape[a]
 
                             subtree = subtree[int(sampled)]
diff --git a/src/Griddly/Core/Grid.cpp b/src/Griddly/Core/Grid.cpp
index aed52577b..d482285ea 100644
--- a/src/Griddly/Core/Grid.cpp
+++ b/src/Griddly/Core/Grid.cpp
@@ -143,19 +143,19 @@ std::unordered_map<uint32_t, int32_t> Grid::executeAction(uint32_t playerId, std
   }
 
   if (sourceObject == nullptr) {
-    spdlog::debug("Cannot perform action on empty space.");
+    spdlog::warn("Cannot perform action on empty space. ({0},{1})", action->getSourceLocation()[0], action->getSourceLocation()[1]);
     return {};
   }
 
   auto sourceObjectPlayerId = sourceObject->getPlayerId();
 
   if (playerId != 0 && sourceObjectPlayerId != playerId) {
-    spdlog::debug("Cannot perform action on object not owned by player. Object owner {0}, Player owner {1}", sourceObjectPlayerId, playerId);
+    spdlog::warn("Cannot perform action on object not owned by player. Object owner {0}, Player owner {1}", sourceObjectPlayerId, playerId);
     return {};
   }
 
   if (playerId != 0 && sourceObject->isPlayerAvatar() && playerAvatars_.find(playerId) == playerAvatars_.end()) {
-    spdlog::debug("Avatar for player {0} has been removed, action will be ignored.", playerId);
+    spdlog::warn("Avatar for player {0} has been removed, action will be ignored.", playerId);
     return {};
   }
 
@@ -176,7 +176,7 @@ std::unordered_map<uint32_t, int32_t> Grid::executeAction(uint32_t playerId, std
     return rewardAccumulator;
 
   } else {
-    spdlog::debug("Cannot perform action={0} on object={1}", action->getActionName(), sourceObject->getObjectName());
+    spdlog::warn("Cannot perform action={0} on object={1}", action->getActionName(), sourceObject->getObjectName());
     return {};
   }
 }

From 232b286396e5b7c6083b53c3ef3cdb3d05c5b14b Mon Sep 17 00:00:00 2001
From: Bam4d <chrisbam4d@gmail.com>
Date: Mon, 29 Mar 2021 10:01:05 +0100
Subject: [PATCH 28/45] automatically use nops if we have no infos (straight
 after reset etc)

---
 .../rllib_conditional_actions.py                   | 14 +++++++-------
 .../conditional_action_exploration.py              | 12 ++++++++----
 2 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py b/python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py
index 80700db63..5f0987735 100644
--- a/python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py
+++ b/python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py
@@ -18,7 +18,7 @@
 
 parser = argparse.ArgumentParser(description='Run experiments')
 
-parser.add_argument('--yaml-file', help='YAML file condining GDY for the game')
+parser.add_argument('--yaml-file', help='YAML file containing GDY for the game')
 
 parser.add_argument('--root-directory', default=os.path.expanduser("~/ray_results"),
                     help='root directory for all data associated with the run')
@@ -49,8 +49,8 @@
     sep = os.pathsep
     os.environ['PYTHONPATH'] = sep.join(sys.path)
 
-    #ray.init(include_dashboard=False, num_gpus=args.num_gpus, num_cpus=args.num_cpus)
-    ray.init(include_dashboard=False, num_gpus=1, num_cpus=args.num_cpus, local_mode=True)
+    ray.init(include_dashboard=False, num_gpus=args.num_gpus, num_cpus=args.num_cpus)
+    #ray.init(include_dashboard=False, num_gpus=1, num_cpus=args.num_cpus, local_mode=True)
 
     env_name = "ray-griddly-env"
 
@@ -83,12 +83,12 @@
         'env_config': {
 
             'allow_nop': args.allow_nop,
-            #'invalid_action_masking': tune.grid_search(['none', 'conditional', 'collapsed']),
+            'invalid_action_masking': tune.grid_search(['conditional', 'collapsed']),
             'vtrace_masking': args.vtrace_masking,
-            'invalid_action_masking': 'conditional',
+            #'invalid_action_masking': 'conditional',
             'generate_valid_action_trees': True,
-            #'random_level_on_reset': True,
-            'level': 0,
+            #'level': 0,
+            'random_level_on_reset': True,
             'yaml_file': args.yaml_file,
             'global_observer_type': gd.ObserverType.SPRITE_2D,
             'max_steps': 1000,
diff --git a/python/griddly/util/rllib/torch/conditional_actions/conditional_action_exploration.py b/python/griddly/util/rllib/torch/conditional_actions/conditional_action_exploration.py
index 5d092055a..f6231c486 100644
--- a/python/griddly/util/rllib/torch/conditional_actions/conditional_action_exploration.py
+++ b/python/griddly/util/rllib/torch/conditional_actions/conditional_action_exploration.py
@@ -34,7 +34,7 @@ def _mask_and_sample(self, options, logits, is_parameters=False):
         mask[options] = 1
 
         if is_parameters:
-            if not self._allow_nop:
+            if not self._allow_nop and len(options) > 1:
                 mask[0] = 0
 
         masked_logits = logits + torch.log(mask)
@@ -44,8 +44,8 @@ def _mask_and_sample(self, options, logits, is_parameters=False):
         logp = dist.log_prob(sampled)
         out_logits = masked_logits
 
-        if not self._allow_nop and is_parameters:
-            assert sampled != 0
+        # if not self._allow_nop and is_parameters:
+        #     assert sampled != 0
 
 
         return sampled, out_logits, logp, mask
@@ -74,7 +74,11 @@ def _process_valid_action_tree(self, valid_action_tree):
 
         # In the case there are no available actions for the player
         if len(subtree_options) == 0:
-            subtree = self._full_tree
+            #subtree = self._full_tree
+            build_tree = subtree
+            for _ in range(self._num_action_parts):
+                build_tree[0] = {}
+                build_tree = build_tree[0]
             subtree_options = list(subtree.keys())
 
         # If we want very basic action masking where parameterized masks are superimposed we use this

From f4a37db0ed268f74544cc48904459a44d58ef5eb Mon Sep 17 00:00:00 2001
From: Bam4d <chrisbam4d@gmail.com>
Date: Mon, 29 Mar 2021 16:32:29 +0100
Subject: [PATCH 29/45] fixing unit clusters games

---
 .../clusters_po_with_push_separate_colors_units.yaml            | 2 +-
 .../conditional_action_spaces/clusters_po_with_push_units.yaml  | 2 +-
 .../torch/conditional_actions/conditional_action_exploration.py | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/examples/experiments/conditional_action_spaces/clusters_po_with_push_separate_colors_units.yaml b/python/examples/experiments/conditional_action_spaces/clusters_po_with_push_separate_colors_units.yaml
index 1fba6490c..bb932fc35 100644
--- a/python/examples/experiments/conditional_action_spaces/clusters_po_with_push_separate_colors_units.yaml
+++ b/python/examples/experiments/conditional_action_spaces/clusters_po_with_push_separate_colors_units.yaml
@@ -38,7 +38,7 @@ Environment:
       w  .  .  b1 .  .  r1 .  G  g1 .  .  w
       w  .  .  .  .  x  .  .  x  .  .  .  w
       w  .  .  .  r1 .  .  g1 .  .  b1 .  w
-      w  .  .  .  .  b  .  .  x  .  .  .  w
+      w  .  .  .  .  R  .  .  x  .  .  .  w
       w  .  .  g1 .  .  .  r1 .  .  b1 .  w
       w  .  .  x  .  x  .  .  .  B  .  .  w
       w  .  .  .  .  .  .  .  .  .  .  .  w
diff --git a/python/examples/experiments/conditional_action_spaces/clusters_po_with_push_units.yaml b/python/examples/experiments/conditional_action_spaces/clusters_po_with_push_units.yaml
index 4a00e3a8c..cae2d9a80 100644
--- a/python/examples/experiments/conditional_action_spaces/clusters_po_with_push_units.yaml
+++ b/python/examples/experiments/conditional_action_spaces/clusters_po_with_push_units.yaml
@@ -38,7 +38,7 @@ Environment:
       w  .  .  b1 .  .  r1 .  G  g1 .  .  w
       w  .  .  .  .  x  .  .  x  .  .  .  w
       w  .  .  .  r1 .  .  g1 .  .  b1 .  w
-      w  .  .  .  .  b  .  .  x  .  .  .  w
+      w  .  .  .  .  R  .  .  x  .  .  .  w
       w  .  .  g1 .  .  .  r1 .  .  b1 .  w
       w  .  .  x  .  x  .  .  .  B  .  .  w
       w  .  .  .  .  .  .  .  .  .  .  .  w
diff --git a/python/griddly/util/rllib/torch/conditional_actions/conditional_action_exploration.py b/python/griddly/util/rllib/torch/conditional_actions/conditional_action_exploration.py
index f6231c486..dc8836f9c 100644
--- a/python/griddly/util/rllib/torch/conditional_actions/conditional_action_exploration.py
+++ b/python/griddly/util/rllib/torch/conditional_actions/conditional_action_exploration.py
@@ -26,7 +26,7 @@ def __init__(self, model, dist_inputs, valid_action_trees, explore=False, invali
 
         self._inputs_split = dist_inputs.split(tuple(self._action_space_shape), dim=1)
 
-        self._full_tree = self._fill_node(self._action_space_shape,0)
+        #self._full_tree = self._fill_node(self._action_space_shape,0)
 
     def _mask_and_sample(self, options, logits, is_parameters=False):
 

From 541ef74bbb2810c1725a82432fe9bb844921deec Mon Sep 17 00:00:00 2001
From: Bam4d <chrisbam4d@gmail.com>
Date: Mon, 29 Mar 2021 19:15:47 +0100
Subject: [PATCH 30/45] full tree for reset mask

---
 .../conditional_action_exploration.py              | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/python/griddly/util/rllib/torch/conditional_actions/conditional_action_exploration.py b/python/griddly/util/rllib/torch/conditional_actions/conditional_action_exploration.py
index dc8836f9c..164442adb 100644
--- a/python/griddly/util/rllib/torch/conditional_actions/conditional_action_exploration.py
+++ b/python/griddly/util/rllib/torch/conditional_actions/conditional_action_exploration.py
@@ -26,7 +26,7 @@ def __init__(self, model, dist_inputs, valid_action_trees, explore=False, invali
 
         self._inputs_split = dist_inputs.split(tuple(self._action_space_shape), dim=1)
 
-        #self._full_tree = self._fill_node(self._action_space_shape,0)
+        self._full_tree = self._fill_node(self._action_space_shape,0)
 
     def _mask_and_sample(self, options, logits, is_parameters=False):
 
@@ -34,7 +34,7 @@ def _mask_and_sample(self, options, logits, is_parameters=False):
         mask[options] = 1
 
         if is_parameters:
-            if not self._allow_nop and len(options) > 1:
+            if not self._allow_nop:
                 mask[0] = 0
 
         masked_logits = logits + torch.log(mask)
@@ -74,11 +74,11 @@ def _process_valid_action_tree(self, valid_action_tree):
 
         # In the case there are no available actions for the player
         if len(subtree_options) == 0:
-            #subtree = self._full_tree
-            build_tree = subtree
-            for _ in range(self._num_action_parts):
-                build_tree[0] = {}
-                build_tree = build_tree[0]
+            subtree = self._full_tree
+            # build_tree = subtree
+            # for _ in range(self._num_action_parts):
+            #     build_tree[0] = {}
+            #     build_tree = build_tree[0]
             subtree_options = list(subtree.keys())
 
         # If we want very basic action masking where parameterized masks are superimposed we use this

From f7339e9c721b38809000d507d4ac30480fd9d221 Mon Sep 17 00:00:00 2001
From: Bam4d <chrisbam4d@gmail.com>
Date: Tue, 30 Mar 2021 20:57:29 +0100
Subject: [PATCH 31/45] changing wandb naming, removing warn lines in Grid.cpp

---
 .../rllib_baseline.py                         | 16 ++++++++++++----
 .../rllib_baseline_flat.py                    | 19 +++++++++++++------
 src/Griddly/Core/Grid.cpp                     |  8 ++++----
 3 files changed, 29 insertions(+), 14 deletions(-)

diff --git a/python/examples/experiments/conditional_action_spaces/rllib_baseline.py b/python/examples/experiments/conditional_action_spaces/rllib_baseline.py
index 7ef20d001..d982350e0 100644
--- a/python/examples/experiments/conditional_action_spaces/rllib_baseline.py
+++ b/python/examples/experiments/conditional_action_spaces/rllib_baseline.py
@@ -11,7 +11,6 @@
 from griddly import gd
 from griddly.util.rllib.callbacks import GriddlyCallbacks
 from griddly.util.rllib.environment.core import RLlibEnv
-from griddly.util.rllib.torch import GAPAgent
 from griddly.util.rllib.torch.agents.conv_agent import SimpleConvAgent
 # from griddly.util.rllib.callbacks import GriddlyCallbacks
 from griddly.util.rllib.torch.conditional_actions.conditional_action_policy_trainer import \
@@ -20,6 +19,7 @@
 parser = argparse.ArgumentParser(description='Run experiments')
 
 parser.add_argument('--yaml-file', help='YAML file condining GDY for the game')
+parser.add_argument('--experiment-name', default='unknown', help='name of the experiment')
 
 parser.add_argument('--root-directory', default=os.path.expanduser("~/ray_results"),
                     help='root directory for all data associated with the run')
@@ -56,7 +56,7 @@
     ModelCatalog.register_custom_model("SimpleConv", SimpleConvAgent)
 
     wandbLoggerCallback = WandbLoggerCallback(
-        project='conditional_actions',
+        project='conditional_action_trees',
         api_key_file='~/.wandb_rc',
         dir=args.root_directory
     )
@@ -107,5 +107,13 @@
         "timesteps_total": max_training_steps,
     }
 
-    result = tune.run(ConditionalActionImpalaTrainer, local_dir=args.root_directory, config=config, stop=stop,
-                      callbacks=[wandbLoggerCallback])
+    trial_name_creator = lambda trial: f'baseline-{args.experiment_name}'
+
+    result = tune.run(
+        ConditionalActionImpalaTrainer,
+        local_dir=args.root_directory,
+        config=config,
+        stop=stop,
+        callbacks=[wandbLoggerCallback],
+        trial_name_creator=trial_name_creator
+    )
diff --git a/python/examples/experiments/conditional_action_spaces/rllib_baseline_flat.py b/python/examples/experiments/conditional_action_spaces/rllib_baseline_flat.py
index afda50765..b8549d9ec 100644
--- a/python/examples/experiments/conditional_action_spaces/rllib_baseline_flat.py
+++ b/python/examples/experiments/conditional_action_spaces/rllib_baseline_flat.py
@@ -19,11 +19,11 @@
 from griddly.util.rllib.callbacks import GriddlyCallbacks
 from griddly.util.rllib.environment.core import RLlibEnv
 from griddly.util.rllib.torch.agents.common import layer_init
-from griddly.util.rllib.torch.agents.conv_agent import SimpleConvAgent
 
 parser = argparse.ArgumentParser(description='Run experiments')
 
 parser.add_argument('--yaml-file', help='YAML file condining GDY for the game')
+parser.add_argument('--experiment-name', default='unknown', help='name of the experiment')
 
 parser.add_argument('--root-directory', default=os.path.expanduser("~/ray_results"),
                     help='root directory for all data associated with the run')
@@ -201,9 +201,8 @@ def value_function(self):
     sep = os.pathsep
     os.environ['PYTHONPATH'] = sep.join(sys.path)
 
-
     ray.init(include_dashboard=False, num_gpus=args.num_gpus, num_cpus=args.num_cpus)
-    #ray.init(include_dashboard=False, num_gpus=1, num_cpus=args.num_cpus, local_mode=True)
+    # ray.init(include_dashboard=False, num_gpus=1, num_cpus=args.num_cpus, local_mode=True)
     env_name = "ray-griddly-env"
 
 
@@ -216,7 +215,7 @@ def _create_env(env_config):
     ModelCatalog.register_custom_model("SimpleConv", SimpleConvFlatAgent)
 
     wandbLoggerCallback = WandbLoggerCallback(
-        project='conditional_actions',
+        project='conditional_action_trees',
         api_key_file='~/.wandb_rc',
         dir=args.root_directory
     )
@@ -266,5 +265,13 @@ def _create_env(env_config):
         "timesteps_total": max_training_steps,
     }
 
-    result = tune.run(ImpalaTrainer, local_dir=args.root_directory, config=config, stop=stop,
-                      callbacks=[wandbLoggerCallback])
+    trial_name_creator = lambda trial: f'baseline-flat-{args.experiment_name}'
+
+    result = tune.run(
+        ImpalaTrainer,
+        local_dir=args.root_directory,
+        config=config,
+        stop=stop,
+        callbacks=[wandbLoggerCallback],
+        trial_name_creator=trial_name_creator
+    )
diff --git a/src/Griddly/Core/Grid.cpp b/src/Griddly/Core/Grid.cpp
index d482285ea..d207d101e 100644
--- a/src/Griddly/Core/Grid.cpp
+++ b/src/Griddly/Core/Grid.cpp
@@ -143,19 +143,19 @@ std::unordered_map<uint32_t, int32_t> Grid::executeAction(uint32_t playerId, std
   }
 
   if (sourceObject == nullptr) {
-    spdlog::warn("Cannot perform action on empty space. ({0},{1})", action->getSourceLocation()[0], action->getSourceLocation()[1]);
+    spdlog::debug("Cannot perform action on empty space. ({0},{1})", action->getSourceLocation()[0], action->getSourceLocation()[1]);
     return {};
   }
 
   auto sourceObjectPlayerId = sourceObject->getPlayerId();
 
   if (playerId != 0 && sourceObjectPlayerId != playerId) {
-    spdlog::warn("Cannot perform action on object not owned by player. Object owner {0}, Player owner {1}", sourceObjectPlayerId, playerId);
+    spdlog::debug("Cannot perform action on object not owned by player. Object owner {0}, Player owner {1}", sourceObjectPlayerId, playerId);
     return {};
   }
 
   if (playerId != 0 && sourceObject->isPlayerAvatar() && playerAvatars_.find(playerId) == playerAvatars_.end()) {
-    spdlog::warn("Avatar for player {0} has been removed, action will be ignored.", playerId);
+    spdlog::debug("Avatar for player {0} has been removed, action will be ignored.", playerId);
     return {};
   }
 
@@ -176,7 +176,7 @@ std::unordered_map<uint32_t, int32_t> Grid::executeAction(uint32_t playerId, std
     return rewardAccumulator;
 
   } else {
-    spdlog::warn("Cannot perform action={0} on object={1}", action->getActionName(), sourceObject->getObjectName());
+    spdlog::debug("Cannot perform action={0} on object={1}", action->getActionName(), sourceObject->getObjectName());
     return {};
   }
 }

From 90f55261594e0b3e7391b44c37f8be44fce61d8d Mon Sep 17 00:00:00 2001
From: Bam4d <chrisbam4d@gmail.com>
Date: Thu, 1 Apr 2021 10:10:06 +0100
Subject: [PATCH 32/45] better naming for conditional action experiments

---
 .../conditional_action_spaces/rllib_baseline.py   |  1 -
 .../rllib_conditional_actions.py                  | 15 ++++++++++++---
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/python/examples/experiments/conditional_action_spaces/rllib_baseline.py b/python/examples/experiments/conditional_action_spaces/rllib_baseline.py
index d982350e0..d29c7647c 100644
--- a/python/examples/experiments/conditional_action_spaces/rllib_baseline.py
+++ b/python/examples/experiments/conditional_action_spaces/rllib_baseline.py
@@ -12,7 +12,6 @@
 from griddly.util.rllib.callbacks import GriddlyCallbacks
 from griddly.util.rllib.environment.core import RLlibEnv
 from griddly.util.rllib.torch.agents.conv_agent import SimpleConvAgent
-# from griddly.util.rllib.callbacks import GriddlyCallbacks
 from griddly.util.rllib.torch.conditional_actions.conditional_action_policy_trainer import \
     ConditionalActionImpalaTrainer
 
diff --git a/python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py b/python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py
index 5f0987735..dcfcf8de6 100644
--- a/python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py
+++ b/python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py
@@ -19,6 +19,7 @@
 parser = argparse.ArgumentParser(description='Run experiments')
 
 parser.add_argument('--yaml-file', help='YAML file containing GDY for the game')
+parser.add_argument('--experiment-name', default='unknown', help='name of the experiment')
 
 parser.add_argument('--root-directory', default=os.path.expanduser("~/ray_results"),
                     help='root directory for all data associated with the run')
@@ -58,7 +59,7 @@
     ModelCatalog.register_custom_model("SimpleConv", SimpleConvAgent)
 
     wandbLoggerCallback = WandbLoggerCallback(
-        project='conditional_actions',
+        project='conditional_action_trees',
         api_key_file='~/.wandb_rc',
         dir=args.root_directory
     )
@@ -115,5 +116,13 @@
         "timesteps_total": max_training_steps,
     }
 
-    result = tune.run(ConditionalActionImpalaTrainer, local_dir=args.root_directory, config=config, stop=stop,
-                      callbacks=[wandbLoggerCallback])
+    trial_name_creator = lambda trial: f'CAT-{args.experiment_name}'
+
+    result = tune.run(
+        ConditionalActionImpalaTrainer,
+        local_dir=args.root_directory,
+        config=config,
+        stop=stop,
+        callbacks=[wandbLoggerCallback],
+        trial_name_creator=trial_name_creator
+    )

From e3290b072ab8c2aa2b55f7baaebc0bdd32211380 Mon Sep 17 00:00:00 2001
From: Bam4d <chrisbam4d@gmail.com>
Date: Thu, 1 Apr 2021 12:50:08 +0100
Subject: [PATCH 33/45] add invalid_action_masking choice

---
 .../conditional_action_spaces/rllib_conditional_actions.py      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py b/python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py
index dcfcf8de6..a6b07b851 100644
--- a/python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py
+++ b/python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py
@@ -116,7 +116,7 @@
         "timesteps_total": max_training_steps,
     }
 
-    trial_name_creator = lambda trial: f'CAT-{args.experiment_name}'
+    trial_name_creator = lambda trial: f'CAT-{args.experiment_name}-{trial.config["env_config"]["invalid_action_masking"]}'
 
     result = tune.run(
         ConditionalActionImpalaTrainer,

From e030ed02a8c6b7219714668aba0ed148aa1366d0 Mon Sep 17 00:00:00 2001
From: Bam4d <chrisbam4d@gmail.com>
Date: Sun, 11 Apr 2021 16:06:14 +0100
Subject: [PATCH 34/45] removing code from here and putting in a submodule so
 its easy to digest

---
 .../clusters_po.yaml                          | 310 -------------
 .../clusters_po_with_push.yaml                | 330 --------------
 ...clusters_po_with_push_separate_colors.yaml | 409 ------------------
 ...rs_po_with_push_separate_colors_units.yaml | 304 -------------
 .../clusters_po_with_push_units.yaml          | 258 -----------
 .../rllib_baseline.py                         | 118 -----
 .../rllib_baseline_flat.py                    | 277 ------------
 .../rllib_conditional_actions.py              | 128 ------
 python/requirements.txt                       |   3 +-
 9 files changed, 2 insertions(+), 2135 deletions(-)
 delete mode 100644 python/examples/experiments/conditional_action_spaces/clusters_po.yaml
 delete mode 100644 python/examples/experiments/conditional_action_spaces/clusters_po_with_push.yaml
 delete mode 100644 python/examples/experiments/conditional_action_spaces/clusters_po_with_push_separate_colors.yaml
 delete mode 100644 python/examples/experiments/conditional_action_spaces/clusters_po_with_push_separate_colors_units.yaml
 delete mode 100644 python/examples/experiments/conditional_action_spaces/clusters_po_with_push_units.yaml
 delete mode 100644 python/examples/experiments/conditional_action_spaces/rllib_baseline.py
 delete mode 100644 python/examples/experiments/conditional_action_spaces/rllib_baseline_flat.py
 delete mode 100644 python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py

diff --git a/python/examples/experiments/conditional_action_spaces/clusters_po.yaml b/python/examples/experiments/conditional_action_spaces/clusters_po.yaml
deleted file mode 100644
index 3d8b70722..000000000
--- a/python/examples/experiments/conditional_action_spaces/clusters_po.yaml
+++ /dev/null
@@ -1,310 +0,0 @@
-Version: "0.1"
-Environment:
-  Name: Partially Observable Clusters
-  Description: Cluster the coloured objects together by pushing them against the static coloured blocks.
-  Observers:
-    Sprite2D:
-      TileSize: 24
-      BackgroundTile: oryx/oryx_fantasy/floor1-2.png
-  Variables:
-    - Name: box_count
-      InitialValue: 0
-  Player:
-    Observer:
-      RotateWithAvatar: true
-      TrackAvatar: true
-      Height: 5
-      Width: 5
-      OffsetX: 0
-      OffsetY: 2
-    AvatarObject: avatar # The player can only control a single avatar in the game
-  Termination:
-    Win:
-      - eq: [box_count, 0]
-    Lose:
-      - eq: [broken_box:count, 1]
-      - eq: [avatar:count, 0]
-  Levels:
-    - |
-      w w w w w w w w w w w w w
-      w . . . . . . . . . . . w
-      w . . 1 1 . . . 2 . 2 . w
-      w . . . . 1 . . . . . . w
-      w . . . a . . . . . 2 . w
-      w . . . . . . . h . . . w
-      w . . . . 1 . . . . b . w
-      w . . . . . . 1 . . . . w
-      w . . . . . . . . A . . w
-      w w w w w w w w w w w w w
-    - |
-      w w w w w w w w w w w w w
-      w . . . . . . . . . . . w
-      w . . 1 . . 2 . c 3 . . w
-      w . . . . h . . h . . . w
-      w . . . 2 . . 3 . . 1 . w
-      w . . . . b . . h . . . w
-      w . . 3 . . . 2 . . 1 . w
-      w . . h . h . . . a . . w
-      w . . . . . A . . . . . w
-      w w w w w w w w w w w w w
-    - |
-      w w w w w w w w w w w w w
-      w . . a . . b . . c . . w
-      w . . . . . . . . . . . w
-      w . . . . . . . . . . . w
-      w h h h h h . h h h h h w
-      w . . . . h . h . . . . w
-      w . 1 2 . h . h . 1 3 . w
-      w . 3 . . . . . . . 2 . w
-      w . . . . . A . . . . . w
-      w w w w w w w w w w w w w
-    - |
-      w w w w w w w w w w w w w
-      w . . . . . . . . . . . w
-      w . . . 1 . 2 . . c . . w
-      w . . . . . 3 . . 3 . . w
-      w . . a . 2 . . . h . . w
-      w . . . . h h . 3 . . . w
-      w . . 1 . . . . . 2 . . w
-      w . . . . . 1 . . b . . w
-      w . . . . . A . . . . . w
-      w w w w w w w w w w w w w
-    - |
-      w w w w w w w w w w w w w
-      w . . . . . . . . . . . w
-      w . . . . . . 1 . . . . w
-      w . . h . . b . . h . . w
-      w . . . . 1 . . . . . . w
-      w . . 3 . . . . 2 . . . w
-      w . . . a . h . . c . . w
-      w . . . . 3 . . . . 2 . w
-      w . . . . . A . . . . . w
-      w w w w w w w w w w w w w
-
-Actions:
-
-  # A simple action to count the number of boxes in the game at the start
-  # Not currently a way to do complex things in termination conditions like combine multiple conditions
-  - Name: box_counter
-    InputMapping:
-      Internal: true
-      Inputs:
-        1:
-          Description: "The only action here is to increment the box count"
-    Behaviours:
-      - Src:
-          Object: [blue_box, red_box, green_box]
-          Commands:
-            - incr: box_count
-        Dst:
-          Object: [blue_box, red_box, green_box]
-
-  # Define the move action
-  - Name: move
-    InputMapping:
-      Inputs:
-        1:
-          Description: Rotate left
-          OrientationVector: [-1, 0]
-        2:
-          Description: Move forwards
-          OrientationVector: [0, -1]
-          VectorToDest: [0, -1]
-        3:
-          Description: Rotate right
-          OrientationVector: [1, 0]
-      Relative: true
-    Behaviours:
-
-      # Avatar rotates
-      - Src:
-          Object: avatar
-          Commands:
-            - rot: _dir
-        Dst:
-          Object: avatar
-
-      # Avatar and boxes can move into empty space
-      - Src:
-          Object: [avatar, blue_box, green_box, red_box]
-          Commands:
-            - mov: _dest
-        Dst:
-          Object: _empty
-
-      # Boxes can be pushed by the avatar
-      - Src:
-          Object: avatar
-          Commands:
-            - mov: _dest
-        Dst:
-          Object: [blue_box, green_box, red_box]
-          Commands:
-            - cascade: _dest
-
-      # When boxes are pushed against the blocks they change
-      - Src:
-          Object: blue_box
-          Commands:
-            - change_to: blue_block
-            - reward: 1
-            - decr:  box_count
-        Dst:
-          Object: blue_block
-      - Src:
-          Object: red_box
-          Commands:
-            - reward: 1
-            - change_to: red_block
-            - decr:  box_count
-        Dst:
-          Object: red_block
-      - Src:
-          Object: green_box
-          Commands:
-            - reward: 1
-            - change_to: green_block
-            - decr:  box_count
-        Dst:
-          Object: green_block
-
-      # Boxes break if they hit the spikes
-      - Src:
-          Object: [blue_box, green_box, red_box]
-          Commands:
-            - change_to: broken_box
-            - reward: -1
-        Dst:
-          Object: spike
-
-      # Avatar dies if it hits the spikes
-      - Src:
-          Object: avatar
-          Commands:
-            - remove: true
-            - reward: -1
-        Dst:
-          Object: spike
-
-Objects:
-  - Name: avatar
-    MapCharacter: A
-    Observers:
-      Sprite2D:
-        - Image: gvgai/oryx/knight1.png
-      Block2D:
-        - Shape: triangle
-          Color: [0.0, 1.0, 0.0]
-          Scale: 0.8
-
-  - Name: wall
-    MapCharacter: w
-    Observers:
-      Sprite2D:
-        - TilingMode: WALL_16
-          Image:
-            - oryx/oryx_fantasy/wall1-0.png
-            - oryx/oryx_fantasy/wall1-1.png
-            - oryx/oryx_fantasy/wall1-2.png
-            - oryx/oryx_fantasy/wall1-3.png
-            - oryx/oryx_fantasy/wall1-4.png
-            - oryx/oryx_fantasy/wall1-5.png
-            - oryx/oryx_fantasy/wall1-6.png
-            - oryx/oryx_fantasy/wall1-7.png
-            - oryx/oryx_fantasy/wall1-8.png
-            - oryx/oryx_fantasy/wall1-9.png
-            - oryx/oryx_fantasy/wall1-10.png
-            - oryx/oryx_fantasy/wall1-11.png
-            - oryx/oryx_fantasy/wall1-12.png
-            - oryx/oryx_fantasy/wall1-13.png
-            - oryx/oryx_fantasy/wall1-14.png
-            - oryx/oryx_fantasy/wall1-15.png
-      Block2D:
-        - Shape: square
-          Color: [0.5, 0.5, 0.5]
-          Scale: 0.9
-
-  - Name: spike
-    MapCharacter: h
-    Observers:
-      Sprite2D:
-        - Image: gvgai/oryx/spike2.png
-      Block2D:
-        - Shape: triangle
-          Color: [0.9, 0.1, 0.1]
-          Scale: 0.5
-
-  - Name: red_box
-    MapCharacter: "2"
-    InitialActions:
-      - Action: box_counter
-        ActionId: 1
-    Observers:
-      Sprite2D:
-        - Image: gvgai/newset/blockR.png
-      Block2D:
-        - Shape: square
-          Color: [0.5, 0.2, 0.2]
-          Scale: 0.5
-  - Name: red_block
-    MapCharacter: b
-    Observers:
-      Sprite2D:
-        - Image: gvgai/newset/blockR2.png
-      Block2D:
-        - Shape: square
-          Color: [1.0, 0.0, 0.0]
-          Scale: 1.0
-
-  - Name: green_box
-    MapCharacter: "3"
-    InitialActions:
-      - Action: box_counter
-        ActionId: 1
-    Observers:
-      Sprite2D:
-        - Image: gvgai/newset/blockG.png
-      Block2D:
-        - Shape: square
-          Color: [0.2, 0.5, 0.2]
-          Scale: 0.5
-  - Name: green_block
-    MapCharacter: c
-    Observers:
-      Sprite2D:
-        - Image: gvgai/newset/blockG2.png
-      Block2D:
-        - Shape: square
-          Color: [0.0, 1.0, 0.0]
-          Scale: 1.0
-
-  - Name: blue_box
-    MapCharacter: "1"
-    InitialActions:
-      - Action: box_counter
-        ActionId: 1
-    Observers:
-      Sprite2D:
-        - Image: gvgai/newset/blockB.png
-      Block2D:
-        - Shape: square
-          Color: [0.2, 0.2, 0.5]
-          Scale: 0.5
-  - Name: blue_block
-    MapCharacter: a
-    Observers:
-      Sprite2D:
-        - Image: gvgai/newset/blockB2.png
-      Block2D:
-        - Shape: square
-          Color: [0.0, 0.0, 1.0]
-          Scale: 1.0
-
-  - Name: broken_box
-    Observers:
-      Sprite2D:
-        - Image: gvgai/newset/block3.png
-      Block2D:
-        - Shape: triangle
-          Color: [1.0, 0.0, 1.0]
-          Scale: 1.0
diff --git a/python/examples/experiments/conditional_action_spaces/clusters_po_with_push.yaml b/python/examples/experiments/conditional_action_spaces/clusters_po_with_push.yaml
deleted file mode 100644
index 9904e87e7..000000000
--- a/python/examples/experiments/conditional_action_spaces/clusters_po_with_push.yaml
+++ /dev/null
@@ -1,330 +0,0 @@
-Version: "0.1"
-Environment:
-  Name: Partially Observable Clusters
-  Description: Cluster the coloured objects together by pushing them against the static coloured blocks.
-  Observers:
-    Sprite2D:
-      TileSize: 24
-      BackgroundTile: oryx/oryx_fantasy/floor1-2.png
-  Variables:
-    - Name: box_count
-      InitialValue: 0
-  Player:
-    Observer:
-      RotateWithAvatar: true
-      TrackAvatar: true
-      Height: 5
-      Width: 5
-      OffsetX: 0
-      OffsetY: 2
-    AvatarObject: avatar # The player can only control a single avatar in the game
-  Termination:
-    Win:
-      - eq: [box_count, 0]
-    Lose:
-      - eq: [broken_box:count, 1]
-      - eq: [avatar:count, 0]
-  Levels:
-    - |
-      w w w w w w w w w w w w w
-      w . . . . . . . . . . . w
-      w . . 1 1 . . . 2 . 2 . w
-      w . . . . 1 . . . . . . w
-      w . . . a . . . . . 2 . w
-      w . . . . . . . h . . . w
-      w . . . . 1 . . . . b . w
-      w . . . . . . 1 . . . . w
-      w . . . . . . . . A . . w
-      w w w w w w w w w w w w w
-    - |
-      w w w w w w w w w w w w w
-      w . . . . . . . . . . . w
-      w . . 1 . . 2 . c 3 . . w
-      w . . . . h . . h . . . w
-      w . . . 2 . . 3 . . 1 . w
-      w . . . . b . . h . . . w
-      w . . 3 . . . 2 . . 1 . w
-      w . . h . h . . . a . . w
-      w . . . . . A . . . . . w
-      w w w w w w w w w w w w w
-    - |
-      w w w w w w w w w w w w w
-      w . . a . . b . . c . . w
-      w . . . . . . . . . . . w
-      w . . . . . . . . . . . w
-      w h h h h h . h h h h h w
-      w . . . . h . h . . . . w
-      w . 1 2 . h . h . 1 3 . w
-      w . 3 . . . . . . . 2 . w
-      w . . . . . A . . . . . w
-      w w w w w w w w w w w w w
-    - |
-      w w w w w w w w w w w w w
-      w . . . . . . . . . . . w
-      w . . . 1 . 2 . . c . . w
-      w . . . . . 3 . . 3 . . w
-      w . . a . 2 . . . h . . w
-      w . . . . h h . 3 . . . w
-      w . . 1 . . . . . 2 . . w
-      w . . . . . 1 . . b . . w
-      w . . . . . A . . . . . w
-      w w w w w w w w w w w w w
-    - |
-      w w w w w w w w w w w w w
-      w . . . . . . . . . . . w
-      w . . . . . . 1 . . . . w
-      w . . h . . b . . h . . w
-      w . . . . 1 . . . . . . w
-      w . . 3 . . . . 2 . . . w
-      w . . . a . h . . c . . w
-      w . . . . 3 . . . . 2 . w
-      w . . . . . A . . . . . w
-      w w w w w w w w w w w w w
-
-Actions:
-
-  # A simple action to count the number of boxes in the game at the start
-  # Not currently a way to do complex things in termination conditions like combine multiple conditions
-  - Name: box_counter
-    InputMapping:
-      Internal: true
-      Inputs:
-        1:
-          Description: "The only action here is to increment the box count"
-    Behaviours:
-      - Src:
-          Object: [blue_box, red_box, green_box]
-          Commands:
-            - incr: box_count
-        Dst:
-          Object: [blue_box, red_box, green_box]
-
-  # Define the move action
-  - Name: move
-    InputMapping:
-      Inputs:
-        1:
-          Description: Rotate left
-          OrientationVector: [-1, 0]
-        2:
-          Description: Move forwards
-          OrientationVector: [0, -1]
-          VectorToDest: [0, -1]
-        3:
-          Description: Rotate right
-          OrientationVector: [1, 0]
-      Relative: true
-    Behaviours:
-
-      # Avatar rotates
-      - Src:
-          Object: avatar
-          Commands:
-            - rot: _dir
-        Dst:
-          Object: avatar
-
-      # Avatar can move into empty space
-      - Src:
-          Object: avatar
-          Commands:
-            - mov: _dest
-        Dst:
-          Object: _empty
-
-      # Avatar dies if it hits the spikes
-      - Src:
-          Object: avatar
-          Commands:
-            - remove: true
-            - reward: -1
-        Dst:
-          Object: spike
-
-
-  - Name: push
-    InputMapping:
-      Inputs:
-        1:
-          Description: Push Forwards
-          OrientationVector: [ 0, -1 ]
-          VectorToDest: [ 0, -1 ]
-      Relative: true
-    Behaviours:
-
-      # Boxes can be pushed by the avatar
-      - Src:
-          Object: avatar
-          Commands:
-            - mov: _dest
-        Dst:
-          Object: [blue_box, green_box, red_box]
-          Commands:
-            - cascade: _dest
-
-      # Boxes break if they hit the spikes
-      - Src:
-          Object: [ blue_box, green_box, red_box ]
-          Commands:
-            - change_to: broken_box
-            - reward: -1
-        Dst:
-          Object: spike
-
-      # Boxes can pushed into empty space
-      - Src:
-          Object: [blue_box, green_box, red_box]
-          Commands:
-            - mov: _dest
-        Dst:
-          Object: _empty
-
-      # When boxes are pushed against the blocks they change
-      - Src:
-          Object: blue_box
-          Commands:
-            - change_to: blue_block
-            - reward: 1
-            - decr:  box_count
-        Dst:
-          Object: blue_block
-      - Src:
-          Object: red_box
-          Commands:
-            - reward: 1
-            - change_to: red_block
-            - decr:  box_count
-        Dst:
-          Object: red_block
-      - Src:
-          Object: green_box
-          Commands:
-            - reward: 1
-            - change_to: green_block
-            - decr:  box_count
-        Dst:
-          Object: green_block
-
-
-Objects:
-  - Name: avatar
-    MapCharacter: A
-    Observers:
-      Sprite2D:
-        - Image: gvgai/oryx/knight1.png
-      Block2D:
-        - Shape: triangle
-          Color: [0.0, 1.0, 0.0]
-          Scale: 0.8
-
-  - Name: wall
-    MapCharacter: w
-    Observers:
-      Sprite2D:
-        - TilingMode: WALL_16
-          Image:
-            - oryx/oryx_fantasy/wall1-0.png
-            - oryx/oryx_fantasy/wall1-1.png
-            - oryx/oryx_fantasy/wall1-2.png
-            - oryx/oryx_fantasy/wall1-3.png
-            - oryx/oryx_fantasy/wall1-4.png
-            - oryx/oryx_fantasy/wall1-5.png
-            - oryx/oryx_fantasy/wall1-6.png
-            - oryx/oryx_fantasy/wall1-7.png
-            - oryx/oryx_fantasy/wall1-8.png
-            - oryx/oryx_fantasy/wall1-9.png
-            - oryx/oryx_fantasy/wall1-10.png
-            - oryx/oryx_fantasy/wall1-11.png
-            - oryx/oryx_fantasy/wall1-12.png
-            - oryx/oryx_fantasy/wall1-13.png
-            - oryx/oryx_fantasy/wall1-14.png
-            - oryx/oryx_fantasy/wall1-15.png
-      Block2D:
-        - Shape: square
-          Color: [0.5, 0.5, 0.5]
-          Scale: 0.9
-
-  - Name: spike
-    MapCharacter: h
-    Observers:
-      Sprite2D:
-        - Image: gvgai/oryx/spike2.png
-      Block2D:
-        - Shape: triangle
-          Color: [0.9, 0.1, 0.1]
-          Scale: 0.5
-
-  - Name: red_box
-    MapCharacter: "2"
-    InitialActions:
-      - Action: box_counter
-        ActionId: 1
-    Observers:
-      Sprite2D:
-        - Image: gvgai/newset/blockR.png
-      Block2D:
-        - Shape: square
-          Color: [0.5, 0.2, 0.2]
-          Scale: 0.5
-  - Name: red_block
-    MapCharacter: b
-    Observers:
-      Sprite2D:
-        - Image: gvgai/newset/blockR2.png
-      Block2D:
-        - Shape: square
-          Color: [1.0, 0.0, 0.0]
-          Scale: 1.0
-
-  - Name: green_box
-    MapCharacter: "3"
-    InitialActions:
-      - Action: box_counter
-        ActionId: 1
-    Observers:
-      Sprite2D:
-        - Image: gvgai/newset/blockG.png
-      Block2D:
-        - Shape: square
-          Color: [0.2, 0.5, 0.2]
-          Scale: 0.5
-  - Name: green_block
-    MapCharacter: c
-    Observers:
-      Sprite2D:
-        - Image: gvgai/newset/blockG2.png
-      Block2D:
-        - Shape: square
-          Color: [0.0, 1.0, 0.0]
-          Scale: 1.0
-
-  - Name: blue_box
-    MapCharacter: "1"
-    InitialActions:
-      - Action: box_counter
-        ActionId: 1
-    Observers:
-      Sprite2D:
-        - Image: gvgai/newset/blockB.png
-      Block2D:
-        - Shape: square
-          Color: [0.2, 0.2, 0.5]
-          Scale: 0.5
-  - Name: blue_block
-    MapCharacter: a
-    Observers:
-      Sprite2D:
-        - Image: gvgai/newset/blockB2.png
-      Block2D:
-        - Shape: square
-          Color: [0.0, 0.0, 1.0]
-          Scale: 1.0
-
-  - Name: broken_box
-    Observers:
-      Sprite2D:
-        - Image: gvgai/newset/block3.png
-      Block2D:
-        - Shape: triangle
-          Color: [1.0, 0.0, 1.0]
-          Scale: 1.0
diff --git a/python/examples/experiments/conditional_action_spaces/clusters_po_with_push_separate_colors.yaml b/python/examples/experiments/conditional_action_spaces/clusters_po_with_push_separate_colors.yaml
deleted file mode 100644
index bb173e3bc..000000000
--- a/python/examples/experiments/conditional_action_spaces/clusters_po_with_push_separate_colors.yaml
+++ /dev/null
@@ -1,409 +0,0 @@
-Version: "0.1"
-Environment:
-  Name: Partially Observable Clusters
-  Description: Cluster the coloured objects together by pushing them against the static coloured blocks.
-  Observers:
-    Sprite2D:
-      TileSize: 24
-      BackgroundTile: oryx/oryx_fantasy/floor1-2.png
-  Variables:
-    - Name: box_count
-      InitialValue: 0
-  Player:
-    Observer:
-      RotateWithAvatar: true
-      TrackAvatar: true
-      Height: 5
-      Width: 5
-      OffsetX: 0
-      OffsetY: 2
-    AvatarObject: avatar # The player can only control a single avatar in the game
-  Termination:
-    Win:
-      - eq: [box_count, 0]
-    Lose:
-      - eq: [broken_box:count, 1]
-      - eq: [avatar:count, 0]
-  Levels:
-    - |
-      w w w w w w w w w w w w w
-      w . . . . . . . . . . . w
-      w . . 1 1 . . . 2 . 2 . w
-      w . . . . 1 . . . . . . w
-      w . . . a . . . . . 2 . w
-      w . . . . . . . h . . . w
-      w . . . . 1 . . . . b . w
-      w . . . . . . 1 . . . . w
-      w . . . . . . . . A . . w
-      w w w w w w w w w w w w w
-    - |
-      w w w w w w w w w w w w w
-      w . . . . . . . . . . . w
-      w . . 1 . . 2 . c 3 . . w
-      w . . . . h . . h . . . w
-      w . . . 2 . . 3 . . 1 . w
-      w . . . . b . . h . . . w
-      w . . 3 . . . 2 . . 1 . w
-      w . . h . h . . . a . . w
-      w . . . . . A . . . . . w
-      w w w w w w w w w w w w w
-    - |
-      w w w w w w w w w w w w w
-      w . . a . . b . . c . . w
-      w . . . . . . . . . . . w
-      w . . . . . . . . . . . w
-      w h h h h h . h h h h h w
-      w . . . . h . h . . . . w
-      w . 1 2 . h . h . 1 3 . w
-      w . 3 . . . . . . . 2 . w
-      w . . . . . A . . . . . w
-      w w w w w w w w w w w w w
-    - |
-      w w w w w w w w w w w w w
-      w . . . . . . . . . . . w
-      w . . . 1 . 2 . . c . . w
-      w . . . . . 3 . . 3 . . w
-      w . . a . 2 . . . h . . w
-      w . . . . h h . 3 . . . w
-      w . . 1 . . . . . 2 . . w
-      w . . . . . 1 . . b . . w
-      w . . . . . A . . . . . w
-      w w w w w w w w w w w w w
-    - |
-      w w w w w w w w w w w w w
-      w . . . . . . . . . . . w
-      w . . . . . . 1 . . . . w
-      w . . h . . b . . h . . w
-      w . . . . 1 . . . . . . w
-      w . . 3 . . . . 2 . . . w
-      w . . . a . h . . c . . w
-      w . . . . 3 . . . . 2 . w
-      w . . . . . A . . . . . w
-      w w w w w w w w w w w w w
-
-Actions:
-
-  # A simple action to count the number of boxes in the game at the start
-  # Not currently a way to do complex things in termination conditions like combine multiple conditions
-  - Name: box_counter
-    InputMapping:
-      Internal: true
-      Inputs:
-        1:
-          Description: "The only action here is to increment the box count"
-    Behaviours:
-      - Src:
-          Object: [blue_box, red_box, green_box]
-          Commands:
-            - incr: box_count
-        Dst:
-          Object: [blue_box, red_box, green_box]
-
-  # Define the move action
-  - Name: move
-    InputMapping:
-      Inputs:
-        1:
-          Description: Rotate left
-          OrientationVector: [-1, 0]
-        2:
-          Description: Move forwards
-          OrientationVector: [0, -1]
-          VectorToDest: [0, -1]
-        3:
-          Description: Rotate right
-          OrientationVector: [1, 0]
-      Relative: true
-    Behaviours:
-
-      # Avatar rotates
-      - Src:
-          Object: avatar
-          Commands:
-            - rot: _dir
-        Dst:
-          Object: avatar
-
-      # Avatar can move into empty space
-      - Src:
-          Object: avatar
-          Commands:
-            - mov: _dest
-        Dst:
-          Object: _empty
-
-
-      # Avatar dies if it hits the spikes
-      - Src:
-          Object: avatar
-          Commands:
-            - remove: true
-            - reward: -1
-        Dst:
-          Object: spike
-
-
-  - Name: push_blue
-    InputMapping:
-      Inputs:
-        1:
-          Description: Push Blue
-          OrientationVector: [ 0, -1 ]
-          VectorToDest: [ 0, -1 ]
-      Relative: true
-    Behaviours:
-
-      # Boxes can be pushed by the avatar
-      - Src:
-          Object: avatar
-          Commands:
-            - mov: _dest
-        Dst:
-          Object: blue_box
-          Commands:
-            - cascade: _dest
-
-      # Boxes break if they are pushed into the spikes
-      - Src:
-          Object: blue_box
-          Commands:
-            - change_to: broken_box
-            - reward: -1
-        Dst:
-          Object: spike
-
-      # Boxes can pushed into empty space
-      - Src:
-          Object: blue_box
-          Commands:
-            - mov: _dest
-        Dst:
-          Object: _empty
-
-      # When boxes are pushed against the blocks they change
-      - Src:
-          Object: blue_box
-          Commands:
-            - change_to: blue_block
-            - reward: 1
-            - decr:  box_count
-        Dst:
-          Object: blue_block
-
-  - Name: push_red
-    InputMapping:
-      Inputs:
-        1:
-          Description: Push Red
-          OrientationVector: [ 0, -1 ]
-          VectorToDest: [ 0, -1 ]
-      Relative: true
-    Behaviours:
-
-      # Boxes can be pushed by the avatar
-      - Src:
-          Object: avatar
-          Commands:
-            - mov: _dest
-        Dst:
-          Object: red_box
-          Commands:
-            - cascade: _dest
-
-      # Boxes break if they are pushed into the spikes
-      - Src:
-          Object: red_box
-          Commands:
-            - change_to: broken_box
-            - reward: -1
-        Dst:
-          Object: spike
-
-      # Boxes can pushed into empty space
-      - Src:
-          Object: red_box
-          Commands:
-            - mov: _dest
-        Dst:
-          Object: _empty
-
-      # When boxes are pushed against the blocks they change
-      - Src:
-          Object: red_box
-          Commands:
-            - reward: 1
-            - change_to: red_block
-            - decr:  box_count
-        Dst:
-          Object: red_block
-
-  - Name: push_green
-    InputMapping:
-      Inputs:
-        1:
-          Description: Push Green
-          OrientationVector: [ 0, -1 ]
-          VectorToDest: [ 0, -1 ]
-      Relative: true
-    Behaviours:
-
-      # Boxes can be pushed by the avatar
-      - Src:
-          Object: avatar
-          Commands:
-            - mov: _dest
-        Dst:
-          Object: green_box
-          Commands:
-            - cascade: _dest
-
-      # Boxes break if they are pushed into the spikes
-      - Src:
-          Object: green_box
-          Commands:
-            - change_to: broken_box
-            - reward: -1
-        Dst:
-          Object: spike
-
-      # Boxes can pushed into empty space
-      - Src:
-          Object: green_box
-          Commands:
-            - mov: _dest
-        Dst:
-          Object: _empty
-
-        # When boxes are pushed against the blocks they change
-      - Src:
-          Object: green_box
-          Commands:
-            - reward: 1
-            - change_to: green_block
-            - decr:  box_count
-        Dst:
-          Object: green_block
-
-
-Objects:
-  - Name: avatar
-    MapCharacter: A
-    Observers:
-      Sprite2D:
-        - Image: gvgai/oryx/knight1.png
-      Block2D:
-        - Shape: triangle
-          Color: [0.0, 1.0, 0.0]
-          Scale: 0.8
-
-  - Name: wall
-    MapCharacter: w
-    Observers:
-      Sprite2D:
-        - TilingMode: WALL_16
-          Image:
-            - oryx/oryx_fantasy/wall1-0.png
-            - oryx/oryx_fantasy/wall1-1.png
-            - oryx/oryx_fantasy/wall1-2.png
-            - oryx/oryx_fantasy/wall1-3.png
-            - oryx/oryx_fantasy/wall1-4.png
-            - oryx/oryx_fantasy/wall1-5.png
-            - oryx/oryx_fantasy/wall1-6.png
-            - oryx/oryx_fantasy/wall1-7.png
-            - oryx/oryx_fantasy/wall1-8.png
-            - oryx/oryx_fantasy/wall1-9.png
-            - oryx/oryx_fantasy/wall1-10.png
-            - oryx/oryx_fantasy/wall1-11.png
-            - oryx/oryx_fantasy/wall1-12.png
-            - oryx/oryx_fantasy/wall1-13.png
-            - oryx/oryx_fantasy/wall1-14.png
-            - oryx/oryx_fantasy/wall1-15.png
-      Block2D:
-        - Shape: square
-          Color: [0.5, 0.5, 0.5]
-          Scale: 0.9
-
-  - Name: spike
-    MapCharacter: h
-    Observers:
-      Sprite2D:
-        - Image: gvgai/oryx/spike2.png
-      Block2D:
-        - Shape: triangle
-          Color: [0.9, 0.1, 0.1]
-          Scale: 0.5
-
-  - Name: red_box
-    MapCharacter: "2"
-    InitialActions:
-      - Action: box_counter
-        ActionId: 1
-    Observers:
-      Sprite2D:
-        - Image: gvgai/newset/blockR.png
-      Block2D:
-        - Shape: square
-          Color: [0.5, 0.2, 0.2]
-          Scale: 0.5
-  - Name: red_block
-    MapCharacter: b
-    Observers:
-      Sprite2D:
-        - Image: gvgai/newset/blockR2.png
-      Block2D:
-        - Shape: square
-          Color: [1.0, 0.0, 0.0]
-          Scale: 1.0
-
-  - Name: green_box
-    MapCharacter: "3"
-    InitialActions:
-      - Action: box_counter
-        ActionId: 1
-    Observers:
-      Sprite2D:
-        - Image: gvgai/newset/blockG.png
-      Block2D:
-        - Shape: square
-          Color: [0.2, 0.5, 0.2]
-          Scale: 0.5
-  - Name: green_block
-    MapCharacter: c
-    Observers:
-      Sprite2D:
-        - Image: gvgai/newset/blockG2.png
-      Block2D:
-        - Shape: square
-          Color: [0.0, 1.0, 0.0]
-          Scale: 1.0
-
-  - Name: blue_box
-    MapCharacter: "1"
-    InitialActions:
-      - Action: box_counter
-        ActionId: 1
-    Observers:
-      Sprite2D:
-        - Image: gvgai/newset/blockB.png
-      Block2D:
-        - Shape: square
-          Color: [0.2, 0.2, 0.5]
-          Scale: 0.5
-  - Name: blue_block
-    MapCharacter: a
-    Observers:
-      Sprite2D:
-        - Image: gvgai/newset/blockB2.png
-      Block2D:
-        - Shape: square
-          Color: [0.0, 0.0, 1.0]
-          Scale: 1.0
-
-  - Name: broken_box
-    Observers:
-      Sprite2D:
-        - Image: gvgai/newset/block3.png
-      Block2D:
-        - Shape: triangle
-          Color: [1.0, 0.0, 1.0]
-          Scale: 1.0
diff --git a/python/examples/experiments/conditional_action_spaces/clusters_po_with_push_separate_colors_units.yaml b/python/examples/experiments/conditional_action_spaces/clusters_po_with_push_separate_colors_units.yaml
deleted file mode 100644
index bb932fc35..000000000
--- a/python/examples/experiments/conditional_action_spaces/clusters_po_with_push_separate_colors_units.yaml
+++ /dev/null
@@ -1,304 +0,0 @@
-Version: "0.1"
-Environment:
-  Name: Partially Observable Clusters
-  Description: Cluster the coloured objects together by pushing them against the static coloured blocks.
-  Observers:
-    Sprite2D:
-      TileSize: 24
-      BackgroundTile: oryx/oryx_fantasy/floor1-2.png
-  Variables:
-    - Name: box_count
-      InitialValue: 0
-      PerPlayer: true
-    - Name: broken_boxes
-      InitialValue: 0
-      PerPlayer: true
-  Player:
-    Count: 1
-  Termination:
-    Win:
-      - eq: [ box_count, 0 ]
-    Lose:
-      - eq: [ broken_boxes, 1 ]
-  Levels:
-    - |
-      w  w  w  w  w  w  w  w  w  w  w  w  w
-      w  .  .  .  .  .  .  .  .  .  .  .  w
-      w  .  .  b1 b1 .  .  .  r1 .  r1 .  w
-      w  .  .  .  .  b1 .  .  .  .  .  .  w
-      w  .  .  .  B  .  .  .  .  .  r1 .  w
-      w  .  .  .  .  .  .  .  x  .  .  .  w
-      w  .  .  .  .  b1 .  .  .  .  R  .  w
-      w  .  .  .  .  .  .  b1 .  .  .  .  w
-      w  .  .  .  .  .  .  .  .  .  .  .  w
-      w  w  w  w  w  w  w  w  w  w  w  w  w
-    - |
-      w  w  w  w  w  w  w  w  w  w  w  w  w
-      w  .  .  .  .  .  .  .  .  .  .  .  w
-      w  .  .  b1 .  .  r1 .  G  g1 .  .  w
-      w  .  .  .  .  x  .  .  x  .  .  .  w
-      w  .  .  .  r1 .  .  g1 .  .  b1 .  w
-      w  .  .  .  .  R  .  .  x  .  .  .  w
-      w  .  .  g1 .  .  .  r1 .  .  b1 .  w
-      w  .  .  x  .  x  .  .  .  B  .  .  w
-      w  .  .  .  .  .  .  .  .  .  .  .  w
-      w  w  w  w  w  w  w  w  w  w  w  w  w
-    - |
-      w  w  w  w  w  w  w  w  w  w  w  w  w
-      w  .  .  B  .  .  R  .  .  G  .  .  w
-      w  .  .  .  .  .  .  .  .  .  .  .  w
-      w  .  .  .  .  .  .  .  .  .  .  .  w
-      w  x  x  x  x  x  .  x  x  x  x  x  w
-      w  .  .  .  .  x  .  x  .  .  .  .  w
-      w  .  b1 r1 .  x  .  x  .  b1 g1 .  w
-      w  .  g1 .  .  .  .  .  .  .  r1 .  w
-      w  .  .  .  .  .  .  .  .  .  .  .  w
-      w  w  w  w  w  w  w  w  w  w  w  w  w
-    - |
-      w   w  w  w  w  w  w  w  w  w  w  w  w
-      w   .  .  .  .  .  .  .  .  .  .  .  w
-      w   .  .  .  b1 .  r1 .  .  G  .  .  w
-      w   .  .  .  .  .  g1 .  .  g1 .  .  w
-      w   .  .  B  .  r1 .  .  .  x  .  .  w
-      w   .  .  .  .  x  x  .  g1 .  .  .  w
-      w   .  .  b1 .  .  .  .  .  r1 .  .  w
-      w   .  .  .  .  .  b1 .  .  R  .  .  w
-      w   .  .  .  .  .  .  .  .  .  .  .  w
-      w   w  w  w  w  w  w  w  w  w  w  w  w
-    - |
-      w  w  w  w  w  w  w  w  w  w  w  w  w
-      w  .  .  .  .  .  .  .  .  .  .  .  w
-      w  .  .  .  .  .  .  b1 .  .  .  .  w
-      w  .  .  x  .  .  R  .  .  x  .  .  w
-      w  .  .  .  .  b1 .  .  .  .  .  .  w
-      w  .  .  g1 .  .  .  .  r1 .  .  .  w
-      w  .  .  .  B  .  x  .  .  G  .  .  w
-      w  .  .  .  .  g1 .  .  .  .  r1 .  w
-      w  .  .  .  .  .  .  .  .  .  .  .  w
-      w  w  w  w  w  w  w  w  w  w  w  w  w
-
-Actions:
-
-  # A simple action to count the number of boxes in the game at the start
-  # Not currently a way to do complex things in termination conditions like combine multiple conditions
-  - Name: box_counter
-    InputMapping:
-      Internal: true
-      Inputs:
-        1:
-          Description: "The only action here is to increment the box count"
-    Behaviours:
-      - Src:
-          Object: [ blue_box, red_box, green_box ]
-          Commands:
-            - incr: box_count
-        Dst:
-          Object: [ blue_box, red_box, green_box ]
-
-  - Name: push_blue
-    Behaviours:
-
-      # Boxes break if they are pushed into the spikes
-      - Src:
-          Object: blue_box
-          Commands:
-            - incr: broken_boxes
-            - change_to: broken_box
-            - reward: -1
-        Dst:
-          Object: spike
-
-      # Boxes can pushed into empty space
-      - Src:
-          Object: blue_box
-          Commands:
-            - mov: _dest
-        Dst:
-          Object: _empty
-
-      # When boxes are pushed against the blocks they change
-      - Src:
-          Object: blue_box
-          Commands:
-            - change_to: blue_block
-            - reward: 1
-            - decr: box_count
-        Dst:
-          Object: blue_block
-
-  - Name: push_red
-    Behaviours:
-
-      # Boxes break if they are pushed into the spikes
-      - Src:
-          Object: red_box
-          Commands:
-            - incr: broken_boxes
-            - change_to: broken_box
-            - reward: -1
-        Dst:
-          Object: spike
-
-      # Boxes can pushed into empty space
-      - Src:
-          Object: red_box
-          Commands:
-            - mov: _dest
-        Dst:
-          Object: _empty
-
-      # When boxes are pushed against the blocks they change
-      - Src:
-          Object: red_box
-          Commands:
-            - reward: 1
-            - change_to: red_block
-            - decr: box_count
-        Dst:
-          Object: red_block
-
-  - Name: push_green
-    Behaviours:
-
-      # Boxes break if they are pushed into the spikes
-      - Src:
-          Object: green_box
-          Commands:
-            - incr: broken_boxes
-            - change_to: broken_box
-            - reward: -1
-        Dst:
-          Object: spike
-
-      # Boxes can pushed into empty space
-      - Src:
-          Object: green_box
-          Commands:
-            - mov: _dest
-        Dst:
-          Object: _empty
-
-        # When boxes are pushed against the blocks they change
-      - Src:
-          Object: green_box
-          Commands:
-            - reward: 1
-            - change_to: green_block
-            - decr: box_count
-        Dst:
-          Object: green_block
-
-
-Objects:
-
-  - Name: wall
-    MapCharacter: w
-    Observers:
-      Sprite2D:
-        - TilingMode: WALL_16
-          Image:
-            - oryx/oryx_fantasy/wall1-0.png
-            - oryx/oryx_fantasy/wall1-1.png
-            - oryx/oryx_fantasy/wall1-2.png
-            - oryx/oryx_fantasy/wall1-3.png
-            - oryx/oryx_fantasy/wall1-4.png
-            - oryx/oryx_fantasy/wall1-5.png
-            - oryx/oryx_fantasy/wall1-6.png
-            - oryx/oryx_fantasy/wall1-7.png
-            - oryx/oryx_fantasy/wall1-8.png
-            - oryx/oryx_fantasy/wall1-9.png
-            - oryx/oryx_fantasy/wall1-10.png
-            - oryx/oryx_fantasy/wall1-11.png
-            - oryx/oryx_fantasy/wall1-12.png
-            - oryx/oryx_fantasy/wall1-13.png
-            - oryx/oryx_fantasy/wall1-14.png
-            - oryx/oryx_fantasy/wall1-15.png
-      Block2D:
-        - Shape: square
-          Color: [ 0.5, 0.5, 0.5 ]
-          Scale: 0.9
-
-  - Name: spike
-    MapCharacter: x
-    Observers:
-      Sprite2D:
-        - Image: gvgai/oryx/spike2.png
-      Block2D:
-        - Shape: triangle
-          Color: [ 0.9, 0.1, 0.1 ]
-          Scale: 0.5
-
-  - Name: red_box
-    MapCharacter: r
-    InitialActions:
-      - Action: box_counter
-        ActionId: 1
-    Observers:
-      Sprite2D:
-        - Image: gvgai/newset/blockR.png
-      Block2D:
-        - Shape: square
-          Color: [ 0.5, 0.2, 0.2 ]
-          Scale: 0.5
-  - Name: red_block
-    MapCharacter: R
-    Observers:
-      Sprite2D:
-        - Image: gvgai/newset/blockR2.png
-      Block2D:
-        - Shape: square
-          Color: [ 1.0, 0.0, 0.0 ]
-          Scale: 1.0
-
-  - Name: green_box
-    MapCharacter: g
-    InitialActions:
-      - Action: box_counter
-        ActionId: 1
-    Observers:
-      Sprite2D:
-        - Image: gvgai/newset/blockG.png
-      Block2D:
-        - Shape: square
-          Color: [ 0.2, 0.5, 0.2 ]
-          Scale: 0.5
-  - Name: green_block
-    MapCharacter: G
-    Observers:
-      Sprite2D:
-        - Image: gvgai/newset/blockG2.png
-      Block2D:
-        - Shape: square
-          Color: [ 0.0, 1.0, 0.0 ]
-          Scale: 1.0
-
-  - Name: blue_box
-    MapCharacter: b
-    InitialActions:
-      - Action: box_counter
-        ActionId: 1
-    Observers:
-      Sprite2D:
-        - Image: gvgai/newset/blockB.png
-      Block2D:
-        - Shape: square
-          Color: [ 0.2, 0.2, 0.5 ]
-          Scale: 0.5
-  - Name: blue_block
-    MapCharacter: B
-    Observers:
-      Sprite2D:
-        - Image: gvgai/newset/blockB2.png
-      Block2D:
-        - Shape: square
-          Color: [ 0.0, 0.0, 1.0 ]
-          Scale: 1.0
-
-  - Name: broken_box
-    Observers:
-      Sprite2D:
-        - Image: gvgai/newset/block3.png
-      Block2D:
-        - Shape: triangle
-          Color: [ 1.0, 0.0, 1.0 ]
-          Scale: 1.0
diff --git a/python/examples/experiments/conditional_action_spaces/clusters_po_with_push_units.yaml b/python/examples/experiments/conditional_action_spaces/clusters_po_with_push_units.yaml
deleted file mode 100644
index cae2d9a80..000000000
--- a/python/examples/experiments/conditional_action_spaces/clusters_po_with_push_units.yaml
+++ /dev/null
@@ -1,258 +0,0 @@
-Version: "0.1"
-Environment:
-  Name: Partially Observable Clusters
-  Description: Cluster the coloured objects together by pushing them against the static coloured blocks.
-  Observers:
-    Sprite2D:
-      TileSize: 24
-      BackgroundTile: oryx/oryx_fantasy/floor1-2.png
-  Variables:
-    - Name: box_count
-      InitialValue: 0
-      PerPlayer: true
-    - Name: broken_boxes
-      InitialValue: 0
-      PerPlayer: true
-  Player:
-    Count: 1
-  Termination:
-    Win:
-      - eq: [ box_count, 0 ]
-    Lose:
-      - eq: [ broken_boxes, 1 ]
-  Levels:
-    - |
-      w  w  w  w  w  w  w  w  w  w  w  w  w
-      w  .  .  .  .  .  .  .  .  .  .  .  w
-      w  .  .  b1 b1 .  .  .  r1 .  r1 .  w
-      w  .  .  .  .  b1 .  .  .  .  .  .  w
-      w  .  .  .  B  .  .  .  .  .  r1 .  w
-      w  .  .  .  .  .  .  .  x  .  .  .  w
-      w  .  .  .  .  b1 .  .  .  .  R  .  w
-      w  .  .  .  .  .  .  b1 .  .  .  .  w
-      w  .  .  .  .  .  .  .  .  .  .  .  w
-      w  w  w  w  w  w  w  w  w  w  w  w  w
-    - |
-      w  w  w  w  w  w  w  w  w  w  w  w  w
-      w  .  .  .  .  .  .  .  .  .  .  .  w
-      w  .  .  b1 .  .  r1 .  G  g1 .  .  w
-      w  .  .  .  .  x  .  .  x  .  .  .  w
-      w  .  .  .  r1 .  .  g1 .  .  b1 .  w
-      w  .  .  .  .  R  .  .  x  .  .  .  w
-      w  .  .  g1 .  .  .  r1 .  .  b1 .  w
-      w  .  .  x  .  x  .  .  .  B  .  .  w
-      w  .  .  .  .  .  .  .  .  .  .  .  w
-      w  w  w  w  w  w  w  w  w  w  w  w  w
-    - |
-      w  w  w  w  w  w  w  w  w  w  w  w  w
-      w  .  .  B  .  .  R  .  .  G  .  .  w
-      w  .  .  .  .  .  .  .  .  .  .  .  w
-      w  .  .  .  .  .  .  .  .  .  .  .  w
-      w  x  x  x  x  x  .  x  x  x  x  x  w
-      w  .  .  .  .  x  .  x  .  .  .  .  w
-      w  .  b1 r1 .  x  .  x  .  b1 g1 .  w
-      w  .  g1 .  .  .  .  .  .  .  r1 .  w
-      w  .  .  .  .  .  .  .  .  .  .  .  w
-      w  w  w  w  w  w  w  w  w  w  w  w  w
-    - |
-      w   w  w  w  w  w  w  w  w  w  w  w  w
-      w   .  .  .  .  .  .  .  .  .  .  .  w
-      w   .  .  .  b1 .  r1 .  .  G  .  .  w
-      w   .  .  .  .  .  g1 .  .  g1 .  .  w
-      w   .  .  B  .  r1 .  .  .  x  .  .  w
-      w   .  .  .  .  x  x  .  g1 .  .  .  w
-      w   .  .  b1 .  .  .  .  .  r1 .  .  w
-      w   .  .  .  .  .  b1 .  .  R  .  .  w
-      w   .  .  .  .  .  .  .  .  .  .  .  w
-      w   w  w  w  w  w  w  w  w  w  w  w  w
-    - |
-      w  w  w  w  w  w  w  w  w  w  w  w  w
-      w  .  .  .  .  .  .  .  .  .  .  .  w
-      w  .  .  .  .  .  .  b1 .  .  .  .  w
-      w  .  .  x  .  .  R  .  .  x  .  .  w
-      w  .  .  .  .  b1 .  .  .  .  .  .  w
-      w  .  .  g1 .  .  .  .  r1 .  .  .  w
-      w  .  .  .  B  .  x  .  .  G  .  .  w
-      w  .  .  .  .  g1 .  .  .  .  r1 .  w
-      w  .  .  .  .  .  .  .  .  .  .  .  w
-      w  w  w  w  w  w  w  w  w  w  w  w  w
-
-Actions:
-
-  # A simple action to count the number of boxes in the game at the start
-  # Not currently a way to do complex things in termination conditions like combine multiple conditions
-  - Name: box_counter
-    InputMapping:
-      Internal: true
-      Inputs:
-        1:
-          Description: "The only action here is to increment the box count"
-    Behaviours:
-      - Src:
-          Object: [ blue_box, red_box, green_box ]
-          Commands:
-            - incr: box_count
-        Dst:
-          Object: [ blue_box, red_box, green_box ]
-
-  - Name: push
-    Behaviours:
-
-      # Boxes break if they hit the spikes
-      - Src:
-          Object: [ blue_box, green_box, red_box ]
-          Commands:
-            - incr: broken_boxes
-            - change_to: broken_box
-            - reward: -1
-        Dst:
-          Object: spike
-
-      # Boxes can pushed into empty space
-      - Src:
-          Object: [ blue_box, green_box, red_box ]
-          Commands:
-            - mov: _dest
-        Dst:
-          Object: _empty
-
-      # When boxes are pushed against the blocks they change
-      - Src:
-          Object: blue_box
-          Commands:
-            - change_to: blue_block
-            - reward: 1
-            - decr: box_count
-        Dst:
-          Object: blue_block
-      - Src:
-          Object: red_box
-          Commands:
-            - reward: 1
-            - change_to: red_block
-            - decr: box_count
-        Dst:
-          Object: red_block
-      - Src:
-          Object: green_box
-          Commands:
-            - reward: 1
-            - change_to: green_block
-            - decr: box_count
-        Dst:
-          Object: green_block
-
-
-Objects:
-
-  - Name: wall
-    MapCharacter: w
-    Observers:
-      Sprite2D:
-        - TilingMode: WALL_16
-          Image:
-            - oryx/oryx_fantasy/wall1-0.png
-            - oryx/oryx_fantasy/wall1-1.png
-            - oryx/oryx_fantasy/wall1-2.png
-            - oryx/oryx_fantasy/wall1-3.png
-            - oryx/oryx_fantasy/wall1-4.png
-            - oryx/oryx_fantasy/wall1-5.png
-            - oryx/oryx_fantasy/wall1-6.png
-            - oryx/oryx_fantasy/wall1-7.png
-            - oryx/oryx_fantasy/wall1-8.png
-            - oryx/oryx_fantasy/wall1-9.png
-            - oryx/oryx_fantasy/wall1-10.png
-            - oryx/oryx_fantasy/wall1-11.png
-            - oryx/oryx_fantasy/wall1-12.png
-            - oryx/oryx_fantasy/wall1-13.png
-            - oryx/oryx_fantasy/wall1-14.png
-            - oryx/oryx_fantasy/wall1-15.png
-      Block2D:
-        - Shape: square
-          Color: [ 0.5, 0.5, 0.5 ]
-          Scale: 0.9
-
-  - Name: spike
-    MapCharacter: x
-    Observers:
-      Sprite2D:
-        - Image: gvgai/oryx/spike2.png
-      Block2D:
-        - Shape: triangle
-          Color: [ 0.9, 0.1, 0.1 ]
-          Scale: 0.5
-
-  - Name: red_box
-    MapCharacter: r
-    InitialActions:
-      - Action: box_counter
-        ActionId: 1
-    Observers:
-      Sprite2D:
-        - Image: gvgai/newset/blockR.png
-      Block2D:
-        - Shape: square
-          Color: [ 0.5, 0.2, 0.2 ]
-          Scale: 0.5
-  - Name: red_block
-    MapCharacter: R
-    Observers:
-      Sprite2D:
-        - Image: gvgai/newset/blockR2.png
-      Block2D:
-        - Shape: square
-          Color: [ 1.0, 0.0, 0.0 ]
-          Scale: 1.0
-
-  - Name: green_box
-    MapCharacter: g
-    InitialActions:
-      - Action: box_counter
-        ActionId: 1
-    Observers:
-      Sprite2D:
-        - Image: gvgai/newset/blockG.png
-      Block2D:
-        - Shape: square
-          Color: [ 0.2, 0.5, 0.2 ]
-          Scale: 0.5
-  - Name: green_block
-    MapCharacter: G
-    Observers:
-      Sprite2D:
-        - Image: gvgai/newset/blockG2.png
-      Block2D:
-        - Shape: square
-          Color: [ 0.0, 1.0, 0.0 ]
-          Scale: 1.0
-
-  - Name: blue_box
-    MapCharacter: b
-    InitialActions:
-      - Action: box_counter
-        ActionId: 1
-    Observers:
-      Sprite2D:
-        - Image: gvgai/newset/blockB.png
-      Block2D:
-        - Shape: square
-          Color: [ 0.2, 0.2, 0.5 ]
-          Scale: 0.5
-  - Name: blue_block
-    MapCharacter: B
-    Observers:
-      Sprite2D:
-        - Image: gvgai/newset/blockB2.png
-      Block2D:
-        - Shape: square
-          Color: [ 0.0, 0.0, 1.0 ]
-          Scale: 1.0
-
-  - Name: broken_box
-    Observers:
-      Sprite2D:
-        - Image: gvgai/newset/block3.png
-      Block2D:
-        - Shape: triangle
-          Color: [ 1.0, 0.0, 1.0 ]
-          Scale: 1.0
diff --git a/python/examples/experiments/conditional_action_spaces/rllib_baseline.py b/python/examples/experiments/conditional_action_spaces/rllib_baseline.py
deleted file mode 100644
index d29c7647c..000000000
--- a/python/examples/experiments/conditional_action_spaces/rllib_baseline.py
+++ /dev/null
@@ -1,118 +0,0 @@
-import argparse
-import os
-import sys
-
-import ray
-from ray import tune
-from ray.rllib.models import ModelCatalog
-from ray.tune.integration.wandb import WandbLoggerCallback
-from ray.tune.registry import register_env
-
-from griddly import gd
-from griddly.util.rllib.callbacks import GriddlyCallbacks
-from griddly.util.rllib.environment.core import RLlibEnv
-from griddly.util.rllib.torch.agents.conv_agent import SimpleConvAgent
-from griddly.util.rllib.torch.conditional_actions.conditional_action_policy_trainer import \
-    ConditionalActionImpalaTrainer
-
-parser = argparse.ArgumentParser(description='Run experiments')
-
-parser.add_argument('--yaml-file', help='YAML file condining GDY for the game')
-parser.add_argument('--experiment-name', default='unknown', help='name of the experiment')
-
-parser.add_argument('--root-directory', default=os.path.expanduser("~/ray_results"),
-                    help='root directory for all data associated with the run')
-parser.add_argument('--num-gpus', default=1, type=int, help='Number of GPUs to make available to ray.')
-parser.add_argument('--num-cpus', default=8, type=int, help='Number of CPUs to make available to ray.')
-
-parser.add_argument('--num-workers', default=7, type=int, help='Number of workers')
-parser.add_argument('--num-envs-per-worker', default=5, type=int, help='Number of workers')
-parser.add_argument('--num-gpus-per-worker', default=0, type=float, help='Number of gpus per worker')
-parser.add_argument('--num-cpus-per-worker', default=1, type=float, help='Number of gpus per worker')
-parser.add_argument('--max-training-steps', default=20000000, type=int, help='Number of workers')
-
-parser.add_argument('--capture-video', action='store_true', help='enable video capture')
-parser.add_argument('--video-directory', default='videos', help='directory of video')
-parser.add_argument('--video-frequency', type=int, default=1000000, help='Frequency of videos')
-
-parser.add_argument('--seed', type=int, default=69420, help='seed for experiments')
-
-parser.add_argument('--lr', type=float, default=0.0005, help='learning rate')
-
-if __name__ == '__main__':
-
-    args = parser.parse_args()
-
-    sep = os.pathsep
-    os.environ['PYTHONPATH'] = sep.join(sys.path)
-
-    ray.init(include_dashboard=False, num_gpus=args.num_gpus, num_cpus=args.num_cpus)
-    #ray.init(include_dashboard=False, num_gpus=1, num_cpus=args.num_cpus, local_mode=True)
-
-    env_name = "ray-griddly-env"
-
-    register_env(env_name, RLlibEnv)
-    ModelCatalog.register_custom_model("SimpleConv", SimpleConvAgent)
-
-    wandbLoggerCallback = WandbLoggerCallback(
-        project='conditional_action_trees',
-        api_key_file='~/.wandb_rc',
-        dir=args.root_directory
-    )
-
-    max_training_steps = args.max_training_steps
-
-
-    config = {
-        'framework': 'torch',
-        'seed': args.seed,
-        'num_workers': args.num_workers,
-        'num_envs_per_worker': args.num_envs_per_worker,
-        'num_gpus_per_worker': float(args.num_gpus_per_worker),
-        'num_cpus_per_worker': args.num_cpus_per_worker,
-
-        'callbacks': GriddlyCallbacks,
-
-        'model': {
-            'custom_model': 'SimpleConv',
-            'custom_model_config': {}
-        },
-        'env': env_name,
-        'env_config': {
-            'generate_valid_action_trees': False,
-            'random_level_on_reset': True,
-            'yaml_file': args.yaml_file,
-            'global_observer_type': gd.ObserverType.SPRITE_2D,
-            'max_steps': 1000,
-        },
-        'entropy_coeff_schedule': [
-            [0, 0.01],
-            [max_training_steps, 0.0]
-        ],
-        'lr_schedule': [
-            [0, args.lr],
-            [max_training_steps, 0.0]
-        ],
-
-    }
-    if args.capture_video:
-        real_video_frequency = int(args.video_frequency / (args.num_envs_per_worker * args.num_workers))
-        config['env_config']['record_video_config'] = {
-            'frequency': real_video_frequency,
-            'directory': os.path.join(args.root_directory, args.video_directory)
-        }
-
-    stop = {
-        "timesteps_total": max_training_steps,
-    }
-
-    trial_name_creator = lambda trial: f'baseline-{args.experiment_name}'
-
-    result = tune.run(
-        ConditionalActionImpalaTrainer,
-        local_dir=args.root_directory,
-        config=config,
-        stop=stop,
-        callbacks=[wandbLoggerCallback],
-        trial_name_creator=trial_name_creator
-    )
diff --git a/python/examples/experiments/conditional_action_spaces/rllib_baseline_flat.py b/python/examples/experiments/conditional_action_spaces/rllib_baseline_flat.py
deleted file mode 100644
index b8549d9ec..000000000
--- a/python/examples/experiments/conditional_action_spaces/rllib_baseline_flat.py
+++ /dev/null
@@ -1,277 +0,0 @@
-import argparse
-import os
-import sys
-
-import gym
-import numpy as np
-import ray
-import torch
-from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
-from torch import nn
-from gym.spaces import MultiDiscrete, Dict, Box
-from ray import tune
-from ray.rllib.agents.impala import ImpalaTrainer
-from ray.rllib.models import ModelCatalog
-from ray.tune.integration.wandb import WandbLoggerCallback
-from ray.tune.registry import register_env
-
-from griddly import gd
-from griddly.util.rllib.callbacks import GriddlyCallbacks
-from griddly.util.rllib.environment.core import RLlibEnv
-from griddly.util.rllib.torch.agents.common import layer_init
-
-parser = argparse.ArgumentParser(description='Run experiments')
-
-parser.add_argument('--yaml-file', help='YAML file condining GDY for the game')
-parser.add_argument('--experiment-name', default='unknown', help='name of the experiment')
-
-parser.add_argument('--root-directory', default=os.path.expanduser("~/ray_results"),
-                    help='root directory for all data associated with the run')
-parser.add_argument('--num-gpus', default=1, type=int, help='Number of GPUs to make available to ray.')
-parser.add_argument('--num-cpus', default=8, type=int, help='Number of CPUs to make available to ray.')
-
-parser.add_argument('--num-workers', default=7, type=int, help='Number of workers')
-parser.add_argument('--num-envs-per-worker', default=5, type=int, help='Number of workers')
-parser.add_argument('--num-gpus-per-worker', default=0, type=float, help='Number of gpus per worker')
-parser.add_argument('--num-cpus-per-worker', default=1, type=float, help='Number of gpus per worker')
-parser.add_argument('--max-training-steps', default=20000000, type=int, help='Number of workers')
-
-parser.add_argument('--capture-video', action='store_true', help='enable video capture')
-parser.add_argument('--video-directory', default='videos', help='directory of video')
-parser.add_argument('--video-frequency', type=int, default=1000000, help='Frequency of videos')
-
-parser.add_argument('--seed', type=int, default=69420, help='seed for experiments')
-
-parser.add_argument('--lr', type=float, default=0.0005, help='learning rate')
-
-
-class FlatActionWrapper(gym.Wrapper):
-
-    def __init__(self, env):
-        super().__init__(env)
-
-        self._num_action_parts = 1
-        self._action_params_offset = 0
-        if not self.has_avatar:
-            self._num_action_parts += 1
-            self._action_params_offset = 1
-
-        self._action_splits = np.zeros(self._num_action_parts)
-
-        self._total_position_params = 0
-        if not self.has_avatar:
-            self._action_splits[0] = self.width * self.height
-            self._total_position_params += self.width * self.height
-
-        self._action_logit_offsets = {}
-
-        total_action_params = 0
-        for i, action_name in enumerate(self.env.action_names):
-            self._action_logit_offsets[action_name] = total_action_params + self._total_position_params
-            total_action_params += self.num_action_ids[action_name]
-
-        self._action_splits[self._action_params_offset] = total_action_params
-
-        self._total_actions = int(np.sum(self._action_splits))
-
-        self.action_space = MultiDiscrete(self._action_splits)
-        self.observation_space = Dict({
-            'obs': self.observation_space,
-            'mask': Box(0, 1, shape=(self._total_actions,)),
-        })
-
-    def _get_flat_mask(self):
-        flat_mask = np.zeros(self._total_actions)
-        for location, action_names in self.env.game.get_available_actions(1).items():
-            if not self.has_avatar:
-                flat_location = self.width * location[1] + location[0]
-                flat_mask[flat_location] = 1
-            for action_name, action_ids in self.env.game.get_available_action_ids(location, list(action_names)).items():
-                mask_offset = self._action_logit_offsets[action_name]
-                flat_mask[mask_offset:mask_offset + self.num_action_ids[action_name]][action_ids] = 1
-        return flat_mask
-
-    def _to_griddly_action(self, action):
-        # convert the flat action back to Griddly's tree based format
-
-        griddly_action = []
-        action_ptr = 0
-        if not self.has_avatar:
-            x = action[action_ptr] % self.width
-            griddly_action.append(x)
-            y = int(action[action_ptr] / self.width)
-            griddly_action.append(y)
-            action_ptr += 1
-
-        if self.action_count > 0:
-            action_type_id = 0
-            action_param_id = 0
-            for action_name in self.action_names:
-                action_offset_after_position = (self._action_logit_offsets[action_name] - self._total_position_params)
-                next_offset = action_offset_after_position + self.num_action_ids[action_name]
-                if next_offset > action[action_ptr]:
-                    action_param_id = action[action_ptr] - action_offset_after_position
-                    break
-                action_type_id += 1
-
-            griddly_action.append(action_type_id)
-            griddly_action.append(action_param_id)
-        else:
-            griddly_action.append(action[action_ptr])
-
-        return griddly_action
-
-    def reset(self, **kwargs):
-
-        obs = super().reset(**kwargs)
-
-        observations = {
-            'obs': obs,
-            'mask': self._get_flat_mask()
-        }
-
-        return observations
-
-    def step(self, action):
-        griddly_action = self._to_griddly_action(action)
-
-        obs, reward, info, done = super().step(griddly_action)
-
-        observations = {
-            'obs': obs,
-            'mask': self._get_flat_mask()
-        }
-
-        return observations, reward, info, done
-
-
-class SimpleConvFlatAgent(TorchModelV2, nn.Module):
-
-    def __init__(self, obs_space, action_space, num_outputs, model_config, name):
-        super().__init__(obs_space, action_space, num_outputs, model_config, name)
-        nn.Module.__init__(self)
-
-        self._num_objects = obs_space.original_space['obs'].shape[2]
-        self._num_actions = num_outputs
-
-        linear_flatten = np.prod(obs_space.original_space['obs'].shape[:2]) * 64
-
-        self.network = nn.Sequential(
-            layer_init(nn.Conv2d(self._num_objects, 32, 3, padding=1)),
-            nn.ReLU(),
-            layer_init(nn.Conv2d(32, 64, 3, padding=1)),
-            nn.ReLU(),
-            nn.Flatten(),
-            layer_init(nn.Linear(linear_flatten, 1024)),
-            nn.ReLU(),
-            layer_init(nn.Linear(1024, 512)),
-            nn.ReLU(),
-        )
-
-        self._actor_head = nn.Sequential(
-            layer_init(nn.Linear(512, 256), std=0.01),
-            nn.ReLU(),
-            layer_init(nn.Linear(256, self._num_actions), std=0.01)
-        )
-
-        self._critic_head = nn.Sequential(
-            layer_init(nn.Linear(512, 1), std=0.01)
-        )
-
-    def forward(self, input_dict, state, seq_lens):
-        obs_transformed = input_dict['obs']['obs'].permute(0, 3, 1, 2)
-        mask = input_dict['obs']['mask']
-        network_output = self.network(obs_transformed)
-        value = self._critic_head(network_output)
-        self._value = value.reshape(-1)
-        logits = self._actor_head(network_output)
-
-        logits += torch.maximum(torch.log(mask), torch.tensor(torch.finfo().min))
-
-        return logits, state
-
-    def value_function(self):
-        return self._value
-
-
-if __name__ == '__main__':
-
-    args = parser.parse_args()
-
-    sep = os.pathsep
-    os.environ['PYTHONPATH'] = sep.join(sys.path)
-
-    ray.init(include_dashboard=False, num_gpus=args.num_gpus, num_cpus=args.num_cpus)
-    # ray.init(include_dashboard=False, num_gpus=1, num_cpus=args.num_cpus, local_mode=True)
-    env_name = "ray-griddly-env"
-
-
-    def _create_env(env_config):
-        env = RLlibEnv(env_config)
-        return FlatActionWrapper(env)
-
-
-    register_env(env_name, _create_env)
-    ModelCatalog.register_custom_model("SimpleConv", SimpleConvFlatAgent)
-
-    wandbLoggerCallback = WandbLoggerCallback(
-        project='conditional_action_trees',
-        api_key_file='~/.wandb_rc',
-        dir=args.root_directory
-    )
-
-    max_training_steps = args.max_training_steps
-
-    config = {
-        'framework': 'torch',
-        'seed': args.seed,
-        'num_workers': args.num_workers,
-        'num_envs_per_worker': args.num_envs_per_worker,
-        'num_gpus_per_worker': float(args.num_gpus_per_worker),
-        'num_cpus_per_worker': args.num_cpus_per_worker,
-
-        'callbacks': GriddlyCallbacks,
-
-        'model': {
-            'custom_model': 'SimpleConv',
-            'custom_model_config': {}
-        },
-        'env': env_name,
-        'env_config': {
-            'generate_valid_action_trees': False,
-            'random_level_on_reset': True,
-            'yaml_file': args.yaml_file,
-            'global_observer_type': gd.ObserverType.SPRITE_2D,
-            'max_steps': 1000,
-        },
-        'entropy_coeff_schedule': [
-            [0, 0.01],
-            [max_training_steps, 0.0]
-        ],
-        'lr_schedule': [
-            [0, args.lr],
-            [max_training_steps, 0.0]
-        ],
-
-    }
-    if args.capture_video:
-        real_video_frequency = int(args.video_frequency / (args.num_envs_per_worker * args.num_workers))
-        config['env_config']['record_video_config'] = {
-            'frequency': real_video_frequency,
-            'directory': os.path.join(args.root_directory, args.video_directory)
-        }
-
-    stop = {
-        "timesteps_total": max_training_steps,
-    }
-
-    trial_name_creator = lambda trial: f'baseline-flat-{args.experiment_name}'
-
-    result = tune.run(
-        ImpalaTrainer,
-        local_dir=args.root_directory,
-        config=config,
-        stop=stop,
-        callbacks=[wandbLoggerCallback],
-        trial_name_creator=trial_name_creator
-    )
diff --git a/python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py b/python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py
deleted file mode 100644
index a6b07b851..000000000
--- a/python/examples/experiments/conditional_action_spaces/rllib_conditional_actions.py
+++ /dev/null
@@ -1,128 +0,0 @@
-import os
-import sys
-
-import ray
-from ray import tune
-from ray.rllib.models import ModelCatalog
-from ray.tune.integration.wandb import WandbLoggerCallback
-from ray.tune.registry import register_env
-
-from griddly import gd
-from griddly.util.rllib.callbacks import GriddlyCallbacks
-from griddly.util.rllib.environment.core import RLlibEnv
-from griddly.util.rllib.torch.agents.conv_agent import SimpleConvAgent
-from griddly.util.rllib.torch.conditional_actions.conditional_action_policy_trainer import \
-    ConditionalActionImpalaTrainer
-
-import argparse
-
-parser = argparse.ArgumentParser(description='Run experiments')
-
-parser.add_argument('--yaml-file', help='YAML file containing GDY for the game')
-parser.add_argument('--experiment-name', default='unknown', help='name of the experiment')
-
-parser.add_argument('--root-directory', default=os.path.expanduser("~/ray_results"),
-                    help='root directory for all data associated with the run')
-parser.add_argument('--num-gpus', default=1, type=int, help='Number of GPUs to make available to ray.')
-parser.add_argument('--num-cpus', default=8, type=int, help='Number of CPUs to make available to ray.')
-
-parser.add_argument('--num-workers', default=7, type=int, help='Number of workers')
-parser.add_argument('--num-envs-per-worker', default=5, type=int, help='Number of workers')
-parser.add_argument('--num-gpus-per-worker', default=0, type=float, help='Number of gpus per worker')
-parser.add_argument('--num-cpus-per-worker', default=1, type=float, help='Number of gpus per worker')
-parser.add_argument('--max-training-steps', default=20000000, type=int, help='Number of workers')
-
-parser.add_argument('--capture-video', action='store_true', help='enable video capture')
-parser.add_argument('--video-directory', default='videos', help='directory of video')
-parser.add_argument('--video-frequency', type=int, default=1000000, help='Frequency of videos')
-
-parser.add_argument('--allow-nop', action='store_true', default=False, help='allow NOP actions in action tree')
-parser.add_argument('--vtrace-masking', action='store_true', default=False, help='use masks in vtrace calculations')
-
-parser.add_argument('--seed', type=int, default=69420, help='seed for experiments')
-
-parser.add_argument('--lr', type=float, default=0.0005, help='learning rate')
-
-if __name__ == '__main__':
-
-    args = parser.parse_args()
-
-    sep = os.pathsep
-    os.environ['PYTHONPATH'] = sep.join(sys.path)
-
-    ray.init(include_dashboard=False, num_gpus=args.num_gpus, num_cpus=args.num_cpus)
-    #ray.init(include_dashboard=False, num_gpus=1, num_cpus=args.num_cpus, local_mode=True)
-
-    env_name = "ray-griddly-env"
-
-    register_env(env_name, RLlibEnv)
-    ModelCatalog.register_custom_model("SimpleConv", SimpleConvAgent)
-
-    wandbLoggerCallback = WandbLoggerCallback(
-        project='conditional_action_trees',
-        api_key_file='~/.wandb_rc',
-        dir=args.root_directory
-    )
-
-    max_training_steps = args.max_training_steps
-
-    config = {
-        'framework': 'torch',
-        'seed': args.seed,
-        'num_workers': args.num_workers,
-        'num_envs_per_worker': args.num_envs_per_worker,
-        'num_gpus_per_worker': float(args.num_gpus_per_worker),
-        'num_cpus_per_worker': args.num_cpus_per_worker,
-
-        'callbacks': GriddlyCallbacks,
-
-        'model': {
-            'custom_model': 'SimpleConv',
-            'custom_model_config': {}
-        },
-        'env': env_name,
-        'env_config': {
-
-            'allow_nop': args.allow_nop,
-            'invalid_action_masking': tune.grid_search(['conditional', 'collapsed']),
-            'vtrace_masking': args.vtrace_masking,
-            #'invalid_action_masking': 'conditional',
-            'generate_valid_action_trees': True,
-            #'level': 0,
-            'random_level_on_reset': True,
-            'yaml_file': args.yaml_file,
-            'global_observer_type': gd.ObserverType.SPRITE_2D,
-            'max_steps': 1000,
-        },
-        'entropy_coeff_schedule': [
-            [0, 0.01],
-            [max_training_steps, 0.0]
-        ],
-        'lr_schedule': [
-            [0, args.lr],
-            [max_training_steps, 0.0]
-        ],
-
-    }
-
-    if args.capture_video:
-        real_video_frequency = int(args.video_frequency / (args.num_envs_per_worker * args.num_workers))
-        config['env_config']['record_video_config'] = {
-            'frequency': real_video_frequency,
-            'directory': os.path.join(args.root_directory, args.video_directory)
-        }
-
-    stop = {
-        "timesteps_total": max_training_steps,
-    }
-
-    trial_name_creator = lambda trial: f'CAT-{args.experiment_name}-{trial.config["env_config"]["invalid_action_masking"]}'
-
-    result = tune.run(
-        ConditionalActionImpalaTrainer,
-        local_dir=args.root_directory,
-        config=config,
-        stop=stop,
-        callbacks=[wandbLoggerCallback],
-        trial_name_creator=trial_name_creator
-    )
diff --git a/python/requirements.txt b/python/requirements.txt
index bcc842968..1fb1c3aad 100644
--- a/python/requirements.txt
+++ b/python/requirements.txt
@@ -5,4 +5,5 @@ imageio>=2.9.0
 pygame>=2.0.0
 matplotlib>=3.3.3
 pyglet
-pytest>=6.2.1
\ No newline at end of file
+pytest>=6.2.1
+

From 61d2a992630b350fe33a95d7f968909efd0a34da Mon Sep 17 00:00:00 2001
From: Bam4d <chrisbam4d@gmail.com>
Date: Sun, 11 Apr 2021 16:08:08 +0100
Subject: [PATCH 35/45] adding code to submodule to keep this repo cleaner

---
 .gitmodules                                          | 3 +++
 python/examples/experiments/conditional-action-trees | 1 +
 2 files changed, 4 insertions(+)
 create mode 160000 python/examples/experiments/conditional-action-trees

diff --git a/.gitmodules b/.gitmodules
index d4a3170ab..d5ee31d48 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -10,3 +10,6 @@
 	path = libs/glm
 	url = https://github.com/g-truc/glm.git
 	ignore = dirty
+[submodule "python/examples/experiments/conditional-action-trees"]
+	path = python/examples/experiments/conditional-action-trees
+	url = git@github.com:Bam4d/conditional-action-trees.git
diff --git a/python/examples/experiments/conditional-action-trees b/python/examples/experiments/conditional-action-trees
new file mode 160000
index 000000000..e36bc5144
--- /dev/null
+++ b/python/examples/experiments/conditional-action-trees
@@ -0,0 +1 @@
+Subproject commit e36bc5144ec7becbe60cd24599a9db65e37de715

From 34f4c596ad180911fdeb4c4ca86aa74af8de6d6f Mon Sep 17 00:00:00 2001
From: Bam4d <chrisbam4d@gmail.com>
Date: Sun, 11 Apr 2021 16:09:57 +0100
Subject: [PATCH 36/45] info -> debug

---
 src/Griddly/Core/Observers/Vulkan/VulkanDevice.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/Griddly/Core/Observers/Vulkan/VulkanDevice.cpp b/src/Griddly/Core/Observers/Vulkan/VulkanDevice.cpp
index 43a82b50d..a5cda36ac 100644
--- a/src/Griddly/Core/Observers/Vulkan/VulkanDevice.cpp
+++ b/src/Griddly/Core/Observers/Vulkan/VulkanDevice.cpp
@@ -150,11 +150,11 @@ void VulkanDevice::initRenderMode(RenderMode mode) {
 
   switch (mode) {
     case SHAPES:
-      spdlog::info("Render mode set to SHAPES. Will only load shape render pipeline.");
+      spdlog::debug("Render mode set to SHAPES. Will only load shape render pipeline.");
       shapeBuffers_ = createShapeBuffers();
       break;
     case SPRITES:
-      spdlog::info("Render mode set to SPRITES. Will load both shape and sprite render pipelines.");
+      spdlog::debug("Render mode set to SPRITES. Will load both shape and sprite render pipelines.");
       spriteShapeBuffer_ = createSpriteShapeBuffer();
       break;
   }
@@ -801,12 +801,12 @@ std::vector<VulkanPhysicalDeviceInfo> VulkanDevice::getSupportedPhysicalDevices(
   }
 
   if (deviceSelection.order == DeviceSelectionOrder::PCI_BUS_ID) {
-    spdlog::info("Sorting devices by PCI_BUS_ID ascending");
+    spdlog::debug("Sorting devices by PCI_BUS_ID ascending");
     std::sort(physicalDeviceInfoList.begin(), physicalDeviceInfoList.end(), [](const VulkanPhysicalDeviceInfo& a, const VulkanPhysicalDeviceInfo& b) -> bool { return a.pciBusId < b.pciBusId; });
   }
 
   for (auto& physicalDeviceInfo : physicalDeviceInfoList) {
-    spdlog::info("Device {0}, isGpu {1}, PCI bus: {2}, isSupported {3}.", physicalDeviceInfo.deviceName, physicalDeviceInfo.isGpu, physicalDeviceInfo.pciBusId, physicalDeviceInfo.isSupported);
+    spdlog::debug("Device {0}, isGpu {1}, PCI bus: {2}, isSupported {3}.", physicalDeviceInfo.deviceName, physicalDeviceInfo.isGpu, physicalDeviceInfo.pciBusId, physicalDeviceInfo.isSupported);
 
     if (physicalDeviceInfo.isGpu) {
       physicalDeviceInfo.gpuIdx = gpuIdx++;
@@ -815,7 +815,7 @@ std::vector<VulkanPhysicalDeviceInfo> VulkanDevice::getSupportedPhysicalDevices(
     if (physicalDeviceInfo.isSupported) {
       if (physicalDeviceInfo.isGpu && limitGpuUsage) {
         if (allowedGpuIdx.find(physicalDeviceInfo.gpuIdx) != allowedGpuIdx.end()) {
-          spdlog::info("GPU Device {0}, Id: {1}, PCI bus: {2} -> Visible", physicalDeviceInfo.deviceName, physicalDeviceInfo.gpuIdx, physicalDeviceInfo.pciBusId);
+          spdlog::debug("GPU Device {0}, Id: {1}, PCI bus: {2} -> Visible", physicalDeviceInfo.deviceName, physicalDeviceInfo.gpuIdx, physicalDeviceInfo.pciBusId);
           supportedPhysicalDeviceList.push_back(physicalDeviceInfo);
         }
       } else {
@@ -852,7 +852,7 @@ VulkanPhysicalDeviceInfo VulkanDevice::getPhysicalDeviceInfo(VkPhysicalDevice& p
 
   auto deviceName = deviceProperties.deviceName;
 
-  spdlog::info("Device found {0}, PCI Bus: {1}. checking for Vulkan support...", deviceName, devicePCIBusInfo.pciBus);
+  spdlog::debug("Device found {0}, PCI Bus: {1}. checking for Vulkan support...", deviceName, devicePCIBusInfo.pciBus);
 
   bool isGpu = deviceProperties.deviceType == VK_PHYSICAL_DEVICE_TYPE_DISCRETE_GPU;
   bool isSupported = hasQueueFamilySupport(physicalDevice, queueFamilyIndices);

From a13474a25bbb23069536c9eb4877828654ae192b Mon Sep 17 00:00:00 2001
From: Bam4d <chrisbam4d@gmail.com>
Date: Sun, 11 Apr 2021 16:15:40 +0100
Subject: [PATCH 37/45] remove example here as it will be in seperate
 repository

---
 .../rllib_single_agent_conditional_actions.py | 68 -------------------
 1 file changed, 68 deletions(-)
 delete mode 100644 python/examples/rllib/rllib_single_agent_conditional_actions.py

diff --git a/python/examples/rllib/rllib_single_agent_conditional_actions.py b/python/examples/rllib/rllib_single_agent_conditional_actions.py
deleted file mode 100644
index b2ff4efca..000000000
--- a/python/examples/rllib/rllib_single_agent_conditional_actions.py
+++ /dev/null
@@ -1,68 +0,0 @@
-import os
-import sys
-
-import ray
-from ray import tune
-from ray.rllib.models import ModelCatalog
-from ray.tune.registry import register_env
-
-from griddly import gd
-from griddly.util.rllib.torch import GAPAgent
-from griddly.util.rllib.torch.conditional_actions.conditional_action_policy_trainer import ConditionalActionImpalaTrainer
-from griddly.util.rllib.environment.core import RLlibEnv
-
-if __name__ == '__main__':
-    sep = os.pathsep
-    os.environ['PYTHONPATH'] = sep.join(sys.path)
-
-    ray.init(num_gpus=1)
-    #ray.init(num_gpus=1, local_mode=True)
-
-    env_name = "ray-griddly-env"
-
-    register_env(env_name, RLlibEnv)
-    ModelCatalog.register_custom_model("GAP", GAPAgent)
-
-    max_training_steps = 20000000
-
-    config = {
-        'framework': 'torch',
-        'num_workers': 8,
-        'num_envs_per_worker': 4,
-
-        'model': {
-            'custom_model': 'GAP',
-            'custom_model_config': {}
-        },
-        'env': env_name,
-        'env_config': {
-            'record_video_config': {
-                'frequency': 100000,
-                'directory': 'videos'
-            },
-
-            'allow_nop': tune.grid_search([True, False]),
-            'invalid_action_masking': tune.grid_search(['none', 'conditional']),
-            'generate_valid_action_trees': True,
-            'random_level_on_reset': True,
-            'yaml_file': 'Single-Player/GVGAI/clusters_partially_observable.yaml',
-            'global_observer_type': gd.ObserverType.SPRITE_2D,
-            'max_steps': 1000,
-        },
-        #'entropy_coeff_schedule': [
-        #    [0, 0.01],
-        #    [max_training_steps, 0.0]
-        #],
-        #'lr_schedule': [
-        #    [0, 0.005],
-        #    [max_training_steps, 0.0]
-        #],
-
-
-    }
-
-    stop = {
-        "timesteps_total": max_training_steps,
-    }
-
-    result = tune.run(ConditionalActionImpalaTrainer, config=config, stop=stop)

From 11b064829bd896ea87ddb0b121cf8944186af9de Mon Sep 17 00:00:00 2001
From: Bam4d <chrisbam4d@gmail.com>
Date: Sun, 11 Apr 2021 17:37:28 +0100
Subject: [PATCH 38/45] try to fix windows/mac compile

---
 python/examples/experiments/conditional-action-trees | 2 +-
 src/Griddly/Core/Grid.hpp                            | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/examples/experiments/conditional-action-trees b/python/examples/experiments/conditional-action-trees
index e36bc5144..0e8d6ffae 160000
--- a/python/examples/experiments/conditional-action-trees
+++ b/python/examples/experiments/conditional-action-trees
@@ -1 +1 @@
-Subproject commit e36bc5144ec7becbe60cd24599a9db65e37de715
+Subproject commit 0e8d6ffae5636d2dbbbbd95ed7a0fbe8771a8c54
diff --git a/src/Griddly/Core/Grid.hpp b/src/Griddly/Core/Grid.hpp
index 3fb4fb31e..22e36cdbb 100644
--- a/src/Griddly/Core/Grid.hpp
+++ b/src/Griddly/Core/Grid.hpp
@@ -35,8 +35,8 @@ struct GridEvent {
   uint32_t sourceObjectPlayerId = 0;
   uint32_t destinationObjectPlayerId = 0;
 
-  glm::vec2 sourceLocation;
-  glm::vec2 destLocation;
+  glm::ivec2 sourceLocation;
+  glm::ivec2 destLocation;
 };
 
 struct GlobalVariableDefinition {

From bf3407ac1017db1cc71498f09b2ad925f3f26065 Mon Sep 17 00:00:00 2001
From: Bam4d <chrisbam4d@gmail.com>
Date: Sun, 11 Apr 2021 17:52:40 +0100
Subject: [PATCH 39/45] try to fix windows/mac compile

---
 bindings/wrapper/GameWrapper.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bindings/wrapper/GameWrapper.cpp b/bindings/wrapper/GameWrapper.cpp
index de3f0175b..77d8d9e46 100644
--- a/bindings/wrapper/GameWrapper.cpp
+++ b/bindings/wrapper/GameWrapper.cpp
@@ -380,8 +380,8 @@ class Py_GameWrapper {
         py_event["SourceObjectPlayerId"] = historyEvent.sourceObjectPlayerId;
         py_event["DestinationObjectPlayerId"] = historyEvent.destinationObjectPlayerId;
 
-        py_event["SourceLocation"] = std::array<uint32_t, 2>{historyEvent.sourceLocation.x, historyEvent.sourceLocation.y};
-        py_event["DestinationLocation"] = std::array<uint32_t, 2>{historyEvent.destLocation.x, historyEvent.destLocation.y};
+        py_event["SourceLocation"] = std::array<uint32_t, 2>{(uint32_t)historyEvent.sourceLocation.x, (uint32_t)historyEvent.sourceLocation.y};
+        py_event["DestinationLocation"] = std::array<uint32_t, 2>{(uint32_t)historyEvent.destLocation.x, (uint32_t)historyEvent.destLocation.y};
 
         py_events.push_back(py_event);
       }

From 25f07cd369d07ecf8e1b44bcf7df24da9076e197 Mon Sep 17 00:00:00 2001
From: Bam4d <chrisbam4d@gmail.com>
Date: Mon, 12 Apr 2021 08:28:24 +0100
Subject: [PATCH 40/45] fixing submodule

---
 .gitmodules                                          | 5 +++++
 python/examples/experiments/conditional-action-trees | 1 +
 2 files changed, 6 insertions(+)
 create mode 160000 python/examples/experiments/conditional-action-trees

diff --git a/.gitmodules b/.gitmodules
index d4a3170ab..c7835f09c 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -10,3 +10,8 @@
 	path = libs/glm
 	url = https://github.com/g-truc/glm.git
 	ignore = dirty
+[submodule "python/examples/experiments/conditional-action-trees"]
+	path=python/examples/experiments/conditional-action-trees
+	url=https://github.com/Bam4d/conditional-action-trees
+	ignore = dirty
+
diff --git a/python/examples/experiments/conditional-action-trees b/python/examples/experiments/conditional-action-trees
new file mode 160000
index 000000000..0e8d6ffae
--- /dev/null
+++ b/python/examples/experiments/conditional-action-trees
@@ -0,0 +1 @@
+Subproject commit 0e8d6ffae5636d2dbbbbd95ed7a0fbe8771a8c54

From 3309a70a298bd97f0ffde24ad58782cc2d5a8a48 Mon Sep 17 00:00:00 2001
From: Bam4d <chrisbam4d@gmail.com>
Date: Mon, 12 Apr 2021 09:46:16 +0100
Subject: [PATCH 41/45] added tests for building conditional action trees

---
 python/tests/cat_test.py                      | 107 ++++++++++++++++++
 python/tests/gdy/test_CAT_depth_1.yaml        |  26 +++++
 python/tests/gdy/test_CAT_depth_2.yaml        |  35 ++++++
 python/tests/gdy/test_CAT_depth_3.yaml        |  25 ++++
 python/tests/gdy/test_CAT_depth_4.yaml        |  34 ++++++
 .../tests/gdy/test_CAT_depth_4_2_players.yaml |  34 ++++++
 6 files changed, 261 insertions(+)
 create mode 100644 python/tests/cat_test.py
 create mode 100644 python/tests/gdy/test_CAT_depth_1.yaml
 create mode 100644 python/tests/gdy/test_CAT_depth_2.yaml
 create mode 100644 python/tests/gdy/test_CAT_depth_3.yaml
 create mode 100644 python/tests/gdy/test_CAT_depth_4.yaml
 create mode 100644 python/tests/gdy/test_CAT_depth_4_2_players.yaml

diff --git a/python/tests/cat_test.py b/python/tests/cat_test.py
new file mode 100644
index 000000000..4d3fa4236
--- /dev/null
+++ b/python/tests/cat_test.py
@@ -0,0 +1,107 @@
+import numpy as np
+import gym
+import pytest
+from griddly import GymWrapperFactory, gd
+
+
+@pytest.fixture
+def test_name(request):
+    return request.node.name
+
+
+def build_test_env(test_name, yaml_file):
+    wrapper_factory = GymWrapperFactory()
+
+    wrapper_factory.build_gym_from_yaml(
+        test_name,
+        yaml_file,
+        global_observer_type=gd.ObserverType.VECTOR,
+        player_observer_type=gd.ObserverType.VECTOR,
+    )
+
+    env = gym.make(f'GDY-{test_name}-v0')
+    env.reset()
+    return env
+
+
+def test_CAT_depth_1(test_name):
+
+    env = build_test_env(
+        test_name,
+        "tests/gdy/test_CAT_depth_1.yaml"
+    )
+
+    valid_action_trees = env.game.build_valid_action_trees()
+
+    assert len(valid_action_trees) == 1
+    assert set(valid_action_trees[0].keys()) == {0, 1, 2, 3}
+
+def test_CAT_depth_2(test_name):
+
+    env = build_test_env(
+        test_name,
+        "tests/gdy/test_CAT_depth_2.yaml"
+    )
+
+    valid_action_trees = env.game.build_valid_action_trees()
+
+    assert len(valid_action_trees) == 1
+    assert set(valid_action_trees[0].keys()) == {0, 1}
+
+    assert set(valid_action_trees[0][0].keys()) == {0, 1, 2, 3}
+    assert set(valid_action_trees[0][1].keys()) == {0, 4}
+
+def test_CAT_depth_3(test_name):
+
+    env = build_test_env(
+        test_name,
+        "tests/gdy/test_CAT_depth_3.yaml"
+    )
+
+    valid_action_trees = env.game.build_valid_action_trees()
+
+    assert len(valid_action_trees) == 1
+    assert set(valid_action_trees[0].keys()) == {1}
+    assert set(valid_action_trees[0][1].keys()) == {1}
+    assert set(valid_action_trees[0][1][1].keys()) == {0, 1, 2, 3}
+
+def test_CAT_depth_4(test_name):
+
+    env = build_test_env(
+        test_name,
+        "tests/gdy/test_CAT_depth_4.yaml"
+    )
+
+    valid_action_trees = env.game.build_valid_action_trees()
+
+    assert len(valid_action_trees) == 1
+    assert set(valid_action_trees[0].keys()) == {1}
+    assert set(valid_action_trees[0][1].keys()) == {1}
+    assert set(valid_action_trees[0][1][1].keys()) == {0, 1}
+
+    assert set(valid_action_trees[0][1][1][0].keys()) == {0, 1, 2, 3}
+    assert set(valid_action_trees[0][1][1][1].keys()) == {0, 4}
+
+
+def test_CAT_depth_4_2_players(test_name):
+    env = build_test_env(
+        test_name,
+        "tests/gdy/test_CAT_depth_4_2_players.yaml"
+    )
+
+    valid_action_trees = env.game.build_valid_action_trees()
+
+    assert len(valid_action_trees) == 2
+    assert set(valid_action_trees[0].keys()) == {1}
+    assert set(valid_action_trees[0][1].keys()) == {1}
+    assert set(valid_action_trees[0][1][1].keys()) == {0, 1}
+
+    assert set(valid_action_trees[0][1][1][0].keys()) == {0, 1, 2, 3}
+    assert set(valid_action_trees[0][1][1][1].keys()) == {0, 4}
+
+    assert set(valid_action_trees[1].keys()) == {3}
+    assert set(valid_action_trees[1][3].keys()) == {1}
+    assert set(valid_action_trees[1][3][1].keys()) == {0, 1}
+
+    assert set(valid_action_trees[1][3][1][0].keys()) == {0, 1, 2, 3}
+    assert set(valid_action_trees[1][3][1][1].keys()) == {0, 4}
\ No newline at end of file
diff --git a/python/tests/gdy/test_CAT_depth_1.yaml b/python/tests/gdy/test_CAT_depth_1.yaml
new file mode 100644
index 000000000..cf28e7278
--- /dev/null
+++ b/python/tests/gdy/test_CAT_depth_1.yaml
@@ -0,0 +1,26 @@
+Version: "0.1"
+Environment:
+  Player:
+    AvatarObject: avatar
+  Levels:
+    - |
+      .  .  .
+      .  a  .
+      .  b  .
+
+Actions:
+  - Name: move
+    Behaviours:
+    - Src:
+        Object: avatar
+        Commands:
+          - mov: _dest
+      Dst:
+        Object: _empty
+
+Objects:
+  - Name: avatar
+    MapCharacter: a
+  - Name: wall
+    MapCharacter: b
+
diff --git a/python/tests/gdy/test_CAT_depth_2.yaml b/python/tests/gdy/test_CAT_depth_2.yaml
new file mode 100644
index 000000000..68bbdc366
--- /dev/null
+++ b/python/tests/gdy/test_CAT_depth_2.yaml
@@ -0,0 +1,35 @@
+Version: "0.1"
+Environment:
+  Player:
+    AvatarObject: avatar
+  Levels:
+    - |
+      .  .  .
+      .  a  .
+      .  b  .
+
+Actions:
+  - Name: move
+    Behaviours:
+    - Src:
+        Object: avatar
+        Commands:
+          - mov: _dest
+      Dst:
+        Object: _empty
+
+  - Name: move2
+    Behaviours:
+      - Src:
+          Object: avatar
+          Commands:
+            - reward: 1
+        Dst:
+          Object: wall
+
+Objects:
+  - Name: avatar
+    MapCharacter: a
+  - Name: wall
+    MapCharacter: b
+
diff --git a/python/tests/gdy/test_CAT_depth_3.yaml b/python/tests/gdy/test_CAT_depth_3.yaml
new file mode 100644
index 000000000..8ac6f21dc
--- /dev/null
+++ b/python/tests/gdy/test_CAT_depth_3.yaml
@@ -0,0 +1,25 @@
+Version: "0.1"
+Environment:
+  Player:
+    Count: 1
+  Levels:
+    - |
+      .  .  .
+      .  a1 .
+      .  b  .
+
+Actions:
+  - Name: move
+    Behaviours:
+    - Src:
+        Object: avatar
+        Commands:
+          - mov: _dest
+      Dst:
+        Object: _empty
+
+Objects:
+  - Name: avatar
+    MapCharacter: a
+  - Name: wall
+    MapCharacter: b
\ No newline at end of file
diff --git a/python/tests/gdy/test_CAT_depth_4.yaml b/python/tests/gdy/test_CAT_depth_4.yaml
new file mode 100644
index 000000000..743b3f419
--- /dev/null
+++ b/python/tests/gdy/test_CAT_depth_4.yaml
@@ -0,0 +1,34 @@
+Version: "0.1"
+Environment:
+  Player:
+    Count: 1
+  Levels:
+    - |
+      .  .  .
+      .  a1 .
+      .  b  .
+
+Actions:
+  - Name: move
+    Behaviours:
+    - Src:
+        Object: avatar
+        Commands:
+          - mov: _dest
+      Dst:
+        Object: _empty
+
+  - Name: move2
+    Behaviours:
+      - Src:
+          Object: avatar
+          Commands:
+            - reward: 1
+        Dst:
+          Object: wall
+
+Objects:
+  - Name: avatar
+    MapCharacter: a
+  - Name: wall
+    MapCharacter: b
\ No newline at end of file
diff --git a/python/tests/gdy/test_CAT_depth_4_2_players.yaml b/python/tests/gdy/test_CAT_depth_4_2_players.yaml
new file mode 100644
index 000000000..80ddb26c0
--- /dev/null
+++ b/python/tests/gdy/test_CAT_depth_4_2_players.yaml
@@ -0,0 +1,34 @@
+Version: "0.1"
+Environment:
+  Player:
+    Count: 2
+  Levels:
+    - |
+      .  .  .  .  .
+      .  a1 .  a2 .
+      .  b  .  b  .
+
+Actions:
+  - Name: move
+    Behaviours:
+    - Src:
+        Object: avatar
+        Commands:
+          - mov: _dest
+      Dst:
+        Object: _empty
+
+  - Name: move2
+    Behaviours:
+      - Src:
+          Object: avatar
+          Commands:
+            - reward: 1
+        Dst:
+          Object: wall
+
+Objects:
+  - Name: avatar
+    MapCharacter: a
+  - Name: wall
+    MapCharacter: b
\ No newline at end of file

From d3eafbf59119b275e6048d1fe11c7577219ea3f9 Mon Sep 17 00:00:00 2001
From: Bam4d <chrisbam4d@gmail.com>
Date: Mon, 12 Apr 2021 10:34:37 +0100
Subject: [PATCH 42/45] adding a bit of docs and some debug lines for future

---
 bindings/wrapper/GameWrapper.cpp             |  8 ++++++++
 docs/getting-started/action spaces/index.rst | 10 +++++++++-
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/bindings/wrapper/GameWrapper.cpp b/bindings/wrapper/GameWrapper.cpp
index 77d8d9e46..65d0b43a0 100644
--- a/bindings/wrapper/GameWrapper.cpp
+++ b/bindings/wrapper/GameWrapper.cpp
@@ -77,19 +77,27 @@ class Py_GameWrapper {
     
     std::vector<py::dict> valid_action_trees; 
     auto externalActionNames = gdyFactory_->getExternalActionNames();
+    spdlog::debug("Building tree, {0} actions", externalActionNames.size());
     for (int playerId = 1; playerId <= playerCount_; playerId++) {
       std::shared_ptr<ValidActionNode> node = std::shared_ptr<ValidActionNode>(new ValidActionNode());
       for (auto actionNamesAtLocation : gameProcess_->getAvailableActionNames(playerId)) {
         auto location = actionNamesAtLocation.first;
         auto actionNames = actionNamesAtLocation.second;
 
+        
+
         for (auto actionName : actionNames) {
+
+          spdlog::debug("[{0}] available at location [{1}, {2}]", actionName, location.x, location.y);
+
           std::shared_ptr<ValidActionNode> treePtr = node;
           auto actionInputsDefinitions = gdyFactory_->getActionInputsDefinitions();
           if (actionInputsDefinitions.find(actionName) != actionInputsDefinitions.end()) {
             auto locationVec = glm::ivec2{location[0], location[1]};
             auto actionIdsForName = gameProcess_->getAvailableActionIdsAtLocation(locationVec, actionName);
 
+            spdlog::debug("{0} action ids available", actionIdsForName.size());
+
             if (actionIdsForName.size() > 0) {
               if (gdyFactory_->getAvatarObject().length() == 0) {
                 auto py_x = locationVec[0];
diff --git a/docs/getting-started/action spaces/index.rst b/docs/getting-started/action spaces/index.rst
index 86ecacae3..6353a4dce 100644
--- a/docs/getting-started/action spaces/index.rst	
+++ b/docs/getting-started/action spaces/index.rst	
@@ -119,7 +119,15 @@ In order to easily support games with large action spaces such as RTS games, sev
 
 .. seealso:: A Closer Look at Action Masking in Policy Gradient Algorithms: https://arxiv.org/abs/2006.14171
 
-  
+Valid Action Trees
+------------------
+
+Valid action trees can be used to construct Conditional Action Trees, which can be used to iteratively apply masks to complex action spaces depending on the previous actions selected.
+
+:env.game.build_valid_action_trees():
+  Returns a valid action tree for the current state for each player.
+
+.. seealso:: You can find several examples of Conditional Action Trees being used with Griddly and RLLib here: https://github.com/Bam4d/conditional-action-trees
 
 ********
 Examples

From 2b5b1cab8f67b49491db99685aa3b7e1ad9ac31a Mon Sep 17 00:00:00 2001
From: Bam4d <chrisbam4d@gmail.com>
Date: Mon, 12 Apr 2021 17:15:44 +0100
Subject: [PATCH 43/45] fixing basic network docs

---
 docs/rllib/intro/index.rst | 25 ++++++++-----------------
 1 file changed, 8 insertions(+), 17 deletions(-)

diff --git a/docs/rllib/intro/index.rst b/docs/rllib/intro/index.rst
index 698cd381b..b5cae6d73 100644
--- a/docs/rllib/intro/index.rst
+++ b/docs/rllib/intro/index.rst
@@ -117,9 +117,9 @@ SimpleConvAgent
 .. code-block::
    
     class SimpleConvAgent(TorchModelV2, nn.Module):
-        """
-        Simple Convolution agent that calculates the required linear output layer
-        """
+    """
+    Simple Convolution agent that calculates the required linear output layer
+    """
 
         def __init__(self, obs_space, action_space, num_outputs, model_config, name):
             super().__init__(obs_space, action_space, num_outputs, model_config, name)
@@ -135,22 +135,17 @@ SimpleConvAgent
                 nn.ReLU(),
                 layer_init(nn.Conv2d(32, 64, 3, padding=1)),
                 nn.ReLU(),
-                layer_init(nn.Conv2d(64, 64, 3, padding=1)),
-                nn.ReLU(),
-                layer_init(nn.Conv2d(64, 64, 3, padding=1)),
-                nn.ReLU(),
                 nn.Flatten(),
                 layer_init(nn.Linear(linear_flatten, 1024)),
                 nn.ReLU(),
                 layer_init(nn.Linear(1024, 512)),
                 nn.ReLU(),
-                layer_init(nn.Linear(512, 512))
             )
 
             self._actor_head = nn.Sequential(
-                layer_init(nn.Linear(512, 512), std=0.01),
+                layer_init(nn.Linear(512, 256), std=0.01),
                 nn.ReLU(),
-                layer_init(nn.Linear(512, self._num_actions), std=0.01)
+                layer_init(nn.Linear(256, self._num_actions), std=0.01)
             )
 
             self._critic_head = nn.Sequential(
@@ -214,6 +209,7 @@ GAPAgent
         nn.Module.__init__(self)
 
         self._num_objects = obs_space.shape[2]
+
         self._num_actions = num_outputs
 
         self.network = nn.Sequential(
@@ -221,22 +217,17 @@ GAPAgent
             nn.ReLU(),
             layer_init(nn.Conv2d(32, 64, 3, padding=1)),
             nn.ReLU(),
-            layer_init(nn.Conv2d(64, 64, 3, padding=1)),
-            nn.ReLU(),
-            layer_init(nn.Conv2d(64, 64, 3, padding=1)),
-            nn.ReLU(),
             GlobalAvePool(2048),
             layer_init(nn.Linear(2048, 1024)),
             nn.ReLU(),
             layer_init(nn.Linear(1024, 512)),
             nn.ReLU(),
-            layer_init(nn.Linear(512, 512))
         )
 
         self._actor_head = nn.Sequential(
-            layer_init(nn.Linear(512, 512), std=0.01),
+            layer_init(nn.Linear(512, 256), std=0.01),
             nn.ReLU(),
-            layer_init(nn.Linear(512, self._num_actions), std=0.01)
+            layer_init(nn.Linear(256, self._num_actions), std=0.01)
         )
 
         self._critic_head = nn.Sequential(

From d64d5700f2efe6da72b3963132b2d2245f56cfea Mon Sep 17 00:00:00 2001
From: Bam4d <chrisbam4d@gmail.com>
Date: Thu, 15 Apr 2021 10:26:39 +0100
Subject: [PATCH 44/45] updated experiment repo

---
 python/examples/experiments/conditional-action-trees | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/examples/experiments/conditional-action-trees b/python/examples/experiments/conditional-action-trees
index 0e8d6ffae..0c1cc7e39 160000
--- a/python/examples/experiments/conditional-action-trees
+++ b/python/examples/experiments/conditional-action-trees
@@ -1 +1 @@
-Subproject commit 0e8d6ffae5636d2dbbbbd95ed7a0fbe8771a8c54
+Subproject commit 0c1cc7e39e3024d0538064c629142abe130b0a3d

From 794cd14c77c73bfb7fc729d5f9ce7f419bb85622 Mon Sep 17 00:00:00 2001
From: Bam4d <chrisbam4d@gmail.com>
Date: Thu, 15 Apr 2021 10:28:13 +0100
Subject: [PATCH 45/45] bumping version numbers

---
 .github/ISSUE_TEMPLATE/bug_report.md | 2 +-
 CMakeLists.txt                       | 2 +-
 bindings/python.cpp                  | 2 +-
 docs/conf.py                         | 2 +-
 python/setup.py                      | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
index 0cd31c1d4..1cfb6bf66 100644
--- a/.github/ISSUE_TEMPLATE/bug_report.md
+++ b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -24,7 +24,7 @@ If applicable, add screenshots to help explain your problem.
 
 **Desktop (please complete the following information):**
  - OS: [e.g. mac/linux/windows]
- - Version [e.g. 1.0.0]
+ - Version [e.g. 1.0.1]
 
 **Additional context**
 Add any other context about the problem here.
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 270aa3a82..140f1ca16 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,5 +1,5 @@
 cmake_minimum_required(VERSION 3.10.0)
-project(Griddly VERSION 1.0.0)
+project(Griddly VERSION 1.0.1)
 
 set(BINARY ${CMAKE_PROJECT_NAME})
 
diff --git a/bindings/python.cpp b/bindings/python.cpp
index a5fde9813..a6677cd8f 100644
--- a/bindings/python.cpp
+++ b/bindings/python.cpp
@@ -12,7 +12,7 @@ namespace griddly {
 
 PYBIND11_MODULE(python_griddly, m) {
   m.doc() = "Griddly python bindings";
-  m.attr("version") = "1.0.0";
+  m.attr("version") = "1.0.1";
 
 #ifndef NDEBUG
   spdlog::set_level(spdlog::level::debug);
diff --git a/docs/conf.py b/docs/conf.py
index 47a2bb244..074e63f9a 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -22,7 +22,7 @@
 author = 'Chris Bamford'
 
 # The full version, including alpha/beta/rc tags
-release = '1.0.0'
+release = '1.0.1'
 
 
 # -- General configuration ---------------------------------------------------
diff --git a/python/setup.py b/python/setup.py
index 430fb88d1..4f2178df1 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -71,7 +71,7 @@ def griddly_package_data(config='Debug'):
 
 setup(
     name='griddly',
-    version="1.0.0",
+    version="1.0.1",
     author_email="chrisbam4d@gmail.com",
     description="Griddly Python Libraries",
     long_description=long_description,