From 4cd1b845e986a777b279ad7b72f7bf900767474e Mon Sep 17 00:00:00 2001
From: Ivan-267 <61947090+Ivan-267@users.noreply.github.com>
Date: Mon, 1 Apr 2024 19:29:49 +0200
Subject: [PATCH 01/14] Multi-policy training support added

---
 examples/rllib_config.yaml               |  52 ++++++++
 examples/rllib_example.py                | 105 ++++++++++++++++
 godot_rl/core/godot_env.py               | 123 ++++++++++++-------
 godot_rl/wrappers/petting_zoo_wrapper.py | 147 +++++++++++++++++++++++
 godot_rl/wrappers/ray_wrapper.py         |   8 +-
 5 files changed, 390 insertions(+), 45 deletions(-)
 create mode 100644 examples/rllib_config.yaml
 create mode 100644 examples/rllib_example.py
 create mode 100644 godot_rl/wrappers/petting_zoo_wrapper.py

diff --git a/examples/rllib_config.yaml b/examples/rllib_config.yaml
new file mode 100644
index 00000000..152cc077
--- /dev/null
+++ b/examples/rllib_config.yaml
@@ -0,0 +1,52 @@
+algorithm: PPO
+
+# Multi-agent-env setting:
+# If true:
+# - Any AIController with done = true will receive zeroes as action values until all AIControllers are done, an episode ends at that point.
+# - ai_controller.needs_reset will also be set to true every time a new episode begins (but you can ignore it in your env if needed).
+# If false:
+# - AIControllers auto-reset in Godot and will receive actions after setting done = true.
+# - Each AIController has its own episodes that can end/reset at any point.
+# Set to false if you have a single policy name for all agents set in AIControllers
+env_is_multiagent: false
+
+checkpoint_frequency: 10
+
+# You can set one or more stopping criteria
+stop:
+    #episode_reward_mean: 0
+    #training_iteration: 1000
+    #timesteps_total: 10000
+    time_total_s: 1200
+
+config:
+    env: godot
+    env_config:
+        env_path: null # Set your env path here (exported executable from Godot) - e.g. 'env_path.exe' on Windows
+        action_repeat: null # Doesn't need to be set here, you can set this in sync node in Godot editor as well
+        show_window: true # Displays game window while training. Might be faster when false in some cases, turning off also reduces GPU usage if you don't need rendering.
+        speedup: 20 # Speeds up Godot physics
+
+    framework: torch # ONNX models exported with torch are compatible with the current Godot RL Agents Plugin
+    lr: 0.0003
+    lambda: 0.95
+    gamma: 0.99
+
+    vf_clip_param: 0.5
+    clip_param: 0.2
+    entropy_coeff: 0.001
+    entropy_coeff_schedule: null
+
+    rollout_fragment_length: 64
+    sgd_minibatch_size: 64
+    num_workers: 1
+    num_envs_per_worker: 1
+    train_batch_size: 512
+
+    num_sgd_iter: 4
+    batch_mode: truncate_episodes
+
+    num_gpus: 0
+    model:
+        vf_share_layers: False
+        fcnet_hiddens: [64, 64]
\ No newline at end of file
diff --git a/examples/rllib_example.py b/examples/rllib_example.py
new file mode 100644
index 00000000..8fcca985
--- /dev/null
+++ b/examples/rllib_example.py
@@ -0,0 +1,105 @@
+# Rllib Example for single and multi-agent training for GodotRL with onnx export,
+# needs rllib_config.yaml in the same folder or --config_file argument specified to work.
+
+import argparse
+import os
+import pathlib
+
+import ray
+import yaml
+from ray import train, tune
+from ray.rllib.algorithms.algorithm import Algorithm
+from ray.rllib.env.wrappers.pettingzoo_env import ParallelPettingZooEnv
+from ray.rllib.policy.policy import PolicySpec
+
+from godot_rl.core.godot_env import GodotEnv
+from godot_rl.wrappers.petting_zoo_wrapper import GDRLPettingZooEnv
+from godot_rl.wrappers.ray_wrapper import RayVectorGodotEnv
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(allow_abbrev=False)
+    parser.add_argument("--config_file", default="rllib_config.yaml", type=str, help="The yaml config file")
+    parser.add_argument("--restore", default=None, type=str, help="the location of a checkpoint to restore from")
+    parser.add_argument(
+        "--experiment_dir",
+        default="logs/rllib",
+        type=str,
+        help="The name of the the experiment directory, used to store logs.",
+    )
+    args, extras = parser.parse_known_args()
+
+    # Get config from file
+    with open(args.config_file) as f:
+        exp = yaml.safe_load(f)
+
+    is_multiagent = exp["env_is_multiagent"]
+
+    # Register env
+    env_name = "godot"
+    env_wrapper = None
+
+    def env_creator(env_config):
+        index = env_config.worker_index * exp["config"]["num_envs_per_worker"] + env_config.vector_index
+        port = index + GodotEnv.DEFAULT_PORT
+        seed = index
+        if is_multiagent:
+            return ParallelPettingZooEnv(GDRLPettingZooEnv(config=env_config, port=port, seed=seed))
+        else:
+            return RayVectorGodotEnv(config=env_config, port=port, seed=seed)
+
+    tune.register_env(env_name, env_creator)
+
+    # Make temp env to get info needed for multi-agent training config
+    if is_multiagent:
+        print("Starting a temporary multi-agent env to get the policy names")
+        tmp_env = GDRLPettingZooEnv(config=exp["config"]["env_config"], show_window=False)
+        policy_names = tmp_env.agent_policy_names
+        print("Policy names for each Agent (AIController) set in the Godot Environment", policy_names)
+        tmp_env.close()
+
+    def policy_mapping_fn(agent_id: int, episode, worker, **kwargs) -> str:
+        return policy_names[agent_id]
+
+    ray.init(_temp_dir=os.path.abspath(args.experiment_dir))
+
+    if is_multiagent:
+        exp["config"]["multiagent"] = {
+            "policies": {policy_name: PolicySpec() for policy_name in policy_names},
+            "policy_mapping_fn": policy_mapping_fn,
+        }
+
+    tuner = None
+    if not args.restore:
+        tuner = tune.Tuner(
+            trainable=exp["algorithm"],
+            param_space=exp["config"],
+            run_config=train.RunConfig(
+                storage_path=os.path.abspath(args.experiment_dir),
+                stop=exp["stop"],
+                checkpoint_config=train.CheckpointConfig(checkpoint_frequency=exp["checkpoint_frequency"]),
+            ),
+        )
+    else:
+        tuner = tune.Tuner.restore(
+            trainable=exp["algorithm"],
+            path=args.restore,
+            resume_unfinished=True,
+        )
+    result = tuner.fit()
+
+    # Onnx export after training if a checkpoint was saved
+    checkpoint = result.get_best_result().checkpoint
+    if checkpoint:
+        result_path = result.get_best_result().path
+        ppo = Algorithm.from_checkpoint(checkpoint)
+        if is_multiagent:
+            for policy_name in set(policy_names):
+                ppo.get_policy(policy_name).export_model(f"{result_path}/onnx_export/{policy_name}_onnx", onnx=12)
+                print(
+                    f"Saving onnx policy to {pathlib.Path(f'{result_path}/onnx_export/{policy_name}_onnx').resolve()}"
+                )
+        else:
+            ppo.get_policy().export_model(f"{result_path}/onnx_export/single_agent_policy_onnx", onnx=12)
+            print(
+                f"Saving onnx policy to {pathlib.Path(f'{result_path}/onnx_export/single_agent_policy_onnx').resolve()}"
+            )
diff --git a/godot_rl/core/godot_env.py b/godot_rl/core/godot_env.py
index c5bc4ac7..9fc2bb9f 100644
--- a/godot_rl/core/godot_env.py
+++ b/godot_rl/core/godot_env.py
@@ -59,10 +59,27 @@ def __init__(
         self.connection = self._start_server()
         self.num_envs = None
         self._handshake()
+
+        # Action and observation spaces for each in-game agent/env/AIController (used only for multi-agent case with Rllib for now)
+        self.action_spaces = []
+        self.observation_spaces = []
+
         self._get_env_info()
+
+        # Single-agent observation space
+        self.observation_space = self.observation_spaces[0]
+
         # sf2 requires a tuple action space
-        self._tuple_action_space = spaces.Tuple([v for _, v in self._action_space.items()])
-        self.action_space_processor = ActionSpaceProcessor(self._tuple_action_space, convert_action_space)
+        # Multiple agents' action space(s)
+        self.tuple_action_spaces = [
+            spaces.Tuple([v for _, v in action_space.items()]) for action_space in self.action_spaces
+        ]
+        # Single agent action space processor using the action space(s) of the first agent
+        self.action_space_processor = ActionSpaceProcessor(self.tuple_action_spaces[0], convert_action_space)
+
+        # For multi-policy envs: The name of each agent's policy set in the env itself (any training_mode
+        # AIController instance is treated as an agent)
+        self.agent_policy_names
 
         atexit.register(self._close)
 
@@ -128,14 +145,13 @@ def from_numpy(self, action, order_ij=False):
         """
         result = []
 
-        for i in range(self.num_envs):
+        for agent_idx in range(self.num_envs):
             env_action = {}
-
-            for j, k in enumerate(self._action_space.keys()):
+            for j, k in enumerate(self.action_spaces[agent_idx].keys()):
                 if order_ij is True:
-                    v = action[i][j]
+                    v = action[agent_idx][j]
                 else:
-                    v = action[j][i]
+                    v = action[j][agent_idx]
 
                 if isinstance(v, np.ndarray):
                     env_action[k] = v.tolist()
@@ -168,6 +184,7 @@ def step_send(self, action, order_ij=False):
             order_ij (bool): Order flag.
         """
         action = self.action_space_processor.to_original_dist(action)
+
         message = {
             "type": "action",
             "action": self.from_numpy(action, order_ij=order_ij),
@@ -251,6 +268,9 @@ def close(self):
 
     @property
     def action_space(self):
+        """
+        Returns a single action space.
+        """
         return self.action_space_processor.action_space
 
     def _close(self):
@@ -314,48 +334,69 @@ def _get_env_info(self):
         self._send_as_json(message)
 
         json_dict = self._get_json_dict()
+
         assert json_dict["type"] == "env_info"
 
+        # Number of AIController instances in a single Godot env/process
+        self.num_envs = json_dict["n_agents"]
+
         # actions can be "single" for a single action head
         # or "multi" for several outputeads
-        action_spaces = OrderedDict()
+
         print("action space", json_dict["action_space"])
-        for k, v in json_dict["action_space"].items():
-            if v["action_type"] == "discrete":
-                action_spaces[k] = spaces.Discrete(v["size"])
-            elif v["action_type"] == "continuous":
-                action_spaces[k] = spaces.Box(low=-1.0, high=1.0, shape=(v["size"],))
-            else:
-                print(f"action space {v['action_type']} is not supported")
-                assert 0, f"action space {v['action_type']} is not supported"
-        self._action_space = spaces.Dict(action_spaces)
-
-        observation_spaces = {}
+        # Compatibility with previous versions of Godot plugin:
+        # A single action space will be received as a dict in previous versions,
+        # A list of dicts will be received from the newer version, defining the action_space for each agent (AIController)
+        if isinstance(json_dict["action_space"], dict):
+            json_dict["action_space"] = [json_dict["action_space"]] * self.num_envs
+
+        for agent_action_space in json_dict["action_space"]:
+            tmp_action_spaces = OrderedDict()
+            for k, v in agent_action_space.items():
+                if v["action_type"] == "discrete":
+                    tmp_action_spaces[k] = spaces.Discrete(v["size"])
+                elif v["action_type"] == "continuous":
+                    tmp_action_spaces[k] = spaces.Box(low=-1.0, high=1.0, shape=(v["size"],))
+                else:
+                    print(f"action space {v['action_type']} is not supported")
+                    assert 0, f"action space {v['action_type']} is not supported"
+            self.action_spaces.append(spaces.Dict(tmp_action_spaces))
+
         print("observation space", json_dict["observation_space"])
-        for k, v in json_dict["observation_space"].items():
-            if v["space"] == "box":
-                if "2d" in k:
-                    observation_spaces[k] = spaces.Box(
-                        low=0,
-                        high=255,
-                        shape=v["size"],
-                        dtype=np.uint8,
-                    )
+        # Compatibility with older versions of Godot plugin:
+        # A single observation space will be received as a dict in previous versions,
+        # A list of dicts will be received from newer version, defining the observation_space for each agent (AIController)
+        if isinstance(json_dict["observation_space"], dict):
+            json_dict["observation_space"] = [json_dict["observation_space"]] * self.num_envs
+
+        for agent_obs_space in json_dict["observation_space"]:
+            observation_spaces = {}
+            for k, v in agent_obs_space.items():
+                if v["space"] == "box":
+                    if "2d" in k:
+                        observation_spaces[k] = spaces.Box(
+                            low=0,
+                            high=255,
+                            shape=v["size"],
+                            dtype=np.uint8,
+                        )
+                    else:
+                        observation_spaces[k] = spaces.Box(
+                            low=-1.0,
+                            high=1.0,
+                            shape=v["size"],
+                            dtype=np.float32,
+                        )
+                elif v["space"] == "discrete":
+                    observation_spaces[k] = spaces.Discrete(v["size"])
                 else:
-                    observation_spaces[k] = spaces.Box(
-                        low=-1.0,
-                        high=1.0,
-                        shape=v["size"],
-                        dtype=np.float32,
-                    )
-            elif v["space"] == "discrete":
-                observation_spaces[k] = spaces.Discrete(v["size"])
-            else:
-                print(f"observation space {v['space']} is not supported")
-                assert 0, f"observation space {v['space']} is not supported"
-        self.observation_space = spaces.Dict(observation_spaces)
+                    print(f"observation space {v['space']} is not supported")
+                    assert 0, f"observation space {v['space']} is not supported"
+                self.observation_spaces.append(spaces.Dict(observation_spaces))
 
-        self.num_envs = json_dict["n_agents"]
+        # Gets policy names defined in AIControllers in Godot. If an older version of the plugin is used and no policy
+        # names are sent, "shared_policy" will be set for compatibility.
+        self.agent_policy_names = json_dict.get("agent_policy_names", ["shared_policy"] * self.num_envs)
 
     @staticmethod
     def _decode_2d_obs_from_string(
diff --git a/godot_rl/wrappers/petting_zoo_wrapper.py b/godot_rl/wrappers/petting_zoo_wrapper.py
new file mode 100644
index 00000000..2160d4e0
--- /dev/null
+++ b/godot_rl/wrappers/petting_zoo_wrapper.py
@@ -0,0 +1,147 @@
+# PettingZoo wrapper for GDRL
+# Multi-agent, where 1 agent corresponds to one AIController instance in Godot
+# Based on https://pettingzoo.farama.org/content/environment_creation/#example-custom-parallel-environment
+# https://github.com/Farama-Foundation/PettingZoo/?tab=License-1-ov-file#readme
+# and adjusted to work with GodotRL and Rllib (made for and tested only with rllib for now)
+
+import functools
+from typing import Dict
+
+import numpy as np
+from pettingzoo import ParallelEnv
+
+from godot_rl.core.godot_env import GodotEnv
+
+
+def env(render_mode=None):
+    """
+    The env function often wraps the environment in wrappers by default.
+    You can find full documentation for these methods
+    elsewhere in the developer documentation.
+    """
+    # Not implemented
+    return env
+
+
+class GDRLPettingZooEnv(ParallelEnv):
+    metadata = {"render_modes": ["human"], "name": "GDRLPettingZooEnv"}
+
+    def __init__(self, port=GodotEnv.DEFAULT_PORT, show_window=True, seed=0, config: Dict = {}):
+        """
+        The init method takes in environment arguments and should define the following attributes:
+        - possible_agents
+        - render_mode
+
+        Note: as of v1.18.1, the action_spaces and observation_spaces attributes are deprecated.
+        Spaces should be defined in the action_space() and observation_space() methods.
+        If these methods are not overridden, spaces will be inferred from self.observation_spaces/action_spaces, raising a warning.
+
+        These attributes should not be changed after initialization.
+        """
+        # Initialize the Godot Env which we will wrap
+        self.godot_env = GodotEnv(
+            env_path=config.get("env_path"),
+            show_window=config.get("show_window"),
+            action_repeat=config.get("action_repeat"),
+            speedup=config.get("speedup"),
+            convert_action_space=False,
+            seed=seed,
+            port=port,
+        )
+
+        self.render_mode = None  # Controlled by the env
+
+        self.possible_agents = [agent_idx for agent_idx in range(self.godot_env.num_envs)]
+        self.agents = self.possible_agents[:]
+
+        # The policy names here are set on each AIController in Godot editor,
+        # used to map agents to policies for multi-policy training.
+        self.agent_policy_names = self.godot_env.agent_policy_names
+
+        # optional: a mapping between agent name and ID
+        self.agent_name_mapping = dict(zip(self.possible_agents, list(range(len(self.possible_agents)))))
+
+        self.observation_spaces = {
+            agent: self.godot_env.observation_spaces[agent_idx] for agent_idx, agent in enumerate(self.agents)
+        }
+
+        self.action_spaces = {
+            agent: self.godot_env.tuple_action_spaces[agent_idx] for agent_idx, agent in enumerate(self.agents)
+        }
+
+    # Observation space should be defined here.
+    # lru_cache allows observation and action spaces to be memoized, reducing clock cycles required to get each agent's space.
+    # If your spaces change over time, remove this line (disable caching).
+    @functools.lru_cache(maxsize=None)
+    def observation_space(self, agent):
+        return self.observation_spaces[agent]
+
+    # Action space should be defined here.
+    # If your spaces change over time, remove this line (disable caching).
+    @functools.lru_cache(maxsize=None)
+    def action_space(self, agent):
+        return self.action_spaces[agent]
+
+    def render(self):
+        """
+        Renders the environment. In human mode, it can print to terminal, open
+        up a graphical window, or open up some other display that a human can see and understand.
+        """
+        # Not implemented
+
+    def close(self):
+        """
+        Close should release any graphical displays, subprocesses, network connections
+        or any other environment data which should not be kept around after the
+        user is no longer using the environment.
+        """
+        self.godot_env.close()
+
+    def reset(self, seed=None, options=None):
+        """
+        Reset needs to initialize the `agents` attribute and must set up the
+        environment so that render(), and step() can be called without issues.
+        Returns the observations for each agent
+        """
+        godot_obs, godot_infos = self.godot_env.reset()
+
+        observations = {agent: godot_obs[agent_idx] for agent_idx, agent in enumerate(self.agents)}
+        infos = {agent: godot_infos[agent_idx] for agent_idx, agent in enumerate(self.agents)}
+
+        return observations, infos
+
+    def step(self, actions):
+        """
+        step(action) takes in an action for each agent and should return the
+        - observations
+        - rewards
+        - terminations
+        - truncations
+        - infos
+        dicts where each dict looks like {agent_1: item_1, agent_2: item_2}
+        """
+
+        # Once an agent (AIController) has done = true, it will not receive any more actions until all agents in the
+        # Godot env have done = true. For agents that received no actions, we will set zeros instead for
+        # compatibility.
+        godot_actions = [
+            actions[agent] if agent in actions else np.zeros_like(self.action_spaces[agent_idx].sample())
+            for agent_idx, agent in enumerate(self.agents)
+        ]
+
+        godot_obs, godot_rewards, godot_dones, godot_truncations, godot_infos = self.godot_env.step(
+            godot_actions, order_ij=True
+        )
+        observations = {agent: godot_obs[agent] for agent in actions}
+        rewards = {agent: godot_rewards[agent] for agent in actions}
+
+        terminations = {agent: godot_dones[agent] for agent in actions}
+
+        # Truncations are not yet implemented in GDRL API
+        truncations = {agent: False for agent in actions}
+
+        # typically there won't be any information in the infos, but there must
+        # still be an entry for each agent
+        infos = {agent: godot_infos[agent] for agent in actions}
+
+        return observations, rewards, terminations, truncations, infos
diff --git a/godot_rl/wrappers/ray_wrapper.py b/godot_rl/wrappers/ray_wrapper.py
index f1c875f2..c212204e 100644
--- a/godot_rl/wrappers/ray_wrapper.py
+++ b/godot_rl/wrappers/ray_wrapper.py
@@ -26,13 +26,13 @@ def __init__(
         config=None,
     ) -> None:
         self._env = GodotEnv(
-            env_path=env_path,
+            env_path=config["env_path"],
             port=port,
             seed=seed,
-            show_window=show_window,
+            show_window=config["show_window"],
             framerate=framerate,
-            action_repeat=action_repeat,
-            speedup=speedup,
+            action_repeat=config["action_repeat"],
+            speedup=config["speedup"],
         )
         super().__init__(
             observation_space=self._env.observation_space,

From e5a5d50ce39d7e37ea3122e239a2433dceaf8896 Mon Sep 17 00:00:00 2001
From: Ivan-267 <61947090+Ivan-267@users.noreply.github.com>
Date: Mon, 1 Apr 2024 21:18:03 +0200
Subject: [PATCH 02/14] Create TRAINING_MULTIPLE_POLICIES.md

---
 docs/TRAINING_MULTIPLE_POLICIES.md | 53 ++++++++++++++++++++++++++++++
 1 file changed, 53 insertions(+)
 create mode 100644 docs/TRAINING_MULTIPLE_POLICIES.md

diff --git a/docs/TRAINING_MULTIPLE_POLICIES.md b/docs/TRAINING_MULTIPLE_POLICIES.md
new file mode 100644
index 00000000..bf9be3c0
--- /dev/null
+++ b/docs/TRAINING_MULTIPLE_POLICIES.md
@@ -0,0 +1,53 @@
+This is a brief guide on training multiple policies focusing on Rllib specifically. If you don’t require agents with different action/obs spaces, you might also consider using Sample Factory (it’s fully supported on Linux), or for simpler multi-agent envs, SB3 might work using a single shared policy for all agents.
+
+## Installation and configuration:
+
+### Install dependencies:
+
+`pip install https://github.com/edbeeching/godot_rl_agents/archive/refs/heads/main.zip` (to get the latest version)
+
+`pip install ray[rllib]`
+
+`pip install PettingZoo`
+
+### Download the examples file and config file:
+
+From https://github.com/edbeeching/godot_rl_agents/tree/main/examples, you will need `rllib_example.py` and `rllib_config.yaml.`
+
+### Open the config file:
+
+If your env has multiple different policies you wish to train (explained below), set `env_is_multiagent: true`, otherwise keep it `false`. 
+
+Change `env_path: None *# Set your env path here (exported executable from Godot) - e.g. 'env_path.exe' on Windows`* to point to your exported env from Godot. In-editor training with this script is not recommended as it will launch the env multiple times, to get info about different policy names, to train, and to export to onnx after training, so while possible, you would need to press `Play` in Godot editor multiple times during the process.
+
+You can also adjust the stop criteria (set to 1200 seconds by default), and other settings.
+
+## Configuring and exporting the Godot Env:
+
+### Multipolicy env design differences:
+
+When you set `env_is_multiagent` to `true`, if one agent (AIController) has `done = true` set, it will receive actions with zeros as values until all agents have set `done = true` at least once during that episode, at which point Rllib considers the episode for all agents to be done and will send a reset signal (this sets `needs_reset = true` in each AIController), and display episode rewards in stats. 
+
+If you notice individual agents standing still or behaving oddly (depending on what action values set to zeros do in the game), it’s possible that some agents had `done = true` set previously in the episode while others are still active.
+
+In the example env, we have a training manager script that sets all agents `done` to true at the same time after a fixed amount of steps, and we’re ignoring the `needs_reset = true` signal as we’re manually resetting all agents once the episode is done. You could also handle resetting agents when `needs_reset` is set to `true` in your env instead (keep in mind that AIControllers also automatically set it to `true` after `reset_after` steps, you can override the behavior if needed).
+
+**The behavior described above is different from setting `env_is_multiagent` to `false`, or e.g. using the [SB3 example to train](https://github.com/edbeeching/godot_rl_agents/blob/main/docs/ADV_STABLE_BASELINES_3.md)**, in which case a single policy will be trained as a vectorized environment, meaning that each agent can have its own episode lengths and it will continue to receive actions even after setting `done = true`, as the agents are considered to auto-reset in the env itself (the reset needs to be implemented in Godot as in the example envs).
+
+### Setting policy names:
+For each AIController, you can set a different policy name in Godot. Policies will be assigned to agents based on this name. E.g. if you have 10 agents assigned to `policy1`, they will all use policy 1, and if you have one agent assigned to `policy2`, it will use policy 2.
+
+![setting-policy-names](https://github.com/edbeeching/godot_rl_agents/assets/61947090/13eb9b46-f7fb-467c-ad16-8609cda9f292)
+Screenshot from [MultiAgent Simple env](https://github.com/edbeeching/godot_rl_agents_examples/tree/main/examples/MultiAgentSimple).
+
+## Training:
+After installing the prerequisites and adjusting the config, you can start training by using `python rllib_example.py` in your conda env/venv (if you are in the same folder).
+Rllib will print out useful info in the console, such as the command to start `Tensorboard` to see the training logs for the session.
+Onnx files will automatically be exported once training is done and their paths will be printed near the bottom of the console log (you can also stop mid training with `CTRL+C`, but if you press it twice in a row, saving/exporting will not be done).
+
+For an example of a multi-policy env with 2 policies, check out the [MultiAgent Simple env](https://github.com/edbeeching/godot_rl_agents_examples/tree/main/examples/MultiAgentSimple).
+
+Additional arguments:
+- You can change the folder for logging, checkpoints, and onnx files by using:`--experiment_dir [experiment_path]`,
+- You can resume stopped sessions by using: `--restore [resume_path]` argument (rllib will print out the path to resume in the console if you stop training),
+- You can set the config file location using `--config_file [path_to_config.yaml]` (default is set to `rllib_config.yaml`).

From 3e080b2d42cf7f3a645a0b1bf32b2ef1e37d0873 Mon Sep 17 00:00:00 2001
From: Ivan-267 <61947090+Ivan-267@users.noreply.github.com>
Date: Mon, 1 Apr 2024 21:19:56 +0200
Subject: [PATCH 03/14] Update TRAINING_MULTIPLE_POLICIES.md

---
 docs/TRAINING_MULTIPLE_POLICIES.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/TRAINING_MULTIPLE_POLICIES.md b/docs/TRAINING_MULTIPLE_POLICIES.md
index bf9be3c0..600c7ec4 100644
--- a/docs/TRAINING_MULTIPLE_POLICIES.md
+++ b/docs/TRAINING_MULTIPLE_POLICIES.md
@@ -38,6 +38,7 @@ In the example env, we have a training manager script that sets all agents `done
 For each AIController, you can set a different policy name in Godot. Policies will be assigned to agents based on this name. E.g. if you have 10 agents assigned to `policy1`, they will all use policy 1, and if you have one agent assigned to `policy2`, it will use policy 2.
 
 ![setting-policy-names](https://github.com/edbeeching/godot_rl_agents/assets/61947090/13eb9b46-f7fb-467c-ad16-8609cda9f292)
+
 Screenshot from [MultiAgent Simple env](https://github.com/edbeeching/godot_rl_agents_examples/tree/main/examples/MultiAgentSimple).
 
 ## Training:

From 8a6a8d8fd8fd4619d5cde2471bfc4d4ab49e64f4 Mon Sep 17 00:00:00 2001
From: Ivan-267 <61947090+Ivan-267@users.noreply.github.com>
Date: Mon, 1 Apr 2024 21:27:18 +0200
Subject: [PATCH 04/14] Update TRAINING_MULTIPLE_POLICIES.md

---
 docs/TRAINING_MULTIPLE_POLICIES.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/docs/TRAINING_MULTIPLE_POLICIES.md b/docs/TRAINING_MULTIPLE_POLICIES.md
index 600c7ec4..ea8746c9 100644
--- a/docs/TRAINING_MULTIPLE_POLICIES.md
+++ b/docs/TRAINING_MULTIPLE_POLICIES.md
@@ -41,6 +41,9 @@ For each AIController, you can set a different policy name in Godot. Policies wi
 
 Screenshot from [MultiAgent Simple env](https://github.com/edbeeching/godot_rl_agents_examples/tree/main/examples/MultiAgentSimple).
 
+> [!IMPORTANT]  
+> All agents that have the same policy name must have the same observation and action space.
+
 ## Training:
 After installing the prerequisites and adjusting the config, you can start training by using `python rllib_example.py` in your conda env/venv (if you are in the same folder).
 Rllib will print out useful info in the console, such as the command to start `Tensorboard` to see the training logs for the session.

From f4b1d88d8a1c6591ce01cf24d4f08330020a5a09 Mon Sep 17 00:00:00 2001
From: Ivan-267 <61947090+Ivan-267@users.noreply.github.com>
Date: Wed, 3 Apr 2024 19:51:36 +0200
Subject: [PATCH 05/14] Update hyperparameters in rllib_config.yaml

---
 examples/rllib_config.yaml | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/examples/rllib_config.yaml b/examples/rllib_config.yaml
index 152cc077..4d03f569 100644
--- a/examples/rllib_config.yaml
+++ b/examples/rllib_config.yaml
@@ -32,9 +32,10 @@ config:
     lambda: 0.95
     gamma: 0.99
 
-    vf_clip_param: 0.5
+    vf_clip_param: 10
+    vf_loss_coeff: 0.5
     clip_param: 0.2
-    entropy_coeff: 0.001
+    entropy_coeff: 0.0001
     entropy_coeff_schedule: null
 
     rollout_fragment_length: 64
@@ -49,4 +50,4 @@ config:
     num_gpus: 0
     model:
         vf_share_layers: False
-        fcnet_hiddens: [64, 64]
\ No newline at end of file
+        fcnet_hiddens: [64, 64]

From beb120363f06f0218b1e7057c2bfbb5bd3fa63f5 Mon Sep 17 00:00:00 2001
From: Ivan-267 <61947090+Ivan-267@users.noreply.github.com>
Date: Fri, 5 Apr 2024 20:34:54 +0200
Subject: [PATCH 06/14] Update rllib_config.yaml hyperparameters

---
 examples/rllib_config.yaml | 27 ++++++++++++++++-----------
 1 file changed, 16 insertions(+), 11 deletions(-)

diff --git a/examples/rllib_config.yaml b/examples/rllib_config.yaml
index 4d03f569..16d969d8 100644
--- a/examples/rllib_config.yaml
+++ b/examples/rllib_config.yaml
@@ -10,39 +10,44 @@ algorithm: PPO
 # Set to false if you have a single policy name for all agents set in AIControllers
 env_is_multiagent: false
 
-checkpoint_frequency: 10
+checkpoint_frequency: 20
 
 # You can set one or more stopping criteria
 stop:
     #episode_reward_mean: 0
     #training_iteration: 1000
     #timesteps_total: 10000
-    time_total_s: 1200
+    time_total_s: 10000000
 
 config:
     env: godot
     env_config:
-        env_path: null # Set your env path here (exported executable from Godot) - e.g. 'env_path.exe' on Windows
+        env_path: null # Set your env path here (exported executable from Godot) - e.g. env_path: 'env_path.exe' on Windows
         action_repeat: null # Doesn't need to be set here, you can set this in sync node in Godot editor as well
         show_window: true # Displays game window while training. Might be faster when false in some cases, turning off also reduces GPU usage if you don't need rendering.
-        speedup: 20 # Speeds up Godot physics
+        speedup: 30 # Speeds up Godot physics
 
     framework: torch # ONNX models exported with torch are compatible with the current Godot RL Agents Plugin
+
     lr: 0.0003
     lambda: 0.95
     gamma: 0.99
 
-    vf_clip_param: 10
     vf_loss_coeff: 0.5
-    clip_param: 0.2
+    vf_clip_param: .inf
+    #clip_param: 0.2
     entropy_coeff: 0.0001
     entropy_coeff_schedule: null
+    #grad_clip: 0.5
+
+    normalize_actions: False
+    clip_actions: True # During onnx inference we simply clip the actions to [-1.0, 1.0] range, set here to match
 
-    rollout_fragment_length: 64
-    sgd_minibatch_size: 64
-    num_workers: 1
-    num_envs_per_worker: 1
-    train_batch_size: 512
+    rollout_fragment_length: 32
+    sgd_minibatch_size: 128
+    num_workers: 4
+    num_envs_per_worker: 16
+    train_batch_size: 2048
 
     num_sgd_iter: 4
     batch_mode: truncate_episodes

From fe60a8ba6b4ebd9ac2ae0e58bfda11b4bae6539a Mon Sep 17 00:00:00 2001
From: Ivan-267 <61947090+Ivan-267@users.noreply.github.com>
Date: Fri, 5 Apr 2024 22:10:48 +0200
Subject: [PATCH 07/14] Auto-set num_envs_per_worker in rllib_example.py

---
 examples/rllib_example.py | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/examples/rllib_example.py b/examples/rllib_example.py
index 8fcca985..0907422d 100644
--- a/examples/rllib_example.py
+++ b/examples/rllib_example.py
@@ -49,13 +49,21 @@ def env_creator(env_config):
 
     tune.register_env(env_name, env_creator)
 
-    # Make temp env to get info needed for multi-agent training config
-    if is_multiagent:
+    policy_names = None
+    num_envs = None
+    tmp_env = None
+
+    if is_multiagent:  # Make temp env to get info needed for multi-agent training config
         print("Starting a temporary multi-agent env to get the policy names")
         tmp_env = GDRLPettingZooEnv(config=exp["config"]["env_config"], show_window=False)
         policy_names = tmp_env.agent_policy_names
         print("Policy names for each Agent (AIController) set in the Godot Environment", policy_names)
-        tmp_env.close()
+    else:  # Make temp env to get info needed for setting num_workers training config
+        print("Starting a temporary env to get the number of envs and auto-set the num_envs_per_worker config value")
+        tmp_env = GodotEnv(env_path=exp["config"]["env_config"]["env_path"], show_window=False)
+        num_envs = tmp_env.num_envs
+
+    tmp_env.close()
 
     def policy_mapping_fn(agent_id: int, episode, worker, **kwargs) -> str:
         return policy_names[agent_id]
@@ -67,6 +75,8 @@ def policy_mapping_fn(agent_id: int, episode, worker, **kwargs) -> str:
             "policies": {policy_name: PolicySpec() for policy_name in policy_names},
             "policy_mapping_fn": policy_mapping_fn,
         }
+    else:
+        exp["config"]["num_envs_per_worker"] = num_envs
 
     tuner = None
     if not args.restore:
@@ -89,6 +99,7 @@ def policy_mapping_fn(agent_id: int, episode, worker, **kwargs) -> str:
 
     # Onnx export after training if a checkpoint was saved
     checkpoint = result.get_best_result().checkpoint
+
     if checkpoint:
         result_path = result.get_best_result().path
         ppo = Algorithm.from_checkpoint(checkpoint)

From 668b88d14eaeda68ffa6fbd808469adcf40cd3f9 Mon Sep 17 00:00:00 2001
From: Ivan-267 <61947090+Ivan-267@users.noreply.github.com>
Date: Fri, 5 Apr 2024 22:11:51 +0200
Subject: [PATCH 08/14] Update rllib_config.yaml

---
 examples/rllib_config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/rllib_config.yaml b/examples/rllib_config.yaml
index 16d969d8..c23d69e2 100644
--- a/examples/rllib_config.yaml
+++ b/examples/rllib_config.yaml
@@ -46,7 +46,7 @@ config:
     rollout_fragment_length: 32
     sgd_minibatch_size: 128
     num_workers: 4
-    num_envs_per_worker: 16
+    num_envs_per_worker: 1 # This will be set automatically if not multi-agent. If multi-agent, changing this changes how many envs to launch per worker.
     train_batch_size: 2048
 
     num_sgd_iter: 4

From 7ade3b059e879b650c184c52e9f5189e1b764d86 Mon Sep 17 00:00:00 2001
From: Ivan-267 <61947090+Ivan-267@users.noreply.github.com>
Date: Fri, 5 Apr 2024 22:19:43 +0200
Subject: [PATCH 09/14] Added basic calculation for train_batch_size to
 rllib_config.yaml

---
 examples/rllib_config.yaml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/examples/rllib_config.yaml b/examples/rllib_config.yaml
index c23d69e2..652f1547 100644
--- a/examples/rllib_config.yaml
+++ b/examples/rllib_config.yaml
@@ -47,7 +47,9 @@ config:
     sgd_minibatch_size: 128
     num_workers: 4
     num_envs_per_worker: 1 # This will be set automatically if not multi-agent. If multi-agent, changing this changes how many envs to launch per worker.
-    train_batch_size: 2048
+    # The value below needs changing per env
+    # Basic calculation for this value can be rollout_fragment_length * num_workers * num_envs_per_worker (how many AIControllers you have if not multi_agent, otherwise the value you set)
+    train_batch_size: 2048 
 
     num_sgd_iter: 4
     batch_mode: truncate_episodes

From 0fd70c2c698e323e8397a45318418cdd9d81d775 Mon Sep 17 00:00:00 2001
From: Ivan-267 <61947090+Ivan-267@users.noreply.github.com>
Date: Sun, 5 May 2024 17:57:51 +0200
Subject: [PATCH 10/14] Multiple observation spaces fix

---
 godot_rl/core/godot_env.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/godot_rl/core/godot_env.py b/godot_rl/core/godot_env.py
index 9fc2bb9f..53996896 100644
--- a/godot_rl/core/godot_env.py
+++ b/godot_rl/core/godot_env.py
@@ -392,7 +392,7 @@ def _get_env_info(self):
                 else:
                     print(f"observation space {v['space']} is not supported")
                     assert 0, f"observation space {v['space']} is not supported"
-                self.observation_spaces.append(spaces.Dict(observation_spaces))
+            self.observation_spaces.append(spaces.Dict(observation_spaces))
 
         # Gets policy names defined in AIControllers in Godot. If an older version of the plugin is used and no policy
         # names are sent, "shared_policy" will be set for compatibility.

From 39c5d91cfe037cb86235b51b5032f73c21ad8bc2 Mon Sep 17 00:00:00 2001
From: Ivan-267 <61947090+Ivan-267@users.noreply.github.com>
Date: Thu, 9 May 2024 21:46:28 +0200
Subject: [PATCH 11/14] Adds support for multidiscrete actions with sb3

- Adds supports for exporting envs with multidiscrete actions with sb3
- Multiple obs spaces onnx export support (for sb3) still needs to be worked on in the future
---
 godot_rl/core/utils.py                        | 30 +++++++++++++++----
 .../wrappers/onnx/stable_baselines_export.py  | 10 +++++--
 2 files changed, 33 insertions(+), 7 deletions(-)

diff --git a/godot_rl/core/utils.py b/godot_rl/core/utils.py
index ed8d5a1f..81b43f24 100644
--- a/godot_rl/core/utils.py
+++ b/godot_rl/core/utils.py
@@ -56,15 +56,17 @@ def __init__(self, action_space: gym.spaces.Tuple, convert) -> None:
                         elif isinstance(space, gym.spaces.Discrete):
                             if space.n > 2:
                                 # for now only binary actions are supported if you mix different spaces
-                                # need to add support for the n>2 case
-                                raise NotImplementedError
+                                raise NotImplementedError(
+                                    "Discrete actions with size larger than 2 "
+                                    "are currently not supported if used together with continuous actions."
+                                )
                             space_size += 1
                         else:
                             raise NotImplementedError
             elif isinstance(action_space, gym.spaces.Dict):
                 raise NotImplementedError
             else:
-                assert isinstance(space, [gym.spaces.Box, gym.spaces.Discrete])
+                assert isinstance(action_space, (gym.spaces.Box, gym.spaces.Discrete))
                 return
 
             if use_multi_discrete_spaces:
@@ -86,6 +88,12 @@ def to_original_dist(self, action):
         original_action = []
         counter = 0
 
+        # If only discrete actions are used in the environment:
+        # - SB3 will send int actions containing the discrete action,
+        # - CleanRL example script (continuous PPO) will only send float actions, which we convert to binary discrete,
+        # - If mixed actions are used, both will send float actions.
+        integer_actions: bool = action.dtype == np.int64
+
         for space in self._original_action_space.spaces:
             if isinstance(space, gym.spaces.Box):
                 assert len(space.shape) == 1
@@ -93,8 +101,20 @@ def to_original_dist(self, action):
                 counter += space.shape[0]
 
             elif isinstance(space, gym.spaces.Discrete):
-                discrete_actions = np.greater(action[:, counter], 0.0)
-                discrete_actions = discrete_actions.astype(np.float32)
+                discrete_actions = None
+
+                if integer_actions:
+                    discrete_actions = action[:, counter]
+                else:
+                    if space.n > 2:
+                        raise NotImplementedError(
+                            "Discrete actions with size larger than "
+                            "2 are currently not implemented for this algorithm."
+                        )
+                    # If the action is not an integer, convert it to a binary discrete action
+                    discrete_actions = np.greater(action[:, counter], 0.0)
+                    discrete_actions = discrete_actions.astype(np.float32)
+
                 original_action.append(discrete_actions)
                 counter += 1
 
diff --git a/godot_rl/wrappers/onnx/stable_baselines_export.py b/godot_rl/wrappers/onnx/stable_baselines_export.py
index 67803ce3..c7225d04 100644
--- a/godot_rl/wrappers/onnx/stable_baselines_export.py
+++ b/godot_rl/wrappers/onnx/stable_baselines_export.py
@@ -1,4 +1,5 @@
 import torch
+from gymnasium.vector.utils import spaces
 from stable_baselines3 import PPO
 
 
@@ -52,7 +53,7 @@ def export_ppo_model_as_onnx(ppo: PPO, onnx_model_path: str, use_obs_array: bool
         dummy_input = dict(ppo.observation_space.sample())
         for k, v in dummy_input.items():
             dummy_input[k] = torch.from_numpy(v).unsqueeze(0)
-            dummy_input = [v for v in dummy_input.values()]
+        dummy_input = [v for v in dummy_input.values()]
 
     torch.onnx.export(
         onnxable_model,
@@ -68,7 +69,12 @@ def export_ppo_model_as_onnx(ppo: PPO, onnx_model_path: str, use_obs_array: bool
             "state_outs": {0: "batch_size"},
         },
     )
-    verify_onnx_export(ppo, onnx_model_path, use_obs_array=use_obs_array)
+
+    # If the space is MultiDiscrete, we skip verifying as action output will have an expected mismatch
+    # (the output from onnx will be the action logits for each discrete action,
+    # while the output from sb3 will be a single int)
+    if not isinstance(ppo.action_space, spaces.MultiDiscrete):
+        verify_onnx_export(ppo, onnx_model_path, use_obs_array=use_obs_array)
 
 
 def verify_onnx_export(ppo: PPO, onnx_model_path: str, num_tests=10, use_obs_array: bool = False):

From 322b3988a40ecb7d7ce9dc711e53ade1f1dcb5f6 Mon Sep 17 00:00:00 2001
From: Ivan-267 <61947090+Ivan-267@users.noreply.github.com>
Date: Thu, 9 May 2024 22:48:57 +0200
Subject: [PATCH 12/14] Removes init variables in ray_wrapper.py

---
 godot_rl/wrappers/ray_wrapper.py | 52 ++------------------------------
 1 file changed, 2 insertions(+), 50 deletions(-)

diff --git a/godot_rl/wrappers/ray_wrapper.py b/godot_rl/wrappers/ray_wrapper.py
index c212204e..fac4acec 100644
--- a/godot_rl/wrappers/ray_wrapper.py
+++ b/godot_rl/wrappers/ray_wrapper.py
@@ -15,14 +15,8 @@
 class RayVectorGodotEnv(VectorEnv):
     def __init__(
         self,
-        env_path=None,
         port=10008,
         seed=0,
-        show_window=False,
-        framerate=None,
-        action_repeat=None,
-        speedup=None,
-        timeout_wait=60,
         config=None,
     ) -> None:
         self._env = GodotEnv(
@@ -30,7 +24,6 @@ def __init__(
             port=port,
             seed=seed,
             show_window=config["show_window"],
-            framerate=framerate,
             action_repeat=config["action_repeat"],
             speedup=config["speedup"],
         )
@@ -83,48 +76,7 @@ def register_env():
     )
 
 
-# TODO: fix this implementation
-# def rllib_export(model_path):
-#     # get path from the config file and remove the file name
-#     path = model_path  # full path with file name
-#     path = path.split("/")  # split the path into a list
-#     path = path[:-1]  # remove the file name from the list
-#     # duplicate the path for the export
-#     export_path = path.copy()
-#     export_path.append("onnx")
-#     export_path = "/".join(export_path)  # join the list into a string
-#     # duplicate the last element of the list
-#     path.append(path[-1])
-#     # change format from checkpoint_000500 to checkpoint-500
-#     temp = path[-1].split("_")
-#     temp = temp[-1]
-#     # parse the number
-#     temp = int(temp)
-#     # back to string
-#     temp = str(temp)
-#     # join the string with the new format
-#     path[-1] = "checkpoint-" + temp
-#     path = "/".join(path)  # join the list into a string
-#     # best_checkpoint = results.get_best_checkpoint(results.trials[0], mode="max")
-#     # print(f".. best checkpoint was: {best_checkpoint}")
-
-#     # From here on, the relevant part to exporting the model
-#     new_trainer = PPOTrainer(config=exp["config"])
-#     new_trainer.restore(path)
-#     # policy = new_trainer.get_policy()
-#     new_trainer.export_policy_model(export_dir=export_path, onnx=9)  # This works for version 1.11.X
-
-
-# Running  with: gdrl --env_path envs/builds/JumperHard/jumper_hard.exe --export \
-# --restore envs/checkpoints/jumper_hard/checkpoint_000500/checkpoint-500
-# model = policy.model
-# export the model to onnx using torch.onnx.export
-# dummy_input = torch.randn(1, 3, 84, 84)
-# input is dictionary with key "obs" and value is a tensor of shape [...,8]
-# tensor = torch.randn([1, 2, 4, 6, 8, 10, 12, 14])
-# dummy_input = {"obs":  tensor}
-# torch.onnx.export(model, dummy_input, "model.onnx", verbose=True,
-# dynamic_axes={"input": {0: "batch_size"}, "output": {0: "batch_size"}})
+# Refactored section: Commented onnx section was removed as it was re-implemented in rllib_example.py
 
 
 def rllib_training(args, extras):
@@ -186,7 +138,7 @@ def rllib_training(args, extras):
             ),
         )
     if args.export:
-        raise NotImplementedError("Exporting is not (re)implemented yet")
+        raise NotImplementedError("Use examples/rllib_example.py to export to onnx.")
         # rllib_export(args.restore)
 
     ray.shutdown()

From e7489ad29a763a46b66755b5da469f4d5ada35a8 Mon Sep 17 00:00:00 2001
From: Ivan-267 <61947090+Ivan-267@users.noreply.github.com>
Date: Thu, 9 May 2024 22:54:56 +0200
Subject: [PATCH 13/14] Removes register_env arguments - ray_wrapper.py

Also removes the previously removed init variables from `tune.register_env()`
---
 godot_rl/wrappers/ray_wrapper.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/godot_rl/wrappers/ray_wrapper.py b/godot_rl/wrappers/ray_wrapper.py
index fac4acec..ab86cd1a 100644
--- a/godot_rl/wrappers/ray_wrapper.py
+++ b/godot_rl/wrappers/ray_wrapper.py
@@ -64,14 +64,9 @@ def register_env():
     tune.register_env(
         "godot",
         lambda c: RayVectorGodotEnv(
-            env_path=c["env_path"],
             config=c,
             port=c.worker_index + GodotEnv.DEFAULT_PORT + 10,
-            show_window=c["show_window"],
-            framerate=c["framerate"],
             seed=c.worker_index + c["seed"],
-            action_repeat=c["framerate"],
-            speedup=c["speedup"],
         ),
     )
 

From 26532d41ef8dca85fbd19157b34ec9701d6511ca Mon Sep 17 00:00:00 2001
From: Ivan-267 <61947090+Ivan-267@users.noreply.github.com>
Date: Wed, 15 May 2024 06:01:28 +0200
Subject: [PATCH 14/14] Update ADV_RLLIB.md

Updates rllib doc to include the new process.
---
 docs/ADV_RLLIB.md | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/docs/ADV_RLLIB.md b/docs/ADV_RLLIB.md
index 0ae95ee5..2706ece7 100644
--- a/docs/ADV_RLLIB.md
+++ b/docs/ADV_RLLIB.md
@@ -2,7 +2,13 @@
 
 [RLlib](https://docs.ray.io/en/latest/rllib/index.html) is an open-source library for reinforcement learning (RL), offering support for production-level, highly distributed RL workloads while maintaining unified and simple APIs for a large variety of industry applications. Whether you would like to train your agents in a multi-agent setup, purely from offline (historic) datasets, or using externally connected simulators, RLlib offers a simple solution for each of your decision making needs.
 
+## Usage with Rllib example (Recommended)
+
+The updated [Rllib example](https://github.com/edbeeching/godot_rl_agents/blob/main/examples/rllib_example.py) script allows training environments with single and multiple different policies.
+To use the new example, installation process is a bit different, you can find it described in the [training multiple policies](https://github.com/edbeeching/godot_rl_agents/blob/main/docs/TRAINING_MULTIPLE_POLICIES.md) guide.
+
 ## Installation
+**Below is the older usage process, please refer to the previous section for recommended usage.**
 
 If you want to train with rllib, create a new environment e.g.: `python -m venv venv.rllib` as rllib's dependencies can conflict with those of sb3 and other libraries.
 Due to a version clash with gymnasium, stable-baselines3 must be uninstalled before installing rllib.