diff --git a/CITATION.bib b/CITATION.bib
index acdab6f3..85bae8e6 100644
--- a/CITATION.bib
+++ b/CITATION.bib
@@ -1,6 +1,6 @@
-@inproceedings{Alegre+2022bnaic,
- author = {Lucas N. Alegre and Florian Felten and El-Ghazali Talbi and Gr{\'e}goire Danoy and Ann Now{\'e} and Ana L. C. Bazzan and Bruno C. da Silva},
- title = {{MO-Gym}: A Library of Multi-Objective Reinforcement Learning Environments},
- booktitle = {Proceedings of the 34th Benelux Conference on Artificial Intelligence BNAIC/Benelearn 2022},
- year = {2022}
+@inproceedings{felten_toolkit_2023,
+ author = {Felten, Florian and Alegre, Lucas N. and Now{\'e}, Ann and Bazzan, Ana L. C. and Talbi, El Ghazali and Danoy, Gr{\'e}goire and Silva, Bruno C. {\relax da}},
+ title = {A Toolkit for Reliable Benchmarking and Research in Multi-Objective Reinforcement Learning},
+ booktitle = {Proceedings of the 37th Conference on Neural Information Processing Systems ({NeurIPS} 2023)},
+ year = {2023}
}
diff --git a/README.md b/README.md
index aaf999e4..fb5f7885 100644
--- a/README.md
+++ b/README.md
@@ -82,11 +82,11 @@ Maintenance for this project is also contributed by the broader Farama team: [fa
If you use this repository in your research, please cite:
```bibtex
-@inproceedings{Alegre+2022bnaic,
- author = {Lucas N. Alegre and Florian Felten and El-Ghazali Talbi and Gr{\'e}goire Danoy and Ann Now{\'e} and Ana L. C. Bazzan and Bruno C. da Silva},
- title = {{MO-Gym}: A Library of Multi-Objective Reinforcement Learning Environments},
- booktitle = {Proceedings of the 34th Benelux Conference on Artificial Intelligence BNAIC/Benelearn 2022},
- year = {2022}
+@inproceedings{felten_toolkit_2023,
+ author = {Felten, Florian and Alegre, Lucas N. and Now{\'e}, Ann and Bazzan, Ana L. C. and Talbi, El Ghazali and Danoy, Gr{\'e}goire and Silva, Bruno C. {\relax da}},
+ title = {A Toolkit for Reliable Benchmarking and Research in Multi-Objective Reinforcement Learning},
+ booktitle = {Proceedings of the 37th Conference on Neural Information Processing Systems ({NeurIPS} 2023)},
+ year = {2023}
}
```
diff --git a/docs/_static/videos/minecart-rgb.gif b/docs/_static/videos/minecart-rgb.gif
new file mode 100644
index 00000000..39ad4172
Binary files /dev/null and b/docs/_static/videos/minecart-rgb.gif differ
diff --git a/docs/_static/videos/mo-ant.gif b/docs/_static/videos/mo-ant.gif
new file mode 100644
index 00000000..9397b4ff
Binary files /dev/null and b/docs/_static/videos/mo-ant.gif differ
diff --git a/docs/_static/videos/mo-humanoid.gif b/docs/_static/videos/mo-humanoid.gif
new file mode 100644
index 00000000..625a40f8
Binary files /dev/null and b/docs/_static/videos/mo-humanoid.gif differ
diff --git a/docs/_static/videos/mo-lunar-lander-continuous.gif b/docs/_static/videos/mo-lunar-lander-continuous.gif
new file mode 100644
index 00000000..2051d754
Binary files /dev/null and b/docs/_static/videos/mo-lunar-lander-continuous.gif differ
diff --git a/docs/_static/videos/mo-swimmer.gif b/docs/_static/videos/mo-swimmer.gif
new file mode 100644
index 00000000..f1dffd63
Binary files /dev/null and b/docs/_static/videos/mo-swimmer.gif differ
diff --git a/docs/_static/videos/mo-walker2d.gif b/docs/_static/videos/mo-walker2d.gif
new file mode 100644
index 00000000..6a2a2e1e
Binary files /dev/null and b/docs/_static/videos/mo-walker2d.gif differ
diff --git a/docs/citing/citing.md b/docs/citing/citing.md
index b90bd5d1..64b21646 100644
--- a/docs/citing/citing.md
+++ b/docs/citing/citing.md
@@ -7,6 +7,17 @@ title: "Citing"
:end-before:
```
+MO-Gymnasium (formerly MO-Gym) appeared first in the following workshop publication:
+
+```bibtex
+@inproceedings{Alegre+2022bnaic,
+ author = {Lucas N. Alegre and Florian Felten and El-Ghazali Talbi and Gr{\'e}goire Danoy and Ann Now{\'e} and Ana L. C. Bazzan and Bruno C. {\relax da} Silva},
+ title = {{MO-Gym}: A Library of Multi-Objective Reinforcement Learning Environments},
+ booktitle = {Proceedings of the 34th Benelux Conference on Artificial Intelligence BNAIC/Benelearn 2022},
+ year = {2022}
+}
+```
+
```{toctree}
:hidden:
:glob:
diff --git a/docs/environments/all-environments.md b/docs/environments/all-environments.md
deleted file mode 100644
index 0975216a..00000000
--- a/docs/environments/all-environments.md
+++ /dev/null
@@ -1,39 +0,0 @@
----
-title: "Environments"
----
-
-# Available environments
-
-
-MO-Gymnasium includes environments taken from the MORL literature, as well as multi-objective version of classical environments, such as Mujoco.
-
-| Env | Obs/Action spaces | Objectives | Description |
-|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------|---------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| [`deep-sea-treasure-v0`](https://mo-gymnasium.farama.org/environments/deep-sea-treasure/)
| Discrete / Discrete | `[treasure, time_penalty]` | Agent is a submarine that must collect a treasure while taking into account a time penalty. Treasures values taken from [Yang et al. 2019](https://arxiv.org/pdf/1908.08342.pdf). |
-| [`deep-sea-treasure-concave-v0`](https://mo-gymnasium.farama.org/environments/deep-sea-treasure-concave/)
| Discrete / Discrete | `[treasure, time_penalty]` | Agent is a submarine that must collect a treasure while taking into account a time penalty. Treasures values taken from [Vamplew et al. 2010](https://link.springer.com/article/10.1007/s10994-010-5232-5). |
-| [`deep-sea-treasure-mirrored-v0`](https://mo-gymnasium.farama.org/environments/deep-sea-treasure-mirrored/)
| Discrete / Discrete | `[treasure, time_penalty]` | Harder version of the concave DST [Felten et al. 2022](https://www.scitepress.org/Papers/2022/109891/109891.pdf). |
-| [`resource-gathering-v0`](https://mo-gymnasium.farama.org/environments/resource-gathering/)
| Discrete / Discrete | `[enemy, gold, gem]` | Agent must collect gold or gem. Enemies have a 10% chance of killing the agent. From [Barret & Narayanan 2008](https://dl.acm.org/doi/10.1145/1390156.1390162). |
-| [`fishwood-v0`](https://mo-gymnasium.farama.org/environments/fishwood/)
| Discrete / Discrete | `[fish_amount, wood_amount]` | ESR environment, the agent must collect fish and wood to light a fire and eat. From [Roijers et al. 2018](https://www.researchgate.net/publication/328718263_Multi-objective_Reinforcement_Learning_for_the_Expected_Utility_of_the_Return). |
-| [`breakable-bottles-v0`](https://mo-gymnasium.farama.org/environments/breakable-bottles/)
| Discrete (Dictionary) / Discrete | `[time_penalty, bottles_delivered, potential]` | Gridworld with 5 cells. The agents must collect bottles from the source location and deliver to the destination. From [Vamplew et al. 2021](https://www.sciencedirect.com/science/article/pii/S0952197621000336). |
-| [`fruit-tree-v0`](https://mo-gymnasium.farama.org/environments/fruit-tree/)
| Discrete / Discrete | `[nutri1, ..., nutri6]` | Full binary tree of depth d=5,6 or 7. Every leaf contains a fruit with a value for the nutrients Protein, Carbs, Fats, Vitamins, Minerals and Water. From [Yang et al. 2019](https://arxiv.org/pdf/1908.08342.pdf). |
-| [`water-reservoir-v0`](https://mo-gymnasium.farama.org/environments/water-reservoir/)
| Continuous / Continuous | `[cost_flooding, deficit_water]` | A Water reservoir environment. The agent executes a continuous action, corresponding to the amount of water released by the dam. From [Pianosi et al. 2013](https://iwaponline.com/jh/article/15/2/258/3425/Tree-based-fitted-Q-iteration-for-multi-objective). |
-| [`four-room-v0`](https://mo-gymnasium.farama.org/environments/four-room/)
| Discrete / Discrete | `[item1, item2, item3]` | Agent must collect three different types of items in the map and reach the goal. From [Alegre et al. 2022](https://proceedings.mlr.press/v162/alegre22a.html). |
-| [`mo-mountaincar-v0`](https://mo-gymnasium.farama.org/environments/mo-mountaincar/)
| Continuous / Discrete | `[time_penalty, reverse_penalty, forward_penalty]` | Classic Mountain Car env, but with extra penalties for the forward and reverse actions. From [Vamplew et al. 2011](https://www.researchgate.net/publication/220343783_Empirical_evaluation_methods_for_multiobjective_reinforcement_learning_algorithms). |
-| [`mo-mountaincarcontinuous-v0`](https://mo-gymnasium.farama.org/environments/mo-mountaincarcontinuous/)
| Continuous / Continuous | `[time_penalty, fuel_consumption_penalty]` | Continuous Mountain Car env, but with penalties for fuel consumption. |
-| [`mo-lunar-lander-v2`](https://mo-gymnasium.farama.org/environments/mo-lunar-lander/)
| Continuous / Discrete or Continuous | `[landed, shaped_reward, main_engine_fuel, side_engine_fuel]` | MO version of the `LunarLander-v2` [environment](https://gymnasium.farama.org/environments/box2d/lunar_lander/). Objectives defined similarly as in [Hung et al. 2022](https://openreview.net/forum?id=AwWaBXLIJE). |
-| [`minecart-v0`](https://mo-gymnasium.farama.org/environments/minecart/)
| Continuous or Image / Discrete | `[ore1, ore2, fuel]` | Agent must collect two types of ores and minimize fuel consumption. From [Abels et al. 2019](https://arxiv.org/abs/1809.07803v2). |
-| [`mo-highway-v0`](https://mo-gymnasium.farama.org/environments/mo-highway/) and `mo-highway-fast-v0`
| Continuous / Discrete | `[speed, right_lane, collision]` | The agent's objective is to reach a high speed while avoiding collisions with neighbouring vehicles and staying on the rightest lane. From [highway-env](https://github.com/eleurent/highway-env). |
-| [`mo-supermario-v0`](https://mo-gymnasium.farama.org/environments/mo-supermario/)
| Image / Discrete | `[x_pos, time, death, coin, enemy]` | [:warning: SuperMarioBrosEnv support is limited.] Multi-objective version of [SuperMarioBrosEnv](https://github.com/Kautenja/gym-super-mario-bros). Objectives are defined similarly as in [Yang et al. 2019](https://arxiv.org/pdf/1908.08342.pdf). |
-| [`mo-reacher-v4`](https://mo-gymnasium.farama.org/environments/mo-reacher/)
| Continuous / Discrete | `[target_1, target_2, target_3, target_4]` | Mujoco version of `mo-reacher-v0`, based on `Reacher-v4` [environment](https://gymnasium.farama.org/environments/mujoco/reacher/). |
-| [`mo-hopper-v4`](https://mo-gymnasium.farama.org/environments/mo-hopper/)
| Continuous / Continuous | `[velocity, height, energy]` | Multi-objective version of [Hopper-v4](https://gymnasium.farama.org/environments/mujoco/hopper/) env. |
-| [`mo-halfcheetah-v4`](https://mo-gymnasium.farama.org/environments/mo-halfcheetah/)
| Continuous / Continuous | `[velocity, energy]` | Multi-objective version of [HalfCheetah-v4](https://gymnasium.farama.org/environments/mujoco/half_cheetah/) env. Similar to [Xu et al. 2020](https://github.com/mit-gfx/PGMORL). |
-
-
-```{toctree}
-:hidden:
-:glob:
-:caption: MO-Gymnasium Environments
-
-./*
-
-```
diff --git a/docs/environments/classical.md b/docs/environments/classical.md
new file mode 100644
index 00000000..3f80ce85
--- /dev/null
+++ b/docs/environments/classical.md
@@ -0,0 +1,25 @@
+---
+title: "Classic Control"
+---
+
+# Classic Control
+
+Multi-objective versions of classical Gymnasium's environments.
+
+| Env | Obs/Action spaces | Objectives | Description |
+|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------|---------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| [`mo-mountaincar-v0`](https://mo-gymnasium.farama.org/environments/mo-mountaincar/)
| Continuous / Discrete | `[time_penalty, reverse_penalty, forward_penalty]` | Classic Mountain Car env, but with extra penalties for the forward and reverse actions. From [Vamplew et al. 2011](https://www.researchgate.net/publication/220343783_Empirical_evaluation_methods_for_multiobjective_reinforcement_learning_algorithms). |
+| [`mo-mountaincarcontinuous-v0`](https://mo-gymnasium.farama.org/environments/mo-mountaincarcontinuous/)
| Continuous / Continuous | `[time_penalty, fuel_consumption_penalty]` | Continuous Mountain Car env, but with penalties for fuel consumption. |
+| [`mo-lunar-lander-v2`](https://mo-gymnasium.farama.org/environments/mo-lunar-lander/)
| Continuous / Discrete or Continuous | `[landed, shaped_reward, main_engine_fuel, side_engine_fuel]` | MO version of the `LunarLander-v2` [environment](https://gymnasium.farama.org/environments/box2d/lunar_lander/). Objectives defined similarly as in [Hung et al. 2022](https://openreview.net/forum?id=AwWaBXLIJE). |
+
+```{toctree}
+:hidden:
+:glob:
+:caption: MO-Gymnasium Environments
+
+./mo-mountaincar
+./mo-mountaincarcontinuous
+./mo-lunar-lander
+./mo-lunar-lander-continuous
+
+```
diff --git a/docs/environments/grid-world.md b/docs/environments/grid-world.md
new file mode 100644
index 00000000..e9d25520
--- /dev/null
+++ b/docs/environments/grid-world.md
@@ -0,0 +1,36 @@
+---
+title: "Grid-World"
+---
+
+# Grid-World
+
+Environments with discrete observation spaces, e.g., grid-worlds.
+
+| Env | Obs/Action spaces | Objectives | Description |
+|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------|---------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| [`deep-sea-treasure-v0`](https://mo-gymnasium.farama.org/environments/deep-sea-treasure/)
| Discrete / Discrete | `[treasure, time_penalty]` | Agent is a submarine that must collect a treasure while taking into account a time penalty. Treasures values taken from [Yang et al. 2019](https://arxiv.org/pdf/1908.08342.pdf). |
+| [`deep-sea-treasure-concave-v0`](https://mo-gymnasium.farama.org/environments/deep-sea-treasure-concave/)
| Discrete / Discrete | `[treasure, time_penalty]` | Agent is a submarine that must collect a treasure while taking into account a time penalty. Treasures values taken from [Vamplew et al. 2010](https://link.springer.com/article/10.1007/s10994-010-5232-5). |
+| [`deep-sea-treasure-mirrored-v0`](https://mo-gymnasium.farama.org/environments/deep-sea-treasure-mirrored/)
| Discrete / Discrete | `[treasure, time_penalty]` | Harder version of the concave DST [Felten et al. 2022](https://www.scitepress.org/Papers/2022/109891/109891.pdf). |
+| [`resource-gathering-v0`](https://mo-gymnasium.farama.org/environments/resource-gathering/)
| Discrete / Discrete | `[enemy, gold, gem]` | Agent must collect gold or gem. Enemies have a 10% chance of killing the agent. From [Barret & Narayanan 2008](https://dl.acm.org/doi/10.1145/1390156.1390162). |
+| [`fishwood-v0`](https://mo-gymnasium.farama.org/environments/fishwood/)
| Discrete / Discrete | `[fish_amount, wood_amount]` | ESR environment, the agent must collect fish and wood to light a fire and eat. From [Roijers et al. 2018](https://www.researchgate.net/publication/328718263_Multi-objective_Reinforcement_Learning_for_the_Expected_Utility_of_the_Return). |
+| [`breakable-bottles-v0`](https://mo-gymnasium.farama.org/environments/breakable-bottles/)
| Discrete (Dictionary) / Discrete | `[time_penalty, bottles_delivered, potential]` | Gridworld with 5 cells. The agents must collect bottles from the source location and deliver to the destination. From [Vamplew et al. 2021](https://www.sciencedirect.com/science/article/pii/S0952197621000336). |
+| [`fruit-tree-v0`](https://mo-gymnasium.farama.org/environments/fruit-tree/)
| Discrete / Discrete | `[nutri1, ..., nutri6]` | Full binary tree of depth d=5,6 or 7. Every leaf contains a fruit with a value for the nutrients Protein, Carbs, Fats, Vitamins, Minerals and Water. From [Yang et al. 2019](https://arxiv.org/pdf/1908.08342.pdf). |
+| [`four-room-v0`](https://mo-gymnasium.farama.org/environments/four-room/)
| Discrete / Discrete | `[item1, item2, item3]` | Agent must collect three different types of items in the map and reach the goal. From [Alegre et al. 2022](https://proceedings.mlr.press/v162/alegre22a.html). |
+
+
+```{toctree}
+:hidden:
+:glob:
+:caption: MO-Gymnasium Environments
+
+./deep-sea-treasure
+./deep-sea-treasure-concave
+./deep-sea-treasure-mirrored
+./resource-gathering
+./four-room
+./fruit-tree
+./breakable-bottles
+./fishwood
+
+
+```
diff --git a/docs/environments/misc.md b/docs/environments/misc.md
new file mode 100644
index 00000000..d9cc2295
--- /dev/null
+++ b/docs/environments/misc.md
@@ -0,0 +1,28 @@
+---
+title: "Misc"
+---
+
+# Miscellaneous
+
+MO-Gymnasium also includes other miscellaneous multi-objective environments:
+
+| Env | Obs/Action spaces | Objectives | Description |
+|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------|---------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| [`water-reservoir-v0`](https://mo-gymnasium.farama.org/environments/water-reservoir/)
| Continuous / Continuous | `[cost_flooding, deficit_water]` | A Water reservoir environment. The agent executes a continuous action, corresponding to the amount of water released by the dam. From [Pianosi et al. 2013](https://iwaponline.com/jh/article/15/2/258/3425/Tree-based-fitted-Q-iteration-for-multi-objective). |
+| [`minecart-v0`](https://mo-gymnasium.farama.org/environments/minecart/)
| Continuous or Image / Discrete | `[ore1, ore2, fuel]` | Agent must collect two types of ores and minimize fuel consumption. From [Abels et al. 2019](https://arxiv.org/abs/1809.07803v2). |
+| [`mo-highway-v0`](https://mo-gymnasium.farama.org/environments/mo-highway/) and `mo-highway-fast-v0`
| Continuous / Discrete | `[speed, right_lane, collision]` | The agent's objective is to reach a high speed while avoiding collisions with neighbouring vehicles and staying on the rightest lane. From [highway-env](https://github.com/eleurent/highway-env). |
+| [`mo-supermario-v0`](https://mo-gymnasium.farama.org/environments/mo-supermario/)
| Image / Discrete | `[x_pos, time, death, coin, enemy]` | [:warning: SuperMarioBrosEnv support is limited.] Multi-objective version of [SuperMarioBrosEnv](https://github.com/Kautenja/gym-super-mario-bros). Objectives are defined similarly as in [Yang et al. 2019](https://arxiv.org/pdf/1908.08342.pdf). |
+
+
+```{toctree}
+:hidden:
+:glob:
+:caption: MO-Gymnasium Environments
+
+./water-reservoir
+./minecart
+./minecart-deterministic
+./minecart-rgb
+./mo-highway
+./mo-supermario
+```
diff --git a/docs/environments/mujoco.md b/docs/environments/mujoco.md
new file mode 100644
index 00000000..70272d24
--- /dev/null
+++ b/docs/environments/mujoco.md
@@ -0,0 +1,32 @@
+---
+title: "MuJoCo"
+---
+
+# MuJoCo
+
+Multi-objective versions of Mujoco environments.
+
+| Env | Obs/Action spaces | Objectives | Description |
+|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------|---------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| [`mo-reacher-v4`](https://mo-gymnasium.farama.org/environments/mo-reacher/)
| Continuous / Discrete | `[target_1, target_2, target_3, target_4]` | Mujoco version of `mo-reacher-v0`, based on `Reacher-v4` [environment](https://gymnasium.farama.org/environments/mujoco/reacher/). |
+| [`mo-hopper-v4`](https://mo-gymnasium.farama.org/environments/mo-hopper/)
| Continuous / Continuous | `[velocity, height, energy]` | Multi-objective version of [Hopper-v4](https://gymnasium.farama.org/environments/mujoco/hopper/) env. |
+| [`mo-halfcheetah-v4`](https://mo-gymnasium.farama.org/environments/mo-halfcheetah/)
| Continuous / Continuous | `[velocity, energy]` | Multi-objective version of [HalfCheetah-v4](https://gymnasium.farama.org/environments/mujoco/half_cheetah/) env. Similar to [Xu et al. 2020](https://github.com/mit-gfx/PGMORL). |
+| [`mo-walker2d-v4`](https://mo-gymnasium.farama.org/environments/mo-walker2d/)
| Continuous / Continuous | `[velocity, energy]` | Multi-objective version of [Walker2d-v4](https://gymnasium.farama.org/environments/mujoco/walker2d/) env. |
+| [`mo-ant-v4`](https://mo-gymnasium.farama.org/environments/mo-ant/)
| Continuous / Continuous | `[x_velocity, y_velocity, energy]` | Multi-objective version of [Ant-v4](https://gymnasium.farama.org/environments/mujoco/ant/) env. |
+| [`mo-swimmer-v4`](https://mo-gymnasium.farama.org/environments/mo-swimmer/)
| Continuous / Continuous | `[velocity, energy]` | Multi-objective version of [Swimmer-v4](https://gymnasium.farama.org/environments/mujoco/swimmer/) env. |
+| [`mo-humanoid-v4`](https://mo-gymnasium.farama.org/environments/mo-humanoid/)
| Continuous / Continuous | `[velocity, energy]` | Multi-objective version of [Humonoid-v4](https://gymnasium.farama.org/environments/mujoco/humanoid/) env. |
+
+
+```{toctree}
+:hidden:
+:glob:
+:caption: MO-Gymnasium Environments
+
+./mo-reacher
+./mo-hopper
+./mo-halfcheetah
+./mo-walker2d
+./mo-ant
+./mo-swimmer
+./mo-humanoid
+```
diff --git a/docs/examples/publications.md b/docs/examples/publications.md
index c2739d12..23176746 100644
--- a/docs/examples/publications.md
+++ b/docs/examples/publications.md
@@ -10,7 +10,14 @@ MO-Gymnasium (formerly MO-Gym) was first published in:
List of publications & submissions using MO-Gymnasium (please open a pull request to add missing entries):
+
- [Sample-Efficient Multi-Objective Learning via Generalized Policy Improvement Prioritization](https://arxiv.org/abs/2301.07784) (Alegre et al., AAMAS 2023)
- [Hyperparameter Optimization for Multi-Objective Reinforcement Learning](https://arxiv.org/abs/2310.16487v1) (Felten et al., MODeM Workshop 2023)
- [Multi-Step Generalized Policy Improvement by Leveraging Approximate Models](https://openreview.net/forum?id=KFj0Q1EXvU) (Alegre et al., NeurIPS 2023)
- [A Toolkit for Reliable Benchmarking and Research in Multi-Objective Reinforcement Learning](https://openreview.net/forum?id=jfwRLudQyj) (Felten et al., NeurIPS 2023)
+- [Distributional Pareto-Optimal Multi-Objective Reinforcement Learning](https://proceedings.neurips.cc/paper_files/paper/2023/hash/32285dd184dbfc33cb2d1f0db53c23c5-Abstract-Conference.html) (Cai et al., NeurIPS 2023)
+- [Welfare and Fairness in Multi-objective Reinforcement Learning](https://arxiv.org/abs/2212.01382) (Fan et al., AAMAS 2023)
+- [Personalized Reinforcement Learning with a Budget of Policies](https://arxiv.org/abs/2401.06514) (Ivanov et al., 2024)
+- [Multi-Objective Reinforcement Learning Based on Decomposition: A Taxonomy and Framework](https://arxiv.org/abs/2311.12495) (Felten et al., 2024)
+- [Multi-objective reinforcement learning for guaranteeing alignment with multiple values](https://alaworkshop2023.github.io/papers/ALA2023_paper_15.pdf) (Rodriguez-Soto et al., 2023)
+- [MOFL/D: A Federated Multi-objective Learning Framework with Decomposition](https://neurips.cc/virtual/2023/79018) (Hartmann et al., 2023)
diff --git a/docs/index.md b/docs/index.md
index a5069a60..fb6d56ff 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -10,11 +10,21 @@ lastpage:
introduction/install
introduction/api
-environments/all-environments
wrappers/wrappers
examples/morl_baselines
```
+```{toctree}
+:hidden:
+:caption: Environments
+
+environments/grid-world
+environments/classical
+environments/misc
+environments/mujoco
+```
+
+
```{toctree}
:hidden:
:caption: Tutorials
diff --git a/mo_gymnasium/__init__.py b/mo_gymnasium/__init__.py
index bb596e25..23201d0c 100644
--- a/mo_gymnasium/__init__.py
+++ b/mo_gymnasium/__init__.py
@@ -14,4 +14,4 @@
)
-__version__ = "1.0.1"
+__version__ = "1.1.0"
diff --git a/mo_gymnasium/envs/breakable_bottles/breakable_bottles.py b/mo_gymnasium/envs/breakable_bottles/breakable_bottles.py
index 0a6cbe72..93c33821 100644
--- a/mo_gymnasium/envs/breakable_bottles/breakable_bottles.py
+++ b/mo_gymnasium/envs/breakable_bottles/breakable_bottles.py
@@ -25,9 +25,13 @@ class BreakableBottles(Env, EzPickle):
The observation space is a dictionary with 4 keys:
- location: the current location of the agent
- bottles_carrying: the number of bottles the agent is currently carrying (0, 1 or 2)
- - bottles_delivered: the number of bottles the agent has delivered (0 or 1)
+ - bottles_delivered: the number of bottles the agent has delivered (0, 1 or 2)
- bottles_dropped: for each location, a boolean flag indicating if that location currently contains a bottle
+ Note that this observation space is different from that listed in the paper above. In the paper, bottles_delivered's possible values are listed as (0 or 1),
+ rather than (0, 1 or 2). This is because the paper did not take the terminal state, in which 2 bottles have been delivered, into account when calculating
+ the observation space. As such, the observation space of this implementation is larger than specified in the paper, having 360 possible states instead of 240.
+
## Reward Space
The reward space has 3 dimensions:
- time penalty: -1 for each time step
@@ -96,11 +100,11 @@ def __init__(
{
"location": Discrete(self.size),
"bottles_carrying": Discrete(3),
- "bottles_delivered": Discrete(2),
+ "bottles_delivered": Discrete(3),
"bottles_dropped": MultiBinary(self.size - 2),
}
)
- self.num_observations = 240
+ self.num_observations = 360
self.action_space = Discrete(3) # LEFT, RIGHT, PICKUP
self.num_actions = 3
@@ -220,7 +224,7 @@ def get_obs_idx(self, obs):
*[[bd > 0] for bd in obs["bottles_dropped"]],
]
)
- return np.ravel_multi_index(multi_index, tuple([self.size, 3, 2, *([2] * (self.size - 2))]))
+ return np.ravel_multi_index(multi_index, tuple([self.size, 3, 3, *([2] * (self.size - 2))]))
def _get_obs(self):
return {
diff --git a/mo_gymnasium/envs/lunar_lander/lunar_lander.py b/mo_gymnasium/envs/lunar_lander/lunar_lander.py
index 091190a9..c8d94d6a 100644
--- a/mo_gymnasium/envs/lunar_lander/lunar_lander.py
+++ b/mo_gymnasium/envs/lunar_lander/lunar_lander.py
@@ -6,6 +6,7 @@
FPS,
LEG_DOWN,
MAIN_ENGINE_POWER,
+ MAIN_ENGINE_Y_LOCATION,
SCALE,
SIDE_ENGINE_AWAY,
SIDE_ENGINE_HEIGHT,
@@ -46,7 +47,7 @@ def __init__(self, *args, **kwargs):
def step(self, action):
assert self.lander is not None
- # Update wind
+ # Update wind and apply to the lander
assert self.lander is not None, "You forgot to call reset()"
if self.enable_wind and not (self.legs[0].ground_contact or self.legs[1].ground_contact):
# the function used for wind is tanh(sin(2 k x) + sin(pi k x)),
@@ -60,12 +61,13 @@ def step(self, action):
# the function used for torque is tanh(sin(2 k x) + sin(pi k x)),
# which is proven to never be periodic, k = 0.01
- torque_mag = math.tanh(math.sin(0.02 * self.torque_idx) + (math.sin(math.pi * 0.01 * self.torque_idx))) * (
- self.turbulence_power
+ torque_mag = (
+ math.tanh(math.sin(0.02 * self.torque_idx) + (math.sin(math.pi * 0.01 * self.torque_idx)))
+ * self.turbulence_power
)
self.torque_idx += 1
self.lander.ApplyTorque(
- (torque_mag),
+ torque_mag,
True,
)
@@ -74,9 +76,15 @@ def step(self, action):
else:
assert self.action_space.contains(action), f"{action!r} ({type(action)}) invalid "
- # Engines
+ # Apply Engine Impulses
+
+ # Tip is the (X and Y) components of the rotation of the lander.
tip = (math.sin(self.lander.angle), math.cos(self.lander.angle))
+
+ # Side is the (-Y and X) components of the rotation of the lander.
side = (-tip[1], tip[0])
+
+ # Generate two random numbers between -1/SCALE and 1/SCALE.
dispersion = [self.np_random.uniform(-1.0, +1.0) / SCALE for _ in range(2)]
m_power = 0.0
@@ -87,21 +95,29 @@ def step(self, action):
assert m_power >= 0.5 and m_power <= 1.0
else:
m_power = 1.0
+
# 4 is move a bit downwards, +-2 for randomness
- ox = tip[0] * (4 / SCALE + 2 * dispersion[0]) + side[0] * dispersion[1]
- oy = -tip[1] * (4 / SCALE + 2 * dispersion[0]) - side[1] * dispersion[1]
+ # The components of the impulse to be applied by the main engine.
+ ox = tip[0] * (MAIN_ENGINE_Y_LOCATION / SCALE + 2 * dispersion[0]) + side[0] * dispersion[1]
+ oy = -tip[1] * (MAIN_ENGINE_Y_LOCATION / SCALE + 2 * dispersion[0]) - side[1] * dispersion[1]
+
impulse_pos = (self.lander.position[0] + ox, self.lander.position[1] + oy)
- p = self._create_particle(
- 3.5, # 3.5 is here to make particle speed adequate
- impulse_pos[0],
- impulse_pos[1],
- m_power,
- ) # particles are just a decoration
- p.ApplyLinearImpulse(
- (ox * MAIN_ENGINE_POWER * m_power, oy * MAIN_ENGINE_POWER * m_power),
- impulse_pos,
- True,
- )
+ if self.render_mode is not None:
+ # particles are just a decoration, with no impact on the physics, so don't add them when not rendering
+ p = self._create_particle(
+ 3.5, # 3.5 is here to make particle speed adequate
+ impulse_pos[0],
+ impulse_pos[1],
+ m_power,
+ )
+ p.ApplyLinearImpulse(
+ (
+ ox * MAIN_ENGINE_POWER * m_power,
+ oy * MAIN_ENGINE_POWER * m_power,
+ ),
+ impulse_pos,
+ True,
+ )
self.lander.ApplyLinearImpulse(
(-ox * MAIN_ENGINE_POWER * m_power, -oy * MAIN_ENGINE_POWER * m_power),
impulse_pos,
@@ -110,26 +126,39 @@ def step(self, action):
s_power = 0.0
if (self.continuous and np.abs(action[1]) > 0.5) or (not self.continuous and action in [1, 3]):
- # Orientation engines
+ # Orientation/Side engines
if self.continuous:
direction = np.sign(action[1])
s_power = np.clip(np.abs(action[1]), 0.5, 1.0)
assert s_power >= 0.5 and s_power <= 1.0
else:
+ # action = 1 is left, action = 3 is right
direction = action - 2
s_power = 1.0
+
+ # The components of the impulse to be applied by the side engines.
ox = tip[0] * dispersion[0] + side[0] * (3 * dispersion[1] + direction * SIDE_ENGINE_AWAY / SCALE)
oy = -tip[1] * dispersion[0] - side[1] * (3 * dispersion[1] + direction * SIDE_ENGINE_AWAY / SCALE)
+
+ # The constant 17 is a constant, that is presumably meant to be SIDE_ENGINE_HEIGHT.
+ # However, SIDE_ENGINE_HEIGHT is defined as 14
+ # This causes the position of the thrust on the body of the lander to change, depending on the orientation of the lander.
+ # This in turn results in an orientation dependent torque being applied to the lander.
impulse_pos = (
self.lander.position[0] + ox - tip[0] * 17 / SCALE,
self.lander.position[1] + oy + tip[1] * SIDE_ENGINE_HEIGHT / SCALE,
)
- p = self._create_particle(0.7, impulse_pos[0], impulse_pos[1], s_power)
- p.ApplyLinearImpulse(
- (ox * SIDE_ENGINE_POWER * s_power, oy * SIDE_ENGINE_POWER * s_power),
- impulse_pos,
- True,
- )
+ if self.render_mode is not None:
+ # particles are just a decoration, with no impact on the physics, so don't add them when not rendering
+ p = self._create_particle(0.7, impulse_pos[0], impulse_pos[1], s_power)
+ p.ApplyLinearImpulse(
+ (
+ ox * SIDE_ENGINE_POWER * s_power,
+ oy * SIDE_ENGINE_POWER * s_power,
+ ),
+ impulse_pos,
+ True,
+ )
self.lander.ApplyLinearImpulse(
(-ox * SIDE_ENGINE_POWER * s_power, -oy * SIDE_ENGINE_POWER * s_power),
impulse_pos,
@@ -140,6 +169,7 @@ def step(self, action):
pos = self.lander.position
vel = self.lander.linearVelocity
+
state = [
(pos.x - VIEWPORT_W / SCALE / 2) / (VIEWPORT_W / SCALE / 2),
(pos.y - (self.helipad_y + LEG_DOWN / SCALE)) / (VIEWPORT_H / SCALE / 2),
diff --git a/mo_gymnasium/envs/mujoco/__init__.py b/mo_gymnasium/envs/mujoco/__init__.py
index 5b605ff5..4415d577 100644
--- a/mo_gymnasium/envs/mujoco/__init__.py
+++ b/mo_gymnasium/envs/mujoco/__init__.py
@@ -39,6 +39,37 @@
kwargs={"cost_objective": False},
)
+register(
+ id="mo-walker2d-v4",
+ entry_point="mo_gymnasium.envs.mujoco.walker2d:MOWalker2dEnv",
+ max_episode_steps=1000,
+)
+
+register(
+ id="mo-ant-v4",
+ entry_point="mo_gymnasium.envs.mujoco.ant:MOAntEnv",
+ max_episode_steps=1000,
+)
+
+register(
+ id="mo-ant-2d-v4",
+ entry_point="mo_gymnasium.envs.mujoco.ant:MOAntEnv",
+ max_episode_steps=1000,
+ kwargs={"cost_objective": False},
+)
+
+register(
+ id="mo-swimmer-v4",
+ entry_point="mo_gymnasium.envs.mujoco.swimmer:MOSwimmerEnv",
+ max_episode_steps=1000,
+)
+
+register(
+ id="mo-humanoid-v4",
+ entry_point="mo_gymnasium.envs.mujoco.humanoid:MOHumanoidEnv",
+ max_episode_steps=1000,
+)
+
register(
id="mo-reacher-v4",
entry_point="mo_gymnasium.envs.mujoco.reacher_v4:MOReacherEnv",
diff --git a/mo_gymnasium/envs/mujoco/ant.py b/mo_gymnasium/envs/mujoco/ant.py
new file mode 100644
index 00000000..cc2ba7ed
--- /dev/null
+++ b/mo_gymnasium/envs/mujoco/ant.py
@@ -0,0 +1,51 @@
+import numpy as np
+from gymnasium.envs.mujoco.ant_v4 import AntEnv
+from gymnasium.spaces import Box
+from gymnasium.utils import EzPickle
+
+
+class MOAntEnv(AntEnv, EzPickle):
+ """
+ ## Description
+ Multi-objective version of the AntEnv environment.
+
+ See [Gymnasium's env](https://gymnasium.farama.org/environments/mujoco/ant/) for more information.
+
+ The original Gymnasium's 'Ant-v4' is recovered by the following linear scalarization:
+
+ env = mo_gym.make('mo-ant-v4', cost_objective=False)
+ LinearReward(env, weight=np.array([1.0, 0.0]))
+
+ ## Reward Space
+ The reward is 2- or 3-dimensional:
+ - 0: x-velocity
+ - 1: y-velocity
+ - 2: Control cost of the action
+ If the cost_objective flag is set to False, the reward is 2-dimensional, and the cost is added to other objectives.
+ A healthy reward is added to all objectives.
+ """
+
+ def __init__(self, cost_objective=True, **kwargs):
+ super().__init__(**kwargs)
+ EzPickle.__init__(self, cost_objective, **kwargs)
+ self.cost_objetive = cost_objective
+ self.reward_dim = 3 if cost_objective else 2
+ self.reward_space = Box(low=-np.inf, high=np.inf, shape=(self.reward_dim,))
+
+ def step(self, action):
+ observation, reward, terminated, truncated, info = super().step(action)
+ x_velocity = info["x_velocity"]
+ y_velocity = info["y_velocity"]
+ cost = info["reward_ctrl"]
+ healthy_reward = info["reward_survive"]
+
+ if self.cost_objetive:
+ cost /= self._ctrl_cost_weight # Ignore the weight in the original AntEnv
+ vec_reward = np.array([x_velocity, y_velocity, cost], dtype=np.float32)
+ else:
+ vec_reward = np.array([x_velocity, y_velocity], dtype=np.float32)
+ vec_reward += cost
+
+ vec_reward += healthy_reward
+
+ return observation, vec_reward, terminated, truncated, info
diff --git a/mo_gymnasium/envs/mujoco/half_cheetah_v4.py b/mo_gymnasium/envs/mujoco/half_cheetah_v4.py
index 8427cc52..0dac5308 100644
--- a/mo_gymnasium/envs/mujoco/half_cheetah_v4.py
+++ b/mo_gymnasium/envs/mujoco/half_cheetah_v4.py
@@ -11,6 +11,11 @@ class MOHalfCheehtahEnv(HalfCheetahEnv, EzPickle):
See [Gymnasium's env](https://gymnasium.farama.org/environments/mujoco/half_cheetah/) for more information.
+ The original Gymnasium's 'HalfCheetah-v4' is recovered by the following linear scalarization:
+
+ env = mo_gym.make('mo-halfcheetah-v4')
+ LinearReward(env, weight=np.array([1.0, 1.0]))
+
## Reward Space
The reward is 2-dimensional:
- 0: Reward for running forward
diff --git a/mo_gymnasium/envs/mujoco/hopper_v4.py b/mo_gymnasium/envs/mujoco/hopper_v4.py
index 7d35b408..6fe0ed3c 100644
--- a/mo_gymnasium/envs/mujoco/hopper_v4.py
+++ b/mo_gymnasium/envs/mujoco/hopper_v4.py
@@ -11,6 +11,11 @@ class MOHopperEnv(HopperEnv, EzPickle):
See [Gymnasium's env](https://gymnasium.farama.org/environments/mujoco/hopper/) for more information.
+ The original Gymnasium's 'Hopper-v4' is recovered by the following linear scalarization:
+
+ env = mo_gym.make('mo-hopper-v4', cost_objective=False)
+ LinearReward(env, weight=np.array([1.0, 0.0]))
+
## Reward Space
The reward is 3-dimensional:
- 0: Reward for going forward on the x-axis
diff --git a/mo_gymnasium/envs/mujoco/humanoid.py b/mo_gymnasium/envs/mujoco/humanoid.py
new file mode 100644
index 00000000..12518cd8
--- /dev/null
+++ b/mo_gymnasium/envs/mujoco/humanoid.py
@@ -0,0 +1,34 @@
+import numpy as np
+from gymnasium.envs.mujoco.humanoid_v4 import HumanoidEnv
+from gymnasium.spaces import Box
+from gymnasium.utils import EzPickle
+
+
+class MOHumanoidEnv(HumanoidEnv, EzPickle):
+ """
+ ## Description
+ Multi-objective version of the HumanoidEnv environment.
+
+ See [Gymnasium's env](https://gymnasium.farama.org/environments/mujoco/humanoid/) for more information.
+
+ ## Reward Space
+ The reward is 2-dimensional:
+ - 0: Reward for running forward (x-velocity)
+ - 1: Control cost of the action
+ """
+
+ def __init__(self, **kwargs):
+ super().__init__(**kwargs)
+ EzPickle.__init__(self, **kwargs)
+ self.reward_space = Box(low=-np.inf, high=np.inf, shape=(2,))
+ self.reward_dim = 2
+
+ def step(self, action):
+ observation, reward, terminated, truncated, info = super().step(action)
+ velocity = info["x_velocity"]
+ negative_cost = 10 * info["reward_quadctrl"]
+ vec_reward = np.array([velocity, negative_cost], dtype=np.float32)
+
+ vec_reward += self.healthy_reward # All objectives are penalyzed when the agent falls
+
+ return observation, vec_reward, terminated, truncated, info
diff --git a/mo_gymnasium/envs/mujoco/swimmer.py b/mo_gymnasium/envs/mujoco/swimmer.py
new file mode 100644
index 00000000..72e5b59e
--- /dev/null
+++ b/mo_gymnasium/envs/mujoco/swimmer.py
@@ -0,0 +1,38 @@
+import numpy as np
+from gymnasium.envs.mujoco.swimmer_v4 import SwimmerEnv
+from gymnasium.spaces import Box
+from gymnasium.utils import EzPickle
+
+
+class MOSwimmerEnv(SwimmerEnv, EzPickle):
+ """
+ ## Description
+ Multi-objective version of the SwimmerEnv environment.
+
+ See [Gymnasium's env](https://gymnasium.farama.org/environments/mujoco/swimmer/) for more information.
+
+ The original Gymnasium's 'Swimmer-v4' is recovered by the following linear scalarization:
+
+ env = mo_gym.make('mo-swimmer-v4')
+ LinearReward(env, weight=np.array([1.0, 1e-4]))
+
+ ## Reward Space
+ The reward is 2-dimensional:
+ - 0: Reward for moving forward (x-velocity)
+ - 1: Control cost of the action
+ """
+
+ def __init__(self, **kwargs):
+ super().__init__(**kwargs)
+ EzPickle.__init__(self, **kwargs)
+ self.reward_space = Box(low=-np.inf, high=np.inf, shape=(2,))
+ self.reward_dim = 2
+
+ def step(self, action):
+ observation, reward, terminated, truncated, info = super().step(action)
+ velocity = info["x_velocity"]
+ energy = -np.sum(np.square(action))
+
+ vec_reward = np.array([velocity, energy], dtype=np.float32)
+
+ return observation, vec_reward, terminated, truncated, info
diff --git a/mo_gymnasium/envs/mujoco/walker2d.py b/mo_gymnasium/envs/mujoco/walker2d.py
new file mode 100644
index 00000000..e3806810
--- /dev/null
+++ b/mo_gymnasium/envs/mujoco/walker2d.py
@@ -0,0 +1,35 @@
+import numpy as np
+from gymnasium.envs.mujoco.walker2d_v4 import Walker2dEnv
+from gymnasium.spaces import Box
+from gymnasium.utils import EzPickle
+
+
+class MOWalker2dEnv(Walker2dEnv, EzPickle):
+ """
+ ## Description
+ Multi-objective version of the Walker2dEnv environment.
+
+ See [Gymnasium's env](https://gymnasium.farama.org/environments/mujoco/walker2d/) for more information.
+
+ ## Reward Space
+ The reward is 2-dimensional:
+ - 0: Reward for running forward (x-velocity)
+ - 1: Control cost of the action
+ """
+
+ def __init__(self, **kwargs):
+ super().__init__(**kwargs)
+ EzPickle.__init__(self, **kwargs)
+ self.reward_space = Box(low=-np.inf, high=np.inf, shape=(2,))
+ self.reward_dim = 2
+
+ def step(self, action):
+ observation, reward, terminated, truncated, info = super().step(action)
+ velocity = info["x_velocity"]
+ energy = -np.sum(np.square(action))
+
+ vec_reward = np.array([velocity, energy], dtype=np.float32)
+
+ vec_reward += self.healthy_reward # All objectives are penalyzed when the agent falls
+
+ return observation, vec_reward, terminated, truncated, info
diff --git a/tests/test_envs.py b/tests/test_envs.py
index 7e338be4..28af4b0c 100644
--- a/tests/test_envs.py
+++ b/tests/test_envs.py
@@ -5,6 +5,7 @@
import pytest
from gymnasium.envs.registration import EnvSpec
from gymnasium.utils.env_checker import check_env, data_equivalence
+from gymnasium.utils.env_match import check_environments_match
import mo_gymnasium as mo_gym
@@ -40,6 +41,32 @@ def test_all_env_passive_env_checker(spec):
env.close()
+@pytest.mark.parametrize(
+ "gym_id, mo_gym_id",
+ [
+ ("MountainCar-v0", "mo-mountaincar-v0"),
+ ("MountainCarContinuous-v0", "mo-mountaincarcontinuous-v0"),
+ ("LunarLander-v2", "mo-lunar-lander-v2"),
+ # ("Reacher-v4", "mo-reacher-v4"), # use a different model and action space
+ ("Hopper-v4", "mo-hopper-v4"),
+ ("HalfCheetah-v4", "mo-halfcheetah-v4"),
+ ("Walker2d-v4", "mo-walker2d-v4"),
+ ("Ant-v4", "mo-ant-v4"),
+ ("Swimmer-v4", "mo-swimmer-v4"),
+ ("Humanoid-v4", "mo-humanoid-v4"),
+ ],
+)
+def test_gymnasium_equivalence(gym_id, mo_gym_id, num_steps=100, seed=123):
+ env = gym.make(gym_id)
+ mo_env = mo_gym.LinearReward(mo_gym.make(mo_gym_id))
+
+ # for float rewards, then precision becomes an issue
+ env = gym.wrappers.TransformReward(env, lambda reward: round(reward, 4))
+ mo_env = gym.wrappers.TransformReward(mo_env, lambda reward: round(reward, 4))
+
+ check_environments_match(env, mo_env, num_steps=num_steps, seed=seed, skip_rew=True, info_comparison="keys-superset")
+
+
# Note that this precludes running this test in multiple threads.
# However, we probably already can't do multithreading due to some environments.
SEED = 0