Merge branch 'gymnasium-v5' into mujoco-v5

Farama-Foundation · Aug 14, 2024 · 57870fe · 57870fe
2 parents cf081b0 + dbddf3a
commit 57870fe
Show file tree

Hide file tree

Showing 22 changed files with 787 additions and 621 deletions.
diff --git a/README.md b/README.md
@@ -50,7 +50,7 @@ obs, info = env.reset()
 next_obs, vector_reward, terminated, truncated, info = env.step(your_agent.act(obs))
 
 # Optionally, you can scalarize the reward function with the LinearReward wrapper
-env = mo_gym.LinearReward(env, weight=np.array([0.8, 0.2, 0.2]))
+env = mo_gym.wrappers.LinearReward(env, weight=np.array([0.8, 0.2, 0.2]))
 ```
 For details on multi-objective MDP's (MOMDP's) and other MORL definitions, see [A practical guide to multi-objective reinforcement learning and planning](https://link.springer.com/article/10.1007/s10458-022-09552-y).
 

diff --git a/docs/index.md b/docs/index.md
@@ -11,6 +11,7 @@ lastpage:
 introduction/install
 introduction/api
 wrappers/wrappers
+wrappers/vector_wrappers
 examples/morl_baselines
 ```
 

diff --git a/docs/wrappers/vector_wrappers.md b/docs/wrappers/vector_wrappers.md
@@ -0,0 +1,20 @@
+---
+title: "Vector Wrappers"
+---
+
+# Vector Wrappers
+
+Similar to the normal wrappers, MO-Gymnasium provides a few wrappers that are specifically designed to work with vectorized environments. They are all available directly from the `mo_gymnasium.wrappers.vector` module.
+
+
+## `MOSyncVectorEnv`
+
+```{eval-rst}
+.. autoclass:: mo_gymnasium.wrappers.vector.MOSyncVectorEnv
+```
+
+## `MORecordEpisodeStatistics`
+
+```{eval-rst}
+.. autoclass:: mo_gymnasium.wrappers.vector.MORecordEpisodeStatistics
+```
diff --git a/docs/wrappers/wrappers.md b/docs/wrappers/wrappers.md
@@ -4,36 +4,36 @@ title: "Wrappers"
 
 # Wrappers
 
-A few wrappers inspired from Gymnasium's wrappers are available in MO-Gymnasium. They are all available directly from the `mo_gymnasium` module.
+A few wrappers inspired from Gymnasium's wrappers are available in MO-Gymnasium. They are all available directly from the `mo_gymnasium.wrappers` module.
 
 
 ## `LinearReward`
 
 
 ```{eval-rst}
-.. autoclass:: mo_gymnasium.LinearReward
+.. autoclass:: mo_gymnasium.wrappers.LinearReward
 ```
 
 ## `MONormalizeReward`
 
 ```{eval-rst}
-.. autoclass:: mo_gymnasium.MONormalizeReward
+.. autoclass:: mo_gymnasium.wrappers.MONormalizeReward
 ```
 
 ## `MOClipReward`
 
 ```{eval-rst}
-.. autoclass:: mo_gymnasium.MOClipReward
+.. autoclass:: mo_gymnasium.wrappers.MOClipReward
 ```
 
-## `MOSyncVectorEnv`
+## `MORecordEpisodeStatistics`
 
 ```{eval-rst}
-.. autoclass:: mo_gymnasium.MOSyncVectorEnv
+.. autoclass:: mo_gymnasium.wrappers.MORecordEpisodeStatistics
 ```
 
-## `MORecordEpisodeStatistics`
+## `MOMaxAndSkipObservation`
 
 ```{eval-rst}
-.. autoclass:: mo_gymnasium.MORecordEpisodeStatistics
+.. autoclass:: mo_gymnasium.wrappers.MOMaxAndSkipObservation
 ```
diff --git a/mo_gymnasium/__init__.py b/mo_gymnasium/__init__.py
@@ -2,16 +2,10 @@
 
 # Envs
 import mo_gymnasium.envs
+from mo_gymnasium import wrappers
 
 # Utils
-from mo_gymnasium.utils import (
-    LinearReward,
-    MOClipReward,
-    MONormalizeReward,
-    MORecordEpisodeStatistics,
-    MOSyncVectorEnv,
-    make,
-)
+from mo_gymnasium.utils import make
 
 
-__version__ = "1.1.0"
+__version__ = "1.2.0"
diff --git a/mo_gymnasium/envs/__init__.py b/mo_gymnasium/envs/__init__.py
@@ -10,6 +10,5 @@
 import mo_gymnasium.envs.minecart
 import mo_gymnasium.envs.mountain_car
 import mo_gymnasium.envs.mujoco
-import mo_gymnasium.envs.reacher
 import mo_gymnasium.envs.resource_gathering
 import mo_gymnasium.envs.water_reservoir
diff --git a/mo_gymnasium/envs/lunar_lander/__init__.py b/mo_gymnasium/envs/lunar_lander/__init__.py
@@ -2,13 +2,13 @@
 
 
 register(
-    id="mo-lunar-lander-v2",
+    id="mo-lunar-lander-v3",
     entry_point="mo_gymnasium.envs.lunar_lander.lunar_lander:MOLunarLander",
     max_episode_steps=1000,
 )
 
 register(
-    id="mo-lunar-lander-continuous-v2",
+    id="mo-lunar-lander-continuous-v3",
     entry_point="mo_gymnasium.envs.lunar_lander.lunar_lander:MOLunarLander",
     max_episode_steps=1000,
     kwargs={"continuous": True},

diff --git a/mo_gymnasium/envs/mario/mario.py b/mo_gymnasium/envs/mario/mario.py
@@ -7,7 +7,6 @@
 from gymnasium.utils import EzPickle, seeding
 
 # from stable_baselines3.common.atari_wrappers import MaxAndSkipEnv
-from gymnasium.wrappers import GrayScaleObservation, ResizeObservation
 from nes_py.nes_env import SCREEN_SHAPE_24_BIT
 
 import mo_gymnasium as mo_gym
@@ -16,7 +15,7 @@
 from mo_gymnasium.envs.mario.joypad_space import JoypadSpace
 
 
-class MOSuperMarioBros(SuperMarioBrosEnv, EzPickle):
+class MOSuperMarioBros(SuperMarioBrosEnv, gym.Env, EzPickle):
     """
     ## Description
     Multi-objective version of the SuperMarioBro environment.
@@ -202,11 +201,14 @@ def step(self, action):
 
 
 if __name__ == "__main__":
+    from gymnasium.wrappers import ResizeObservation
+    from gymnasium.wrappers.transform_observation import GrayscaleObservation
+
     env = MOSuperMarioBros()
     env = JoypadSpace(env, SIMPLE_MOVEMENT)
     # env = MaxAndSkipEnv(env, 4)
     env = ResizeObservation(env, (84, 84))
-    env = GrayScaleObservation(env)
+    env = GrayscaleObservation(env)
     # env = FrameStack(env, 4)
     env = mo_gym.LinearReward(env)
 

diff --git a/mo_gymnasium/envs/mountain_car/__init__.py b/mo_gymnasium/envs/mountain_car/__init__.py
@@ -6,3 +6,24 @@
     entry_point="mo_gymnasium.envs.mountain_car.mountain_car:MOMountainCar",
     max_episode_steps=200,
 )
+
+register(
+    id="mo-mountaincar-3d-v0",
+    entry_point="mo_gymnasium.envs.mountain_car.mountain_car:MOMountainCar",
+    max_episode_steps=200,
+    kwargs={"add_speed_objective": True, "merge_move_penalty": True},
+)
+
+register(
+    id="mo-mountaincar-timemove-v0",
+    entry_point="mo_gymnasium.envs.mountain_car.mountain_car:MOMountainCar",
+    max_episode_steps=200,
+    kwargs={"merge_move_penalty": True},
+)
+
+register(
+    id="mo-mountaincar-timespeed-v0",
+    entry_point="mo_gymnasium.envs.mountain_car.mountain_car:MOMountainCar",
+    max_episode_steps=200,
+    kwargs={"remove_move_penalty": True, "add_speed_objective": True},
+)
diff --git a/mo_gymnasium/envs/mountain_car/mountain_car.py b/mo_gymnasium/envs/mountain_car/mountain_car.py
@@ -14,19 +14,50 @@ class MOMountainCar(MountainCarEnv, EzPickle):
     See [Gymnasium's env](https://gymnasium.farama.org/environments/classic_control/mountain_car_continuous/) for more information.
 
     ## Reward space:
-    The reward space is a 3D vector containing the time penalty, and penalties for reversing and going forward.
+    By default, the reward space is a 3D vector containing the time penalty, and penalties for reversing and going forward.
     - time penalty: -1.0 for each time step
     - reverse penalty: -1.0 for each time step the action is 0 (reverse)
     - forward penalty: -1.0 for each time step the action is 2 (forward)
+
+    Alternatively, the reward can be changed with the following options:
+    - add_speed_objective: Add an extra objective corresponding to the speed of the car.
+    - remove_move_penalty: Remove the reverse and forward objectives.
+    - merge_move_penalty: Merge reverse and forward penalties into a single penalty.
     """
 
-    def __init__(self, render_mode: Optional[str] = None, goal_velocity=0):
+    def __init__(
+        self,
+        render_mode: Optional[str] = None,
+        add_speed_objective: bool = False,
+        remove_move_penalty: bool = False,
+        merge_move_penalty: bool = False,
+        goal_velocity=0,
+    ):
         super().__init__(render_mode, goal_velocity)
-        EzPickle.__init__(self, render_mode, goal_velocity)
+        EzPickle.__init__(self, render_mode, add_speed_objective, remove_move_penalty, merge_move_penalty, goal_velocity)
+        self.add_speed_objective = add_speed_objective
+        self.remove_move_penalty = remove_move_penalty
+        self.merge_move_penalty = merge_move_penalty
 
-        self.reward_space = spaces.Box(low=np.array([-1, -1, -1]), high=np.array([-1, 0, 0]), shape=(3,), dtype=np.float32)
         self.reward_dim = 3
 
+        if self.add_speed_objective:
+            self.reward_dim += 1
+
+        if self.remove_move_penalty:
+            self.reward_dim -= 2
+        elif self.merge_move_penalty:
+            self.reward_dim -= 1
+
+        low = np.array([-1] * self.reward_dim)
+        high = np.zeros(self.reward_dim)
+        high[0] = -1  # Time penalty is always -1
+        if self.add_speed_objective:
+            low[-1] = 0.0
+            high[-1] = 1.1
+
+        self.reward_space = spaces.Box(low=low, high=high, shape=(self.reward_dim,), dtype=np.float32)
+
     def step(self, action: int):
         assert self.action_space.contains(action), f"{action!r} ({type(action)}) invalid"
 
@@ -39,11 +70,20 @@ def step(self, action: int):
             velocity = 0
 
         terminated = bool(position >= self.goal_position and velocity >= self.goal_velocity)
-        # reward = -1.0
-        reward = np.zeros(3, dtype=np.float32)
+
+        reward = np.zeros(self.reward_dim, dtype=np.float32)
+
         reward[0] = 0.0 if terminated else -1.0  # time penalty
-        reward[1] = 0.0 if action != 0 else -1.0  # reverse penalty
-        reward[2] = 0.0 if action != 2 else -1.0  # forward penalty
+
+        if not self.remove_move_penalty:
+            if self.merge_move_penalty:
+                reward[1] = 0.0 if action == 1 else -1.0
+            else:
+                reward[1] = 0.0 if action != 0 else -1.0  # reverse penalty
+                reward[2] = 0.0 if action != 2 else -1.0  # forward penalty
+
+        if self.add_speed_objective:
+            reward[-1] = 15 * abs(velocity)
 
         self.state = (position, velocity)
         if self.render_mode == "human":

diff --git a/mo_gymnasium/envs/mujoco/reacher_v4.py b/mo_gymnasium/envs/mujoco/reacher_v4.py
@@ -13,7 +13,7 @@
 class MOReacherEnv(ReacherEnv):
     """
     ## Description
-    Mujoco version of `mo-reacher-v0`, based on [`Reacher-v4` environment](https://gymnasium.farama.org/environments/mujoco/reacher/).
+    Multi-objective version of the [`Reacher-v4` environment](https://gymnasium.farama.org/environments/mujoco/reacher/).
 
     ## Observation Space
     The observation is 6-dimensional and contains:

diff --git a/mo_gymnasium/envs/reacher/__init__.py b/mo_gymnasium/envs/reacher/__init__.py