Add Mujoco v5 environments (#85)

Co-authored-by: Lucas Alegre <[email protected]> Co-authored-by: Mark Towers <[email protected]> Co-authored-by: Florian Felten <[email protected]>
Farama-Foundation · Oct 28, 2024 · 5b248ed · 5b248ed
1 parent 5099244
commit 5b248ed
Show file tree

Hide file tree

Showing 15 changed files with 456 additions and 15 deletions.
diff --git a/mo_gymnasium/envs/mujoco/__init__.py b/mo_gymnasium/envs/mujoco/__init__.py
@@ -3,56 +3,113 @@
 
 register(
     id="mo-halfcheetah-v4",
-    entry_point="mo_gymnasium.envs.mujoco.half_cheetah:MOHalfCheehtahEnv",
+    entry_point="mo_gymnasium.envs.mujoco.half_cheetah_v4:MOHalfCheehtahEnv",
+    max_episode_steps=1000,
+)
+
+register(
+    id="mo-halfcheetah-v5",
+    entry_point="mo_gymnasium.envs.mujoco.half_cheetah_v5:MOHalfCheehtahEnv",
     max_episode_steps=1000,
 )
 
 register(
     id="mo-hopper-v4",
-    entry_point="mo_gymnasium.envs.mujoco.hopper:MOHopperEnv",
+    entry_point="mo_gymnasium.envs.mujoco.hopper_v4:MOHopperEnv",
+    max_episode_steps=1000,
+)
+
+register(
+    id="mo-hopper-v5",
+    entry_point="mo_gymnasium.envs.mujoco.hopper_v5:MOHopperEnv",
     max_episode_steps=1000,
 )
 
 register(
     id="mo-hopper-2d-v4",
-    entry_point="mo_gymnasium.envs.mujoco.hopper:MOHopperEnv",
+    entry_point="mo_gymnasium.envs.mujoco.hopper_v4:MOHopperEnv",
+    max_episode_steps=1000,
+    kwargs={"cost_objective": False},
+)
+
+register(
+    id="mo-hopper-2obj-v5",
+    entry_point="mo_gymnasium.envs.mujoco.hopper_v5:MOHopperEnv",
     max_episode_steps=1000,
     kwargs={"cost_objective": False},
 )
 
 register(
     id="mo-walker2d-v4",
-    entry_point="mo_gymnasium.envs.mujoco.walker2d:MOWalker2dEnv",
+    entry_point="mo_gymnasium.envs.mujoco.walker2d_v4:MOWalker2dEnv",
+    max_episode_steps=1000,
+)
+
+register(
+    id="mo-walker2d-v5",
+    entry_point="mo_gymnasium.envs.mujoco.walker2d_v5:MOWalker2dEnv",
     max_episode_steps=1000,
 )
 
 register(
     id="mo-ant-v4",
-    entry_point="mo_gymnasium.envs.mujoco.ant:MOAntEnv",
+    entry_point="mo_gymnasium.envs.mujoco.ant_v4:MOAntEnv",
     max_episode_steps=1000,
 )
 
 register(
     id="mo-ant-2d-v4",
-    entry_point="mo_gymnasium.envs.mujoco.ant:MOAntEnv",
+    entry_point="mo_gymnasium.envs.mujoco.ant_v4:MOAntEnv",
+    max_episode_steps=1000,
+    kwargs={"cost_objective": False},
+)
+
+
+register(
+    id="mo-ant-v5",
+    entry_point="mo_gymnasium.envs.mujoco.ant_v5:MOAntEnv",
+    max_episode_steps=1000,
+)
+
+register(
+    id="mo-ant-2obj-v5",
+    entry_point="mo_gymnasium.envs.mujoco.ant_v5:MOAntEnv",
     max_episode_steps=1000,
     kwargs={"cost_objective": False},
 )
 
 register(
     id="mo-swimmer-v4",
-    entry_point="mo_gymnasium.envs.mujoco.swimmer:MOSwimmerEnv",
+    entry_point="mo_gymnasium.envs.mujoco.swimmer_v4:MOSwimmerEnv",
+    max_episode_steps=1000,
+)
+
+register(
+    id="mo-swimmer-v5",
+    entry_point="mo_gymnasium.envs.mujoco.swimmer_v5:MOSwimmerEnv",
     max_episode_steps=1000,
 )
 
 register(
     id="mo-humanoid-v4",
-    entry_point="mo_gymnasium.envs.mujoco.humanoid:MOHumanoidEnv",
+    entry_point="mo_gymnasium.envs.mujoco.humanoid_v4:MOHumanoidEnv",
+    max_episode_steps=1000,
+)
+
+register(
+    id="mo-humanoid-v5",
+    entry_point="mo_gymnasium.envs.mujoco.humanoid_v5:MOHumanoidEnv",
     max_episode_steps=1000,
 )
 
 register(
     id="mo-reacher-v4",
-    entry_point="mo_gymnasium.envs.mujoco.reacher:MOReacherEnv",
+    entry_point="mo_gymnasium.envs.mujoco.reacher_v4:MOReacherEnv",
+    max_episode_steps=50,
+)
+
+register(
+    id="mo-reacher-v5",
+    entry_point="mo_gymnasium.envs.mujoco.reacher_v5:MOReacherEnv",
     max_episode_steps=50,
 )
diff --git a/mo_gymnasium/envs/mujoco/ant.py → mo_gymnasium/envs/mujoco/ant_v4.py b/mo_gymnasium/envs/mujoco/ant.py → mo_gymnasium/envs/mujoco/ant_v4.py
@@ -28,7 +28,7 @@ class MOAntEnv(AntEnv, EzPickle):
     def __init__(self, cost_objective=True, **kwargs):
         super().__init__(**kwargs)
         EzPickle.__init__(self, cost_objective, **kwargs)
-        self.cost_objetive = cost_objective
+        self._cost_objective = cost_objective
         self.reward_dim = 3 if cost_objective else 2
         self.reward_space = Box(low=-np.inf, high=np.inf, shape=(self.reward_dim,))
 
@@ -39,7 +39,7 @@ def step(self, action):
         cost = info["reward_ctrl"]
         healthy_reward = info["reward_survive"]
 
-        if self.cost_objetive:
+        if self._cost_objective:
             cost /= self._ctrl_cost_weight  # Ignore the weight in the original AntEnv
             vec_reward = np.array([x_velocity, y_velocity, cost], dtype=np.float32)
         else:

diff --git a/mo_gymnasium/envs/mujoco/ant_v5.py b/mo_gymnasium/envs/mujoco/ant_v5.py
@@ -0,0 +1,60 @@
+import numpy as np
+from gymnasium.envs.mujoco.ant_v5 import AntEnv
+from gymnasium.spaces import Box
+from gymnasium.utils import EzPickle
+
+
+class MOAntEnv(AntEnv, EzPickle):
+    """
+    ## Description
+    Multi-objective version of the AntEnv environment.
+
+    See [Gymnasium's env](https://gymnasium.farama.org/environments/mujoco/ant/) for more information.
+
+    The original Gymnasium's 'Ant-v5' is recovered by the following linear scalarization:
+
+    env = mo_gym.make('mo-ant-v4', cost_objective=False)
+    LinearReward(env, weight=np.array([1.0, 0.0]))
+
+    ## Reward Space
+    The reward is 2- or 3-dimensional:
+    - 0: x-velocity
+    - 1: y-velocity
+    - 2: Control cost of the action
+    If the cost_objective flag is set to False, the reward is 2-dimensional, and the cost is added to other objectives.
+    A healthy reward and a cost for contact forces is added to all objectives.
+
+    A 2-objective version (without the cost objective as a separate objective) can be instantiated via:
+    env = mo_gym.make('mo-ant-2obj-v5')
+
+    ## Version History
+    - v5: Now includes contact forces in the reward and observation.
+          The 2-objective version has now id 'mo-ant-2obj-v5', instead of 'mo-ant-2d-v4'.
+    See https://gymnasium.farama.org/environments/mujoco/ant/#version-history
+    """
+
+    def __init__(self, cost_objective=True, **kwargs):
+        super().__init__(**kwargs)
+        EzPickle.__init__(self, cost_objective, **kwargs)
+        self._cost_objective = cost_objective
+        self.reward_dim = 3 if cost_objective else 2
+        self.reward_space = Box(low=-np.inf, high=np.inf, shape=(self.reward_dim,))
+
+    def step(self, action):
+        observation, reward, terminated, truncated, info = super().step(action)
+        x_velocity = info["x_velocity"]
+        y_velocity = info["y_velocity"]
+        cost = info["reward_ctrl"]
+        healthy_reward = info["reward_survive"]
+
+        if self._cost_objective:
+            cost /= self._ctrl_cost_weight  # Ignore the weight in the original AntEnv
+            vec_reward = np.array([x_velocity, y_velocity, cost], dtype=np.float32)
+        else:
+            vec_reward = np.array([x_velocity, y_velocity], dtype=np.float32)
+            vec_reward += cost
+
+        vec_reward += healthy_reward
+        vec_reward += info["reward_contact"]  # Do not treat contact forces as a separate objective
+
+        return observation, vec_reward, terminated, truncated, info
diff --git a/mo_gymnasium/envs/mujoco/half_cheetah.py → mo_gymnasium/envs/mujoco/half_cheetah_v4.py b/mo_gymnasium/envs/mujoco/half_cheetah.py → mo_gymnasium/envs/mujoco/half_cheetah_v4.py
diff --git a/mo_gymnasium/envs/mujoco/half_cheetah_v5.py b/mo_gymnasium/envs/mujoco/half_cheetah_v5.py
@@ -0,0 +1,40 @@
+import numpy as np
+from gymnasium.envs.mujoco.half_cheetah_v5 import HalfCheetahEnv
+from gymnasium.spaces import Box
+from gymnasium.utils import EzPickle
+
+
+class MOHalfCheehtahEnv(HalfCheetahEnv, EzPickle):
+    """
+    ## Description
+    Multi-objective version of the HalfCheetahEnv environment.
+
+    See [Gymnasium's env](https://gymnasium.farama.org/environments/mujoco/half_cheetah/) for more information.
+
+    The original Gymnasium's 'HalfCheetah-v5' is recovered by the following linear scalarization:
+
+    env = mo_gym.make('mo-halfcheetah-v5')
+    LinearReward(env, weight=np.array([1.0, 0.1]))
+
+    ## Reward Space
+    The reward is 2-dimensional:
+    - 0: Reward for running forward
+    - 1: Control cost of the action
+
+    ## Version History
+    - v5: The scales of the control cost has changed from v4.
+          See https://gymnasium.farama.org/environments/mujoco/half_cheetah/#version-history for other changes.
+    """
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        EzPickle.__init__(self, **kwargs)
+        self.reward_space = Box(low=-np.inf, high=np.inf, shape=(2,))
+        self.reward_dim = 2
+
+    def step(self, action):
+        observation, reward, terminated, truncated, info = super().step(action)
+        x_velocity = info["x_velocity"]
+        neg_energy_cost = info["reward_ctrl"] / self._ctrl_cost_weight  # Revert the scale applied in the original environment
+        vec_reward = np.array([x_velocity, neg_energy_cost], dtype=np.float32)
+        return observation, vec_reward, terminated, truncated, info
diff --git a/mo_gymnasium/envs/mujoco/hopper.py → mo_gymnasium/envs/mujoco/hopper_v4.py b/mo_gymnasium/envs/mujoco/hopper.py → mo_gymnasium/envs/mujoco/hopper_v4.py
@@ -27,7 +27,7 @@ class MOHopperEnv(HopperEnv, EzPickle):
     def __init__(self, cost_objective=True, **kwargs):
         super().__init__(**kwargs)
         EzPickle.__init__(self, cost_objective, **kwargs)
-        self.cost_objetive = cost_objective
+        self._cost_objective = cost_objective
         self.reward_dim = 3 if cost_objective else 2
         self.reward_space = Box(low=-np.inf, high=np.inf, shape=(self.reward_dim,))
 
@@ -53,7 +53,7 @@ def step(self, action):
         height = 10 * (z - self.init_qpos[1])
         energy_cost = np.sum(np.square(action))
 
-        if self.cost_objetive:
+        if self._cost_objective:
             vec_reward = np.array([x_velocity, height, -energy_cost], dtype=np.float32)
         else:
             vec_reward = np.array([x_velocity, height], dtype=np.float32)

diff --git a/mo_gymnasium/envs/mujoco/hopper_v5.py b/mo_gymnasium/envs/mujoco/hopper_v5.py
@@ -0,0 +1,55 @@
+import numpy as np
+from gymnasium.envs.mujoco.hopper_v5 import HopperEnv
+from gymnasium.spaces import Box
+from gymnasium.utils import EzPickle
+
+
+class MOHopperEnv(HopperEnv, EzPickle):
+    """
+    ## Description
+    Multi-objective version of the HopperEnv environment.
+
+    See [Gymnasium's env](https://gymnasium.farama.org/environments/mujoco/hopper/) for more information.
+
+    The original Gymnasium's 'Hopper-v5' is recovered by the following linear scalarization:
+
+    env = mo_gym.make('mo-hopper-v5')
+    LinearReward(env, weight=np.array([1.0, 0.0, 1e-3]))
+
+    ## Reward Space
+    The reward is 3-dimensional:
+    - 0: Reward for going forward on the x-axis
+    - 1: Reward for jumping high on the z-axis
+    - 2: Control cost of the action
+    If the cost_objective flag is set to False, the reward is 2-dimensional, and the cost is added to other objectives.
+
+    A 2-objective version (without the cost objective as a separate objective) can be instantiated via:
+    env = mo_gym.make('mo-hopper-2obj-v5')
+
+    ## Version History
+    - v5: The 2-objective version has now id 'mo-hopper-2obj-v5', instead of 'mo-hopper-2d-v4'.
+    See https://gymnasium.farama.org/environments/mujoco/hopper/#version-history
+    """
+
+    def __init__(self, cost_objective=True, **kwargs):
+        super().__init__(**kwargs)
+        EzPickle.__init__(self, cost_objective, **kwargs)
+        self._cost_objective = cost_objective
+        self.reward_dim = 3 if cost_objective else 2
+        self.reward_space = Box(low=-np.inf, high=np.inf, shape=(self.reward_dim,))
+
+    def step(self, action):
+        observation, reward, terminated, truncated, info = super().step(action)
+        x_velocity = info["x_velocity"]
+        height = 10 * info["z_distance_from_origin"]
+        neg_energy_cost = info["reward_ctrl"]
+        if self._cost_objective:
+            neg_energy_cost /= self._ctrl_cost_weight  # Revert the scale applied in the original environment
+            vec_reward = np.array([x_velocity, height, neg_energy_cost], dtype=np.float32)
+        else:
+            vec_reward = np.array([x_velocity, height], dtype=np.float32)
+            vec_reward += neg_energy_cost
+
+        vec_reward += info["reward_survive"]
+
+        return observation, vec_reward, terminated, truncated, info
diff --git a/mo_gymnasium/envs/mujoco/humanoid.py → mo_gymnasium/envs/mujoco/humanoid_v4.py b/mo_gymnasium/envs/mujoco/humanoid.py → mo_gymnasium/envs/mujoco/humanoid_v4.py
diff --git a/mo_gymnasium/envs/mujoco/humanoid_v5.py b/mo_gymnasium/envs/mujoco/humanoid_v5.py
@@ -0,0 +1,44 @@
+import numpy as np
+from gymnasium.envs.mujoco.humanoid_v5 import HumanoidEnv
+from gymnasium.spaces import Box
+from gymnasium.utils import EzPickle
+
+
+class MOHumanoidEnv(HumanoidEnv, EzPickle):
+    """
+    ## Description
+    Multi-objective version of the HumanoidEnv environment.
+
+    See [Gymnasium's env](https://gymnasium.farama.org/environments/mujoco/humanoid/) for more information.
+
+    The original Gymnasium's 'Humanoid-v5' is recovered by the following linear scalarization:
+
+    env = mo_gym.make('mo-humanoid-v5')
+    LinearReward(env, weight=np.array([1.25, 0.1]))
+
+    ## Reward Space
+    The reward is 2-dimensional:
+    - 0: Reward for running forward (x-velocity)
+    - 1: Control cost of the action
+
+    ## Version History:
+    - v5: Now includes contact forces. See: https://gymnasium.farama.org/environments/mujoco/humanoid/#version-history
+          The scales of the control cost has changed from v4.
+    """
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        EzPickle.__init__(self, **kwargs)
+        self.reward_space = Box(low=-np.inf, high=np.inf, shape=(2,))
+        self.reward_dim = 2
+
+    def step(self, action):
+        observation, reward, terminated, truncated, info = super().step(action)
+        velocity = info["x_velocity"]
+        neg_energy_cost = info["reward_ctrl"] / self._ctrl_cost_weight  # Revert the scale applied in the original environment
+        vec_reward = np.array([velocity, neg_energy_cost], dtype=np.float32)
+
+        vec_reward += self.healthy_reward  # All objectives are penalyzed when the agent falls
+        vec_reward += info["reward_contact"]  # Do not treat contact forces as a separate objective
+
+        return observation, vec_reward, terminated, truncated, info
diff --git a/mo_gymnasium/envs/mujoco/reacher.py → mo_gymnasium/envs/mujoco/reacher_v4.py b/mo_gymnasium/envs/mujoco/reacher.py → mo_gymnasium/envs/mujoco/reacher_v4.py