diff --git a/mo_gymnasium/envs/mujoco/__init__.py b/mo_gymnasium/envs/mujoco/__init__.py index 12b77130..92245429 100644 --- a/mo_gymnasium/envs/mujoco/__init__.py +++ b/mo_gymnasium/envs/mujoco/__init__.py @@ -3,56 +3,113 @@ register( id="mo-halfcheetah-v4", - entry_point="mo_gymnasium.envs.mujoco.half_cheetah:MOHalfCheehtahEnv", + entry_point="mo_gymnasium.envs.mujoco.half_cheetah_v4:MOHalfCheehtahEnv", + max_episode_steps=1000, +) + +register( + id="mo-halfcheetah-v5", + entry_point="mo_gymnasium.envs.mujoco.half_cheetah_v5:MOHalfCheehtahEnv", max_episode_steps=1000, ) register( id="mo-hopper-v4", - entry_point="mo_gymnasium.envs.mujoco.hopper:MOHopperEnv", + entry_point="mo_gymnasium.envs.mujoco.hopper_v4:MOHopperEnv", + max_episode_steps=1000, +) + +register( + id="mo-hopper-v5", + entry_point="mo_gymnasium.envs.mujoco.hopper_v5:MOHopperEnv", max_episode_steps=1000, ) register( id="mo-hopper-2d-v4", - entry_point="mo_gymnasium.envs.mujoco.hopper:MOHopperEnv", + entry_point="mo_gymnasium.envs.mujoco.hopper_v4:MOHopperEnv", + max_episode_steps=1000, + kwargs={"cost_objective": False}, +) + +register( + id="mo-hopper-2obj-v5", + entry_point="mo_gymnasium.envs.mujoco.hopper_v5:MOHopperEnv", max_episode_steps=1000, kwargs={"cost_objective": False}, ) register( id="mo-walker2d-v4", - entry_point="mo_gymnasium.envs.mujoco.walker2d:MOWalker2dEnv", + entry_point="mo_gymnasium.envs.mujoco.walker2d_v4:MOWalker2dEnv", + max_episode_steps=1000, +) + +register( + id="mo-walker2d-v5", + entry_point="mo_gymnasium.envs.mujoco.walker2d_v5:MOWalker2dEnv", max_episode_steps=1000, ) register( id="mo-ant-v4", - entry_point="mo_gymnasium.envs.mujoco.ant:MOAntEnv", + entry_point="mo_gymnasium.envs.mujoco.ant_v4:MOAntEnv", max_episode_steps=1000, ) register( id="mo-ant-2d-v4", - entry_point="mo_gymnasium.envs.mujoco.ant:MOAntEnv", + entry_point="mo_gymnasium.envs.mujoco.ant_v4:MOAntEnv", + max_episode_steps=1000, + kwargs={"cost_objective": False}, +) + + +register( + id="mo-ant-v5", + entry_point="mo_gymnasium.envs.mujoco.ant_v5:MOAntEnv", + max_episode_steps=1000, +) + +register( + id="mo-ant-2obj-v5", + entry_point="mo_gymnasium.envs.mujoco.ant_v5:MOAntEnv", max_episode_steps=1000, kwargs={"cost_objective": False}, ) register( id="mo-swimmer-v4", - entry_point="mo_gymnasium.envs.mujoco.swimmer:MOSwimmerEnv", + entry_point="mo_gymnasium.envs.mujoco.swimmer_v4:MOSwimmerEnv", + max_episode_steps=1000, +) + +register( + id="mo-swimmer-v5", + entry_point="mo_gymnasium.envs.mujoco.swimmer_v5:MOSwimmerEnv", max_episode_steps=1000, ) register( id="mo-humanoid-v4", - entry_point="mo_gymnasium.envs.mujoco.humanoid:MOHumanoidEnv", + entry_point="mo_gymnasium.envs.mujoco.humanoid_v4:MOHumanoidEnv", + max_episode_steps=1000, +) + +register( + id="mo-humanoid-v5", + entry_point="mo_gymnasium.envs.mujoco.humanoid_v5:MOHumanoidEnv", max_episode_steps=1000, ) register( id="mo-reacher-v4", - entry_point="mo_gymnasium.envs.mujoco.reacher:MOReacherEnv", + entry_point="mo_gymnasium.envs.mujoco.reacher_v4:MOReacherEnv", + max_episode_steps=50, +) + +register( + id="mo-reacher-v5", + entry_point="mo_gymnasium.envs.mujoco.reacher_v5:MOReacherEnv", max_episode_steps=50, ) diff --git a/mo_gymnasium/envs/mujoco/ant.py b/mo_gymnasium/envs/mujoco/ant_v4.py similarity index 95% rename from mo_gymnasium/envs/mujoco/ant.py rename to mo_gymnasium/envs/mujoco/ant_v4.py index cc2ba7ed..695b6d57 100644 --- a/mo_gymnasium/envs/mujoco/ant.py +++ b/mo_gymnasium/envs/mujoco/ant_v4.py @@ -28,7 +28,7 @@ class MOAntEnv(AntEnv, EzPickle): def __init__(self, cost_objective=True, **kwargs): super().__init__(**kwargs) EzPickle.__init__(self, cost_objective, **kwargs) - self.cost_objetive = cost_objective + self._cost_objective = cost_objective self.reward_dim = 3 if cost_objective else 2 self.reward_space = Box(low=-np.inf, high=np.inf, shape=(self.reward_dim,)) @@ -39,7 +39,7 @@ def step(self, action): cost = info["reward_ctrl"] healthy_reward = info["reward_survive"] - if self.cost_objetive: + if self._cost_objective: cost /= self._ctrl_cost_weight # Ignore the weight in the original AntEnv vec_reward = np.array([x_velocity, y_velocity, cost], dtype=np.float32) else: diff --git a/mo_gymnasium/envs/mujoco/ant_v5.py b/mo_gymnasium/envs/mujoco/ant_v5.py new file mode 100644 index 00000000..8d4d9ae7 --- /dev/null +++ b/mo_gymnasium/envs/mujoco/ant_v5.py @@ -0,0 +1,60 @@ +import numpy as np +from gymnasium.envs.mujoco.ant_v5 import AntEnv +from gymnasium.spaces import Box +from gymnasium.utils import EzPickle + + +class MOAntEnv(AntEnv, EzPickle): + """ + ## Description + Multi-objective version of the AntEnv environment. + + See [Gymnasium's env](https://gymnasium.farama.org/environments/mujoco/ant/) for more information. + + The original Gymnasium's 'Ant-v5' is recovered by the following linear scalarization: + + env = mo_gym.make('mo-ant-v4', cost_objective=False) + LinearReward(env, weight=np.array([1.0, 0.0])) + + ## Reward Space + The reward is 2- or 3-dimensional: + - 0: x-velocity + - 1: y-velocity + - 2: Control cost of the action + If the cost_objective flag is set to False, the reward is 2-dimensional, and the cost is added to other objectives. + A healthy reward and a cost for contact forces is added to all objectives. + + A 2-objective version (without the cost objective as a separate objective) can be instantiated via: + env = mo_gym.make('mo-ant-2obj-v5') + + ## Version History + - v5: Now includes contact forces in the reward and observation. + The 2-objective version has now id 'mo-ant-2obj-v5', instead of 'mo-ant-2d-v4'. + See https://gymnasium.farama.org/environments/mujoco/ant/#version-history + """ + + def __init__(self, cost_objective=True, **kwargs): + super().__init__(**kwargs) + EzPickle.__init__(self, cost_objective, **kwargs) + self._cost_objective = cost_objective + self.reward_dim = 3 if cost_objective else 2 + self.reward_space = Box(low=-np.inf, high=np.inf, shape=(self.reward_dim,)) + + def step(self, action): + observation, reward, terminated, truncated, info = super().step(action) + x_velocity = info["x_velocity"] + y_velocity = info["y_velocity"] + cost = info["reward_ctrl"] + healthy_reward = info["reward_survive"] + + if self._cost_objective: + cost /= self._ctrl_cost_weight # Ignore the weight in the original AntEnv + vec_reward = np.array([x_velocity, y_velocity, cost], dtype=np.float32) + else: + vec_reward = np.array([x_velocity, y_velocity], dtype=np.float32) + vec_reward += cost + + vec_reward += healthy_reward + vec_reward += info["reward_contact"] # Do not treat contact forces as a separate objective + + return observation, vec_reward, terminated, truncated, info diff --git a/mo_gymnasium/envs/mujoco/half_cheetah.py b/mo_gymnasium/envs/mujoco/half_cheetah_v4.py similarity index 100% rename from mo_gymnasium/envs/mujoco/half_cheetah.py rename to mo_gymnasium/envs/mujoco/half_cheetah_v4.py diff --git a/mo_gymnasium/envs/mujoco/half_cheetah_v5.py b/mo_gymnasium/envs/mujoco/half_cheetah_v5.py new file mode 100644 index 00000000..3cd89431 --- /dev/null +++ b/mo_gymnasium/envs/mujoco/half_cheetah_v5.py @@ -0,0 +1,40 @@ +import numpy as np +from gymnasium.envs.mujoco.half_cheetah_v5 import HalfCheetahEnv +from gymnasium.spaces import Box +from gymnasium.utils import EzPickle + + +class MOHalfCheehtahEnv(HalfCheetahEnv, EzPickle): + """ + ## Description + Multi-objective version of the HalfCheetahEnv environment. + + See [Gymnasium's env](https://gymnasium.farama.org/environments/mujoco/half_cheetah/) for more information. + + The original Gymnasium's 'HalfCheetah-v5' is recovered by the following linear scalarization: + + env = mo_gym.make('mo-halfcheetah-v5') + LinearReward(env, weight=np.array([1.0, 0.1])) + + ## Reward Space + The reward is 2-dimensional: + - 0: Reward for running forward + - 1: Control cost of the action + + ## Version History + - v5: The scales of the control cost has changed from v4. + See https://gymnasium.farama.org/environments/mujoco/half_cheetah/#version-history for other changes. + """ + + def __init__(self, **kwargs): + super().__init__(**kwargs) + EzPickle.__init__(self, **kwargs) + self.reward_space = Box(low=-np.inf, high=np.inf, shape=(2,)) + self.reward_dim = 2 + + def step(self, action): + observation, reward, terminated, truncated, info = super().step(action) + x_velocity = info["x_velocity"] + neg_energy_cost = info["reward_ctrl"] / self._ctrl_cost_weight # Revert the scale applied in the original environment + vec_reward = np.array([x_velocity, neg_energy_cost], dtype=np.float32) + return observation, vec_reward, terminated, truncated, info diff --git a/mo_gymnasium/envs/mujoco/hopper.py b/mo_gymnasium/envs/mujoco/hopper_v4.py similarity index 96% rename from mo_gymnasium/envs/mujoco/hopper.py rename to mo_gymnasium/envs/mujoco/hopper_v4.py index 6fe0ed3c..6da2bd32 100644 --- a/mo_gymnasium/envs/mujoco/hopper.py +++ b/mo_gymnasium/envs/mujoco/hopper_v4.py @@ -27,7 +27,7 @@ class MOHopperEnv(HopperEnv, EzPickle): def __init__(self, cost_objective=True, **kwargs): super().__init__(**kwargs) EzPickle.__init__(self, cost_objective, **kwargs) - self.cost_objetive = cost_objective + self._cost_objective = cost_objective self.reward_dim = 3 if cost_objective else 2 self.reward_space = Box(low=-np.inf, high=np.inf, shape=(self.reward_dim,)) @@ -53,7 +53,7 @@ def step(self, action): height = 10 * (z - self.init_qpos[1]) energy_cost = np.sum(np.square(action)) - if self.cost_objetive: + if self._cost_objective: vec_reward = np.array([x_velocity, height, -energy_cost], dtype=np.float32) else: vec_reward = np.array([x_velocity, height], dtype=np.float32) diff --git a/mo_gymnasium/envs/mujoco/hopper_v5.py b/mo_gymnasium/envs/mujoco/hopper_v5.py new file mode 100644 index 00000000..9496652c --- /dev/null +++ b/mo_gymnasium/envs/mujoco/hopper_v5.py @@ -0,0 +1,55 @@ +import numpy as np +from gymnasium.envs.mujoco.hopper_v5 import HopperEnv +from gymnasium.spaces import Box +from gymnasium.utils import EzPickle + + +class MOHopperEnv(HopperEnv, EzPickle): + """ + ## Description + Multi-objective version of the HopperEnv environment. + + See [Gymnasium's env](https://gymnasium.farama.org/environments/mujoco/hopper/) for more information. + + The original Gymnasium's 'Hopper-v5' is recovered by the following linear scalarization: + + env = mo_gym.make('mo-hopper-v5') + LinearReward(env, weight=np.array([1.0, 0.0, 1e-3])) + + ## Reward Space + The reward is 3-dimensional: + - 0: Reward for going forward on the x-axis + - 1: Reward for jumping high on the z-axis + - 2: Control cost of the action + If the cost_objective flag is set to False, the reward is 2-dimensional, and the cost is added to other objectives. + + A 2-objective version (without the cost objective as a separate objective) can be instantiated via: + env = mo_gym.make('mo-hopper-2obj-v5') + + ## Version History + - v5: The 2-objective version has now id 'mo-hopper-2obj-v5', instead of 'mo-hopper-2d-v4'. + See https://gymnasium.farama.org/environments/mujoco/hopper/#version-history + """ + + def __init__(self, cost_objective=True, **kwargs): + super().__init__(**kwargs) + EzPickle.__init__(self, cost_objective, **kwargs) + self._cost_objective = cost_objective + self.reward_dim = 3 if cost_objective else 2 + self.reward_space = Box(low=-np.inf, high=np.inf, shape=(self.reward_dim,)) + + def step(self, action): + observation, reward, terminated, truncated, info = super().step(action) + x_velocity = info["x_velocity"] + height = 10 * info["z_distance_from_origin"] + neg_energy_cost = info["reward_ctrl"] + if self._cost_objective: + neg_energy_cost /= self._ctrl_cost_weight # Revert the scale applied in the original environment + vec_reward = np.array([x_velocity, height, neg_energy_cost], dtype=np.float32) + else: + vec_reward = np.array([x_velocity, height], dtype=np.float32) + vec_reward += neg_energy_cost + + vec_reward += info["reward_survive"] + + return observation, vec_reward, terminated, truncated, info diff --git a/mo_gymnasium/envs/mujoco/humanoid.py b/mo_gymnasium/envs/mujoco/humanoid_v4.py similarity index 100% rename from mo_gymnasium/envs/mujoco/humanoid.py rename to mo_gymnasium/envs/mujoco/humanoid_v4.py diff --git a/mo_gymnasium/envs/mujoco/humanoid_v5.py b/mo_gymnasium/envs/mujoco/humanoid_v5.py new file mode 100644 index 00000000..f7506bc1 --- /dev/null +++ b/mo_gymnasium/envs/mujoco/humanoid_v5.py @@ -0,0 +1,44 @@ +import numpy as np +from gymnasium.envs.mujoco.humanoid_v5 import HumanoidEnv +from gymnasium.spaces import Box +from gymnasium.utils import EzPickle + + +class MOHumanoidEnv(HumanoidEnv, EzPickle): + """ + ## Description + Multi-objective version of the HumanoidEnv environment. + + See [Gymnasium's env](https://gymnasium.farama.org/environments/mujoco/humanoid/) for more information. + + The original Gymnasium's 'Humanoid-v5' is recovered by the following linear scalarization: + + env = mo_gym.make('mo-humanoid-v5') + LinearReward(env, weight=np.array([1.25, 0.1])) + + ## Reward Space + The reward is 2-dimensional: + - 0: Reward for running forward (x-velocity) + - 1: Control cost of the action + + ## Version History: + - v5: Now includes contact forces. See: https://gymnasium.farama.org/environments/mujoco/humanoid/#version-history + The scales of the control cost has changed from v4. + """ + + def __init__(self, **kwargs): + super().__init__(**kwargs) + EzPickle.__init__(self, **kwargs) + self.reward_space = Box(low=-np.inf, high=np.inf, shape=(2,)) + self.reward_dim = 2 + + def step(self, action): + observation, reward, terminated, truncated, info = super().step(action) + velocity = info["x_velocity"] + neg_energy_cost = info["reward_ctrl"] / self._ctrl_cost_weight # Revert the scale applied in the original environment + vec_reward = np.array([velocity, neg_energy_cost], dtype=np.float32) + + vec_reward += self.healthy_reward # All objectives are penalyzed when the agent falls + vec_reward += info["reward_contact"] # Do not treat contact forces as a separate objective + + return observation, vec_reward, terminated, truncated, info diff --git a/mo_gymnasium/envs/mujoco/reacher.py b/mo_gymnasium/envs/mujoco/reacher_v4.py similarity index 100% rename from mo_gymnasium/envs/mujoco/reacher.py rename to mo_gymnasium/envs/mujoco/reacher_v4.py diff --git a/mo_gymnasium/envs/mujoco/reacher_v5.py b/mo_gymnasium/envs/mujoco/reacher_v5.py new file mode 100644 index 00000000..20b9668d --- /dev/null +++ b/mo_gymnasium/envs/mujoco/reacher_v5.py @@ -0,0 +1,101 @@ +from os import path + +import numpy as np +from gymnasium import utils +from gymnasium.envs.mujoco import MujocoEnv +from gymnasium.envs.mujoco.reacher_v5 import ReacherEnv +from gymnasium.spaces import Box, Discrete + + +DEFAULT_CAMERA_CONFIG = {"trackbodyid": 0} + + +class MOReacherEnv(ReacherEnv): + """ + ## Description + Multi-objective version of the [`Reacher-v4` environment](https://gymnasium.farama.org/environments/mujoco/reacher/). + + ## Observation Space + The observation is 6-dimensional and contains: + - sin and cos of the angles of the central and elbow joints + - angular velocity of the central and elbow joints + + ## Action Space + The action space is discrete and contains the 3^2=9 possible actions based on applying positive (+1), negative (-1) or zero (0) torque to each of the two joints. + + ## Reward Space + The reward is 4-dimensional and is defined based on the distance of the tip of the arm and the four target locations. + For each i={1,2,3,4} it is computed as: + ```math + r_i = 1 - 4 * || finger_tip_coord - target_i ||^2 + ``` + + ## Version History: + See https://gymnasium.farama.org/environments/mujoco/reacher/#version-history + """ + + def __init__(self, **kwargs): + utils.EzPickle.__init__(self, **kwargs) + self.observation_space = Box(low=-np.inf, high=np.inf, shape=(6,), dtype=np.float64) + MujocoEnv.__init__( + self, + path.join(path.dirname(__file__), "assets", "mo_reacher.xml"), + 2, + observation_space=self.observation_space, + default_camera_config=DEFAULT_CAMERA_CONFIG, + **kwargs, + ) + actions = [-1.0, 0.0, 1.0] + self.action_dict = dict() + for a1 in actions: + for a2 in actions: + self.action_dict[len(self.action_dict)] = (a1, a2) + self.action_space = Discrete(9) + # Target goals: x1, y1, x2, y2, ... x4, y4 + self.goal = np.array([0.14, 0.0, -0.14, 0.0, 0.0, 0.14, 0.0, -0.14]) + self.reward_space = Box(low=-1.0, high=1.0, shape=(4,)) + self.reward_dim = 4 + + def step(self, a): + real_action = self.action_dict[int(a)] + vec_reward = np.array( + [ + 1 - 4 * np.linalg.norm(self.get_body_com("fingertip")[:2] - self.get_body_com("target1")[:2]), + 1 - 4 * np.linalg.norm(self.get_body_com("fingertip")[:2] - self.get_body_com("target2")[:2]), + 1 - 4 * np.linalg.norm(self.get_body_com("fingertip")[:2] - self.get_body_com("target3")[:2]), + 1 - 4 * np.linalg.norm(self.get_body_com("fingertip")[:2] - self.get_body_com("target4")[:2]), + ], + dtype=np.float32, + ) + + self._step_mujoco_simulation(real_action, self.frame_skip) + if self.render_mode == "human": + self.render() + + ob = self._get_obs() + return ( + ob, + vec_reward, + False, + False, + {}, + ) + + def reset_model(self): + qpos = self.np_random.uniform(low=-0.1, high=0.1, size=self.model.nq) + self.init_qpos + qpos[:2] = np.array([0, 3.1415 / 2]) # init position + qpos[-len(self.goal) :] = self.goal + qvel = self.init_qvel + self.np_random.uniform(low=-0.005, high=0.005, size=self.model.nv) + qvel[-len(self.goal) :] = 0 + self.set_state(qpos, qvel) + return self._get_obs() + + def _get_obs(self): + theta = self.data.qpos.flatten()[:2] + return np.concatenate( + [ + np.cos(theta), + np.sin(theta), + self.data.qvel.flatten()[:2] * 0.1, + ] + ) diff --git a/mo_gymnasium/envs/mujoco/swimmer.py b/mo_gymnasium/envs/mujoco/swimmer_v4.py similarity index 100% rename from mo_gymnasium/envs/mujoco/swimmer.py rename to mo_gymnasium/envs/mujoco/swimmer_v4.py diff --git a/mo_gymnasium/envs/mujoco/swimmer_v5.py b/mo_gymnasium/envs/mujoco/swimmer_v5.py new file mode 100644 index 00000000..3e0de496 --- /dev/null +++ b/mo_gymnasium/envs/mujoco/swimmer_v5.py @@ -0,0 +1,41 @@ +import numpy as np +from gymnasium.envs.mujoco.swimmer_v5 import SwimmerEnv +from gymnasium.spaces import Box +from gymnasium.utils import EzPickle + + +class MOSwimmerEnv(SwimmerEnv, EzPickle): + """ + ## Description + Multi-objective version of the SwimmerEnv environment. + + See [Gymnasium's env](https://gymnasium.farama.org/environments/mujoco/swimmer/) for more information. + + The original Gymnasium's 'Swimmer-v5' is recovered by the following linear scalarization: + + env = mo_gym.make('mo-swimmer-v5') + LinearReward(env, weight=np.array([1.0, 1e-4])) + + ## Reward Space + The reward is 2-dimensional: + - 0: Reward for moving forward (x-velocity) + - 1: Control cost of the action + + ## Version History: + See https://gymnasium.farama.org/main/environments/mujoco/swimmer/#version-history + """ + + def __init__(self, **kwargs): + super().__init__(**kwargs) + EzPickle.__init__(self, **kwargs) + self.reward_space = Box(low=-np.inf, high=np.inf, shape=(2,)) + self.reward_dim = 2 + + def step(self, action): + observation, reward, terminated, truncated, info = super().step(action) + velocity = info["x_velocity"] + neg_energy_cost = info["reward_ctrl"] / self._ctrl_cost_weight # Revert the scale applied in the original environment + + vec_reward = np.array([velocity, neg_energy_cost], dtype=np.float32) + + return observation, vec_reward, terminated, truncated, info diff --git a/mo_gymnasium/envs/mujoco/walker2d.py b/mo_gymnasium/envs/mujoco/walker2d_v4.py similarity index 89% rename from mo_gymnasium/envs/mujoco/walker2d.py rename to mo_gymnasium/envs/mujoco/walker2d_v4.py index e3806810..e4a7493b 100644 --- a/mo_gymnasium/envs/mujoco/walker2d.py +++ b/mo_gymnasium/envs/mujoco/walker2d_v4.py @@ -26,9 +26,9 @@ def __init__(self, **kwargs): def step(self, action): observation, reward, terminated, truncated, info = super().step(action) velocity = info["x_velocity"] - energy = -np.sum(np.square(action)) + neg_energy_cost = -np.sum(np.square(action)) - vec_reward = np.array([velocity, energy], dtype=np.float32) + vec_reward = np.array([velocity, neg_energy_cost], dtype=np.float32) vec_reward += self.healthy_reward # All objectives are penalyzed when the agent falls diff --git a/mo_gymnasium/envs/mujoco/walker2d_v5.py b/mo_gymnasium/envs/mujoco/walker2d_v5.py new file mode 100644 index 00000000..a3446b66 --- /dev/null +++ b/mo_gymnasium/envs/mujoco/walker2d_v5.py @@ -0,0 +1,43 @@ +import numpy as np +from gymnasium.envs.mujoco.walker2d_v5 import Walker2dEnv +from gymnasium.spaces import Box +from gymnasium.utils import EzPickle + + +class MOWalker2dEnv(Walker2dEnv, EzPickle): + """ + ## Description + Multi-objective version of the Walker2dEnv environment. + + See [Gymnasium's env](https://gymnasium.farama.org/environments/mujoco/walker2d/) for more information. + + The original Gymnasium's 'Walker2d-v5' is recovered by the following linear scalarization: + + env = mo_gym.make('mo-walker2d-v5') + LinearReward(env, weight=np.array([1.0, 1e-3])) + + ## Reward Space + The reward is 2-dimensional: + - 0: Reward for running forward (x-velocity) + - 1: Control cost of the action + + # Version History + - See https://gymnasium.farama.org/main/environments/mujoco/walker2d/#version-history + """ + + def __init__(self, **kwargs): + super().__init__(**kwargs) + EzPickle.__init__(self, **kwargs) + self.reward_space = Box(low=-np.inf, high=np.inf, shape=(2,)) + self.reward_dim = 2 + + def step(self, action): + observation, reward, terminated, truncated, info = super().step(action) + velocity = info["x_velocity"] + neg_energy_cost = info["reward_ctrl"] / self._ctrl_cost_weight + + vec_reward = np.array([velocity, neg_energy_cost], dtype=np.float32) + + vec_reward += self.healthy_reward # All objectives are penalyzed when the agent falls + + return observation, vec_reward, terminated, truncated, info