diff --git a/CHANGELOG.md b/CHANGELOG.md index 5abd9df64..0507d9cf7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,21 @@ +## Release 2.3.0a1 (WIP) + +### Breaking Changes +- Updated defaults hyperparameters for TD3/DDPG to be more consistent with SAC +- Upgraded MuJoCo envs hyperparameters to v4 (pre-trained agents need to be updated) +- Upgraded to SB3 >= 2.3.0 + +### New Features + + +### Bug fixes + +### Documentation + +### Other + + + ## Release 2.2.1 (2023-11-17) ### Breaking Changes diff --git a/hyperparams/a2c.yml b/hyperparams/a2c.yml index ba9416e85..02e8c780b 100644 --- a/hyperparams/a2c.yml +++ b/hyperparams/a2c.yml @@ -165,24 +165,24 @@ ReacherBulletEnv-v0: # === Mujoco Envs === -HalfCheetah-v3: &mujoco-defaults +HalfCheetah-v4: &mujoco-defaults normalize: true n_timesteps: !!float 1e6 policy: 'MlpPolicy' -Ant-v3: +Ant-v4: <<: *mujoco-defaults -Hopper-v3: +Hopper-v4: <<: *mujoco-defaults -Walker2d-v3: +Walker2d-v4: <<: *mujoco-defaults -Humanoid-v3: +Humanoid-v4: <<: *mujoco-defaults n_timesteps: !!float 2e6 -Swimmer-v3: +Swimmer-v4: <<: *mujoco-defaults gamma: 0.9999 diff --git a/hyperparams/ars.yml b/hyperparams/ars.yml index e58d4fa3c..9e365cf35 100644 --- a/hyperparams/ars.yml +++ b/hyperparams/ars.yml @@ -108,7 +108,7 @@ ReacherBulletEnv-v0: # === Mujoco Envs === # Params closest to original paper -Swimmer-v3: +Swimmer-v4: n_envs: 1 policy: 'LinearPolicy' n_timesteps: !!float 2e6 @@ -119,7 +119,7 @@ Swimmer-v3: alive_bonus_offset: 0 # normalize: "dict(norm_obs=True, norm_reward=False)" -Hopper-v3: +Hopper-v4: n_envs: 1 policy: 'LinearPolicy' n_timesteps: !!float 7e6 @@ -130,7 +130,7 @@ Hopper-v3: alive_bonus_offset: -1 normalize: "dict(norm_obs=True, norm_reward=False)" -HalfCheetah-v3: +HalfCheetah-v4: n_envs: 1 policy: 'LinearPolicy' n_timesteps: !!float 1.25e7 @@ -141,7 +141,7 @@ HalfCheetah-v3: alive_bonus_offset: 0 normalize: "dict(norm_obs=True, norm_reward=False)" -Walker2d-v3: +Walker2d-v4: n_envs: 1 policy: 'LinearPolicy' n_timesteps: !!float 7.5e7 @@ -152,7 +152,7 @@ Walker2d-v3: alive_bonus_offset: -1 normalize: "dict(norm_obs=True, norm_reward=False)" -Ant-v3: +Ant-v4: n_envs: 1 policy: 'LinearPolicy' n_timesteps: !!float 7.5e7 @@ -164,7 +164,7 @@ Ant-v3: normalize: "dict(norm_obs=True, norm_reward=False)" -Humanoid-v3: +Humanoid-v4: n_envs: 1 policy: 'LinearPolicy' n_timesteps: !!float 2.5e8 diff --git a/hyperparams/ddpg.yml b/hyperparams/ddpg.yml index 14a53cfca..bb78fdae1 100644 --- a/hyperparams/ddpg.yml +++ b/hyperparams/ddpg.yml @@ -4,6 +4,11 @@ MountainCarContinuous-v0: policy: 'MlpPolicy' noise_type: 'ornstein-uhlenbeck' noise_std: 0.5 + gradient_steps: 1 + train_freq: 1 + learning_rate: !!float 1e-3 + batch_size: 256 + policy_kwargs: "dict(net_arch=[400, 300])" Pendulum-v1: n_timesteps: 20000 @@ -13,8 +18,8 @@ Pendulum-v1: learning_starts: 10000 noise_type: 'normal' noise_std: 0.1 - gradient_steps: -1 - train_freq: [1, "episode"] + gradient_steps: 1 + train_freq: 1 learning_rate: !!float 1e-3 policy_kwargs: "dict(net_arch=[400, 300])" @@ -26,8 +31,8 @@ LunarLanderContinuous-v2: learning_starts: 10000 noise_type: 'normal' noise_std: 0.1 - gradient_steps: -1 - train_freq: [1, "episode"] + gradient_steps: 1 + train_freq: 1 learning_rate: !!float 1e-3 policy_kwargs: "dict(net_arch=[400, 300])" @@ -39,8 +44,8 @@ BipedalWalker-v3: learning_starts: 10000 noise_type: 'normal' noise_std: 0.1 - gradient_steps: -1 - train_freq: [1, "episode"] + gradient_steps: 1 + train_freq: 1 learning_rate: !!float 1e-3 policy_kwargs: "dict(net_arch=[400, 300])" @@ -48,14 +53,14 @@ BipedalWalker-v3: BipedalWalkerHardcore-v3: n_timesteps: !!float 1e7 policy: 'MlpPolicy' - gamma: 0.98 - buffer_size: 200000 + gamma: 0.99 + buffer_size: 1000000 learning_starts: 10000 noise_type: 'normal' noise_std: 0.1 - gradient_steps: -1 - train_freq: [1, "episode"] - learning_rate: !!float 1e-3 + batch_size: 256 + train_freq: 1 + learning_rate: lin_7e-4 policy_kwargs: "dict(net_arch=[400, 300])" # Tuned @@ -69,28 +74,21 @@ HalfCheetahBulletEnv-v0: &pybullet-defaults noise_std: 0.1 gradient_steps: 1 train_freq: 1 - learning_rate: !!float 1e-3 + batch_size: 256 + learning_rate: !!float 7e-4 policy_kwargs: "dict(net_arch=[400, 300])" # Tuned AntBulletEnv-v0: <<: *pybullet-defaults - learning_rate: !!float 7e-4 - policy_kwargs: "dict(net_arch=[400, 300])" # Tuned HopperBulletEnv-v0: <<: *pybullet-defaults - train_freq: 64 - gradient_steps: 64 - batch_size: 256 - learning_rate: !!float 7e-4 # Tuned Walker2DBulletEnv-v0: <<: *pybullet-defaults - batch_size: 256 - learning_rate: !!float 7e-4 # TO BE tested HumanoidBulletEnv-v0: @@ -123,29 +121,31 @@ InvertedPendulumSwingupBulletEnv-v0: n_timesteps: !!float 3e5 # === Mujoco Envs === - -HalfCheetah-v3: &mujoco-defaults +HalfCheetah-v4: &mujoco-defaults n_timesteps: !!float 1e6 policy: 'MlpPolicy' learning_starts: 10000 noise_type: 'normal' noise_std: 0.1 + train_freq: 1 + gradient_steps: 1 + learning_rate: !!float 1e-3 + batch_size: 256 + policy_kwargs: "dict(net_arch=[400, 300])" -Ant-v3: +Ant-v4: <<: *mujoco-defaults -Hopper-v3: +Hopper-v4: <<: *mujoco-defaults -Walker2d-v3: +Walker2d-v4: <<: *mujoco-defaults -Humanoid-v3: +Humanoid-v4: <<: *mujoco-defaults n_timesteps: !!float 2e6 -Swimmer-v3: +Swimmer-v4: <<: *mujoco-defaults gamma: 0.9999 - train_freq: 1 - gradient_steps: 1 diff --git a/hyperparams/ppo.yml b/hyperparams/ppo.yml index 27a1c0d1d..fd664dbed 100644 --- a/hyperparams/ppo.yml +++ b/hyperparams/ppo.yml @@ -380,28 +380,28 @@ CarRacing-v2: # === Mujoco Envs === -# HalfCheetah-v3: &mujoco-defaults +# HalfCheetah-v4: &mujoco-defaults # normalize: true # n_timesteps: !!float 1e6 # policy: 'MlpPolicy' -Ant-v3: &mujoco-defaults +Ant-v4: &mujoco-defaults normalize: true n_timesteps: !!float 1e6 policy: 'MlpPolicy' -# Hopper-v3: +# Hopper-v4: # <<: *mujoco-defaults # -# Walker2d-v3: +# Walker2d-v4: # <<: *mujoco-defaults # -# Humanoid-v3: +# Humanoid-v4: # <<: *mujoco-defaults # n_timesteps: !!float 2e6 # # tuned -Swimmer-v3: +Swimmer-v4: <<: *mujoco-defaults gamma: 0.9999 n_envs: 4 @@ -413,7 +413,7 @@ Swimmer-v3: # Tuned # 10 mujoco envs -HalfCheetah-v3: +HalfCheetah-v4: normalize: true n_envs: 1 policy: 'MlpPolicy' @@ -435,7 +435,7 @@ HalfCheetah-v3: net_arch=dict(pi=[256, 256], vf=[256, 256]) )" -# Ant-v3: +# Ant-v4: # normalize: true # n_envs: 1 # policy: 'MlpPolicy' @@ -451,7 +451,7 @@ HalfCheetah-v3: # max_grad_norm: 0.6 # vf_coef: 0.677239 -Hopper-v3: +Hopper-v4: normalize: true n_envs: 1 policy: 'MlpPolicy' @@ -495,7 +495,7 @@ HumanoidStandup-v2: net_arch=dict(pi=[256, 256], vf=[256, 256]) )" -Humanoid-v3: +Humanoid-v4: normalize: true n_envs: 1 policy: 'MlpPolicy' @@ -565,7 +565,7 @@ Reacher-v2: max_grad_norm: 0.9 vf_coef: 0.950368 -Walker2d-v3: +Walker2d-v4: normalize: true n_envs: 1 policy: 'MlpPolicy' diff --git a/hyperparams/ppo_lstm.yml b/hyperparams/ppo_lstm.yml index 3948a781f..a8a301b23 100644 --- a/hyperparams/ppo_lstm.yml +++ b/hyperparams/ppo_lstm.yml @@ -316,27 +316,27 @@ CarRacing-v2: )" # === Mujoco Envs === -# HalfCheetah-v3: &mujoco-defaults +# HalfCheetah-v4: &mujoco-defaults # normalize: true # n_timesteps: !!float 1e6 # policy: 'MlpLstmPolicy' -Ant-v3: &mujoco-defaults +Ant-v4: &mujoco-defaults normalize: true n_timesteps: !!float 1e6 policy: 'MlpLstmPolicy' -# Hopper-v3: +# Hopper-v4: # <<: *mujoco-defaults # -# Walker2d-v3: +# Walker2d-v4: # <<: *mujoco-defaults # -# Humanoid-v3: +# Humanoid-v4: # <<: *mujoco-defaults # n_timesteps: !!float 2e6 # -Swimmer-v3: +Swimmer-v4: <<: *mujoco-defaults gamma: 0.9999 n_envs: 4 @@ -347,7 +347,7 @@ Swimmer-v3: # 10 mujoco envs -HalfCheetah-v3: +HalfCheetah-v4: normalize: true n_envs: 1 policy: 'MlpLstmPolicy' @@ -369,7 +369,7 @@ HalfCheetah-v3: net_arch=dict(pi=[256, 256], vf=[256, 256]) )" -# Ant-v3: +# Ant-v4: # normalize: true # n_envs: 1 # policy: 'MlpLstmPolicy' @@ -385,7 +385,7 @@ HalfCheetah-v3: # max_grad_norm: 0.6 # vf_coef: 0.677239 -Hopper-v3: +Hopper-v4: normalize: true n_envs: 1 policy: 'MlpLstmPolicy' @@ -429,7 +429,7 @@ HumanoidStandup-v2: net_arch=dict(pi=[256, 256], vf=[256, 256]) )" -Humanoid-v3: +Humanoid-v4: normalize: true n_envs: 1 policy: 'MlpLstmPolicy' @@ -499,7 +499,7 @@ Reacher-v2: max_grad_norm: 0.9 vf_coef: 0.950368 -# Swimmer-v3: +# Swimmer-v4: # normalize: true # n_envs: 1 # policy: 'MlpLstmPolicy' @@ -521,7 +521,7 @@ Reacher-v2: # net_arch=dict(pi=[256, 256], vf=[256, 256]) # )" -Walker2d-v3: +Walker2d-v4: normalize: true n_envs: 1 policy: 'MlpLstmPolicy' diff --git a/hyperparams/qrdqn.yml b/hyperparams/qrdqn.yml index d1da3e5fd..df70c8580 100644 --- a/hyperparams/qrdqn.yml +++ b/hyperparams/qrdqn.yml @@ -4,6 +4,7 @@ atari: frame_stack: 4 policy: 'CnnPolicy' n_timesteps: !!float 1e7 + learning_starts: 50000 exploration_fraction: 0.025 # explore 250k steps = 10M * 0.025 # If True, you need to deactivate handle_timeout_termination # in the replay_buffer_kwargs diff --git a/hyperparams/sac.yml b/hyperparams/sac.yml index 3266baeb8..00fdb86b6 100644 --- a/hyperparams/sac.yml +++ b/hyperparams/sac.yml @@ -189,25 +189,25 @@ CarRacing-v2: # === Mujoco Envs === -HalfCheetah-v3: &mujoco-defaults +HalfCheetah-v4: &mujoco-defaults n_timesteps: !!float 1e6 policy: 'MlpPolicy' learning_starts: 10000 -Ant-v3: +Ant-v4: <<: *mujoco-defaults -Hopper-v3: +Hopper-v4: <<: *mujoco-defaults -Walker2d-v3: +Walker2d-v4: <<: *mujoco-defaults -Humanoid-v3: +Humanoid-v4: <<: *mujoco-defaults n_timesteps: !!float 2e6 -Swimmer-v3: +Swimmer-v4: <<: *mujoco-defaults gamma: 0.9999 diff --git a/hyperparams/td3.yml b/hyperparams/td3.yml index 9b941516b..068a6ce4c 100644 --- a/hyperparams/td3.yml +++ b/hyperparams/td3.yml @@ -4,6 +4,11 @@ MountainCarContinuous-v0: policy: 'MlpPolicy' noise_type: 'ornstein-uhlenbeck' noise_std: 0.5 + gradient_steps: 1 + train_freq: 1 + learning_rate: !!float 1e-3 + batch_size: 256 + policy_kwargs: "dict(net_arch=[400, 300])" Pendulum-v1: n_timesteps: 20000 @@ -13,8 +18,8 @@ Pendulum-v1: learning_starts: 10000 noise_type: 'normal' noise_std: 0.1 - gradient_steps: -1 - train_freq: [1, "episode"] + gradient_steps: 1 + train_freq: 1 learning_rate: !!float 1e-3 policy_kwargs: "dict(net_arch=[400, 300])" @@ -26,8 +31,8 @@ LunarLanderContinuous-v2: learning_starts: 10000 noise_type: 'normal' noise_std: 0.1 - gradient_steps: -1 - train_freq: [1, "episode"] + gradient_steps: 1 + train_freq: 1 learning_rate: !!float 1e-3 policy_kwargs: "dict(net_arch=[400, 300])" @@ -39,8 +44,8 @@ BipedalWalker-v3: learning_starts: 10000 noise_type: 'normal' noise_std: 0.1 - gradient_steps: -1 - train_freq: [1, "episode"] + gradient_steps: 1 + train_freq: 1 learning_rate: !!float 1e-3 policy_kwargs: "dict(net_arch=[400, 300])" @@ -67,9 +72,10 @@ HalfCheetahBulletEnv-v0: &pybullet-defaults learning_starts: 10000 noise_type: 'normal' noise_std: 0.1 - gradient_steps: -1 - train_freq: [1, "episode"] - learning_rate: !!float 1e-3 + gradient_steps: 1 + train_freq: 1 + batch_size: 256 + learning_rate: !!float 7e-4 policy_kwargs: "dict(net_arch=[400, 300])" AntBulletEnv-v0: @@ -118,47 +124,39 @@ MinitaurBulletEnv-v0: noise_type: 'normal' noise_std: 0.1 learning_starts: 10000 - batch_size: 100 + batch_size: 256 learning_rate: !!float 1e-3 train_freq: 1 gradient_steps: 1 policy_kwargs: "dict(net_arch=[400, 300])" # === Mujoco Envs === - -HalfCheetah-v3: &mujoco-defaults +HalfCheetah-v4: &mujoco-defaults n_timesteps: !!float 1e6 policy: 'MlpPolicy' learning_starts: 10000 noise_type: 'normal' noise_std: 0.1 + train_freq: 1 + gradient_steps: 1 + learning_rate: !!float 1e-3 + batch_size: 256 + policy_kwargs: "dict(net_arch=[400, 300])" -Ant-v3: +Ant-v4: <<: *mujoco-defaults -Hopper-v3: +Hopper-v4: <<: *mujoco-defaults - # SAC Hyperparams - train_freq: 1 - gradient_steps: 1 - learning_rate: !!float 3e-4 - batch_size: 256 -Walker2d-v3: +Walker2d-v4: <<: *mujoco-defaults -Humanoid-v3: +Humanoid-v4: <<: *mujoco-defaults n_timesteps: !!float 2e6 - # SAC Hyperparams - train_freq: 1 - gradient_steps: 1 - learning_rate: !!float 3e-4 - batch_size: 256 # Tuned -Swimmer-v3: +Swimmer-v4: <<: *mujoco-defaults gamma: 0.9999 - train_freq: 1 - gradient_steps: 1 diff --git a/hyperparams/tqc.yml b/hyperparams/tqc.yml index 89b925806..df38ccf35 100644 --- a/hyperparams/tqc.yml +++ b/hyperparams/tqc.yml @@ -140,26 +140,26 @@ MinitaurBulletEnv-v0: # === Mujoco Envs === -HalfCheetah-v3: &mujoco-defaults +HalfCheetah-v4: &mujoco-defaults n_timesteps: !!float 1e6 policy: 'MlpPolicy' learning_starts: 10000 -Ant-v3: +Ant-v4: <<: *mujoco-defaults -Hopper-v3: +Hopper-v4: <<: *mujoco-defaults top_quantiles_to_drop_per_net: 5 -Walker2d-v3: +Walker2d-v4: <<: *mujoco-defaults -Humanoid-v3: +Humanoid-v4: <<: *mujoco-defaults n_timesteps: !!float 2e6 -Swimmer-v3: +Swimmer-v4: <<: *mujoco-defaults gamma: 0.9999 diff --git a/hyperparams/trpo.yml b/hyperparams/trpo.yml index bf6866f40..dbf49b89b 100644 --- a/hyperparams/trpo.yml +++ b/hyperparams/trpo.yml @@ -118,26 +118,26 @@ ReacherBulletEnv-v0: # === Mujoco Envs === # Tuned -Ant-v3: &mujoco-defaults +Ant-v4: &mujoco-defaults <<: *pybullet-defaults n_timesteps: !!float 1e6 # Tuned -HalfCheetah-v3: +HalfCheetah-v4: <<: *mujoco-defaults target_kl: 0.04 # Tuned -Hopper-v3: +Hopper-v4: <<: *mujoco-defaults # Tuned -Walker2d-v3: +Walker2d-v4: <<: *mujoco-defaults -Humanoid-v3: +Humanoid-v4: <<: *mujoco-defaults n_timesteps: !!float 2e6 # Tuned -Swimmer-v3: +Swimmer-v4: <<: *mujoco-defaults gamma: 0.9999 diff --git a/requirements.txt b/requirements.txt index 85fa79507..b380b7a45 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,13 +1,12 @@ gym==0.26.2 -stable-baselines3[extra_no_roms,tests,docs]>=2.2.1,<3.0 -sb3-contrib>=2.2.1,<3.0 +stable-baselines3[extra_no_roms,tests,docs]>=2.3.0a1,<3.0 +sb3-contrib>=2.3.0a1,<3.0 box2d-py==2.3.8 pybullet pybullet_envs_gymnasium>=0.4.0 # minigrid # scikit-optimize optuna~=3.0 -pytablewriter~=0.64 pyyaml>=5.1 cloudpickle>=2.2.1 plotly diff --git a/rl_zoo3/version.txt b/rl_zoo3/version.txt index c043eea77..4d04ad95c 100644 --- a/rl_zoo3/version.txt +++ b/rl_zoo3/version.txt @@ -1 +1 @@ -2.2.1 +2.3.0a1 diff --git a/setup.py b/setup.py index 8105579ba..90d177b7b 100644 --- a/setup.py +++ b/setup.py @@ -27,7 +27,7 @@ }, entry_points={"console_scripts": ["rl_zoo3=rl_zoo3.cli:main"]}, install_requires=[ - "sb3_contrib>=2.2.1,<3.0", + "sb3_contrib>=2.3.0a1,<3.0", "gymnasium~=0.29.1", "huggingface_sb3>=3.0,<4.0", "tqdm",