diff --git a/docs/misc/changelog.rst b/docs/misc/changelog.rst index 9ed1cfc09..842db827c 100644 --- a/docs/misc/changelog.rst +++ b/docs/misc/changelog.rst @@ -3,7 +3,7 @@ Changelog ========== -Release 2.3.0a4 (WIP) +Release 2.3.0a5 (WIP) -------------------------- Breaking Changes: @@ -41,9 +41,11 @@ Breaking Changes: New Features: ^^^^^^^^^^^^^ +- Log success rate ``rollout/success_rate`` when available for on policy algorithms (@corentinlger) Bug Fixes: ^^^^^^^^^^ +- Fixed ``monitor_wrapper`` argument that was not passed to the parent class, and dones argument that wasn't passed to ``_update_into_buffer`` (@corentinlger) `SB3-Contrib`_ ^^^^^^^^^^^^^^ diff --git a/stable_baselines3/common/on_policy_algorithm.py b/stable_baselines3/common/on_policy_algorithm.py index ddd0f8de2..1ba36d5f0 100644 --- a/stable_baselines3/common/on_policy_algorithm.py +++ b/stable_baselines3/common/on_policy_algorithm.py @@ -92,6 +92,7 @@ def __init__( use_sde=use_sde, sde_sample_freq=sde_sample_freq, support_multi_env=True, + monitor_wrapper=monitor_wrapper, seed=seed, stats_window_size=stats_window_size, tensorboard_log=tensorboard_log, @@ -200,7 +201,7 @@ def collect_rollouts( if not callback.on_step(): return False - self._update_info_buffer(infos) + self._update_info_buffer(infos, dones) n_steps += 1 if isinstance(self.action_space, spaces.Discrete): @@ -250,6 +251,28 @@ def train(self) -> None: """ raise NotImplementedError + def _dump_logs(self, iteration: int) -> None: + """ + Write log. + + :param iteration: Current logging iteration + """ + assert self.ep_info_buffer is not None + assert self.ep_success_buffer is not None + + time_elapsed = max((time.time_ns() - self.start_time) / 1e9, sys.float_info.epsilon) + fps = int((self.num_timesteps - self._num_timesteps_at_start) / time_elapsed) + self.logger.record("time/iterations", iteration, exclude="tensorboard") + if len(self.ep_info_buffer) > 0 and len(self.ep_info_buffer[0]) > 0: + self.logger.record("rollout/ep_rew_mean", safe_mean([ep_info["r"] for ep_info in self.ep_info_buffer])) + self.logger.record("rollout/ep_len_mean", safe_mean([ep_info["l"] for ep_info in self.ep_info_buffer])) + self.logger.record("time/fps", fps) + self.logger.record("time/time_elapsed", int(time_elapsed), exclude="tensorboard") + self.logger.record("time/total_timesteps", self.num_timesteps, exclude="tensorboard") + if len(self.ep_success_buffer) > 0: + self.logger.record("rollout/success_rate", safe_mean(self.ep_success_buffer)) + self.logger.dump(step=self.num_timesteps) + def learn( self: SelfOnPolicyAlgorithm, total_timesteps: int, @@ -285,16 +308,7 @@ def learn( # Display training infos if log_interval is not None and iteration % log_interval == 0: assert self.ep_info_buffer is not None - time_elapsed = max((time.time_ns() - self.start_time) / 1e9, sys.float_info.epsilon) - fps = int((self.num_timesteps - self._num_timesteps_at_start) / time_elapsed) - self.logger.record("time/iterations", iteration, exclude="tensorboard") - if len(self.ep_info_buffer) > 0 and len(self.ep_info_buffer[0]) > 0: - self.logger.record("rollout/ep_rew_mean", safe_mean([ep_info["r"] for ep_info in self.ep_info_buffer])) - self.logger.record("rollout/ep_len_mean", safe_mean([ep_info["l"] for ep_info in self.ep_info_buffer])) - self.logger.record("time/fps", fps) - self.logger.record("time/time_elapsed", int(time_elapsed), exclude="tensorboard") - self.logger.record("time/total_timesteps", self.num_timesteps, exclude="tensorboard") - self.logger.dump(step=self.num_timesteps) + self._dump_logs(iteration) self.train() diff --git a/stable_baselines3/version.txt b/stable_baselines3/version.txt index 87ced0fe9..a3b489b55 100644 --- a/stable_baselines3/version.txt +++ b/stable_baselines3/version.txt @@ -1 +1 @@ -2.3.0a4 +2.3.0a5 diff --git a/tests/test_logger.py b/tests/test_logger.py index 05bf196a3..dfd9e5567 100644 --- a/tests/test_logger.py +++ b/tests/test_logger.py @@ -14,7 +14,7 @@ from matplotlib import pyplot as plt from pandas.errors import EmptyDataError -from stable_baselines3 import A2C, DQN +from stable_baselines3 import A2C, DQN, PPO from stable_baselines3.common.env_checker import check_env from stable_baselines3.common.logger import ( DEBUG, @@ -33,6 +33,7 @@ read_csv, read_json, ) +from stable_baselines3.common.monitor import Monitor KEY_VALUES = { "test": 1, @@ -474,3 +475,92 @@ def get_printed(self) -> str: """ assert printed == desired_printed + + +class DummySuccessEnv(gym.Env): + """ + Create a dummy success environment that returns wether True or False for info['is_success'] + at the end of an episode according to its dummy successes list + """ + + def __init__(self, dummy_successes, ep_steps): + """Init the dummy success env + + :param dummy_successes: list of size (n_logs_iterations, n_episodes_per_log) that specifies + the success value of log iteration i at episode j + :param ep_steps: number of steps per episode (to activate truncated) + """ + self.n_steps = 0 + self.log_id = 0 + self.ep_id = 0 + + self.ep_steps = ep_steps + + self.dummy_success = dummy_successes + self.num_logs = len(dummy_successes) + self.ep_per_log = len(dummy_successes[0]) + self.steps_per_log = self.ep_per_log * self.ep_steps + + self.action_space = spaces.Discrete(2) + self.observation_space = spaces.Discrete(2) + + def reset(self, seed=None, options=None): + """ + Reset the env and advance to the next episode_id to get the next dummy success + """ + self.n_steps = 0 + + if self.ep_id == self.ep_per_log: + self.ep_id = 0 + self.log_id = (self.log_id + 1) % self.num_logs + + return self.observation_space.sample(), {} + + def step(self, action): + """ + Step and return a dummy success when an episode is truncated + """ + self.n_steps += 1 + truncated = self.n_steps >= self.ep_steps + + info = {} + if truncated: + maybe_success = self.dummy_success[self.log_id][self.ep_id] + info["is_success"] = maybe_success + self.ep_id += 1 + return self.observation_space.sample(), 0.0, False, truncated, info + + +def test_rollout_success_rate_on_policy_algorithm(tmp_path): + """ + Test if the rollout/success_rate information is correctly logged with on policy algorithms + + To do so, create a dummy environment that takes as argument dummy successes (i.e when an episode) + is going to be successfull or not. + """ + + STATS_WINDOW_SIZE = 10 + # Add dummy successes with 0.3, 0.5 and 0.8 success_rate of length STATS_WINDOW_SIZE + dummy_successes = [ + [True] * 3 + [False] * 7, + [True] * 5 + [False] * 5, + [True] * 8 + [False] * 2, + ] + ep_steps = 64 + + # Monitor the env to track the success info + monitor_file = str(tmp_path / "monitor.csv") + env = Monitor(DummySuccessEnv(dummy_successes, ep_steps), filename=monitor_file, info_keywords=("is_success",)) + + # Equip the model of a custom logger to check the success_rate info + model = PPO("MlpPolicy", env=env, stats_window_size=STATS_WINDOW_SIZE, n_steps=env.steps_per_log, verbose=1) + logger = InMemoryLogger() + model.set_logger(logger) + + # Make the model learn and check that the success rate corresponds to the ratio of dummy successes + model.learn(total_timesteps=env.ep_per_log * ep_steps, log_interval=1) + assert logger.name_to_value["rollout/success_rate"] == 0.3 + model.learn(total_timesteps=env.ep_per_log * ep_steps, log_interval=1) + assert logger.name_to_value["rollout/success_rate"] == 0.5 + model.learn(total_timesteps=env.ep_per_log * ep_steps, log_interval=1) + assert logger.name_to_value["rollout/success_rate"] == 0.8