From e8ea1e76327f164e08a5b9b01d48733b1d45fb9b Mon Sep 17 00:00:00 2001 From: Paul Holzinger Date: Tue, 4 Jun 2024 16:55:49 +0200 Subject: [PATCH] libpod: do not leak systemd hc startup unit timer This fixes a regression added in commit 4fd84190b8, because the name was overwritten by the createTimer() timer call the removeTransientFiles() call removed the new timer and not the startup healthcheck. And then when the container was stopped we leaked it as the wrong unit name was in the state. A new test has been added to ensure the logic works and we never leak the system timers. Fixes #22884 Signed-off-by: Paul Holzinger --- libpod/container_internal.go | 12 +++++++++--- libpod/healthcheck.go | 3 ++- libpod/healthcheck_linux.go | 3 +-- libpod/healthcheck_nosystemd_linux.go | 2 +- libpod/healthcheck_unsupported.go | 2 +- test/system/220-healthcheck.bats | 12 ++++++++++++ 6 files changed, 26 insertions(+), 8 deletions(-) diff --git a/libpod/container_internal.go b/libpod/container_internal.go index d3302a5978..d1eeb7f851 100644 --- a/libpod/container_internal.go +++ b/libpod/container_internal.go @@ -272,7 +272,9 @@ func (c *Container) handleRestartPolicy(ctx context.Context) (_ bool, retErr err } if c.config.HealthCheckConfig != nil { - if err := c.removeTransientFiles(ctx, c.config.StartupHealthCheckConfig != nil && !c.state.StartupHCPassed); err != nil { + if err := c.removeTransientFiles(ctx, + c.config.StartupHealthCheckConfig != nil && !c.state.StartupHCPassed, + c.state.HCUnitName); err != nil { return false, err } } @@ -1590,7 +1592,9 @@ func (c *Container) restartWithTimeout(ctx context.Context, timeout uint) (retEr } if c.config.HealthCheckConfig != nil { - if err := c.removeTransientFiles(context.Background(), c.config.StartupHealthCheckConfig != nil && !c.state.StartupHCPassed); err != nil { + if err := c.removeTransientFiles(context.Background(), + c.config.StartupHealthCheckConfig != nil && !c.state.StartupHCPassed, + c.state.HCUnitName); err != nil { logrus.Error(err.Error()) } } @@ -2047,7 +2051,9 @@ func (c *Container) cleanup(ctx context.Context) error { // Remove healthcheck unit/timer file if it execs if c.config.HealthCheckConfig != nil { - if err := c.removeTransientFiles(ctx, c.config.StartupHealthCheckConfig != nil && !c.state.StartupHCPassed); err != nil { + if err := c.removeTransientFiles(ctx, + c.config.StartupHealthCheckConfig != nil && !c.state.StartupHCPassed, + c.state.HCUnitName); err != nil { logrus.Errorf("Removing timer for container %s healthcheck: %v", c.ID(), err) } } diff --git a/libpod/healthcheck.go b/libpod/healthcheck.go index 8f0253a17f..3b027e63ec 100644 --- a/libpod/healthcheck.go +++ b/libpod/healthcheck.go @@ -278,6 +278,7 @@ func (c *Container) incrementStartupHCSuccessCounter(ctx context.Context) { if recreateTimer { logrus.Infof("Startup healthcheck for container %s passed, recreating timer", c.ID()) + oldUnit := c.state.HCUnitName // Create the new, standard healthcheck timer first. if err := c.createTimer(c.HealthCheckConfig().Interval.String(), false); err != nil { logrus.Errorf("Error recreating container %s healthcheck: %v", c.ID(), err) @@ -291,7 +292,7 @@ func (c *Container) incrementStartupHCSuccessCounter(ctx context.Context) { // Which happens to be us. // So this has to be last - after this, systemd serves us a // SIGTERM and we exit. - if err := c.removeTransientFiles(ctx, true); err != nil { + if err := c.removeTransientFiles(ctx, true, oldUnit); err != nil { logrus.Errorf("Error removing container %s healthcheck: %v", c.ID(), err) return } diff --git a/libpod/healthcheck_linux.go b/libpod/healthcheck_linux.go index 52a4561868..344c4a2029 100644 --- a/libpod/healthcheck_linux.go +++ b/libpod/healthcheck_linux.go @@ -108,7 +108,7 @@ func (c *Container) startTimer(isStartup bool) error { // removeTransientFiles removes the systemd timer and unit files // for the container -func (c *Container) removeTransientFiles(ctx context.Context, isStartup bool) error { +func (c *Container) removeTransientFiles(ctx context.Context, isStartup bool, unitName string) error { if c.disableHealthCheckSystemd(isStartup) { return nil } @@ -122,7 +122,6 @@ func (c *Container) removeTransientFiles(ctx context.Context, isStartup bool) er // clean up as much as possible. stopErrors := []error{} - unitName := c.state.HCUnitName if unitName == "" { unitName = c.hcUnitName(isStartup, true) } diff --git a/libpod/healthcheck_nosystemd_linux.go b/libpod/healthcheck_nosystemd_linux.go index cd8503f82a..c338caf1cd 100644 --- a/libpod/healthcheck_nosystemd_linux.go +++ b/libpod/healthcheck_nosystemd_linux.go @@ -18,6 +18,6 @@ func (c *Container) startTimer(isStartup bool) error { // removeTransientFiles removes the systemd timer and unit files // for the container -func (c *Container) removeTransientFiles(ctx context.Context, isStartup bool) error { +func (c *Container) removeTransientFiles(ctx context.Context, isStartup bool, unitName string) error { return nil } diff --git a/libpod/healthcheck_unsupported.go b/libpod/healthcheck_unsupported.go index 0517465dbc..8d733698b8 100644 --- a/libpod/healthcheck_unsupported.go +++ b/libpod/healthcheck_unsupported.go @@ -18,6 +18,6 @@ func (c *Container) startTimer(isStartup bool) error { // removeTransientFiles removes the systemd timer and unit files // for the container -func (c *Container) removeTransientFiles(ctx context.Context, isStartup bool) error { +func (c *Container) removeTransientFiles(ctx context.Context, isStartup bool, unitName string) error { return nil } diff --git a/test/system/220-healthcheck.bats b/test/system/220-healthcheck.bats index c5f0f0ac2c..95c4e4646c 100644 --- a/test/system/220-healthcheck.bats +++ b/test/system/220-healthcheck.bats @@ -5,6 +5,7 @@ # load helpers +load helpers.systemd # Helper function: run 'podman inspect' and check various given fields @@ -38,7 +39,10 @@ function _check_health { --health-interval 1s \ --health-retries 3 \ --health-on-failure=kill \ + --health-startup-cmd /home/podman/healthcheck \ + --health-startup-interval 1s \ $IMAGE /home/podman/pause + cid="$output" run_podman inspect healthcheck_c --format "{{.Config.HealthcheckOnFailureAction}}" is "$output" "kill" "on-failure action is set to kill" @@ -70,6 +74,10 @@ Log[-1].ExitCode | 1 Log[-1].Output | \"Uh-oh on stdout!\\\nUh-oh on stderr!\" " "$current_time" "healthy" + # Check that we now we do have valid podman units with this + # name so that the leak check below does not turn into a NOP without noticing. + assert "$(systemctl list-units --type timer | grep $cid)" =~ "podman" "Healthcheck systemd unit exists" + current_time=$(date --iso-8601=seconds) # After three successive failures, container should no longer be healthy sleep 5 @@ -85,6 +93,10 @@ Log[-1].Output | \"Uh-oh on stdout!\\\nUh-oh on stderr!\" # Clean up run_podman rm -t 0 -f healthcheck_c + + # Important check for https://github.com/containers/podman/issues/22884 + # We never should leak the unit files, healthcheck uses the cid in name so just grep that. + assert "$(systemctl list-units --type timer | grep $cid)" == "" "Healthcheck systemd unit cleanup" } @test "podman healthcheck - restart cleans up old state" {