Skip to content

Commit

Permalink
Abort the agent if a single shutdown step takes more than 60 seconds. (
Browse files Browse the repository at this point in the history
…netdata#17060)

* Add timed-wait for completion.

* Abort if any shutdown step takes more than 60 seconds to complete.

* Timeout only on sentry builds.
  • Loading branch information
vkalintiris authored Feb 27, 2024
1 parent a9ca70c commit 700d77b
Show file tree
Hide file tree
Showing 3 changed files with 67 additions and 14 deletions.
42 changes: 28 additions & 14 deletions src/daemon/watcher.c
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
// SPDX-License-Identifier: GPL-3.0-or-later

#include "daemon/common.h"
#include "watcher.h"

watcher_step_t *watcher_steps;
Expand Down Expand Up @@ -30,24 +29,39 @@ void *watcher_main(void *arg)
completion_wait_for(&shutdown_begin_completion);
usec_t shutdown_start_time = now_monotonic_usec();

// TODO:
// - add a version of completion_wait_for with timeout
// - check the step's duration and abort when the timeout has expired.
netdata_log_error("Shutdown process started");

unsigned timeout = 60;

for (int step_id = 0; step_id != WATCHER_STEP_ID_MAX; step_id++) {
usec_t step_start_time = now_monotonic_usec();

#ifdef ENABLE_SENTRY
// Wait with a timeout
bool ok = completion_timedwait_for(&watcher_steps[step_id].p, timeout);
#else
// Wait indefinitely
bool ok = true;
completion_wait_for(&watcher_steps[step_id].p);
usec_t step_end_time = now_monotonic_usec();

usec_t step_duration = step_end_time - step_start_time;
netdata_log_info("shutdown step: [%d/%d] - '%s' finished in %llu milliseconds",
step_id + 1,
WATCHER_STEP_ID_MAX,
watcher_steps[step_id].msg,
step_duration / USEC_PER_MS);
#endif

usec_t step_duration = now_monotonic_usec() - step_start_time;

if (ok) {
netdata_log_info("shutdown step: [%d/%d] - '%s' finished in %llu milliseconds",
step_id + 1, WATCHER_STEP_ID_MAX,
watcher_steps[step_id].msg, step_duration / USEC_PER_MS);
} else {
// Do not call fatal() because it will try to execute the exit
// sequence twice.
netdata_log_error("shutdown step: [%d/%d] - '%s' took more than %u seconds (ie. %llu milliseconds)",
step_id + 1, WATCHER_STEP_ID_MAX, watcher_steps[step_id].msg,
timeout, step_duration / USEC_PER_MS);

abort();
}
}

netdata_log_error("Shutdown process started");

completion_wait_for(&shutdown_end_completion);
usec_t shutdown_end_time = now_monotonic_usec();

Expand Down
35 changes: 35 additions & 0 deletions src/libnetdata/completion/completion.c
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,41 @@ void completion_wait_for(struct completion *p)
uv_mutex_unlock(&p->mutex);
}

bool completion_timedwait_for(struct completion *p, uint64_t timeout)
{
timeout *= NSEC_PER_SEC;

uint64_t start_time = uv_hrtime();
bool result = true;

uv_mutex_lock(&p->mutex);
while (!p->completed) {
int rc = uv_cond_timedwait(&p->cond, &p->mutex, timeout);

if (rc == 0) {
result = true;
break;
} else if (rc == UV_ETIMEDOUT) {
result = false;
break;
}

/*
* handle spurious wakeups
*/

uint64_t elapsed = uv_hrtime() - start_time;
if (elapsed >= timeout) {
result = false;
break;
}
timeout -= elapsed;
}
uv_mutex_unlock(&p->mutex);

return result;
}

void completion_mark_complete(struct completion *p)
{
uv_mutex_lock(&p->mutex);
Expand Down
4 changes: 4 additions & 0 deletions src/libnetdata/completion/completion.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@ void completion_destroy(struct completion *p);

void completion_wait_for(struct completion *p);

// Wait for at most `timeout` seconds. Return true on success, false on
// error or timeout.
bool completion_timedwait_for(struct completion *p, uint64_t timeout);

void completion_mark_complete(struct completion *p);

unsigned completion_wait_for_a_job(struct completion *p, unsigned completed_jobs);
Expand Down

0 comments on commit 700d77b

Please sign in to comment.