diff --git a/src/daemon/watcher.c b/src/daemon/watcher.c index 2a3bcf544b27ab..70f71f8e4386bb 100644 --- a/src/daemon/watcher.c +++ b/src/daemon/watcher.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-3.0-or-later -#include "daemon/common.h" #include "watcher.h" watcher_step_t *watcher_steps; @@ -30,24 +29,39 @@ void *watcher_main(void *arg) completion_wait_for(&shutdown_begin_completion); usec_t shutdown_start_time = now_monotonic_usec(); - // TODO: - // - add a version of completion_wait_for with timeout - // - check the step's duration and abort when the timeout has expired. + netdata_log_error("Shutdown process started"); + + unsigned timeout = 60; + for (int step_id = 0; step_id != WATCHER_STEP_ID_MAX; step_id++) { usec_t step_start_time = now_monotonic_usec(); + +#ifdef ENABLE_SENTRY + // Wait with a timeout + bool ok = completion_timedwait_for(&watcher_steps[step_id].p, timeout); +#else + // Wait indefinitely + bool ok = true; completion_wait_for(&watcher_steps[step_id].p); - usec_t step_end_time = now_monotonic_usec(); - - usec_t step_duration = step_end_time - step_start_time; - netdata_log_info("shutdown step: [%d/%d] - '%s' finished in %llu milliseconds", - step_id + 1, - WATCHER_STEP_ID_MAX, - watcher_steps[step_id].msg, - step_duration / USEC_PER_MS); +#endif + + usec_t step_duration = now_monotonic_usec() - step_start_time; + + if (ok) { + netdata_log_info("shutdown step: [%d/%d] - '%s' finished in %llu milliseconds", + step_id + 1, WATCHER_STEP_ID_MAX, + watcher_steps[step_id].msg, step_duration / USEC_PER_MS); + } else { + // Do not call fatal() because it will try to execute the exit + // sequence twice. + netdata_log_error("shutdown step: [%d/%d] - '%s' took more than %u seconds (ie. %llu milliseconds)", + step_id + 1, WATCHER_STEP_ID_MAX, watcher_steps[step_id].msg, + timeout, step_duration / USEC_PER_MS); + + abort(); + } } - netdata_log_error("Shutdown process started"); - completion_wait_for(&shutdown_end_completion); usec_t shutdown_end_time = now_monotonic_usec(); diff --git a/src/libnetdata/completion/completion.c b/src/libnetdata/completion/completion.c index 6257e02998236d..113423835a889e 100644 --- a/src/libnetdata/completion/completion.c +++ b/src/libnetdata/completion/completion.c @@ -26,6 +26,41 @@ void completion_wait_for(struct completion *p) uv_mutex_unlock(&p->mutex); } +bool completion_timedwait_for(struct completion *p, uint64_t timeout) +{ + timeout *= NSEC_PER_SEC; + + uint64_t start_time = uv_hrtime(); + bool result = true; + + uv_mutex_lock(&p->mutex); + while (!p->completed) { + int rc = uv_cond_timedwait(&p->cond, &p->mutex, timeout); + + if (rc == 0) { + result = true; + break; + } else if (rc == UV_ETIMEDOUT) { + result = false; + break; + } + + /* + * handle spurious wakeups + */ + + uint64_t elapsed = uv_hrtime() - start_time; + if (elapsed >= timeout) { + result = false; + break; + } + timeout -= elapsed; + } + uv_mutex_unlock(&p->mutex); + + return result; +} + void completion_mark_complete(struct completion *p) { uv_mutex_lock(&p->mutex); diff --git a/src/libnetdata/completion/completion.h b/src/libnetdata/completion/completion.h index 723f736889666b..908ccfaf6419ff 100644 --- a/src/libnetdata/completion/completion.h +++ b/src/libnetdata/completion/completion.h @@ -18,6 +18,10 @@ void completion_destroy(struct completion *p); void completion_wait_for(struct completion *p); +// Wait for at most `timeout` seconds. Return true on success, false on +// error or timeout. +bool completion_timedwait_for(struct completion *p, uint64_t timeout); + void completion_mark_complete(struct completion *p); unsigned completion_wait_for_a_job(struct completion *p, unsigned completed_jobs);