diff --git a/core/omcmd/node_events.go b/core/omcmd/node_events.go index 06b0824e4..28f36f252 100644 --- a/core/omcmd/node_events.go +++ b/core/omcmd/node_events.go @@ -296,7 +296,9 @@ func (t *CmdNodeEvents) nodeEventLoop(ctx context.Context, nodename string) { _, _ = fmt.Fprintf(os.Stderr, "event read failed for node %s: '%s'\n", nodename, err) _, _ = fmt.Fprintln(os.Stderr, "press ctrl+c to interrupt retries") } - time.Sleep(1 * time.Second) + // wait time before reconnect, it is short to avoid loosing daemon + // restart events. + time.Sleep(100 * time.Millisecond) select { case <-ctx.Done(): t.errC <- ctx.Err() diff --git a/daemon/daemon/main.go b/daemon/daemon/main.go index f32ec6c02..f263d821b 100644 --- a/daemon/daemon/main.go +++ b/daemon/daemon/main.go @@ -99,8 +99,14 @@ func (t *T) Start(ctx context.Context) error { t.ctx = pubsub.ContextWithBus(t.ctx, bus) t.wg.Add(1) bus.Start(t.ctx) + bus.EnableBufferPublication(2000) + bus.Pub(&msgbus.DaemonStatusUpdated{Node: localhost, Version: version.Version(), Status: "starting"}, labelLocalhost) + t.bus = bus t.stopFuncs = append(t.stopFuncs, func() error { + bus.Pub(&msgbus.DaemonStatusUpdated{Node: localhost, Version: version.Version(), Status: "stopped"}, labelLocalhost) + // give chance for DaemonStatusUpdated message to reach peers + time.Sleep(300 * time.Millisecond) defer t.wg.Done() t.log.Infof("stop pubsub bus") t.bus.Stop() @@ -108,6 +114,12 @@ func (t *T) Start(ctx context.Context) error { return nil }) + defer func() { + // give a chance for event client to reconnect to the daemon + time.Sleep(200 * time.Millisecond) + bus.DisableBufferPublication() + }() + defer t.stopWatcher() go t.notifyWatchDogSys(t.ctx) @@ -187,6 +199,9 @@ func (t *T) Stop() error { var errs error // stop goroutines without cancel context t.logTransition("stopping 🟡") + localhost := hostname.Hostname() + t.bus.Pub(&msgbus.DaemonStatusUpdated{Node: localhost, Version: version.Version(), Status: "stopping"}, pubsub.Label{"node", localhost}) + time.Sleep(300 * time.Millisecond) defer t.logTransition("stopped 🟡") for i := len(t.stopFuncs) - 1; i >= 0; i-- { if err := t.stopFuncs[i](); err != nil {