diff --git a/manifests/machineconfigdaemon/daemonset.yaml b/manifests/machineconfigdaemon/daemonset.yaml index 560921ce1f..5337913236 100644 --- a/manifests/machineconfigdaemon/daemonset.yaml +++ b/manifests/machineconfigdaemon/daemonset.yaml @@ -79,7 +79,7 @@ spec: hostNetwork: true hostPID: true serviceAccountName: machine-config-daemon - terminationGracePeriodSeconds: 600 + terminationGracePeriodSeconds: 3600 nodeSelector: kubernetes.io/os: linux priorityClassName: "system-node-critical" diff --git a/pkg/daemon/daemon.go b/pkg/daemon/daemon.go index 69d1280a77..619a4c6fdb 100644 --- a/pkg/daemon/daemon.go +++ b/pkg/daemon/daemon.go @@ -91,7 +91,7 @@ type Daemon struct { kubeletHealthzEnabled bool kubeletHealthzEndpoint string - updateActive bool + // updateActiveLock is used to signal that an update operation is in progress updateActiveLock sync.Mutex nodeWriter NodeWriter @@ -351,15 +351,19 @@ func (dn *Daemon) worker() { } func (dn *Daemon) processNextWorkItem() bool { - key, quit := dn.queue.Get() - if quit { + select { + case <-dn.stopCh: return false - } - defer dn.queue.Done(key) - - err := dn.syncHandler(key.(string)) - dn.handleErr(err, key) + default: + key, quit := dn.queue.Get() + if quit { + return false + } + defer dn.queue.Done(key) + err := dn.syncHandler(key.(string)) + dn.handleErr(err, key) + } return true } @@ -580,29 +584,28 @@ func (dn *Daemon) RunFirstbootCompleteMachineconfig() error { return dn.reboot(fmt.Sprintf("Completing firstboot provisioning to %s", mc.GetName())) } -// InstallSignalHandler installs the handler for the signals the daemon should act on -func (dn *Daemon) InstallSignalHandler(signaled chan struct{}) { - termChan := make(chan os.Signal, 2048) - signal.Notify(termChan, syscall.SIGTERM) +// InterruptHandler ensures that shutdown operations are blocked until an +// an update operation is finished. +func (dn *Daemon) InterruptHandler(signaled chan int) { + glog.Info("Guarding against sigterm signal") // Catch SIGTERM - if we're actively updating, we should avoid // having the process be killed. // https://github.com/openshift/machine-config-operator/issues/407 go func() { - for sig := range termChan { - //nolint:gocritic - switch sig { - case syscall.SIGTERM: - dn.updateActiveLock.Lock() - updateActive := dn.updateActive - dn.updateActiveLock.Unlock() - if updateActive { - glog.Info("Got SIGTERM, but actively updating") - } else { - close(signaled) - return - } - } + sig := make(chan os.Signal, 1) + signal.Notify(sig, os.Interrupt, syscall.SIGTERM) + + select { + case <-dn.stopCh: + glog.Info("sigterm handler function stopped") + dn.updateActiveLock.Lock() + return + case <-sig: + glog.Warningf("sigterm received waiting to claim update lock") + dn.updateActiveLock.Lock() + glog.Info("lock obtained, proceeding to shutdown") + signaled <- 15 } }() } @@ -617,8 +620,8 @@ func (dn *Daemon) Run(stopCh <-chan struct{}, exitCh <-chan error) error { glog.Info("Starting MachineConfigDaemon") defer glog.Info("Shutting down MachineConfigDaemon") - signaled := make(chan struct{}) - dn.InstallSignalHandler(signaled) + signaled := make(chan int) + dn.InterruptHandler(signaled) if dn.kubeletHealthzEnabled { glog.Info("Enabling Kubelet Healthz Monitor") @@ -634,16 +637,16 @@ func (dn *Daemon) Run(stopCh <-chan struct{}, exitCh <-chan error) error { go wait.Until(dn.worker, time.Second, stopCh) - for { - select { - case <-stopCh: - return nil - case <-signaled: - return nil - case err := <-exitCh: - // This channel gets errors from auxiliary goroutines like loginmonitor and kubehealth - glog.Warningf("Got an error from auxiliary tools: %v", err) - } + select { + case <-stopCh: + return nil + case sig := <-signaled: + glog.Infof("shutdown of machine-config-daemon triggered via signal %d", sig) + return nil + case err := <-exitCh: + // This channel gets errors from auxiliary goroutines like loginmonitor and kubehealth + glog.Warningf("Got an error from auxiliary tools: %v", err) + return err } } diff --git a/pkg/daemon/update.go b/pkg/daemon/update.go index 4778d9d016..ec3559ed04 100644 --- a/pkg/daemon/update.go +++ b/pkg/daemon/update.go @@ -573,6 +573,10 @@ func calculatePostConfigChangeAction(oldConfig, newConfig *mcfgv1.MachineConfig) func (dn *Daemon) update(oldConfig, newConfig *mcfgv1.MachineConfig) (retErr error) { oldConfig = canonicalizeEmptyMC(oldConfig) + // signal that an update is active + dn.updateActiveLock.Lock() + defer dn.updateActiveLock.Unlock() + if dn.nodeWriter != nil { state, err := getNodeAnnotationExt(dn.node, constants.MachineConfigDaemonStateAnnotationKey, true) if err != nil { @@ -585,13 +589,6 @@ func (dn *Daemon) update(oldConfig, newConfig *mcfgv1.MachineConfig) (retErr err } } - dn.catchIgnoreSIGTERM() - defer func() { - if retErr != nil { - dn.cancelSIGTERM() - } - }() - oldConfigName := oldConfig.GetName() newConfigName := newConfig.GetName() @@ -1914,29 +1911,11 @@ func (dn *Daemon) logSystem(format string, a ...interface{}) { } } -func (dn *Daemon) catchIgnoreSIGTERM() { - dn.updateActiveLock.Lock() - defer dn.updateActiveLock.Unlock() - if dn.updateActive { - return - } - dn.updateActive = true -} - -func (dn *Daemon) cancelSIGTERM() { - dn.updateActiveLock.Lock() - defer dn.updateActiveLock.Unlock() - if dn.updateActive { - dn.updateActive = false - } -} - // reboot is the final step. it tells systemd-logind to reboot the machine, // cleans up the agent's connections, and then sleeps for 7 days. if it wakes up // and manages to return, it returns a scary error message. func (dn *Daemon) reboot(rationale string) error { // Now that everything is done, avoid delaying shutdown. - dn.cancelSIGTERM() dn.Close() if dn.skipReboot { diff --git a/pkg/operator/assets/bindata.go b/pkg/operator/assets/bindata.go index a6850235a9..5501ae7969 100644 --- a/pkg/operator/assets/bindata.go +++ b/pkg/operator/assets/bindata.go @@ -1158,7 +1158,7 @@ spec: hostNetwork: true hostPID: true serviceAccountName: machine-config-daemon - terminationGracePeriodSeconds: 600 + terminationGracePeriodSeconds: 3600 nodeSelector: kubernetes.io/os: linux priorityClassName: "system-node-critical"