Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bug 1927041: daemon: safer signal handling for shutdown #2395

Merged
merged 2 commits into from
Feb 19, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion manifests/machineconfigdaemon/daemonset.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ spec:
hostNetwork: true
hostPID: true
serviceAccountName: machine-config-daemon
terminationGracePeriodSeconds: 600
terminationGracePeriodSeconds: 3600
nodeSelector:
kubernetes.io/os: linux
priorityClassName: "system-node-critical"
Expand Down
79 changes: 41 additions & 38 deletions pkg/daemon/daemon.go
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ type Daemon struct {
kubeletHealthzEnabled bool
kubeletHealthzEndpoint string

updateActive bool
// updateActiveLock is used to signal that an update operation is in progress
updateActiveLock sync.Mutex

nodeWriter NodeWriter
Expand Down Expand Up @@ -351,15 +351,19 @@ func (dn *Daemon) worker() {
}

func (dn *Daemon) processNextWorkItem() bool {
key, quit := dn.queue.Get()
if quit {
select {
case <-dn.stopCh:
return false
}
defer dn.queue.Done(key)

err := dn.syncHandler(key.(string))
dn.handleErr(err, key)
default:
key, quit := dn.queue.Get()
if quit {
return false
}
defer dn.queue.Done(key)

err := dn.syncHandler(key.(string))
dn.handleErr(err, key)
}
return true
}

Expand Down Expand Up @@ -580,29 +584,28 @@ func (dn *Daemon) RunFirstbootCompleteMachineconfig() error {
return dn.reboot(fmt.Sprintf("Completing firstboot provisioning to %s", mc.GetName()))
}

// InstallSignalHandler installs the handler for the signals the daemon should act on
func (dn *Daemon) InstallSignalHandler(signaled chan struct{}) {
termChan := make(chan os.Signal, 2048)
signal.Notify(termChan, syscall.SIGTERM)
// InterruptHandler ensures that shutdown operations are blocked until an
// an update operation is finished.
darkmuggle marked this conversation as resolved.
Show resolved Hide resolved
func (dn *Daemon) InterruptHandler(signaled chan int) {
glog.Info("Guarding against sigterm signal")

// Catch SIGTERM - if we're actively updating, we should avoid
// having the process be killed.
// https://github.com/openshift/machine-config-operator/issues/407
go func() {
for sig := range termChan {
//nolint:gocritic
switch sig {
case syscall.SIGTERM:
dn.updateActiveLock.Lock()
updateActive := dn.updateActive
dn.updateActiveLock.Unlock()
if updateActive {
glog.Info("Got SIGTERM, but actively updating")
} else {
close(signaled)
return
}
}
sig := make(chan os.Signal, 1)
signal.Notify(sig, os.Interrupt, syscall.SIGTERM)

select {
case <-dn.stopCh:
glog.Info("sigterm handler function stopped")
dn.updateActiveLock.Lock()
return
case <-sig:
glog.Warningf("sigterm received waiting to claim update lock")
dn.updateActiveLock.Lock()
glog.Info("lock obtained, proceeding to shutdown")
signaled <- 15
}
}()
}
Expand All @@ -617,8 +620,8 @@ func (dn *Daemon) Run(stopCh <-chan struct{}, exitCh <-chan error) error {
glog.Info("Starting MachineConfigDaemon")
defer glog.Info("Shutting down MachineConfigDaemon")

signaled := make(chan struct{})
dn.InstallSignalHandler(signaled)
signaled := make(chan int)
dn.InterruptHandler(signaled)

if dn.kubeletHealthzEnabled {
glog.Info("Enabling Kubelet Healthz Monitor")
Expand All @@ -634,16 +637,16 @@ func (dn *Daemon) Run(stopCh <-chan struct{}, exitCh <-chan error) error {

go wait.Until(dn.worker, time.Second, stopCh)

for {
Copy link
Contributor Author

@darkmuggle darkmuggle Feb 17, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The for loop is superfluous since once we get the sigterm we should finish the work and then get out of the way. A sigkill could/likely follow and there's literally nothing we can do about that.

select {
case <-stopCh:
return nil
case <-signaled:
return nil
case err := <-exitCh:
// This channel gets errors from auxiliary goroutines like loginmonitor and kubehealth
glog.Warningf("Got an error from auxiliary tools: %v", err)
}
select {
case <-stopCh:
return nil
case sig := <-signaled:
glog.Infof("shutdown of machine-config-daemon triggered via signal %d", sig)
return nil
case err := <-exitCh:
// This channel gets errors from auxiliary goroutines like loginmonitor and kubehealth
glog.Warningf("Got an error from auxiliary tools: %v", err)
return err
}
}

Expand Down
29 changes: 4 additions & 25 deletions pkg/daemon/update.go
Original file line number Diff line number Diff line change
Expand Up @@ -573,6 +573,10 @@ func calculatePostConfigChangeAction(oldConfig, newConfig *mcfgv1.MachineConfig)
func (dn *Daemon) update(oldConfig, newConfig *mcfgv1.MachineConfig) (retErr error) {
oldConfig = canonicalizeEmptyMC(oldConfig)

// signal that an update is active
dn.updateActiveLock.Lock()
defer dn.updateActiveLock.Unlock()

if dn.nodeWriter != nil {
state, err := getNodeAnnotationExt(dn.node, constants.MachineConfigDaemonStateAnnotationKey, true)
if err != nil {
Expand All @@ -585,13 +589,6 @@ func (dn *Daemon) update(oldConfig, newConfig *mcfgv1.MachineConfig) (retErr err
}
}

dn.catchIgnoreSIGTERM()
defer func() {
if retErr != nil {
dn.cancelSIGTERM()
}
}()

oldConfigName := oldConfig.GetName()
newConfigName := newConfig.GetName()

Expand Down Expand Up @@ -1914,29 +1911,11 @@ func (dn *Daemon) logSystem(format string, a ...interface{}) {
}
}

func (dn *Daemon) catchIgnoreSIGTERM() {
dn.updateActiveLock.Lock()
defer dn.updateActiveLock.Unlock()
if dn.updateActive {
return
}
dn.updateActive = true
}

func (dn *Daemon) cancelSIGTERM() {
dn.updateActiveLock.Lock()
defer dn.updateActiveLock.Unlock()
if dn.updateActive {
dn.updateActive = false
}
}

// reboot is the final step. it tells systemd-logind to reboot the machine,
// cleans up the agent's connections, and then sleeps for 7 days. if it wakes up
// and manages to return, it returns a scary error message.
func (dn *Daemon) reboot(rationale string) error {
// Now that everything is done, avoid delaying shutdown.
dn.cancelSIGTERM()
dn.Close()

if dn.skipReboot {
Expand Down
2 changes: 1 addition & 1 deletion pkg/operator/assets/bindata.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.