Skip to content

Commit

Permalink
Merge pull request #1194 from runcom/drain-first
Browse files Browse the repository at this point in the history
Bug 1763695: [release-4.2] pkg/daemon: drain before applying changes
  • Loading branch information
openshift-merge-robot authored Jan 9, 2020
2 parents d780d19 + 124c441 commit 4f0e309
Show file tree
Hide file tree
Showing 2 changed files with 46 additions and 34 deletions.
5 changes: 4 additions & 1 deletion pkg/daemon/daemon.go
Original file line number Diff line number Diff line change
Expand Up @@ -830,7 +830,10 @@ func (dn *Daemon) checkStateOnFirstRun() error {
// take a stab at that and re-run the drain+reboot routine
if state.pendingConfig != nil && bootID == dn.bootID {
dn.logSystem("drain interrupted, retrying")
return dn.drainAndReboot(state.pendingConfig)
if err := dn.drain(); err != nil {
return err
}
return dn.finalizeAndReboot(state.pendingConfig)
}

if err := dn.detectEarlySSHAccessesFromBoot(); err != nil {
Expand Down
75 changes: 42 additions & 33 deletions pkg/daemon/update.go
Original file line number Diff line number Diff line change
Expand Up @@ -99,10 +99,10 @@ func (dn *Daemon) updateOSAndReboot(newConfig *mcfgv1.MachineConfig) (retErr err
if err := dn.updateOS(newConfig); err != nil {
return err
}
return dn.drainAndReboot(newConfig)
return dn.finalizeAndReboot(newConfig)
}

func (dn *Daemon) drainAndReboot(newConfig *mcfgv1.MachineConfig) (retErr error) {
func (dn *Daemon) finalizeAndReboot(newConfig *mcfgv1.MachineConfig) (retErr error) {
if out, err := dn.storePendingState(newConfig, 1); err != nil {
return errors.Wrapf(err, "failed to log pending config: %s", string(out))
}
Expand All @@ -121,43 +121,48 @@ func (dn *Daemon) drainAndReboot(newConfig *mcfgv1.MachineConfig) (retErr error)
dn.recorder.Eventf(getNodeRef(dn.node), corev1.EventTypeNormal, "PendingConfig", fmt.Sprintf("Written pending config %s", newConfig.GetName()))
}

// reboot. this function shouldn't actually return.
return dn.reboot(fmt.Sprintf("Node will reboot into config %v", newConfig.GetName()))
}

func (dn *Daemon) drain() error {
// Skip draining of the node when we're not cluster driven
if dn.kubeClient != nil {
dn.logSystem("Update prepared; beginning drain")
if dn.kubeClient == nil {
return nil
}

dn.logSystem("Update prepared; beginning drain")

dn.recorder.Eventf(getNodeRef(dn.node), corev1.EventTypeNormal, "Drain", "Draining node to update config.")
dn.recorder.Eventf(getNodeRef(dn.node), corev1.EventTypeNormal, "Drain", "Draining node to update config.")

backoff := wait.Backoff{
Steps: 5,
Duration: 10 * time.Second,
Factor: 2,
backoff := wait.Backoff{
Steps: 5,
Duration: 10 * time.Second,
Factor: 2,
}
var lastErr error
if err := wait.ExponentialBackoff(backoff, func() (bool, error) {
err := drain.Drain(dn.kubeClient, []*corev1.Node{dn.node}, &drain.DrainOptions{
DeleteLocalData: true,
Force: true,
GracePeriodSeconds: -1,
IgnoreDaemonsets: true,
Logger: &drainLogger{},
})
if err == nil {
return true, nil
}
var lastErr error
if err := wait.ExponentialBackoff(backoff, func() (bool, error) {
err := drain.Drain(dn.kubeClient, []*corev1.Node{dn.node}, &drain.DrainOptions{
DeleteLocalData: true,
Force: true,
GracePeriodSeconds: -1,
IgnoreDaemonsets: true,
Logger: &drainLogger{},
})
if err == nil {
return true, nil
}
lastErr = err
glog.Infof("Draining failed with: %v, retrying", err)
return false, nil
}); err != nil {
if err == wait.ErrWaitTimeout {
return errors.Wrapf(lastErr, "failed to drain node (%d tries): %v", backoff.Steps, err)
}
return errors.Wrap(err, "failed to drain node")
lastErr = err
glog.Infof("Draining failed with: %v, retrying", err)
return false, nil
}); err != nil {
if err == wait.ErrWaitTimeout {
return errors.Wrapf(lastErr, "failed to drain node (%d tries): %v", backoff.Steps, err)
}
dn.logSystem("drain complete")
return errors.Wrap(err, "failed to drain node")
}

// reboot. this function shouldn't actually return.
return dn.reboot(fmt.Sprintf("Node will reboot into config %v", newConfig.GetName()))
dn.logSystem("drain complete")
return nil
}

var errUnreconcilable = errors.New("unreconcilable")
Expand Down Expand Up @@ -213,6 +218,10 @@ func (dn *Daemon) update(oldConfig, newConfig *mcfgv1.MachineConfig) (retErr err
dn.logSystem("Starting update from %s to %s: %+v", oldConfigName, newConfigName, diff)
}

if err := dn.drain(); err != nil {
return err
}

// update files on disk that need updating
if err := dn.updateFiles(oldConfig, newConfig); err != nil {
return err
Expand Down

0 comments on commit 4f0e309

Please sign in to comment.