Skip to content

Commit

Permalink
SKS-2387: Requeue ElfMachine when NodeHealthy condition is unknown to…
Browse files Browse the repository at this point in the history
… ensure the VM can be powered on ASAP (#171)
  • Loading branch information
haijianyang authored Feb 26, 2024
1 parent 66ba3c9 commit 6e0b432
Show file tree
Hide file tree
Showing 4 changed files with 109 additions and 1 deletion.
18 changes: 17 additions & 1 deletion controllers/elfmachine_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ func AddMachineControllerToManager(ctx *context.ControllerManagerContext, mgr ct
}

// Reconcile ensures the back-end state reflects the Kubernetes resource state intent.
func (r *ElfMachineReconciler) Reconcile(ctx goctx.Context, req ctrl.Request) (_ ctrl.Result, reterr error) {
func (r *ElfMachineReconciler) Reconcile(ctx goctx.Context, req ctrl.Request) (result ctrl.Result, reterr error) {
// Get the ElfMachine resource for this request.
var elfMachine infrav1.ElfMachine
if err := r.Client.Get(r, req.NamespacedName, &elfMachine); err != nil {
Expand Down Expand Up @@ -223,6 +223,22 @@ func (r *ElfMachineReconciler) Reconcile(ctx goctx.Context, req ctrl.Request) (_

machineContext.Logger.Error(err, "patch failed", "elfMachine", machineContext.String())
}

// If the node's healthy condition is unknown, the virtual machine may
// have been shut down through Tower or directly on the virtual machine.
// We need to try to reconcile to ensure that the virtual machine is powered on.
if err == nil && result.IsZero() &&
!machineutil.IsMachineFailed(machineContext.Machine) &&
machineContext.Machine.DeletionTimestamp.IsZero() &&
machineContext.ElfMachine.DeletionTimestamp.IsZero() &&
machineutil.IsNodeHealthyConditionUnknown(machineContext.Machine) {
lastTransitionTime := conditions.GetLastTransitionTime(machineContext.Machine, clusterv1.MachineNodeHealthyCondition)
if lastTransitionTime != nil && time.Now().Before(lastTransitionTime.Add(config.VMPowerStatusCheckingDuration)) {
result.RequeueAfter = config.DefaultRequeueTimeout

machineContext.Logger.Info(fmt.Sprintf("The node's healthy condition is unknown, virtual machine may have been shut down, will reconcile after %s", result.RequeueAfter), "nodeConditionUnknownTime", lastTransitionTime)
}
}
}()

// Handle deleted machines
Expand Down
73 changes: 73 additions & 0 deletions controllers/elfmachine_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -229,6 +229,79 @@ var _ = Describe("ElfMachineReconciler", func() {
Expect(reconciler.Client.Get(reconciler, elfMachineKey, elfMachine)).To(Succeed())
expectConditions(elfMachine, []conditionAssertion{{infrav1.VMProvisionedCondition, corev1.ConditionFalse, clusterv1.ConditionSeverityInfo, infrav1.WaitingForBootstrapDataReason}})
})

It("should requeue when node's healthy condition is unknown", func() {
ctrlutil.AddFinalizer(elfMachine, infrav1.MachineFinalizer)
ctrlutil.AddFinalizer(machine, infrav1.MachineFinalizer)
cluster.Status.InfrastructureReady = false

ctrlContext := newCtrlContexts(elfCluster, cluster, elfMachine, machine, secret, md)
fake.InitOwnerReferences(ctrlContext, elfCluster, cluster, elfMachine, machine)
reconciler := &ElfMachineReconciler{ControllerContext: ctrlContext, NewVMService: mockNewVMService}
elfMachineKey := capiutil.ObjectKey(elfMachine)
result, err := reconciler.Reconcile(ctx, ctrl.Request{NamespacedName: elfMachineKey})
Expect(err).ToNot(HaveOccurred())
Expect(result.IsZero()).To(BeTrue())

logBuffer.Reset()
message := "The node's healthy condition is unknown, virtual machine may have been shut down, will reconcile"
conditions.MarkUnknown(machine, clusterv1.MachineNodeHealthyCondition, clusterv1.NodeConditionsFailedReason, "test")
ctrlContext = newCtrlContexts(elfCluster, cluster, elfMachine, machine, secret, md)
fake.InitOwnerReferences(ctrlContext, elfCluster, cluster, elfMachine, machine)
reconciler = &ElfMachineReconciler{ControllerContext: ctrlContext, NewVMService: mockNewVMService}
result, err = reconciler.Reconcile(ctx, ctrl.Request{NamespacedName: elfMachineKey})
Expect(err).ToNot(HaveOccurred())
Expect(result.RequeueAfter).To(Equal(config.DefaultRequeueTimeout))
Expect(logBuffer.String()).To(ContainSubstring(message))

machine.Status.Conditions[0].LastTransitionTime = metav1.NewTime(time.Now().Add(-config.VMPowerStatusCheckingDuration))
ctrlContext = newCtrlContexts(elfCluster, cluster, elfMachine, machine, secret, md)
fake.InitOwnerReferences(ctrlContext, elfCluster, cluster, elfMachine, machine)
reconciler = &ElfMachineReconciler{ControllerContext: ctrlContext, NewVMService: mockNewVMService}
result, err = reconciler.Reconcile(ctx, ctrl.Request{NamespacedName: elfMachineKey})
Expect(err).ToNot(HaveOccurred())
Expect(result.IsZero()).To(BeTrue())

conditions.MarkUnknown(machine, clusterv1.MachineNodeHealthyCondition, clusterv1.NodeConditionsFailedReason, "test")
machine.Status.FailureMessage = pointer.String("error")
ctrlContext = newCtrlContexts(elfCluster, cluster, elfMachine, machine, secret, md)
fake.InitOwnerReferences(ctrlContext, elfCluster, cluster, elfMachine, machine)
reconciler = &ElfMachineReconciler{ControllerContext: ctrlContext, NewVMService: mockNewVMService}
result, err = reconciler.Reconcile(ctx, ctrl.Request{NamespacedName: elfMachineKey})
Expect(err).ToNot(HaveOccurred())
Expect(result.IsZero()).To(BeTrue())

conditions.MarkUnknown(machine, clusterv1.MachineNodeHealthyCondition, clusterv1.NodeConditionsFailedReason, "test")
machine.Status.FailureMessage = nil
machine.DeletionTimestamp = &metav1.Time{Time: time.Now().UTC()}
ctrlContext = newCtrlContexts(elfCluster, cluster, elfMachine, machine, secret, md)
fake.InitOwnerReferences(ctrlContext, elfCluster, cluster, elfMachine, machine)
reconciler = &ElfMachineReconciler{ControllerContext: ctrlContext, NewVMService: mockNewVMService}
result, err = reconciler.Reconcile(ctx, ctrl.Request{NamespacedName: elfMachineKey})
Expect(err).ToNot(HaveOccurred())
Expect(result.IsZero()).To(BeTrue())

mockVMService.EXPECT().GetByName(elfMachine.Name).Return(nil, errors.New(service.VMNotFound))
machine.DeletionTimestamp = nil
elfMachine.DeletionTimestamp = &metav1.Time{Time: time.Now().UTC()}
ctrlutil.AddFinalizer(elfMachine, "no-gc")
ctrlContext = newCtrlContexts(elfCluster, cluster, elfMachine, machine, secret, md)
fake.InitOwnerReferences(ctrlContext, elfCluster, cluster, elfMachine, machine)
reconciler = &ElfMachineReconciler{ControllerContext: ctrlContext, NewVMService: mockNewVMService}
result, err = reconciler.Reconcile(ctx, ctrl.Request{NamespacedName: elfMachineKey})
Expect(err).ToNot(HaveOccurred())
Expect(result.IsZero()).To(BeTrue())

machine.DeletionTimestamp = &metav1.Time{Time: time.Now().UTC()}
ctrlutil.AddFinalizer(machine, "no-gc")
elfMachine.DeletionTimestamp = nil
ctrlContext = newCtrlContexts(elfCluster, cluster, elfMachine, machine, secret, md)
fake.InitOwnerReferences(ctrlContext, elfCluster, cluster, elfMachine, machine)
reconciler = &ElfMachineReconciler{ControllerContext: ctrlContext, NewVMService: mockNewVMService}
result, err = reconciler.Reconcile(ctx, ctrl.Request{NamespacedName: elfMachineKey})
Expect(err).ToNot(HaveOccurred())
Expect(result.IsZero()).To(BeTrue())
})
})

Context("Reconcile ElfMachine VM", func() {
Expand Down
4 changes: 4 additions & 0 deletions pkg/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,4 +33,8 @@ var (

// WaitTaskTimeoutForPlacementGroupOperation is the timeout for waiting for placement group creating/updating/deleting task to complete.
WaitTaskTimeoutForPlacementGroupOperation = 10 * time.Second

// VMPowerStatusCheckingDuration is the time duration for cheking if the VM is powered off
// after the Machine's NodeHealthy condition status is set to Unknown.
VMPowerStatusCheckingDuration = 2 * time.Minute
)
15 changes: 15 additions & 0 deletions pkg/util/machine/machine.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import (
"github.com/pkg/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
"sigs.k8s.io/cluster-api/util/conditions"
"sigs.k8s.io/controller-runtime/pkg/client"

infrav1 "github.com/smartxworks/cluster-api-provider-elf/api/v1beta1"
Expand Down Expand Up @@ -114,6 +115,20 @@ func GetNodeGroupName(machine *clusterv1.Machine) string {
return strings.ReplaceAll(nodeGroupName, fmt.Sprintf("%s-", clusterName), "")
}

// IsNodeHealthyConditionUnknown returns whether the node's healthy condition is unknown.
func IsNodeHealthyConditionUnknown(machine *clusterv1.Machine) bool {
if conditions.IsUnknown(machine, clusterv1.MachineNodeHealthyCondition) &&
conditions.GetReason(machine, clusterv1.MachineNodeHealthyCondition) == clusterv1.NodeConditionsFailedReason {
return true
}

return false
}

func IsMachineFailed(machine *clusterv1.Machine) bool {
return machine.Status.FailureReason != nil || machine.Status.FailureMessage != nil
}

func ConvertProviderIDToUUID(providerID *string) string {
if providerID == nil || *providerID == "" {
return ""
Expand Down

0 comments on commit 6e0b432

Please sign in to comment.