diff --git a/controllers/elfmachine_controller.go b/controllers/elfmachine_controller.go index 29bfc762..5ce67a28 100644 --- a/controllers/elfmachine_controller.go +++ b/controllers/elfmachine_controller.go @@ -112,7 +112,7 @@ func AddMachineControllerToManager(ctx *context.ControllerManagerContext, mgr ct } // Reconcile ensures the back-end state reflects the Kubernetes resource state intent. -func (r *ElfMachineReconciler) Reconcile(ctx goctx.Context, req ctrl.Request) (_ ctrl.Result, reterr error) { +func (r *ElfMachineReconciler) Reconcile(ctx goctx.Context, req ctrl.Request) (result ctrl.Result, reterr error) { // Get the ElfMachine resource for this request. var elfMachine infrav1.ElfMachine if err := r.Client.Get(r, req.NamespacedName, &elfMachine); err != nil { @@ -223,6 +223,22 @@ func (r *ElfMachineReconciler) Reconcile(ctx goctx.Context, req ctrl.Request) (_ machineContext.Logger.Error(err, "patch failed", "elfMachine", machineContext.String()) } + + // If the node's healthy condition is unknown, the virtual machine may + // have been shut down through Tower or directly on the virtual machine. + // We need to try to reconcile to ensure that the virtual machine is powered on. + if err == nil && result.IsZero() && + !machineutil.IsMachineFailed(machineContext.Machine) && + machineContext.Machine.DeletionTimestamp.IsZero() && + machineContext.ElfMachine.DeletionTimestamp.IsZero() && + machineutil.IsNodeHealthyConditionUnknown(machineContext.Machine) { + lastTransitionTime := conditions.GetLastTransitionTime(machineContext.Machine, clusterv1.MachineNodeHealthyCondition) + if lastTransitionTime != nil && time.Now().Before(lastTransitionTime.Add(config.VMPowerStatusCheckingDuration)) { + result.RequeueAfter = config.DefaultRequeueTimeout + + machineContext.Logger.Info(fmt.Sprintf("The node's healthy condition is unknown, virtual machine may have been shut down, will reconcile after %s", result.RequeueAfter), "nodeConditionUnknownTime", lastTransitionTime) + } + } }() // Handle deleted machines diff --git a/controllers/elfmachine_controller_test.go b/controllers/elfmachine_controller_test.go index cd40d9fc..21bd6326 100644 --- a/controllers/elfmachine_controller_test.go +++ b/controllers/elfmachine_controller_test.go @@ -229,6 +229,79 @@ var _ = Describe("ElfMachineReconciler", func() { Expect(reconciler.Client.Get(reconciler, elfMachineKey, elfMachine)).To(Succeed()) expectConditions(elfMachine, []conditionAssertion{{infrav1.VMProvisionedCondition, corev1.ConditionFalse, clusterv1.ConditionSeverityInfo, infrav1.WaitingForBootstrapDataReason}}) }) + + It("should requeue when node's healthy condition is unknown", func() { + ctrlutil.AddFinalizer(elfMachine, infrav1.MachineFinalizer) + ctrlutil.AddFinalizer(machine, infrav1.MachineFinalizer) + cluster.Status.InfrastructureReady = false + + ctrlContext := newCtrlContexts(elfCluster, cluster, elfMachine, machine, secret, md) + fake.InitOwnerReferences(ctrlContext, elfCluster, cluster, elfMachine, machine) + reconciler := &ElfMachineReconciler{ControllerContext: ctrlContext, NewVMService: mockNewVMService} + elfMachineKey := capiutil.ObjectKey(elfMachine) + result, err := reconciler.Reconcile(ctx, ctrl.Request{NamespacedName: elfMachineKey}) + Expect(err).ToNot(HaveOccurred()) + Expect(result.IsZero()).To(BeTrue()) + + logBuffer.Reset() + message := "The node's healthy condition is unknown, virtual machine may have been shut down, will reconcile" + conditions.MarkUnknown(machine, clusterv1.MachineNodeHealthyCondition, clusterv1.NodeConditionsFailedReason, "test") + ctrlContext = newCtrlContexts(elfCluster, cluster, elfMachine, machine, secret, md) + fake.InitOwnerReferences(ctrlContext, elfCluster, cluster, elfMachine, machine) + reconciler = &ElfMachineReconciler{ControllerContext: ctrlContext, NewVMService: mockNewVMService} + result, err = reconciler.Reconcile(ctx, ctrl.Request{NamespacedName: elfMachineKey}) + Expect(err).ToNot(HaveOccurred()) + Expect(result.RequeueAfter).To(Equal(config.DefaultRequeueTimeout)) + Expect(logBuffer.String()).To(ContainSubstring(message)) + + machine.Status.Conditions[0].LastTransitionTime = metav1.NewTime(time.Now().Add(-config.VMPowerStatusCheckingDuration)) + ctrlContext = newCtrlContexts(elfCluster, cluster, elfMachine, machine, secret, md) + fake.InitOwnerReferences(ctrlContext, elfCluster, cluster, elfMachine, machine) + reconciler = &ElfMachineReconciler{ControllerContext: ctrlContext, NewVMService: mockNewVMService} + result, err = reconciler.Reconcile(ctx, ctrl.Request{NamespacedName: elfMachineKey}) + Expect(err).ToNot(HaveOccurred()) + Expect(result.IsZero()).To(BeTrue()) + + conditions.MarkUnknown(machine, clusterv1.MachineNodeHealthyCondition, clusterv1.NodeConditionsFailedReason, "test") + machine.Status.FailureMessage = pointer.String("error") + ctrlContext = newCtrlContexts(elfCluster, cluster, elfMachine, machine, secret, md) + fake.InitOwnerReferences(ctrlContext, elfCluster, cluster, elfMachine, machine) + reconciler = &ElfMachineReconciler{ControllerContext: ctrlContext, NewVMService: mockNewVMService} + result, err = reconciler.Reconcile(ctx, ctrl.Request{NamespacedName: elfMachineKey}) + Expect(err).ToNot(HaveOccurred()) + Expect(result.IsZero()).To(BeTrue()) + + conditions.MarkUnknown(machine, clusterv1.MachineNodeHealthyCondition, clusterv1.NodeConditionsFailedReason, "test") + machine.Status.FailureMessage = nil + machine.DeletionTimestamp = &metav1.Time{Time: time.Now().UTC()} + ctrlContext = newCtrlContexts(elfCluster, cluster, elfMachine, machine, secret, md) + fake.InitOwnerReferences(ctrlContext, elfCluster, cluster, elfMachine, machine) + reconciler = &ElfMachineReconciler{ControllerContext: ctrlContext, NewVMService: mockNewVMService} + result, err = reconciler.Reconcile(ctx, ctrl.Request{NamespacedName: elfMachineKey}) + Expect(err).ToNot(HaveOccurred()) + Expect(result.IsZero()).To(BeTrue()) + + mockVMService.EXPECT().GetByName(elfMachine.Name).Return(nil, errors.New(service.VMNotFound)) + machine.DeletionTimestamp = nil + elfMachine.DeletionTimestamp = &metav1.Time{Time: time.Now().UTC()} + ctrlutil.AddFinalizer(elfMachine, "no-gc") + ctrlContext = newCtrlContexts(elfCluster, cluster, elfMachine, machine, secret, md) + fake.InitOwnerReferences(ctrlContext, elfCluster, cluster, elfMachine, machine) + reconciler = &ElfMachineReconciler{ControllerContext: ctrlContext, NewVMService: mockNewVMService} + result, err = reconciler.Reconcile(ctx, ctrl.Request{NamespacedName: elfMachineKey}) + Expect(err).ToNot(HaveOccurred()) + Expect(result.IsZero()).To(BeTrue()) + + machine.DeletionTimestamp = &metav1.Time{Time: time.Now().UTC()} + ctrlutil.AddFinalizer(machine, "no-gc") + elfMachine.DeletionTimestamp = nil + ctrlContext = newCtrlContexts(elfCluster, cluster, elfMachine, machine, secret, md) + fake.InitOwnerReferences(ctrlContext, elfCluster, cluster, elfMachine, machine) + reconciler = &ElfMachineReconciler{ControllerContext: ctrlContext, NewVMService: mockNewVMService} + result, err = reconciler.Reconcile(ctx, ctrl.Request{NamespacedName: elfMachineKey}) + Expect(err).ToNot(HaveOccurred()) + Expect(result.IsZero()).To(BeTrue()) + }) }) Context("Reconcile ElfMachine VM", func() { diff --git a/pkg/config/config.go b/pkg/config/config.go index cf965608..bdfcab39 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -33,4 +33,8 @@ var ( // WaitTaskTimeoutForPlacementGroupOperation is the timeout for waiting for placement group creating/updating/deleting task to complete. WaitTaskTimeoutForPlacementGroupOperation = 10 * time.Second + + // VMPowerStatusCheckingDuration is the time duration for cheking if the VM is powered off + // after the Machine's NodeHealthy condition status is set to Unknown. + VMPowerStatusCheckingDuration = 2 * time.Minute ) diff --git a/pkg/util/machine/machine.go b/pkg/util/machine/machine.go index e66ac4e0..e4f380d4 100644 --- a/pkg/util/machine/machine.go +++ b/pkg/util/machine/machine.go @@ -25,6 +25,7 @@ import ( "github.com/pkg/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" + "sigs.k8s.io/cluster-api/util/conditions" "sigs.k8s.io/controller-runtime/pkg/client" infrav1 "github.com/smartxworks/cluster-api-provider-elf/api/v1beta1" @@ -114,6 +115,20 @@ func GetNodeGroupName(machine *clusterv1.Machine) string { return strings.ReplaceAll(nodeGroupName, fmt.Sprintf("%s-", clusterName), "") } +// IsNodeHealthyConditionUnknown returns whether the node's healthy condition is unknown. +func IsNodeHealthyConditionUnknown(machine *clusterv1.Machine) bool { + if conditions.IsUnknown(machine, clusterv1.MachineNodeHealthyCondition) && + conditions.GetReason(machine, clusterv1.MachineNodeHealthyCondition) == clusterv1.NodeConditionsFailedReason { + return true + } + + return false +} + +func IsMachineFailed(machine *clusterv1.Machine) bool { + return machine.Status.FailureReason != nil || machine.Status.FailureMessage != nil +} + func ConvertProviderIDToUUID(providerID *string) string { if providerID == nil || *providerID == "" { return ""