From bdc587499649df9476d3d7abc7c0b69f7a10e9fd Mon Sep 17 00:00:00 2001 From: Levi080513 Date: Fri, 1 Dec 2023 20:01:40 +0800 Subject: [PATCH] SKS-2174: Gracefully shutdown VM with vGPU to ensure successful release of vGPU license lease (#159) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## 问题 [SKS-2174] vGPU 集群切换为 GPU 集群时,在集群仍然有 vGPU 节点时,vmGracefulShutdown被设置为 false - Jira http://jira.smartx.com/browse/SKS-2174 ## 修复 - 当 ELFMachine 配置 vGPU 时,在关机未超时下,即使ElfCluster.spec.vmGracefulShutdown设置为false,仍然执行正常关机 --- controllers/elfmachine_controller.go | 7 ++- controllers/elfmachine_controller_test.go | 59 +++++++++++++++++++++++ 2 files changed, 64 insertions(+), 2 deletions(-) diff --git a/controllers/elfmachine_controller.go b/controllers/elfmachine_controller.go index d17f8d94..c7a10f7a 100644 --- a/controllers/elfmachine_controller.go +++ b/controllers/elfmachine_controller.go @@ -255,9 +255,12 @@ func (r *ElfMachineReconciler) reconcileDeleteVM(ctx *context.MachineContext) er if *vm.Status == models.VMStatusRUNNING { var task *models.Task var err error - // If VM shutdown timed out or VMGracefulShutdown is set to false, simply power off the VM. + // The vGPU license release logic requires the VM to be shutdown gracefully, so if ElfMachine is configured with vGPU, + // we should perform a graceful shutdown to ensure that the vGPU license can be released. + // Therefore, if the ElfMachine is configured with vGPU or ElfCluster.Spec.VMGracefulShutdown is true, the virtual machine will be shutdown normally. + // But if the VM shutdown timed out, simply power off the VM. if service.IsShutDownTimeout(conditions.GetMessage(ctx.ElfMachine, infrav1.VMProvisionedCondition)) || - !ctx.ElfCluster.Spec.VMGracefulShutdown { + !(ctx.ElfMachine.RequiresVGPUDevices() || ctx.ElfCluster.Spec.VMGracefulShutdown) { task, err = ctx.VMService.PowerOff(ctx.ElfMachine.Status.VMRef) } else { task, err = ctx.VMService.ShutDown(ctx.ElfMachine.Status.VMRef) diff --git a/controllers/elfmachine_controller_test.go b/controllers/elfmachine_controller_test.go index af1bfbed..5631af65 100644 --- a/controllers/elfmachine_controller_test.go +++ b/controllers/elfmachine_controller_test.go @@ -2211,6 +2211,7 @@ var _ = Describe("ElfMachineReconciler", func() { machine.Spec.Bootstrap = clusterv1.Bootstrap{DataSecretName: &secret.Name} ctrlutil.AddFinalizer(elfMachine, infrav1.MachineFinalizer) elfMachine.DeletionTimestamp = &metav1.Time{Time: time.Now().UTC()} + elfMachine.Spec.VGPUDevices = nil elfCluster.Spec.VMGracefulShutdown = true }) @@ -2425,6 +2426,37 @@ var _ = Describe("ElfMachineReconciler", func() { Expect(conditions.GetMessage(elfMachine, infrav1.VMProvisionedCondition)).To(Equal("JOB_VM_SHUTDOWN_TIMEOUT")) }) + It("should power off when the VM which required vGPU devices is powered on and shut down failed", func() { + vm := fake.NewTowerVM() + vm.EntityAsyncStatus = nil + task := fake.NewTowerTask() + status := models.TaskStatusFAILED + task.Status = &status + task.ErrorMessage = pointer.String("JOB_VM_SHUTDOWN_TIMEOUT") + elfMachine.Status.VMRef = *vm.LocalID + elfMachine.Status.TaskRef = *task.ID + ctrlContext := newCtrlContexts(elfCluster, cluster, elfMachine, machine, secret, md) + fake.InitOwnerReferences(ctrlContext, elfCluster, cluster, elfMachine, machine) + + mockVMService.EXPECT().Get(elfMachine.Status.VMRef).Return(vm, nil) + mockVMService.EXPECT().GetTask(elfMachine.Status.TaskRef).Return(task, nil) + mockVMService.EXPECT().PowerOff(elfMachine.Status.VMRef).Return(task, nil) + mockVMService.EXPECT().FindVMsByName(elfMachine.Name).Return(nil, nil) + + reconciler := &ElfMachineReconciler{ControllerContext: ctrlContext, NewVMService: mockNewVMService} + elfMachineKey := capiutil.ObjectKey(elfMachine) + result, err := reconciler.Reconcile(ctx, ctrl.Request{NamespacedName: elfMachineKey}) + Expect(result.RequeueAfter).NotTo(BeZero()) + Expect(err).To(BeZero()) + Expect(logBuffer.String()).To(ContainSubstring("VM task failed")) + Expect(logBuffer.String()).To(ContainSubstring("Waiting for VM shut down")) + elfMachine = &infrav1.ElfMachine{} + Expect(reconciler.Client.Get(reconciler, elfMachineKey, elfMachine)).To(Succeed()) + Expect(elfMachine.Status.VMRef).To(Equal(*vm.LocalID)) + expectConditions(elfMachine, []conditionAssertion{{infrav1.VMProvisionedCondition, corev1.ConditionFalse, clusterv1.ConditionSeverityInfo, infrav1.TaskFailureReason}}) + Expect(conditions.GetMessage(elfMachine, infrav1.VMProvisionedCondition)).To(Equal("JOB_VM_SHUTDOWN_TIMEOUT")) + }) + It("should handle task - done", func() { vm := fake.NewTowerVM() vm.EntityAsyncStatus = nil @@ -2477,6 +2509,33 @@ var _ = Describe("ElfMachineReconciler", func() { expectConditions(elfMachine, []conditionAssertion{{infrav1.VMProvisionedCondition, corev1.ConditionFalse, clusterv1.ConditionSeverityInfo, clusterv1.DeletingReason}}) }) + It("should shutdown VM when the VM which required vGPU devices is powered on and cluster VMGracefulShutdown is false", func() { + vm := fake.NewTowerVM() + vm.EntityAsyncStatus = nil + task := fake.NewTowerTask() + elfMachine.Status.VMRef = *vm.LocalID + elfMachine.Spec.VGPUDevices = []infrav1.VGPUDeviceSpec{{}} + elfCluster.Spec.VMGracefulShutdown = false + ctrlContext := newCtrlContexts(elfCluster, cluster, elfMachine, machine, secret, md) + fake.InitOwnerReferences(ctrlContext, elfCluster, cluster, elfMachine, machine) + + mockVMService.EXPECT().Get(elfMachine.Status.VMRef).Return(vm, nil) + mockVMService.EXPECT().ShutDown(elfMachine.Status.VMRef).Return(task, nil) + mockVMService.EXPECT().FindVMsByName(elfMachine.Name).Return(nil, nil) + + reconciler := &ElfMachineReconciler{ControllerContext: ctrlContext, NewVMService: mockNewVMService} + elfMachineKey := capiutil.ObjectKey(elfMachine) + result, err := reconciler.Reconcile(ctx, ctrl.Request{NamespacedName: elfMachineKey}) + Expect(result.RequeueAfter).NotTo(BeZero()) + Expect(err).To(BeZero()) + Expect(logBuffer.String()).To(ContainSubstring("Waiting for VM shut down")) + elfMachine = &infrav1.ElfMachine{} + Expect(reconciler.Client.Get(reconciler, elfMachineKey, elfMachine)).To(Succeed()) + Expect(elfMachine.Status.VMRef).To(Equal(*vm.LocalID)) + Expect(elfMachine.Status.TaskRef).To(Equal(*task.ID)) + expectConditions(elfMachine, []conditionAssertion{{infrav1.VMProvisionedCondition, corev1.ConditionFalse, clusterv1.ConditionSeverityInfo, clusterv1.DeletingReason}}) + }) + It("should handle delete error", func() { vm := fake.NewTowerVM() vm.Name = &elfMachine.Name