Skip to content

Commit

Permalink
SKS-2174: Gracefully shutdown VM with vGPU to ensure successful relea…
Browse files Browse the repository at this point in the history
…se of vGPU license lease (#159)

## 问题

[SKS-2174] vGPU 集群切换为 GPU 集群时,在集群仍然有 vGPU 节点时,vmGracefulShutdown被设置为
false - Jira http://jira.smartx.com/browse/SKS-2174

## 修复

- 当 ELFMachine 配置 vGPU
时,在关机未超时下,即使ElfCluster.spec.vmGracefulShutdown设置为false,仍然执行正常关机
  • Loading branch information
Levi080513 authored Dec 1, 2023
1 parent 2fe52eb commit bdc5874
Show file tree
Hide file tree
Showing 2 changed files with 64 additions and 2 deletions.
7 changes: 5 additions & 2 deletions controllers/elfmachine_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -255,9 +255,12 @@ func (r *ElfMachineReconciler) reconcileDeleteVM(ctx *context.MachineContext) er
if *vm.Status == models.VMStatusRUNNING {
var task *models.Task
var err error
// If VM shutdown timed out or VMGracefulShutdown is set to false, simply power off the VM.
// The vGPU license release logic requires the VM to be shutdown gracefully, so if ElfMachine is configured with vGPU,
// we should perform a graceful shutdown to ensure that the vGPU license can be released.
// Therefore, if the ElfMachine is configured with vGPU or ElfCluster.Spec.VMGracefulShutdown is true, the virtual machine will be shutdown normally.
// But if the VM shutdown timed out, simply power off the VM.
if service.IsShutDownTimeout(conditions.GetMessage(ctx.ElfMachine, infrav1.VMProvisionedCondition)) ||
!ctx.ElfCluster.Spec.VMGracefulShutdown {
!(ctx.ElfMachine.RequiresVGPUDevices() || ctx.ElfCluster.Spec.VMGracefulShutdown) {
task, err = ctx.VMService.PowerOff(ctx.ElfMachine.Status.VMRef)
} else {
task, err = ctx.VMService.ShutDown(ctx.ElfMachine.Status.VMRef)
Expand Down
59 changes: 59 additions & 0 deletions controllers/elfmachine_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2211,6 +2211,7 @@ var _ = Describe("ElfMachineReconciler", func() {
machine.Spec.Bootstrap = clusterv1.Bootstrap{DataSecretName: &secret.Name}
ctrlutil.AddFinalizer(elfMachine, infrav1.MachineFinalizer)
elfMachine.DeletionTimestamp = &metav1.Time{Time: time.Now().UTC()}
elfMachine.Spec.VGPUDevices = nil
elfCluster.Spec.VMGracefulShutdown = true
})

Expand Down Expand Up @@ -2425,6 +2426,37 @@ var _ = Describe("ElfMachineReconciler", func() {
Expect(conditions.GetMessage(elfMachine, infrav1.VMProvisionedCondition)).To(Equal("JOB_VM_SHUTDOWN_TIMEOUT"))
})

It("should power off when the VM which required vGPU devices is powered on and shut down failed", func() {
vm := fake.NewTowerVM()
vm.EntityAsyncStatus = nil
task := fake.NewTowerTask()
status := models.TaskStatusFAILED
task.Status = &status
task.ErrorMessage = pointer.String("JOB_VM_SHUTDOWN_TIMEOUT")
elfMachine.Status.VMRef = *vm.LocalID
elfMachine.Status.TaskRef = *task.ID
ctrlContext := newCtrlContexts(elfCluster, cluster, elfMachine, machine, secret, md)
fake.InitOwnerReferences(ctrlContext, elfCluster, cluster, elfMachine, machine)

mockVMService.EXPECT().Get(elfMachine.Status.VMRef).Return(vm, nil)
mockVMService.EXPECT().GetTask(elfMachine.Status.TaskRef).Return(task, nil)
mockVMService.EXPECT().PowerOff(elfMachine.Status.VMRef).Return(task, nil)
mockVMService.EXPECT().FindVMsByName(elfMachine.Name).Return(nil, nil)

reconciler := &ElfMachineReconciler{ControllerContext: ctrlContext, NewVMService: mockNewVMService}
elfMachineKey := capiutil.ObjectKey(elfMachine)
result, err := reconciler.Reconcile(ctx, ctrl.Request{NamespacedName: elfMachineKey})
Expect(result.RequeueAfter).NotTo(BeZero())
Expect(err).To(BeZero())
Expect(logBuffer.String()).To(ContainSubstring("VM task failed"))
Expect(logBuffer.String()).To(ContainSubstring("Waiting for VM shut down"))
elfMachine = &infrav1.ElfMachine{}
Expect(reconciler.Client.Get(reconciler, elfMachineKey, elfMachine)).To(Succeed())
Expect(elfMachine.Status.VMRef).To(Equal(*vm.LocalID))
expectConditions(elfMachine, []conditionAssertion{{infrav1.VMProvisionedCondition, corev1.ConditionFalse, clusterv1.ConditionSeverityInfo, infrav1.TaskFailureReason}})
Expect(conditions.GetMessage(elfMachine, infrav1.VMProvisionedCondition)).To(Equal("JOB_VM_SHUTDOWN_TIMEOUT"))
})

It("should handle task - done", func() {
vm := fake.NewTowerVM()
vm.EntityAsyncStatus = nil
Expand Down Expand Up @@ -2477,6 +2509,33 @@ var _ = Describe("ElfMachineReconciler", func() {
expectConditions(elfMachine, []conditionAssertion{{infrav1.VMProvisionedCondition, corev1.ConditionFalse, clusterv1.ConditionSeverityInfo, clusterv1.DeletingReason}})
})

It("should shutdown VM when the VM which required vGPU devices is powered on and cluster VMGracefulShutdown is false", func() {
vm := fake.NewTowerVM()
vm.EntityAsyncStatus = nil
task := fake.NewTowerTask()
elfMachine.Status.VMRef = *vm.LocalID
elfMachine.Spec.VGPUDevices = []infrav1.VGPUDeviceSpec{{}}
elfCluster.Spec.VMGracefulShutdown = false
ctrlContext := newCtrlContexts(elfCluster, cluster, elfMachine, machine, secret, md)
fake.InitOwnerReferences(ctrlContext, elfCluster, cluster, elfMachine, machine)

mockVMService.EXPECT().Get(elfMachine.Status.VMRef).Return(vm, nil)
mockVMService.EXPECT().ShutDown(elfMachine.Status.VMRef).Return(task, nil)
mockVMService.EXPECT().FindVMsByName(elfMachine.Name).Return(nil, nil)

reconciler := &ElfMachineReconciler{ControllerContext: ctrlContext, NewVMService: mockNewVMService}
elfMachineKey := capiutil.ObjectKey(elfMachine)
result, err := reconciler.Reconcile(ctx, ctrl.Request{NamespacedName: elfMachineKey})
Expect(result.RequeueAfter).NotTo(BeZero())
Expect(err).To(BeZero())
Expect(logBuffer.String()).To(ContainSubstring("Waiting for VM shut down"))
elfMachine = &infrav1.ElfMachine{}
Expect(reconciler.Client.Get(reconciler, elfMachineKey, elfMachine)).To(Succeed())
Expect(elfMachine.Status.VMRef).To(Equal(*vm.LocalID))
Expect(elfMachine.Status.TaskRef).To(Equal(*task.ID))
expectConditions(elfMachine, []conditionAssertion{{infrav1.VMProvisionedCondition, corev1.ConditionFalse, clusterv1.ConditionSeverityInfo, clusterv1.DeletingReason}})
})

It("should handle delete error", func() {
vm := fake.NewTowerVM()
vm.Name = &elfMachine.Name
Expand Down

0 comments on commit bdc5874

Please sign in to comment.