From 1ed0e13e4c07b8a5e809a50568841dc3c9041a36 Mon Sep 17 00:00:00 2001 From: "Weber.Yang" Date: Tue, 27 Feb 2024 15:01:43 +0800 Subject: [PATCH] SKS-1390: Optimize the creation of virtual machines when insufficient storage detected for ELF cluster (#170) --- api/v1beta1/conditions_consts.go | 4 ++ controllers/elfmachine_controller.go | 16 +++++-- controllers/elfmachine_controller_test.go | 47 +++++++++++++++---- controllers/tower_cache.go | 25 ++++++++++- controllers/tower_cache_test.go | 55 +++++++++++++++-------- pkg/service/errors.go | 5 +++ 6 files changed, 119 insertions(+), 33 deletions(-) diff --git a/api/v1beta1/conditions_consts.go b/api/v1beta1/conditions_consts.go index ef75b629..c54f149b 100644 --- a/api/v1beta1/conditions_consts.go +++ b/api/v1beta1/conditions_consts.go @@ -110,6 +110,10 @@ const ( // waiting for ELF cluster with sufficient memory to create or power on VM. WaitingForELFClusterWithSufficientMemoryReason = "WaitingForELFClusterWithSufficientMemory" + // WaitingForELFClusterWithSufficientStorageReason (Severity=Info) documents an ElfMachine + // waiting for ELF cluster with sufficient storage to create or power on VM. + WaitingForELFClusterWithSufficientStorageReason = "WaitingForELFClusterWithSufficientStorage" + // WaitingForAvailableHostRequiredByPlacementGroupReason (Severity=Info) documents an ElfMachine // waiting for an available host required by placement group to create VM. WaitingForAvailableHostRequiredByPlacementGroupReason = "WaitingForAvailableHostRequiredByPlacementGroup" diff --git a/controllers/elfmachine_controller.go b/controllers/elfmachine_controller.go index 5ce67a28..b537a894 100644 --- a/controllers/elfmachine_controller.go +++ b/controllers/elfmachine_controller.go @@ -938,6 +938,7 @@ func (r *ElfMachineReconciler) reconcileVMTask(ctx *context.MachineContext, vm * if service.IsCloneVMTask(task) || service.IsPowerOnVMTask(task) { releaseTicketForCreateVM(ctx.ElfMachine.Name) + recordElfClusterStorageInsufficient(ctx, false) recordElfClusterMemoryInsufficient(ctx, false) if err := recordPlacementGroupPolicyNotSatisfied(ctx, false); err != nil { @@ -974,10 +975,6 @@ func (r *ElfMachineReconciler) reconcileVMFailedTask(ctx *context.MachineContext case service.IsCloneVMTask(task): releaseTicketForCreateVM(ctx.ElfMachine.Name) - if service.IsVMDuplicateError(errorMessage) { - setVMDuplicate(ctx.ElfMachine.Name) - } - if ctx.ElfMachine.RequiresGPUDevices() { unlockGPUDevicesLockedByVM(ctx.ElfCluster.Spec.Cluster, ctx.ElfMachine.Name) } @@ -985,6 +982,17 @@ func (r *ElfMachineReconciler) reconcileVMFailedTask(ctx *context.MachineContext if ctx.ElfMachine.RequiresGPUDevices() { unlockGPUDevicesLockedByVM(ctx.ElfCluster.Spec.Cluster, ctx.ElfMachine.Name) } + } + + switch { + case service.IsVMDuplicateError(errorMessage): + setVMDuplicate(ctx.ElfMachine.Name) + case service.IsStorageInsufficientError(errorMessage): + recordElfClusterStorageInsufficient(ctx, true) + message := fmt.Sprintf("Insufficient storage detected for the ELF cluster %s", ctx.ElfCluster.Spec.Cluster) + ctx.Logger.Info(message) + + return errors.New(message) case service.IsMemoryInsufficientError(errorMessage): recordElfClusterMemoryInsufficient(ctx, true) message := fmt.Sprintf("Insufficient memory detected for the ELF cluster %s", ctx.ElfCluster.Spec.Cluster) diff --git a/controllers/elfmachine_controller_test.go b/controllers/elfmachine_controller_test.go index 21bd6326..6eabb116 100644 --- a/controllers/elfmachine_controller_test.go +++ b/controllers/elfmachine_controller_test.go @@ -331,11 +331,11 @@ var _ = Describe("ElfMachineReconciler", func() { expectConditions(elfMachine, []conditionAssertion{{infrav1.VMProvisionedCondition, corev1.ConditionFalse, clusterv1.ConditionSeverityWarning, infrav1.CloningFailedReason}}) }) - It("should create a new VM if none exists", func() { + It("should create a new VM if not exists", func() { resetMemoryCache() vm := fake.NewTowerVM() vm.Name = &elfMachine.Name - elfCluster.Spec.Cluster = clusterKey + elfCluster.Spec.Cluster = clusterInsufficientStorageKey task := fake.NewTowerTask() withTaskVM := fake.NewWithTaskVM(vm, task) ctrlutil.AddFinalizer(elfMachine, infrav1.MachineFinalizer) @@ -344,12 +344,26 @@ var _ = Describe("ElfMachineReconciler", func() { machineContext := newMachineContext(ctrlContext, elfCluster, cluster, elfMachine, machine, mockVMService) machineContext.VMService = mockVMService - recordIsUnmet(machineContext, clusterKey, true) - reconciler := &ElfMachineReconciler{ControllerContext: ctrlContext, NewVMService: mockNewVMService} + recordOrClearError(machineContext, clusterInsufficientStorageKey, true) + mockVMService.EXPECT().GetVMPlacementGroup(gomock.Any()).Return(placementGroup, nil) elfMachineKey := capiutil.ObjectKey(elfMachine) + reconciler := &ElfMachineReconciler{ControllerContext: ctrlContext, NewVMService: mockNewVMService} result, err := reconciler.Reconcile(ctx, ctrl.Request{NamespacedName: elfMachineKey}) Expect(result.RequeueAfter).NotTo(BeZero()) Expect(err).NotTo(HaveOccurred()) + Expect(logBuffer.String()).To(ContainSubstring("Insufficient storage detected for the ELF cluster")) + expireELFScheduleVMError(machineContext, clusterInsufficientStorageKey) + + logBuffer.Reset() + elfCluster.Spec.Cluster = clusterInsufficientMemoryKey + ctrlContext = newCtrlContexts(elfCluster, cluster, elfMachine, machine, secret, md) + fake.InitOwnerReferences(ctrlContext, elfCluster, cluster, elfMachine, machine) + machineContext = newMachineContext(ctrlContext, elfCluster, cluster, elfMachine, machine, mockVMService) + recordOrClearError(machineContext, clusterInsufficientMemoryKey, true) + reconciler = &ElfMachineReconciler{ControllerContext: ctrlContext, NewVMService: mockNewVMService} + result, err = reconciler.Reconcile(ctx, ctrl.Request{NamespacedName: elfMachineKey}) + Expect(result.RequeueAfter).NotTo(BeZero()) + Expect(err).NotTo(HaveOccurred()) Expect(logBuffer.String()).To(ContainSubstring("Insufficient memory detected for the ELF cluster")) logBuffer = new(bytes.Buffer) @@ -358,7 +372,7 @@ var _ = Describe("ElfMachineReconciler", func() { mockVMService.EXPECT().Get(*vm.ID).Return(vm, nil) mockVMService.EXPECT().GetTask(*task.ID).Return(task, nil) mockVMService.EXPECT().GetVMPlacementGroup(gomock.Any()).Return(placementGroup, nil) - expireELFScheduleVMError(machineContext, clusterKey) + expireELFScheduleVMError(machineContext, clusterInsufficientMemoryKey) reconciler = &ElfMachineReconciler{ControllerContext: ctrlContext, NewVMService: mockNewVMService} result, err = reconciler.Reconcile(ctx, ctrl.Request{NamespacedName: elfMachineKey}) @@ -899,12 +913,12 @@ var _ = Describe("ElfMachineReconciler", func() { vm := fake.NewTowerVM() vm.Host = &models.NestedHost{ID: service.TowerString(fake.ID())} elfMachine.Status.VMRef = *vm.LocalID - elfCluster.Spec.Cluster = clusterKey + elfCluster.Spec.Cluster = clusterInsufficientMemoryKey ctrlContext := newCtrlContexts(elfCluster, cluster, elfMachine, machine, secret, md) fake.InitOwnerReferences(ctrlContext, elfCluster, cluster, elfMachine, machine) machineContext := newMachineContext(ctrlContext, elfCluster, cluster, elfMachine, machine, mockVMService) machineContext.VMService = mockVMService - recordIsUnmet(machineContext, clusterKey, true) + recordOrClearError(machineContext, clusterInsufficientMemoryKey, true) reconciler := &ElfMachineReconciler{ControllerContext: ctrlContext, NewVMService: mockNewVMService} err := reconciler.powerOnVM(machineContext, vm) Expect(err).NotTo(HaveOccurred()) @@ -912,7 +926,7 @@ var _ = Describe("ElfMachineReconciler", func() { task := fake.NewTowerTask() mockVMService.EXPECT().PowerOn(elfMachine.Status.VMRef, "").Return(task, nil) - expireELFScheduleVMError(machineContext, clusterKey) + expireELFScheduleVMError(machineContext, clusterInsufficientMemoryKey) err = reconciler.powerOnVM(machineContext, vm) Expect(err).NotTo(HaveOccurred()) Expect(logBuffer.String()).To(ContainSubstring("and the retry silence period passes, will try to power on the VM again")) @@ -3258,11 +3272,28 @@ var _ = Describe("ElfMachineReconciler", func() { Expect(err.Error()).To(ContainSubstring("The placement group policy can not be satisfied")) Expect(logBuffer.String()).To(ContainSubstring("VM task failed")) + logBuffer.Reset() ok, msg, err := isELFScheduleVMErrorRecorded(machineContext) Expect(ok).To(BeTrue()) Expect(msg).To(ContainSubstring("Insufficient memory detected for the ELF cluster")) Expect(err).ShouldNot(HaveOccurred()) + resetMemoryCache() + logBuffer.Reset() + elfMachine.Status.TaskRef = *task.ID + task.ErrorMessage = service.TowerString(service.StorageInsufficientError) + ok, err = reconciler.reconcileVMTask(machineContext, nil) + Expect(ok).Should(BeTrue()) + Expect(err.Error()).To(ContainSubstring("Insufficient storage detected for the ELF cluster")) + Expect(elfMachine.Status.TaskRef).To(Equal("")) + Expect(logBuffer.String()).To(ContainSubstring("VM task failed")) + + logBuffer.Reset() + ok, msg, err = isELFScheduleVMErrorRecorded(machineContext) + Expect(ok).To(BeTrue()) + Expect(msg).To(ContainSubstring("Insufficient storage detected for the ELF cluster")) + Expect(err).ShouldNot(HaveOccurred()) + // Start VM task.Status = models.NewTaskStatus(models.TaskStatusSUCCESSED) task.Description = service.TowerString("Start VM") diff --git a/controllers/tower_cache.go b/controllers/tower_cache.go index a65f97dd..9e3d9c86 100644 --- a/controllers/tower_cache.go +++ b/controllers/tower_cache.go @@ -50,7 +50,8 @@ type clusterResource struct { // // Includes these scenarios: // 1. ELF cluster has insufficient memory. -// 2. Placement group not satisfy policy. +// 2. ELF cluster has insufficient storage. +// 3. Cannot satisfy the PlacementGroup policy. func isELFScheduleVMErrorRecorded(ctx *context.MachineContext) (bool, string, error) { lock.Lock() defer lock.Unlock() @@ -59,6 +60,10 @@ func isELFScheduleVMErrorRecorded(ctx *context.MachineContext) (bool, string, er conditions.MarkFalse(ctx.ElfMachine, infrav1.VMProvisionedCondition, infrav1.WaitingForELFClusterWithSufficientMemoryReason, clusterv1.ConditionSeverityInfo, "") return true, fmt.Sprintf("Insufficient memory detected for the ELF cluster %s", ctx.ElfCluster.Spec.Cluster), nil + } else if resource := getClusterResource(getKeyForInsufficientStorageError(ctx.ElfCluster.Spec.Cluster)); resource != nil { + conditions.MarkFalse(ctx.ElfMachine, infrav1.VMProvisionedCondition, infrav1.WaitingForELFClusterWithSufficientStorageReason, clusterv1.ConditionSeverityInfo, "") + + return true, fmt.Sprintf("Insufficient storage detected for the ELF cluster %s", ctx.ElfCluster.Spec.Cluster), nil } placementGroupName, err := towerresources.GetVMPlacementGroupName(ctx, ctx.Client, ctx.Machine, ctx.Cluster) @@ -85,6 +90,16 @@ func recordElfClusterMemoryInsufficient(ctx *context.MachineContext, isInsuffici } } +// recordElfClusterStorageInsufficient records whether the storage is insufficient. +func recordElfClusterStorageInsufficient(ctx *context.MachineContext, isError bool) { + key := getKeyForInsufficientStorageError(ctx.ElfCluster.Spec.Cluster) + if isError { + inMemoryCache.Set(key, newClusterResource(), resourceDuration) + } else { + inMemoryCache.Delete(key) + } +} + // recordPlacementGroupPolicyNotSatisfied records whether the placement group not satisfy policy. func recordPlacementGroupPolicyNotSatisfied(ctx *context.MachineContext, isPGPolicyNotSatisfied bool) error { placementGroupName, err := towerresources.GetVMPlacementGroupName(ctx, ctx.Client, ctx.Machine, ctx.Cluster) @@ -116,7 +131,9 @@ func canRetryVMOperation(ctx *context.MachineContext) (bool, error) { lock.Lock() defer lock.Unlock() - if ok := canRetry(getKeyForInsufficientMemoryError(ctx.ElfCluster.Spec.Cluster)); ok { + if ok := canRetry(getKeyForInsufficientStorageError(ctx.ElfCluster.Spec.Cluster)); ok { + return true, nil + } else if ok := canRetry(getKeyForInsufficientMemoryError(ctx.ElfCluster.Spec.Cluster)); ok { return true, nil } @@ -158,6 +175,10 @@ func getClusterResource(key string) *clusterResource { return nil } +func getKeyForInsufficientStorageError(clusterID string) string { + return fmt.Sprintf("insufficient:storage:%s", clusterID) +} + func getKeyForInsufficientMemoryError(clusterID string) string { return fmt.Sprintf("insufficient:memory:%s", clusterID) } diff --git a/controllers/tower_cache_test.go b/controllers/tower_cache_test.go index 3d3a69e9..d8c52b24 100644 --- a/controllers/tower_cache_test.go +++ b/controllers/tower_cache_test.go @@ -34,8 +34,10 @@ import ( ) const ( - clusterKey = "clusterID" - placementGroupKey = "getPlacementGroupName" + clusterKey = "clusterID" + clusterInsufficientMemoryKey = "clusterInsufficientMemory" + clusterInsufficientStorageKey = "clusterInsufficientStorage" + placementGroupKey = "getPlacementGroupName" ) var _ = Describe("TowerCache", func() { @@ -44,7 +46,7 @@ var _ = Describe("TowerCache", func() { }) It("should set memoryInsufficient/policyNotSatisfied", func() { - for _, name := range []string{clusterKey, placementGroupKey} { + for _, name := range []string{clusterInsufficientMemoryKey, clusterInsufficientStorageKey, placementGroupKey} { resetMemoryCache() elfCluster, cluster, elfMachine, machine, secret := fake.NewClusterAndMachineObjects() elfCluster.Spec.Cluster = name @@ -59,19 +61,19 @@ var _ = Describe("TowerCache", func() { _, found := inMemoryCache.Get(key) Expect(found).To(BeFalse()) - recordIsUnmet(machineContext, name, true) + recordOrClearError(machineContext, name, true) _, found = inMemoryCache.Get(key) Expect(found).To(BeTrue()) resource := getClusterResource(key) Expect(resource.LastDetected).To(Equal(resource.LastRetried)) - recordIsUnmet(machineContext, name, true) + recordOrClearError(machineContext, name, true) lastDetected := resource.LastDetected resource = getClusterResource(key) Expect(resource.LastDetected).To(Equal(resource.LastRetried)) Expect(resource.LastDetected.After(lastDetected)).To(BeTrue()) - recordIsUnmet(machineContext, name, false) + recordOrClearError(machineContext, name, false) resource = getClusterResource(key) Expect(resource).To(BeNil()) @@ -79,17 +81,17 @@ var _ = Describe("TowerCache", func() { _, found = inMemoryCache.Get(key) Expect(found).To(BeFalse()) - recordIsUnmet(machineContext, name, false) + recordOrClearError(machineContext, name, false) resource = getClusterResource(key) _, found = inMemoryCache.Get(key) Expect(found).To(BeFalse()) Expect(resource).To(BeNil()) - recordIsUnmet(machineContext, name, false) + recordOrClearError(machineContext, name, false) resource = getClusterResource(key) Expect(resource).To(BeNil()) - recordIsUnmet(machineContext, name, true) + recordOrClearError(machineContext, name, true) _, found = inMemoryCache.Get(key) Expect(found).To(BeTrue()) resource = getClusterResource(key) @@ -98,7 +100,7 @@ var _ = Describe("TowerCache", func() { }) It("should return whether need to detect", func() { - for _, name := range []string{clusterKey, placementGroupKey} { + for _, name := range []string{clusterInsufficientMemoryKey, clusterInsufficientStorageKey, placementGroupKey} { resetMemoryCache() elfCluster, cluster, elfMachine, machine, secret := fake.NewClusterAndMachineObjects() elfCluster.Spec.Cluster = name @@ -116,12 +118,12 @@ var _ = Describe("TowerCache", func() { Expect(ok).To(BeFalse()) Expect(err).ShouldNot(HaveOccurred()) - recordIsUnmet(machineContext, name, false) + recordOrClearError(machineContext, name, false) ok, err = canRetryVMOperation(machineContext) Expect(ok).To(BeFalse()) Expect(err).ShouldNot(HaveOccurred()) - recordIsUnmet(machineContext, name, true) + recordOrClearError(machineContext, name, true) ok, err = canRetryVMOperation(machineContext) Expect(ok).To(BeFalse()) Expect(err).ShouldNot(HaveOccurred()) @@ -154,7 +156,8 @@ var _ = Describe("TowerCache", func() { Expect(err).ShouldNot(HaveOccurred()) expectConditions(elfMachine, []conditionAssertion{}) - recordIsUnmet(machineContext, clusterKey, true) + elfCluster.Spec.Cluster = clusterInsufficientMemoryKey + recordOrClearError(machineContext, clusterInsufficientMemoryKey, true) ok, msg, err = isELFScheduleVMErrorRecorded(machineContext) Expect(ok).To(BeTrue()) Expect(msg).To(ContainSubstring("Insufficient memory detected for the ELF cluster")) @@ -162,7 +165,16 @@ var _ = Describe("TowerCache", func() { expectConditions(elfMachine, []conditionAssertion{{infrav1.VMProvisionedCondition, corev1.ConditionFalse, clusterv1.ConditionSeverityInfo, infrav1.WaitingForELFClusterWithSufficientMemoryReason}}) resetMemoryCache() - recordIsUnmet(machineContext, placementGroupKey, true) + elfCluster.Spec.Cluster = clusterInsufficientStorageKey + recordOrClearError(machineContext, clusterInsufficientStorageKey, true) + ok, msg, err = isELFScheduleVMErrorRecorded(machineContext) + Expect(ok).To(BeTrue()) + Expect(msg).To(ContainSubstring("Insufficient storage detected for the ELF cluster clusterInsufficientStorage")) + Expect(err).ShouldNot(HaveOccurred()) + expectConditions(elfMachine, []conditionAssertion{{infrav1.VMProvisionedCondition, corev1.ConditionFalse, clusterv1.ConditionSeverityInfo, infrav1.WaitingForELFClusterWithSufficientStorageReason}}) + + resetMemoryCache() + recordOrClearError(machineContext, placementGroupKey, true) ok, msg, err = isELFScheduleVMErrorRecorded(machineContext) Expect(ok).To(BeTrue()) Expect(msg).To(ContainSubstring("Not satisfy policy detected for the placement group")) @@ -220,8 +232,10 @@ func removeGPUVMInfosCache(gpuIDs []string) { } func getKey(ctx *context.MachineContext, name string) string { - if name == clusterKey { + if name == clusterInsufficientMemoryKey { return getKeyForInsufficientMemoryError(name) + } else if name == clusterInsufficientStorageKey { + return getKeyForInsufficientStorageError(name) } placementGroupName, err := towerresources.GetVMPlacementGroupName(ctx, ctx.Client, ctx.Machine, ctx.Cluster) @@ -230,13 +244,16 @@ func getKey(ctx *context.MachineContext, name string) string { return getKeyForDuplicatePlacementGroupError(placementGroupName) } -func recordIsUnmet(ctx *context.MachineContext, key string, isUnmet bool) { - if strings.Contains(key, clusterKey) { - recordElfClusterMemoryInsufficient(ctx, isUnmet) +func recordOrClearError(ctx *context.MachineContext, key string, record bool) { + if strings.Contains(key, clusterInsufficientMemoryKey) { + recordElfClusterMemoryInsufficient(ctx, record) + return + } else if strings.Contains(key, clusterInsufficientStorageKey) { + recordElfClusterStorageInsufficient(ctx, record) return } - Expect(recordPlacementGroupPolicyNotSatisfied(ctx, isUnmet)).ShouldNot(HaveOccurred()) + Expect(recordPlacementGroupPolicyNotSatisfied(ctx, record)).ShouldNot(HaveOccurred()) } func expireELFScheduleVMError(ctx *context.MachineContext, name string) { diff --git a/pkg/service/errors.go b/pkg/service/errors.go index d4435878..f783cf3b 100644 --- a/pkg/service/errors.go +++ b/pkg/service/errors.go @@ -37,6 +37,7 @@ const ( LabelAddFailed = "LABEL_ADD_FAILED" CloudInitError = "VM_CLOUD_INIT_CONFIG_ERROR" MemoryInsufficientError = "HostAvailableMemoryFilter" + StorageInsufficientError = "EAllocSpace" PlacementGroupError = "PlacementGroupFilter" // SMTX OS <= 5.0.4 PlacementGroupMustError = "PlacementGroupMustFilter" PlacementGroupPriorError = "PlacementGroupPriorFilter" @@ -111,6 +112,10 @@ func ParseGPUAssignFailed(message string) string { return message[index:] } +func IsStorageInsufficientError(message string) bool { + return strings.Contains(message, StorageInsufficientError) +} + func IsMemoryInsufficientError(message string) bool { return strings.Contains(message, MemoryInsufficientError) }