diff --git a/cluster-autoscaler/clusterstate/clusterstate_test.go b/cluster-autoscaler/clusterstate/clusterstate_test.go index b22a2fd5b7e0..ca1b72df5668 100644 --- a/cluster-autoscaler/clusterstate/clusterstate_test.go +++ b/cluster-autoscaler/clusterstate/clusterstate_test.go @@ -135,6 +135,77 @@ func TestEmptyOK(t *testing.T) { assert.True(t, clusterstate.IsNodeGroupScalingUp("ng1")) } +// TestRecalculateStateAfterNodeGroupSizeChanged checks that Recalculate updates state correctly after +// some node group size changed. We verify that acceptable ranges are updated accordingly +// and that the UpcomingNodes reflect the node group size change (important for recalculating state after +// deleting scale-up nodes that failed to create). +func TestRecalculateStateAfterNodeGroupSizeChanged(t *testing.T) { + ngName := "ng1" + testCases := []struct { + name string + acceptableRange AcceptableRange + readiness Readiness + newTarget int + scaleUpRequest *ScaleUpRequest + wantAcceptableRange AcceptableRange + wantUpcoming int + }{ + { + name: "failed scale up by 3 nodes", + acceptableRange: AcceptableRange{MinNodes: 1, CurrentTarget: 4, MaxNodes: 4}, + readiness: Readiness{Ready: make([]string, 1)}, + newTarget: 1, + wantAcceptableRange: AcceptableRange{MinNodes: 1, CurrentTarget: 1, MaxNodes: 1}, + wantUpcoming: 0, + }, { + name: "partially failed scale up", + acceptableRange: AcceptableRange{MinNodes: 5, CurrentTarget: 7, MaxNodes: 8}, + readiness: Readiness{Ready: make([]string, 5)}, + newTarget: 6, + wantAcceptableRange: AcceptableRange{MinNodes: 5, CurrentTarget: 6, MaxNodes: 6}, + scaleUpRequest: &ScaleUpRequest{Increase: 1}, + wantUpcoming: 1, + }, { + name: "scale up ongoing, no change", + acceptableRange: AcceptableRange{MinNodes: 1, CurrentTarget: 4, MaxNodes: 4}, + readiness: Readiness{Ready: make([]string, 1)}, + newTarget: 4, + wantAcceptableRange: AcceptableRange{MinNodes: 1, CurrentTarget: 4, MaxNodes: 4}, + scaleUpRequest: &ScaleUpRequest{Increase: 3}, + wantUpcoming: 3, + }, { + name: "no scale up, no change", + acceptableRange: AcceptableRange{MinNodes: 4, CurrentTarget: 4, MaxNodes: 4}, + readiness: Readiness{Ready: make([]string, 4)}, + newTarget: 4, + wantAcceptableRange: AcceptableRange{MinNodes: 4, CurrentTarget: 4, MaxNodes: 4}, + wantUpcoming: 0, + }, + } + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + provider := testprovider.NewTestCloudProvider(nil, nil) + provider.AddNodeGroup(ngName, 0, 1000, tc.newTarget) + + fakeLogRecorder, _ := utils.NewStatusMapRecorder(&fake.Clientset{}, "kube-system", kube_record.NewFakeRecorder(5), false, "my-cool-configmap") + clusterState := NewClusterStateRegistry(provider, ClusterStateRegistryConfig{}, fakeLogRecorder, + newBackoff(), nodegroupconfig.NewDefaultNodeGroupConfigProcessor(config.NodeGroupAutoscalingOptions{})) + clusterState.acceptableRanges = map[string]AcceptableRange{ngName: tc.acceptableRange} + clusterState.perNodeGroupReadiness = map[string]Readiness{ngName: tc.readiness} + if tc.scaleUpRequest != nil { + clusterState.scaleUpRequests = map[string]*ScaleUpRequest{ngName: tc.scaleUpRequest} + } + + clusterState.Recalculate() + assert.Equal(t, tc.wantAcceptableRange, clusterState.acceptableRanges[ngName]) + upcomingCounts, _ := clusterState.GetUpcomingNodes() + if upcoming, found := upcomingCounts[ngName]; found { + assert.Equal(t, tc.wantUpcoming, upcoming, "Unexpected upcoming nodes count, want: %d got: %d", tc.wantUpcoming, upcomingCounts[ngName]) + } + }) + } +} + func TestOKOneUnreadyNode(t *testing.T) { now := time.Now() diff --git a/cluster-autoscaler/core/static_autoscaler.go b/cluster-autoscaler/core/static_autoscaler.go index 905d56f2019f..56f16f59c423 100644 --- a/cluster-autoscaler/core/static_autoscaler.go +++ b/cluster-autoscaler/core/static_autoscaler.go @@ -421,10 +421,7 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) caerrors.AutoscalerErr return nil } - if deletedNodes := a.deleteCreatedNodesWithErrors(); deletedNodes { - klog.V(0).Infof("Some nodes that failed to create were removed, skipping iteration") - return nil - } + a.deleteCreatedNodesWithErrors() // Check if there has been a constant difference between the number of nodes in k8s and // the number of nodes on the cloud provider side. @@ -808,7 +805,7 @@ func toNodes(unregisteredNodes []clusterstate.UnregisteredNode) []*apiv1.Node { return nodes } -func (a *StaticAutoscaler) deleteCreatedNodesWithErrors() bool { +func (a *StaticAutoscaler) deleteCreatedNodesWithErrors() { // We always schedule deleting of incoming errornous nodes // TODO[lukaszos] Consider adding logic to not retry delete every loop iteration nodeGroups := a.nodeGroupsById() @@ -850,7 +847,10 @@ func (a *StaticAutoscaler) deleteCreatedNodesWithErrors() bool { } } - return deletedAny + if deletedAny { + klog.V(0).Infof("Some nodes that failed to create were removed, recalculating cluster state.") + a.clusterStateRegistry.Recalculate() + } } // instancesToNodes returns a list of fake nodes with just names populated, diff --git a/cluster-autoscaler/core/static_autoscaler_test.go b/cluster-autoscaler/core/static_autoscaler_test.go index bf958ad4af59..474439a5891a 100644 --- a/cluster-autoscaler/core/static_autoscaler_test.go +++ b/cluster-autoscaler/core/static_autoscaler_test.go @@ -1164,8 +1164,11 @@ func TestStaticAutoscalerInstanceCreationErrors(t *testing.T) { clusterState.UpdateNodes([]*apiv1.Node{}, nil, now) // delete nodes with create errors - removedNodes := autoscaler.deleteCreatedNodesWithErrors() - assert.True(t, removedNodes) + autoscaler.deleteCreatedNodesWithErrors() + + // nodes should be deleted + expectedDeleteCalls := 1 + nodeGroupA.AssertNumberOfCalls(t, "DeleteNodes", expectedDeleteCalls) // check delete was called on correct nodes nodeGroupA.AssertCalled(t, "DeleteNodes", mock.MatchedBy( @@ -1189,10 +1192,12 @@ func TestStaticAutoscalerInstanceCreationErrors(t *testing.T) { clusterState.UpdateNodes([]*apiv1.Node{}, nil, now) // delete nodes with create errors - removedNodes = autoscaler.deleteCreatedNodesWithErrors() - assert.True(t, removedNodes) + autoscaler.deleteCreatedNodesWithErrors() // nodes should be deleted again + expectedDeleteCalls += 1 + nodeGroupA.AssertNumberOfCalls(t, "DeleteNodes", expectedDeleteCalls) + nodeGroupA.AssertCalled(t, "DeleteNodes", mock.MatchedBy( func(nodes []*apiv1.Node) bool { if len(nodes) != 4 { @@ -1253,11 +1258,10 @@ func TestStaticAutoscalerInstanceCreationErrors(t *testing.T) { clusterState.UpdateNodes([]*apiv1.Node{}, nil, now) // delete nodes with create errors - removedNodes = autoscaler.deleteCreatedNodesWithErrors() - assert.False(t, removedNodes) + autoscaler.deleteCreatedNodesWithErrors() - // we expect no more Delete Nodes - nodeGroupA.AssertNumberOfCalls(t, "DeleteNodes", 2) + // we expect no more Delete Nodes, don't increase expectedDeleteCalls + nodeGroupA.AssertNumberOfCalls(t, "DeleteNodes", expectedDeleteCalls) // failed node not included by NodeGroupForNode nodeGroupC := &mockprovider.NodeGroup{} @@ -1295,8 +1299,7 @@ func TestStaticAutoscalerInstanceCreationErrors(t *testing.T) { clusterState.UpdateNodes([]*apiv1.Node{}, nil, time.Now()) // No nodes are deleted when failed nodes don't have matching node groups - removedNodes = autoscaler.deleteCreatedNodesWithErrors() - assert.False(t, removedNodes) + autoscaler.deleteCreatedNodesWithErrors() nodeGroupC.AssertNumberOfCalls(t, "DeleteNodes", 0) nodeGroupAtomic := &mockprovider.NodeGroup{} @@ -1351,8 +1354,7 @@ func TestStaticAutoscalerInstanceCreationErrors(t *testing.T) { clusterState.UpdateNodes([]*apiv1.Node{}, nil, now) // delete nodes with create errors - removedNodes = autoscaler.deleteCreatedNodesWithErrors() - assert.True(t, removedNodes) + autoscaler.deleteCreatedNodesWithErrors() nodeGroupAtomic.AssertCalled(t, "DeleteNodes", mock.MatchedBy( func(nodes []*apiv1.Node) bool { @@ -1365,6 +1367,56 @@ func TestStaticAutoscalerInstanceCreationErrors(t *testing.T) { } return names["D1"] && names["D2"] && names["D3"] })) + + // Node group with getOptions error gets no deletes. + nodeGroupError := &mockprovider.NodeGroup{} + nodeGroupError.On("Exist").Return(true) + nodeGroupError.On("Autoprovisioned").Return(false) + nodeGroupError.On("TargetSize").Return(1, nil) + nodeGroupError.On("Id").Return("E") + nodeGroupError.On("DeleteNodes", mock.Anything).Return(nil) + nodeGroupError.On("GetOptions", options.NodeGroupDefaults).Return(nil, fmt.Errorf("Failed to get options")) + nodeGroupError.On("Nodes").Return([]cloudprovider.Instance{ + { + Id: "E1", + Status: &cloudprovider.InstanceStatus{ + State: cloudprovider.InstanceRunning, + }, + }, + { + + Id: "E2", + Status: &cloudprovider.InstanceStatus{ + State: cloudprovider.InstanceCreating, + ErrorInfo: &cloudprovider.InstanceErrorInfo{ + ErrorClass: cloudprovider.OutOfResourcesErrorClass, + ErrorCode: "QUOTA", + }, + }, + }, + }, nil) + + provider = &mockprovider.CloudProvider{} + provider.On("NodeGroups").Return([]cloudprovider.NodeGroup{nodeGroupError}) + provider.On("NodeGroupForNode", mock.Anything).Return( + func(node *apiv1.Node) cloudprovider.NodeGroup { + if strings.HasPrefix(node.Spec.ProviderID, "E") { + return nodeGroupError + } + return nil + }, nil).Times(2) + + clusterState = clusterstate.NewClusterStateRegistry(provider, clusterStateConfig, context.LogRecorder, NewBackoff(), nodeGroupConfigProcessor) + clusterState.RefreshCloudProviderNodeInstancesCache() + autoscaler.CloudProvider = provider + autoscaler.clusterStateRegistry = clusterState + // propagate nodes info in cluster state + clusterState.UpdateNodes([]*apiv1.Node{}, nil, now) + + // delete nodes with create errors + autoscaler.deleteCreatedNodesWithErrors() + + nodeGroupError.AssertNumberOfCalls(t, "DeleteNodes", 0) } type candidateTrackingFakePlanner struct {