Skip to content

Commit

Permalink
[release-0.47] Timeout if Pending takes too long (#1027)
Browse files Browse the repository at this point in the history
* Timeout if Pending takes too long

Add LastUnavailableNodeCount timestamp to avoid
getting stuck at Pending forever.

Signed-off-by: Radim Hrazdil <[email protected]>

* handler, container: Use centos stream 8

Signed-off-by: Radim Hrazdil <[email protected]>
  • Loading branch information
rhrazdil authored Mar 28, 2022
1 parent c2b607d commit 3a832ad
Show file tree
Hide file tree
Showing 7 changed files with 64 additions and 33 deletions.
8 changes: 7 additions & 1 deletion api/shared/nodenetworkconfigurationpolicy_types.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
package shared

import "k8s.io/apimachinery/pkg/util/intstr"
import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/intstr"
)

// NodeNetworkConfigurationPolicySpec defines the desired state of NodeNetworkConfigurationPolicy
type NodeNetworkConfigurationPolicySpec struct {
Expand Down Expand Up @@ -28,6 +31,9 @@ type NodeNetworkConfigurationPolicyStatus struct {
// processing a NodeNetworkConfigurationPolicy
// +optional
UnavailableNodeCount int `json:"unavailableNodeCount,omitempty" optional:"true"`
// LastUnavailableNodeCountUpdate is time of the last UnavailableNodeCount update
// +optional
LastUnavailableNodeCountUpdate *metav1.Time `json:"lastUnavailableNodeCountUpdate,omitempty" optional:"true"`
}

const (
Expand Down
2 changes: 1 addition & 1 deletion build/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM centos:8
FROM quay.io/centos/centos:stream8
ARG NMSTATE_COPR_REPO
ARG NM_COPR_REPO
RUN dnf install -b -y dnf-plugins-core && \
Expand Down
11 changes: 8 additions & 3 deletions controllers/nodenetworkconfigurationpolicy_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ import (

corev1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/runtime/schema"
"k8s.io/apimachinery/pkg/types"
Expand All @@ -52,6 +53,7 @@ import (
nmstate "github.com/nmstate/kubernetes-nmstate/pkg/helper"
"github.com/nmstate/kubernetes-nmstate/pkg/node"
"github.com/nmstate/kubernetes-nmstate/pkg/policyconditions"
"github.com/nmstate/kubernetes-nmstate/pkg/probe"
"github.com/nmstate/kubernetes-nmstate/pkg/selectors"
)

Expand Down Expand Up @@ -178,7 +180,7 @@ func (r *NodeNetworkConfigurationPolicyReconciler) Reconcile(ctx context.Context
return ctrl.Result{}, nil
}

if r.shouldIncrementUnavailableNodeCount(previousConditions) {
if r.shouldIncrementUnavailableNodeCount(instance, previousConditions) {
err = r.incrementUnavailableNodeCount(instance)
if err != nil {
if apierrors.IsConflict(err) {
Expand Down Expand Up @@ -313,8 +315,9 @@ func (r *NodeNetworkConfigurationPolicyReconciler) waitEnactmentCreated(enactmen
return pollErr
}

func (r *NodeNetworkConfigurationPolicyReconciler) shouldIncrementUnavailableNodeCount(conditions *nmstateapi.ConditionList) bool {
return !enactmentstatus.IsProgressing(conditions)
func (r *NodeNetworkConfigurationPolicyReconciler) shouldIncrementUnavailableNodeCount(policy *nmstatev1beta1.NodeNetworkConfigurationPolicy, conditions *nmstateapi.ConditionList) bool {
return !enactmentstatus.IsProgressing(conditions) && (policy.Status.LastUnavailableNodeCountUpdate == nil ||
time.Since(policy.Status.LastUnavailableNodeCountUpdate.Time) < (nmstate.DesiredStateConfigurationTimeout+probe.ProbesTotalTimeout))
}

func (r *NodeNetworkConfigurationPolicyReconciler) incrementUnavailableNodeCount(policy *nmstatev1beta1.NodeNetworkConfigurationPolicy) error {
Expand All @@ -330,6 +333,7 @@ func (r *NodeNetworkConfigurationPolicyReconciler) incrementUnavailableNodeCount
if policy.Status.UnavailableNodeCount >= maxUnavailable {
return apierrors.NewConflict(schema.GroupResource{Resource: "nodenetworkconfigurationpolicies"}, policy.Name, fmt.Errorf("maximal number of %d nodes are already processing policy configuration", policy.Status.UnavailableNodeCount))
}
policy.Status.LastUnavailableNodeCountUpdate = &metav1.Time{Time: time.Now()}
policy.Status.UnavailableNodeCount += 1
err = r.Client.Status().Update(context.TODO(), policy)
if err != nil {
Expand All @@ -349,6 +353,7 @@ func (r *NodeNetworkConfigurationPolicyReconciler) decrementUnavailableNodeCount
if instance.Status.UnavailableNodeCount <= 0 {
return fmt.Errorf("no unavailable nodes")
}
instance.Status.LastUnavailableNodeCountUpdate = &metav1.Time{Time: time.Now()}
instance.Status.UnavailableNodeCount -= 1
return r.Client.Status().Update(context.TODO(), instance)
})
Expand Down
48 changes: 28 additions & 20 deletions controllers/nodenetworkconfigurationpolicy_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -73,10 +73,11 @@ var _ = Describe("NodeNetworkConfigurationPolicy controller predicates", func()
)

type incrementUnavailableNodeCountCase struct {
currentUnavailableNodeCount int
expectedUnavailableNodeCount int
expectedReconcileResult ctrl.Result
previousEnactmentConditions func(*shared.ConditionList, string)
currentUnavailableNodeCount int
expectedUnavailableNodeCount int
expectedReconcileResult ctrl.Result
previousEnactmentConditions func(*shared.ConditionList, string)
shouldUpdateUnavailableNodeCount bool
}
DescribeTable("when claimNodeRunningUpdate is called and",
func(c incrementUnavailableNodeCountCase) {
Expand Down Expand Up @@ -132,34 +133,41 @@ var _ = Describe("NodeNetworkConfigurationPolicy controller predicates", func()
obtainedNNCP := nmstatev1beta1.NodeNetworkConfigurationPolicy{}
cl.Get(context.TODO(), types.NamespacedName{Name: nncp.Name}, &obtainedNNCP)
Expect(obtainedNNCP.Status.UnavailableNodeCount).To(Equal(c.expectedUnavailableNodeCount))
if c.shouldUpdateUnavailableNodeCount {
Expect(obtainedNNCP.Status.LastUnavailableNodeCountUpdate).ToNot(BeNil())
}
},
Entry("No node applying policy with empty enactment, should succeed incrementing UnavailableNodeCount",
incrementUnavailableNodeCountCase{
currentUnavailableNodeCount: 0,
expectedUnavailableNodeCount: 0,
previousEnactmentConditions: func(*shared.ConditionList, string) {},
expectedReconcileResult: ctrl.Result{},
currentUnavailableNodeCount: 0,
expectedUnavailableNodeCount: 0,
previousEnactmentConditions: func(*shared.ConditionList, string) {},
expectedReconcileResult: ctrl.Result{},
shouldUpdateUnavailableNodeCount: true,
}),
Entry("No node applying policy with progressing enactment, should succeed incrementing UnavailableNodeCount",
incrementUnavailableNodeCountCase{
currentUnavailableNodeCount: 0,
expectedUnavailableNodeCount: 0,
previousEnactmentConditions: conditions.SetProgressing,
expectedReconcileResult: ctrl.Result{},
currentUnavailableNodeCount: 0,
expectedUnavailableNodeCount: 0,
previousEnactmentConditions: conditions.SetProgressing,
expectedReconcileResult: ctrl.Result{},
shouldUpdateUnavailableNodeCount: false,
}),
Entry("One node applying policy with empty enactment, should conflict incrementing UnavailableNodeCount",
incrementUnavailableNodeCountCase{
currentUnavailableNodeCount: 1,
expectedUnavailableNodeCount: 1,
previousEnactmentConditions: func(*shared.ConditionList, string) {},
expectedReconcileResult: ctrl.Result{RequeueAfter: nodeRunningUpdateRetryTime},
currentUnavailableNodeCount: 1,
expectedUnavailableNodeCount: 1,
previousEnactmentConditions: func(*shared.ConditionList, string) {},
expectedReconcileResult: ctrl.Result{RequeueAfter: nodeRunningUpdateRetryTime},
shouldUpdateUnavailableNodeCount: false,
}),
Entry("One node applying policy with Progressing enactment, should succeed incrementing UnavailableNodeCount",
incrementUnavailableNodeCountCase{
currentUnavailableNodeCount: 1,
expectedUnavailableNodeCount: 0,
previousEnactmentConditions: conditions.SetProgressing,
expectedReconcileResult: ctrl.Result{},
currentUnavailableNodeCount: 1,
expectedUnavailableNodeCount: 0,
previousEnactmentConditions: conditions.SetProgressing,
expectedReconcileResult: ctrl.Result{},
shouldUpdateUnavailableNodeCount: false,
}),
)
})
10 changes: 10 additions & 0 deletions deploy/crds/nmstate.io_nodenetworkconfigurationpolicies.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,11 @@ spec:
- type
type: object
type: array
lastUnavailableNodeCountUpdate:
description: LastUnavailableNodeCountUpdate is time of the last UnavailableNodeCount
update
format: date-time
type: string
unavailableNodeCount:
description: UnavailableNodeCount represents the total number of potentially
unavailable nodes that are processing a NodeNetworkConfigurationPolicy
Expand Down Expand Up @@ -172,6 +177,11 @@ spec:
- type
type: object
type: array
lastUnavailableNodeCountUpdate:
description: LastUnavailableNodeCountUpdate is time of the last UnavailableNodeCount
update
format: date-time
type: string
unavailableNodeCount:
description: UnavailableNodeCount represents the total number of potentially
unavailable nodes that are processing a NodeNetworkConfigurationPolicy
Expand Down
17 changes: 9 additions & 8 deletions pkg/helper/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,14 @@ var (
log = logf.Log.WithName("client")
)

const defaultGwRetrieveTimeout = 120 * time.Second
const defaultGwProbeTimeout = 120 * time.Second
const apiServerProbeTimeout = 120 * time.Second
const (
defaultGwProbeTimeout = 120 * time.Second
apiServerProbeTimeout = 120 * time.Second
// DesiredStateConfigurationTimeout doubles the default gw ping probe and API server
// connectivity check timeout to ensure the Checkpoint is alive before rolling it back
// https://nmstate.github.io/cli_guide#manual-transaction-control
DesiredStateConfigurationTimeout = (defaultGwProbeTimeout + apiServerProbeTimeout) * 2
)

func InitializeNodeNetworkState(client client.Client, node *corev1.Node) (*nmstatev1beta1.NodeNetworkState, error) {
ownerRefList := []metav1.OwnerReference{{Name: node.ObjectMeta.Name, Kind: "Node", APIVersion: "v1", UID: node.UID}}
Expand Down Expand Up @@ -112,11 +117,7 @@ func ApplyDesiredState(client client.Client, desiredState shared.State) (string,
// working fine after apply
probes := probe.Select(client)

// commit timeout doubles the default gw ping probe and check API server
// connectivity timeout, to
// ensure the Checkpoint is alive before rolling it back
// https://nmstate.github.io/cli_guide#manual-transaction-control
setOutput, err := nmstatectl.Set(desiredState, (defaultGwProbeTimeout+apiServerProbeTimeout)*2)
setOutput, err := nmstatectl.Set(desiredState, DesiredStateConfigurationTimeout)
if err != nil {
return setOutput, err
}
Expand Down
1 change: 1 addition & 0 deletions pkg/probe/probes.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ const (
defaultDnsProbeTimeout = 120 * time.Second
apiServerProbeTimeout = 120 * time.Second
nodeReadinessProbeTimeout = 120 * time.Second
ProbesTotalTimeout = defaultGwRetrieveTimeout + defaultDnsProbeTimeout + defaultDnsProbeTimeout + apiServerProbeTimeout + nodeReadinessProbeTimeout
)

func currentStateAsGJson() (gjson.Result, error) {
Expand Down

0 comments on commit 3a832ad

Please sign in to comment.