Skip to content

Commit

Permalink
feat: add a buffer period for ReplicaSet health evaluation (#120)
Browse files Browse the repository at this point in the history
* feat: add a buffer period for ReplicaSet health evaluation

* chore: don't exit early due to buffer period

* feat: new way to handle buffer period for replicaset
  • Loading branch information
adityathebe authored Oct 21, 2024
1 parent 4d2b366 commit 63838eb
Show file tree
Hide file tree
Showing 4 changed files with 145 additions and 2 deletions.
13 changes: 11 additions & 2 deletions pkg/health/health_replicaset.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@ import (
"k8s.io/apimachinery/pkg/runtime"
)

// duration after the creation of a replica set
// within which we never deem the it to be unhealthy.
const replicaSetBufferPeriod = time.Minute * 10

func getReplicaSetHealth(obj *unstructured.Unstructured) (*HealthStatus, error) {
gvk := obj.GroupVersionKind()
switch gvk {
Expand All @@ -27,6 +31,8 @@ func getReplicaSetHealth(obj *unstructured.Unstructured) (*HealthStatus, error)
}

func getAppsv1ReplicaSetHealth(replicaSet *appsv1.ReplicaSet) (*HealthStatus, error) {
isWithinBufferPeriod := replicaSet.CreationTimestamp.Add(replicaSetBufferPeriod).After(time.Now())

var containersWaitingForReadiness []string
for _, container := range replicaSet.Spec.Template.Spec.Containers {
if container.ReadinessProbe != nil && container.ReadinessProbe.InitialDelaySeconds > 0 {
Expand Down Expand Up @@ -62,6 +68,11 @@ func getAppsv1ReplicaSetHealth(replicaSet *appsv1.ReplicaSet) (*HealthStatus, er
health = HealthUnhealthy
}

if (health == HealthUnhealthy || health == HealthWarning) && isWithinBufferPeriod {
// within the buffer period, we don't mark a ReplicaSet as unhealthy
health = HealthUnknown
}

if replicaSet.Generation == replicaSet.Status.ObservedGeneration && replicaSet.Status.ReadyReplicas == *replicaSet.Spec.Replicas {
return &HealthStatus{
Health: health,
Expand All @@ -84,7 +95,6 @@ func getAppsv1ReplicaSetHealth(replicaSet *appsv1.ReplicaSet) (*HealthStatus, er
Health: health,
Status: HealthStatusScalingUp,
Message: fmt.Sprintf("%d of %d pods ready", replicaSet.Status.ReadyReplicas, *replicaSet.Spec.Replicas),
Ready: true,
}, nil
}

Expand All @@ -93,7 +103,6 @@ func getAppsv1ReplicaSetHealth(replicaSet *appsv1.ReplicaSet) (*HealthStatus, er
Health: health,
Status: HealthStatusScalingDown,
Message: fmt.Sprintf("%d pods terminating", replicaSet.Status.ReadyReplicas-*replicaSet.Spec.Replicas),
Ready: true,
}, nil
}

Expand Down
11 changes: 11 additions & 0 deletions pkg/health/health_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,17 @@ func TestHPA(t *testing.T) {
assertAppHealth(t, "./testdata/hpa-v1-progressing-with-no-annotations.yaml", health.HealthStatusProgressing, health.HealthHealthy, false)
}

func TestReplicaSet(t *testing.T) {
assertAppHealthWithOverwrite(t, "./testdata/replicaset-ittools.yml", map[string]string{
"2024-08-03T06:06:18Z": time.Now().Add(-time.Minute * 2).UTC().Format("2006-01-02T15:04:05Z"),
}, health.HealthStatusRunning, health.HealthHealthy, true)

assertAppHealthWithOverwrite(t, "./testdata/replicaset-unhealthy-pods.yaml", map[string]string{
"2024-10-21T11:20:19Z": time.Now().Add(-time.Minute * 2).UTC().Format("2006-01-02T15:04:05Z"),
}, health.HealthStatusScalingUp, health.HealthUnknown, false)

}

func TestPod(t *testing.T) {
assertAppHealth(t, "./testdata/terminating-stuck.yaml", "TerminatingStalled", health.HealthUnhealthy, false)
assertAppHealth(t, "./testdata/terminating-namespace.yaml", "TerminatingStalled", health.HealthUnhealthy, false)
Expand Down
70 changes: 70 additions & 0 deletions pkg/health/testdata/replicaset-ittools.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
apiVersion: apps/v1
kind: ReplicaSet
metadata:
annotations:
deployment.kubernetes.io/desired-replicas: "1"
deployment.kubernetes.io/max-replicas: "1"
deployment.kubernetes.io/revision: "2"
meta.helm.sh/release-name: ittools
meta.helm.sh/release-namespace: default
creationTimestamp: "2024-08-03T06:06:18Z"
generation: 52
labels:
app.kubernetes.io/component: main
app.kubernetes.io/instance: ittools
app.kubernetes.io/name: ittools
pod-template-hash: 5fbf458f49
name: ittools-5fbf458f49
namespace: default
ownerReferences:
- apiVersion: apps/v1
blockOwnerDeletion: true
controller: true
kind: Deployment
name: ittools
uid: d2beccff-8da9-42e8-8459-e7ff938b2ffd
resourceVersion: "96413911"
uid: c044b250-2445-4813-b00f-22b696c5fcf2
spec:
replicas: 1
selector:
matchLabels:
app.kubernetes.io/component: main
app.kubernetes.io/instance: ittools
app.kubernetes.io/name: ittools
pod-template-hash: 5fbf458f49
template:
metadata:
labels:
app.kubernetes.io/component: main
app.kubernetes.io/instance: ittools
app.kubernetes.io/name: ittools
pod-template-hash: 5fbf458f49
spec:
automountServiceAccountToken: true
containers:
- image: corentinth/it-tools:latest
imagePullPolicy: Always
name: it-tools
resources:
limits:
memory: 50Mi
requests:
cpu: 25m
memory: 10Mi
terminationMessagePath: /dev/termination-log
terminationMessagePolicy: File
dnsPolicy: ClusterFirst
enableServiceLinks: false
restartPolicy: Always
schedulerName: default-scheduler
securityContext: {}
serviceAccount: default
serviceAccountName: default
terminationGracePeriodSeconds: 30
status:
availableReplicas: 1
fullyLabeledReplicas: 1
observedGeneration: 52
readyReplicas: 1
replicas: 1
53 changes: 53 additions & 0 deletions pkg/health/testdata/replicaset-unhealthy-pods.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
apiVersion: apps/v1
kind: ReplicaSet
metadata:
uid: f6579017-448f-425a-9645-ea3c93700948
name: failing-deployment-866585899d
labels:
app: failing-app
pod-template-hash: 866585899d
namespace: default
annotations:
deployment.kubernetes.io/revision: "1"
deployment.kubernetes.io/max-replicas: "2"
deployment.kubernetes.io/desired-replicas: "1"
ownerReferences:
- uid: 1ab20b2b-e2c8-4e85-b7b6-5709ba594c0d
kind: Deployment
name: failing-deployment
apiVersion: apps/v1
controller: true
blockOwnerDeletion: true
creationTimestamp: 2024-10-21T11:20:19Z
spec:
replicas: 1
selector:
matchLabels:
app: failing-app
pod-template-hash: 866585899d
template:
spec:
dnsPolicy: ClusterFirst
containers:
- args:
- -c
- sleep 5 && exit 1
name: failing-container
image: busybox
command:
- /bin/sh
resources: {}
imagePullPolicy: Always
terminationMessagePath: /dev/termination-log
terminationMessagePolicy: File
restartPolicy: Always
schedulerName: default-scheduler
securityContext: {}
terminationGracePeriodSeconds: 30
metadata:
labels:
app: failing-app
pod-template-hash: 866585899d
status:
replicas: 1
fullyLabeledReplicas: 1

0 comments on commit 63838eb

Please sign in to comment.