Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add a buffer period for ReplicaSet health evaluation #120

Merged
merged 3 commits into from
Oct 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 11 additions & 2 deletions pkg/health/health_replicaset.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@ import (
"k8s.io/apimachinery/pkg/runtime"
)

// duration after the creation of a replica set
// within which we never deem the it to be unhealthy.
const replicaSetBufferPeriod = time.Minute * 10

func getReplicaSetHealth(obj *unstructured.Unstructured) (*HealthStatus, error) {
gvk := obj.GroupVersionKind()
switch gvk {
Expand All @@ -27,6 +31,8 @@ func getReplicaSetHealth(obj *unstructured.Unstructured) (*HealthStatus, error)
}

func getAppsv1ReplicaSetHealth(replicaSet *appsv1.ReplicaSet) (*HealthStatus, error) {
isWithinBufferPeriod := replicaSet.CreationTimestamp.Add(replicaSetBufferPeriod).After(time.Now())

var containersWaitingForReadiness []string
for _, container := range replicaSet.Spec.Template.Spec.Containers {
if container.ReadinessProbe != nil && container.ReadinessProbe.InitialDelaySeconds > 0 {
Expand Down Expand Up @@ -62,6 +68,11 @@ func getAppsv1ReplicaSetHealth(replicaSet *appsv1.ReplicaSet) (*HealthStatus, er
health = HealthUnhealthy
}

if (health == HealthUnhealthy || health == HealthWarning) && isWithinBufferPeriod {
// within the buffer period, we don't mark a ReplicaSet as unhealthy
health = HealthUnknown
}

if replicaSet.Generation == replicaSet.Status.ObservedGeneration && replicaSet.Status.ReadyReplicas == *replicaSet.Spec.Replicas {
return &HealthStatus{
Health: health,
Expand All @@ -84,7 +95,6 @@ func getAppsv1ReplicaSetHealth(replicaSet *appsv1.ReplicaSet) (*HealthStatus, er
Health: health,
Status: HealthStatusScalingUp,
Message: fmt.Sprintf("%d of %d pods ready", replicaSet.Status.ReadyReplicas, *replicaSet.Spec.Replicas),
Ready: true,
}, nil
}

Expand All @@ -93,7 +103,6 @@ func getAppsv1ReplicaSetHealth(replicaSet *appsv1.ReplicaSet) (*HealthStatus, er
Health: health,
Status: HealthStatusScalingDown,
Message: fmt.Sprintf("%d pods terminating", replicaSet.Status.ReadyReplicas-*replicaSet.Spec.Replicas),
Ready: true,
}, nil
}

Expand Down
11 changes: 11 additions & 0 deletions pkg/health/health_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,17 @@ func TestHPA(t *testing.T) {
assertAppHealth(t, "./testdata/hpa-v1-progressing-with-no-annotations.yaml", health.HealthStatusProgressing, health.HealthHealthy, false)
}

func TestReplicaSet(t *testing.T) {
assertAppHealthWithOverwrite(t, "./testdata/replicaset-ittools.yml", map[string]string{
"2024-08-03T06:06:18Z": time.Now().Add(-time.Minute * 2).UTC().Format("2006-01-02T15:04:05Z"),
}, health.HealthStatusRunning, health.HealthHealthy, true)

assertAppHealthWithOverwrite(t, "./testdata/replicaset-unhealthy-pods.yaml", map[string]string{
"2024-10-21T11:20:19Z": time.Now().Add(-time.Minute * 2).UTC().Format("2006-01-02T15:04:05Z"),
}, health.HealthStatusScalingUp, health.HealthUnknown, false)

}

func TestPod(t *testing.T) {
assertAppHealth(t, "./testdata/terminating-stuck.yaml", "TerminatingStalled", health.HealthUnhealthy, false)
assertAppHealth(t, "./testdata/terminating-namespace.yaml", "TerminatingStalled", health.HealthUnhealthy, false)
Expand Down
70 changes: 70 additions & 0 deletions pkg/health/testdata/replicaset-ittools.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
apiVersion: apps/v1
kind: ReplicaSet
metadata:
annotations:
deployment.kubernetes.io/desired-replicas: "1"
deployment.kubernetes.io/max-replicas: "1"
deployment.kubernetes.io/revision: "2"
meta.helm.sh/release-name: ittools
meta.helm.sh/release-namespace: default
creationTimestamp: "2024-08-03T06:06:18Z"
generation: 52
labels:
app.kubernetes.io/component: main
app.kubernetes.io/instance: ittools
app.kubernetes.io/name: ittools
pod-template-hash: 5fbf458f49
name: ittools-5fbf458f49
namespace: default
ownerReferences:
- apiVersion: apps/v1
blockOwnerDeletion: true
controller: true
kind: Deployment
name: ittools
uid: d2beccff-8da9-42e8-8459-e7ff938b2ffd
resourceVersion: "96413911"
uid: c044b250-2445-4813-b00f-22b696c5fcf2
spec:
replicas: 1
selector:
matchLabels:
app.kubernetes.io/component: main
app.kubernetes.io/instance: ittools
app.kubernetes.io/name: ittools
pod-template-hash: 5fbf458f49
template:
metadata:
labels:
app.kubernetes.io/component: main
app.kubernetes.io/instance: ittools
app.kubernetes.io/name: ittools
pod-template-hash: 5fbf458f49
spec:
automountServiceAccountToken: true
containers:
- image: corentinth/it-tools:latest
imagePullPolicy: Always
name: it-tools
resources:
limits:
memory: 50Mi
requests:
cpu: 25m
memory: 10Mi
terminationMessagePath: /dev/termination-log
terminationMessagePolicy: File
dnsPolicy: ClusterFirst
enableServiceLinks: false
restartPolicy: Always
schedulerName: default-scheduler
securityContext: {}
serviceAccount: default
serviceAccountName: default
terminationGracePeriodSeconds: 30
status:
availableReplicas: 1
fullyLabeledReplicas: 1
observedGeneration: 52
readyReplicas: 1
replicas: 1
53 changes: 53 additions & 0 deletions pkg/health/testdata/replicaset-unhealthy-pods.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
apiVersion: apps/v1
kind: ReplicaSet
metadata:
uid: f6579017-448f-425a-9645-ea3c93700948
name: failing-deployment-866585899d
labels:
app: failing-app
pod-template-hash: 866585899d
namespace: default
annotations:
deployment.kubernetes.io/revision: "1"
deployment.kubernetes.io/max-replicas: "2"
deployment.kubernetes.io/desired-replicas: "1"
ownerReferences:
- uid: 1ab20b2b-e2c8-4e85-b7b6-5709ba594c0d
kind: Deployment
name: failing-deployment
apiVersion: apps/v1
controller: true
blockOwnerDeletion: true
creationTimestamp: 2024-10-21T11:20:19Z
spec:
replicas: 1
selector:
matchLabels:
app: failing-app
pod-template-hash: 866585899d
template:
spec:
dnsPolicy: ClusterFirst
containers:
- args:
- -c
- sleep 5 && exit 1
name: failing-container
image: busybox
command:
- /bin/sh
resources: {}
imagePullPolicy: Always
terminationMessagePath: /dev/termination-log
terminationMessagePolicy: File
restartPolicy: Always
schedulerName: default-scheduler
securityContext: {}
terminationGracePeriodSeconds: 30
metadata:
labels:
app: failing-app
pod-template-hash: 866585899d
status:
replicas: 1
fullyLabeledReplicas: 1
Loading