From 80f69a189e3c8d569a3a033014c071eeb17500b0 Mon Sep 17 00:00:00 2001 From: SuperQ Date: Wed, 31 Jul 2024 11:48:47 +0200 Subject: [PATCH] Fix: Handle float apiserver buckets Depending on the version of client_golang and metrics format (Prometheus vs OpenMetrics), bucket names may contain either integer or float formatting. Use a regexp match to select either format. Signed-off-by: SuperQ --- rules/kube_apiserver-availability.libsonnet | 16 ++++++++-------- rules/kube_apiserver-config.libsonnet | 8 ++++---- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/rules/kube_apiserver-availability.libsonnet b/rules/kube_apiserver-availability.libsonnet index 18c9e4ef7..486e2f5ec 100644 --- a/rules/kube_apiserver-availability.libsonnet +++ b/rules/kube_apiserver-availability.libsonnet @@ -61,7 +61,7 @@ # write too slow sum by (%(clusterLabel)s) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase%(SLODays)s{%(kubeApiserverWriteSelector)s}) - - sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverWriteSelector)s,le="%(kubeApiserverWriteLatency)s"}) + sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverWriteSelector)s,le=~"%(kubeApiserverWriteLatency)s"}) ) + ( # read too slow @@ -69,14 +69,14 @@ - ( ( - sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverReadSelector)s,scope=~"resource|",le="%(kubeApiserverReadResourceLatency)s"}) + sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverReadSelector)s,scope=~"resource|",le=~"%(kubeApiserverReadResourceLatency)s"}) or vector(0) ) + - sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverReadSelector)s,scope="namespace",le="%(kubeApiserverReadNamespaceLatency)s"}) + sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverReadSelector)s,scope="namespace",le=~"%(kubeApiserverReadNamespaceLatency)s"}) + - sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverReadSelector)s,scope="cluster",le="%(kubeApiserverReadClusterLatency)s"}) + sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverReadSelector)s,scope="cluster",le=~"%(kubeApiserverReadClusterLatency)s"}) ) ) + # errors @@ -98,14 +98,14 @@ ( # too slow ( - sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverReadSelector)s,scope=~"resource|",le="%(kubeApiserverReadResourceLatency)s"}) + sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverReadSelector)s,scope=~"resource|",le=~"%(kubeApiserverReadResourceLatency)s"}) or vector(0) ) + - sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverReadSelector)s,scope="namespace",le="%(kubeApiserverReadNamespaceLatency)s"}) + sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverReadSelector)s,scope="namespace",le=~"%(kubeApiserverReadNamespaceLatency)s"}) + - sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverReadSelector)s,scope="cluster",le="%(kubeApiserverReadClusterLatency)s"}) + sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverReadSelector)s,scope="cluster",le=~"%(kubeApiserverReadClusterLatency)s"}) ) + # errors @@ -126,7 +126,7 @@ # too slow sum by (%(clusterLabel)s) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase%(SLODays)s{%(kubeApiserverWriteSelector)s}) - - sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverWriteSelector)s,le="%(kubeApiserverWriteLatency)s"}) + sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverWriteSelector)s,le=~"%(kubeApiserverWriteLatency)s"}) ) + # errors diff --git a/rules/kube_apiserver-config.libsonnet b/rules/kube_apiserver-config.libsonnet index ad0a013bd..80bff2b7d 100644 --- a/rules/kube_apiserver-config.libsonnet +++ b/rules/kube_apiserver-config.libsonnet @@ -8,9 +8,9 @@ // These are buckets that exist on the apiserver_request_sli_duration_seconds_bucket histogram. // They are what the Kubernetes SIG Scalability is using to measure availability of Kubernetes clusters. // If you want to change these, make sure the "le" buckets exist on the histogram! - kubeApiserverReadResourceLatency: '1', - kubeApiserverReadNamespaceLatency: '5', - kubeApiserverReadClusterLatency: '30', - kubeApiserverWriteLatency: '1', + kubeApiserverReadResourceLatency: '1(\\.0)?', + kubeApiserverReadNamespaceLatency: '5(\\.0)?', + kubeApiserverReadClusterLatency: '30(\\.0)?', + kubeApiserverWriteLatency: '1(\\.0)?', }, }