Skip to content

Commit

Permalink
fixup! Prefer kube-scheduler's resource metrics to kube-state-metrics'
Browse files Browse the repository at this point in the history
Refactor kube_pod_status_phase, since statuses other than "Pending" or
"Running" are excluded or deprecated.

Signed-off-by: Pranshu Srivastava <[email protected]>
  • Loading branch information
rexagod committed Sep 23, 2024
1 parent 67a87ba commit 86d83ae
Show file tree
Hide file tree
Showing 4 changed files with 10 additions and 50 deletions.
2 changes: 1 addition & 1 deletion DESIGN.md
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ Jsonnet offers the ability to parameterise configuration, allowing for basic cus
alert: "KubePodNotReady",
expr: |||
sum by (namespace, pod) (
kube_pod_status_phase{%(kubeStateMetricsSelector)s, phase!~"Running|Succeeded"}
kube_pod_status_phase{%(kubeStateMetricsSelector)s, phase!~"Running"}
) > $(allowedNotReadyPods)s
||| % $._config,
"for": "1h",
Expand Down
2 changes: 1 addition & 1 deletion alerts/apps_alerts.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
expr: |||
sum by (namespace, pod, %(clusterLabel)s) (
max by(namespace, pod, %(clusterLabel)s) (
kube_pod_status_phase{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s, phase=~"Pending|Unknown|Failed"}
kube_pod_status_phase{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s, phase="Pending"}
) * on(namespace, pod, %(clusterLabel)s) group_left(owner_kind) topk by(namespace, pod, %(clusterLabel)s) (
1, max by(namespace, pod, owner_kind, %(clusterLabel)s) (kube_pod_owner{owner_kind!="Job"})
)
Expand Down
28 changes: 4 additions & 24 deletions rules/apps.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -86,10 +86,7 @@
{
record: 'cluster:namespace:pod_memory:active:kube_pod_resource_request_or_kube_pod_container_resource_requests',
expr: |||
(kube_pod_resource_request{resource="memory",%(kubeSchedulerSelector)s} or (kube_pod_container_resource_requests{resource="memory",%(kubeStateMetricsSelector)s}) * on (namespace, pod, %(clusterLabel)s)
group_left() max by (namespace, pod, %(clusterLabel)s) (
(kube_pod_status_phase{phase=~"Pending|Running"} == 1)
)
(kube_pod_resource_request{resource="memory",%(kubeSchedulerSelector)s} or kube_pod_container_resource_requests{resource="memory",%(kubeStateMetricsSelector)s})
||| % $._config,
},
{
Expand All @@ -99,8 +96,6 @@
sum by (namespace, pod, %(clusterLabel)s) (
max by (namespace, pod, container, %(clusterLabel)s) (
kube_pod_resource_request{resource="memory",%(kubeSchedulerSelector)s} or kube_pod_container_resource_requests{resource="memory",%(kubeStateMetricsSelector)s}
) * on(namespace, pod, %(clusterLabel)s) group_left() max by (namespace, pod, %(clusterLabel)s) (
kube_pod_status_phase{phase=~"Pending|Running"} == 1
)
)
)
Expand All @@ -114,10 +109,7 @@
{
record: 'cluster:namespace:pod_cpu:active:kube_pod_resource_request_or_kube_pod_container_resource_requests',
expr: |||
(kube_pod_resource_request{resource="memory",%(kubeSchedulerSelector)s} or (kube_pod_container_resource_requests{resource="cpu",%(kubeStateMetricsSelector)s}) * on (namespace, pod, %(clusterLabel)s)
group_left() max by (namespace, pod, %(clusterLabel)s) (
(kube_pod_status_phase{phase=~"Pending|Running"} == 1)
)
(kube_pod_resource_request{resource="memory",%(kubeSchedulerSelector)s} or kube_pod_container_resource_requests{resource="cpu",%(kubeStateMetricsSelector)s})
||| % $._config,
},
{
Expand All @@ -127,8 +119,6 @@
sum by (namespace, pod, %(clusterLabel)s) (
max by (namespace, pod, container, %(clusterLabel)s) (
kube_pod_resource_request{resource="cpu",%(kubeSchedulerSelector)s} or kube_pod_container_resource_requests{resource="cpu",%(kubeStateMetricsSelector)s}
) * on(namespace, pod, %(clusterLabel)s) group_left() max by (namespace, pod, %(clusterLabel)s) (
kube_pod_status_phase{phase=~"Pending|Running"} == 1
)
)
)
Expand All @@ -142,10 +132,7 @@
{
record: 'cluster:namespace:pod_memory:active:kube_pod_resource_limit_or_kube_pod_container_resource_limits',
expr: |||
(kube_pod_resource_limit{resource="memory",%(kubeSchedulerSelector)s} or kube_pod_container_resource_limits{resource="memory",%(kubeStateMetricsSelector)s}) * on (namespace, pod, %(clusterLabel)s)
group_left() max by (namespace, pod, %(clusterLabel)s) (
(kube_pod_status_phase{phase=~"Pending|Running"} == 1)
)
(kube_pod_resource_limit{resource="memory",%(kubeSchedulerSelector)s} or kube_pod_container_resource_limits{resource="memory",%(kubeStateMetricsSelector)s})
||| % $._config,
},
{
Expand All @@ -155,8 +142,6 @@
sum by (namespace, pod, %(clusterLabel)s) (
max by (namespace, pod, container, %(clusterLabel)s) (
kube_pod_container_resource_limits{resource="memory",%(kubeStateMetricsSelector)s}
) * on(namespace, pod, %(clusterLabel)s) group_left() max by (namespace, pod, %(clusterLabel)s) (
kube_pod_status_phase{phase=~"Pending|Running"} == 1
)
)
)
Expand All @@ -170,10 +155,7 @@
{
record: 'cluster:namespace:pod_cpu:active:kube_pod_resource_limit_or_kube_pod_container_resource_limits',
expr: |||
(kube_pod_resource_limit{resource="memory",%(kubeSchedulerSelector)s} or kube_pod_container_resource_limits{resource="cpu",%(kubeStateMetricsSelector)s}) * on (namespace, pod, %(clusterLabel)s)
group_left() max by (namespace, pod, %(clusterLabel)s) (
(kube_pod_status_phase{phase=~"Pending|Running"} == 1)
)
(kube_pod_resource_limit{resource="memory",%(kubeSchedulerSelector)s} or kube_pod_container_resource_limits{resource="cpu",%(kubeStateMetricsSelector)s})
||| % $._config,
},
{
Expand All @@ -183,8 +165,6 @@
sum by (namespace, pod, %(clusterLabel)s) (
max by (namespace, pod, container, %(clusterLabel)s) (
kube_pod_resource_limit{resource="memory",%(kubeSchedulerSelector)s} or kube_pod_container_resource_limits{resource="cpu",%(kubeStateMetricsSelector)s}
) * on(namespace, pod, %(clusterLabel)s) group_left() max by (namespace, pod, %(clusterLabel)s) (
kube_pod_status_phase{phase=~"Pending|Running"} == 1
)
)
)
Expand Down
28 changes: 4 additions & 24 deletions tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -461,23 +461,13 @@ tests:
- eval_time: 0m
expr: namespace_cpu:kube_pod_resource_request_or_kube_pod_container_resource_requests:sum
exp_samples:
- value: 0.15
- value: 0.3
labels: 'namespace_cpu:kube_pod_resource_request_or_kube_pod_container_resource_requests:sum{cluster="kubernetes",namespace="kube-apiserver"}'
- eval_time: 0m
expr: namespace_memory:kube_pod_container_resource_requests:sum
exp_samples:
- value: 1.0e+9
labels: 'namespace_memory:kube_pod_container_resource_requests:sum{cluster="kubernetes",namespace="kube-apiserver"}'
- eval_time: 1m
expr: namespace_cpu:kube_pod_resource_request_or_kube_pod_container_resource_requests:sum
exp_samples:
- value: 0.15
- value: 0.3
labels: 'namespace_cpu:kube_pod_resource_request_or_kube_pod_container_resource_requests:sum{cluster="kubernetes",namespace="kube-apiserver"}'
- eval_time: 1m
expr: namespace_memory:kube_pod_container_resource_requests:sum
exp_samples:
- value: 1.0e+9
labels: 'namespace_memory:kube_pod_container_resource_requests:sum{cluster="kubernetes",namespace="kube-apiserver"}'

- interval: 1m
input_series:
Expand Down Expand Up @@ -506,23 +496,13 @@ tests:
- eval_time: 0m
expr: namespace_cpu:kube_pod_resource_request_or_kube_pod_container_resource_requests:sum
exp_samples:
- value: 0.15
- value: 0.3
labels: 'namespace_cpu:kube_pod_resource_request_or_kube_pod_container_resource_requests:sum{cluster="kubernetes",namespace="kube-apiserver"}'
- eval_time: 0m
expr: namespace_memory:kube_pod_container_resource_requests:sum
exp_samples:
- value: 1.0e+9
labels: 'namespace_memory:kube_pod_container_resource_requests:sum{cluster="kubernetes",namespace="kube-apiserver"}'
- eval_time: 1m
expr: namespace_cpu:kube_pod_resource_request_or_kube_pod_container_resource_requests:sum
exp_samples:
- value: 0.15
- value: 0.3
labels: 'namespace_cpu:kube_pod_resource_request_or_kube_pod_container_resource_requests:sum{cluster="kubernetes",namespace="kube-apiserver"}'
- eval_time: 1m
expr: namespace_memory:kube_pod_container_resource_requests:sum
exp_samples:
- value: 1.0e+9
labels: 'namespace_memory:kube_pod_container_resource_requests:sum{cluster="kubernetes",namespace="kube-apiserver"}'

- interval: 1m
input_series:
Expand Down

0 comments on commit 86d83ae

Please sign in to comment.