diff --git a/assets/control-plane/prometheus-rule.yaml b/assets/control-plane/prometheus-rule.yaml index c477f58dbb..f003bb0673 100644 --- a/assets/control-plane/prometheus-rule.yaml +++ b/assets/control-plane/prometheus-rule.yaml @@ -94,7 +94,7 @@ spec: summary: StatefulSet update has not been rolled out. expr: | ( - max without (revision) ( + max by(namespace, statefulset, job, cluster) ( kube_statefulset_status_current_revision{namespace=~"(openshift-.*|kube-.*|default)",job="kube-state-metrics"} unless kube_statefulset_status_update_revision{namespace=~"(openshift-.*|kube-.*|default)",job="kube-state-metrics"} @@ -232,7 +232,7 @@ spec: description: Cluster {{ $labels.cluster }} has overcommitted CPU resource requests for Pods by {{ $value }} CPU shares and cannot tolerate node failure. summary: Cluster has overcommitted CPU resource requests. expr: | - sum(namespace_cpu:kube_pod_container_resource_requests:sum{job="kube-state-metrics",}) by (cluster) - (sum(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster) - max(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster)) > 0 + sum(namespace_cpu:kube_pod_container_resource_requests:sum{}) by (cluster) - (sum(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster) - max(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster)) > 0 and (sum(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster) - max(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster)) > 0 for: 10m @@ -336,7 +336,7 @@ spec: description: The kubernetes apiserver has terminated {{ $value | humanizePercentage }} of its incoming requests. summary: The kubernetes apiserver has terminated {{ $value | humanizePercentage }} of its incoming requests. expr: | - sum(rate(apiserver_request_terminations_total{job="apiserver"}[10m])) / ( sum(rate(apiserver_request_total{job="apiserver"}[10m])) + sum(rate(apiserver_request_terminations_total{job="apiserver"}[10m])) ) > 0.20 + sum by(cluster) (rate(apiserver_request_terminations_total{job="apiserver"}[10m])) / ( sum by(cluster) (rate(apiserver_request_total{job="apiserver"}[10m])) + sum by(cluster) (rate(apiserver_request_terminations_total{job="apiserver"}[10m])) ) > 0.20 for: 5m labels: severity: warning @@ -477,7 +477,7 @@ spec: max by(cluster, namespace, pod, node) (kube_pod_info{node!=""}) ) record: node_namespace_pod_container:container_memory_swap - - name: k8s.rules.container_resource + - name: k8s.rules.container_memory_requests rules: - expr: | kube_pod_container_resource_requests{resource="memory",job="kube-state-metrics"} * on (namespace, pod, cluster) @@ -496,6 +496,8 @@ spec: ) ) record: namespace_memory:kube_pod_container_resource_requests:sum + - name: k8s.rules.container_cpu_requests + rules: - expr: | kube_pod_container_resource_requests{resource="cpu",job="kube-state-metrics"} * on (namespace, pod, cluster) group_left() max by (namespace, pod, cluster) ( @@ -513,6 +515,8 @@ spec: ) ) record: namespace_cpu:kube_pod_container_resource_requests:sum + - name: k8s.rules.container_memory_limits + rules: - expr: | kube_pod_container_resource_limits{resource="memory",job="kube-state-metrics"} * on (namespace, pod, cluster) group_left() max by (namespace, pod, cluster) ( @@ -530,6 +534,8 @@ spec: ) ) record: namespace_memory:kube_pod_container_resource_limits:sum + - name: k8s.rules.container_cpu_limits + rules: - expr: | kube_pod_container_resource_limits{resource="cpu",job="kube-state-metrics"} * on (namespace, pod, cluster) group_left() max by (namespace, pod, cluster) ( diff --git a/jsonnet/jsonnetfile.json b/jsonnet/jsonnetfile.json index c887a826ea..673d619e3d 100644 --- a/jsonnet/jsonnetfile.json +++ b/jsonnet/jsonnetfile.json @@ -58,14 +58,13 @@ "version": "main" }, { - "name": "kubernetes-mixin is pinned because newer versions are breaking MON-3837", "source": { "git": { "remote": "https://github.com/kubernetes-monitoring/kubernetes-mixin.git", "subdir": "" } }, - "version": "b247371d1780f530587a8d9dd04ccb19ea970ba0" + "version": "cb72d737459a655e7575c09f7859815ae3690981" } ], "legacyImports": true diff --git a/jsonnet/jsonnetfile.lock.json b/jsonnet/jsonnetfile.lock.json index 4e456a6664..05e0f3df23 100644 --- a/jsonnet/jsonnetfile.lock.json +++ b/jsonnet/jsonnetfile.lock.json @@ -129,9 +129,8 @@ "subdir": "" } }, - "version": "b247371d1780f530587a8d9dd04ccb19ea970ba0", - "sum": "7M2QHK3WhOc1xT7T7KhL9iKsCYTfsIXpmcItffAcbL0=", - "name": "kubernetes-mixin is pinned because newer versions are breaking MON-3837" + "version": "cb72d737459a655e7575c09f7859815ae3690981", + "sum": "JaPnO5N/KUBgA9v6qE7CYzp8OWDTpzjM0+l/SPqL4m4=" }, { "source": { @@ -140,7 +139,7 @@ "subdir": "jsonnet/kube-state-metrics" } }, - "version": "0738de0be2ba1607aac8b58a0d783891664d48a9", + "version": "3c6bc839119b271bf9fa86809d5b3b1421fc1d36", "sum": "lO7jUSzAIy8Yk9pOWJIWgPRhubkWzVh56W6wtYfbVH4=" }, { @@ -150,7 +149,7 @@ "subdir": "jsonnet/kube-state-metrics-mixin" } }, - "version": "0738de0be2ba1607aac8b58a0d783891664d48a9", + "version": "3c6bc839119b271bf9fa86809d5b3b1421fc1d36", "sum": "qclI7LwucTjBef3PkGBkKxF0mfZPbHnn4rlNWKGtR4c=" }, { @@ -192,7 +191,7 @@ "subdir": "jsonnet/mixin" } }, - "version": "3c35a6d7baf761cc2e4426d508528e913cc9aab2", + "version": "f4ecf80bd1051cb62a95a7c49ef228a67e5c2f45", "sum": "gi+knjdxs2T715iIQIntrimbHRgHnpM8IFBJDD1gYfs=", "name": "prometheus-operator-mixin" }, @@ -234,7 +233,7 @@ "subdir": "documentation/prometheus-mixin" } }, - "version": "3c5551df68442dd07668987e0685d12d9c3138dd", + "version": "4b56af7eb824d8e7dc13994b662b5c5b39928629", "sum": "dYLcLzGH4yF3qB7OGC/7z4nqeTNjv42L7Q3BENU8XJI=", "name": "prometheus" }, @@ -266,7 +265,7 @@ "subdir": "mixin" } }, - "version": "df3df36986e07b21aaa88adefb5fbf0b648129b8", + "version": "62038110b1bc47b1324b06a0f7d91c8bce680e5e", "sum": "ieCD4eMgGbOlrI8GmckGPHBGQDcLasE1rULYq56W/bs=" } ],