diff --git a/.github/workflows/check-with-upstream.yaml b/.github/workflows/check-with-upstream.yaml new file mode 100644 index 000000000..00fd90fb2 --- /dev/null +++ b/.github/workflows/check-with-upstream.yaml @@ -0,0 +1,14 @@ +name: check-with-upstream +# Run every Monday. +on: + schedule: + - cron: '0 0 * * 1' +jobs: + check-selectors-ksm: + runs-on: ubuntu-latest + name: Check if KSM selectors are present on applicable metrics. + steps: + - uses: actions/checkout@v2 + with: + persist-credentials: false + - run: make --always-make check-selectors-ksm diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index a58b73731..bd17701af 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -2,8 +2,6 @@ name: ci on: - push - pull_request -env: - golang-version: '1.17' jobs: generate: runs-on: ubuntu-latest @@ -12,9 +10,6 @@ jobs: - uses: actions/checkout@v2 with: persist-credentials: false - - uses: actions/setup-go@v2 - with: - go-version: ${{ env.golang-version }} - run: make --always-make generate && git diff --exit-code jsonnet-lint: runs-on: ubuntu-latest @@ -23,9 +18,6 @@ jobs: - uses: actions/checkout@v2 with: persist-credentials: false - - uses: actions/setup-go@v2 - with: - go-version: ${{ env.golang-version }} - run: make --always-make jsonnet-lint dashboards-lint: runs-on: ubuntu-latest @@ -34,9 +26,6 @@ jobs: - uses: actions/checkout@v2 with: persist-credentials: false - - uses: actions/setup-go@v2 - with: - go-version: ${{ env.golang-version }} - run: make --always-make dashboards-lint alerts-lint: runs-on: ubuntu-latest @@ -45,9 +34,6 @@ jobs: - uses: actions/checkout@v2 with: persist-credentials: false - - uses: actions/setup-go@v2 - with: - go-version: ${{ env.golang-version }} - run: make --always-make alerts-lint fmt: runs-on: ubuntu-latest @@ -56,9 +42,6 @@ jobs: - uses: actions/checkout@v2 with: persist-credentials: false - - uses: actions/setup-go@v2 - with: - go-version: ${{ env.golang-version }} - run: make --always-make fmt && git diff --exit-code unit-tests: runs-on: ubuntu-latest @@ -67,7 +50,4 @@ jobs: - uses: actions/checkout@v2 with: persist-credentials: false - - uses: actions/setup-go@v2 - with: - go-version: ${{ env.golang-version }} - run: make --always-make test diff --git a/Makefile b/Makefile index d291f3acc..f88facaa5 100644 --- a/Makefile +++ b/Makefile @@ -76,3 +76,9 @@ $(TOOLING): $(BIN_DIR) @echo Installing tools from hack/tools.go @cd scripts && go list -mod=mod -tags tools -f '{{ range .Imports }}{{ printf "%s\n" .}}{{end}}' ./ | xargs -tI % go build -mod=mod -o $(BIN_DIR) % +######################################## +# "check-with-upstream" workflow checks. +######################################## + +check-selectors-ksm: + @./scripts/check-selectors-ksm.sh diff --git a/alerts/apps_alerts.libsonnet b/alerts/apps_alerts.libsonnet index 65c8349c3..63e8d9edf 100644 --- a/alerts/apps_alerts.libsonnet +++ b/alerts/apps_alerts.libsonnet @@ -87,6 +87,21 @@ 'for': '15m', alert: 'KubeDeploymentReplicasMismatch', }, + { + expr: ||| + kube_deployment_status_condition{condition="Progressing", status="false",%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s} + != 0 + ||| % $._config, + labels: { + severity: 'warning', + }, + annotations: { + description: 'Rollout of deployment {{ $labels.namespace }}/{{ $labels.deployment }} is not progressing for longer than 15 minutes.', + summary: 'Deployment rollout is not progressing.', + }, + 'for': '15m', + alert: 'KubeDeploymentRolloutStuck', + }, { expr: ||| ( @@ -104,7 +119,7 @@ }, annotations: { description: 'StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has not matched the expected number of replicas for longer than 15 minutes.', - summary: 'Deployment has not matched the expected number of replicas.', + summary: 'StatefulSet has not matched the expected number of replicas.', }, 'for': '15m', alert: 'KubeStatefulSetReplicasMismatch', diff --git a/alerts/kube_apiserver.libsonnet b/alerts/kube_apiserver.libsonnet index 275cf711f..5639454d5 100644 --- a/alerts/kube_apiserver.libsonnet +++ b/alerts/kube_apiserver.libsonnet @@ -77,7 +77,7 @@ local utils = import '../lib/utils.libsonnet'; { alert: 'KubeAggregatedAPIErrors', expr: ||| - sum by(name, namespace, %(clusterLabel)s)(increase(aggregator_unavailable_apiservice_total[10m])) > 4 + sum by(name, namespace, %(clusterLabel)s)(increase(aggregator_unavailable_apiservice_total{%(kubeApiserverSelector)s}[10m])) > 4 ||| % $._config, labels: { severity: 'warning', @@ -90,7 +90,7 @@ local utils = import '../lib/utils.libsonnet'; { alert: 'KubeAggregatedAPIDown', expr: ||| - (1 - max by(name, namespace, %(clusterLabel)s)(avg_over_time(aggregator_unavailable_apiservice[10m]))) * 100 < 85 + (1 - max by(name, namespace, %(clusterLabel)s)(avg_over_time(aggregator_unavailable_apiservice{%(kubeApiserverSelector)s}[10m]))) * 100 < 85 ||| % $._config, 'for': '5m', labels: { diff --git a/alerts/kubelet.libsonnet b/alerts/kubelet.libsonnet index 95475b4c3..2b206dec1 100644 --- a/alerts/kubelet.libsonnet +++ b/alerts/kubelet.libsonnet @@ -72,7 +72,7 @@ { alert: 'KubeNodeReadinessFlapping', expr: ||| - sum(changes(kube_node_status_condition{status="true",condition="Ready"}[15m])) by (%(clusterLabel)s, node) > 2 + sum(changes(kube_node_status_condition{%(kubeStateMetricsSelector)s,status="true",condition="Ready"}[15m])) by (%(clusterLabel)s, node) > 2 ||| % $._config, 'for': '15m', labels: { diff --git a/alerts/resource_alerts.libsonnet b/alerts/resource_alerts.libsonnet index a3e4fa406..46225afcf 100644 --- a/alerts/resource_alerts.libsonnet +++ b/alerts/resource_alerts.libsonnet @@ -34,18 +34,18 @@ } + if $._config.showMultiCluster then { expr: ||| - sum(namespace_cpu:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) by (%(clusterLabel)s) - (sum(kube_node_status_allocatable{resource="cpu"}) by (%(clusterLabel)s) - max(kube_node_status_allocatable{resource="cpu"}) by (%(clusterLabel)s)) > 0 + sum(namespace_cpu:kube_pod_container_resource_requests:sum{%(kubeStateMetricsSelector)s,%(ignoringOverprovisionedWorkloadSelector)s}) by (%(clusterLabel)s) - (sum(kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="cpu"}) by (%(clusterLabel)s) - max(kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="cpu"}) by (%(clusterLabel)s)) > 0 and - (sum(kube_node_status_allocatable{resource="cpu"}) by (%(clusterLabel)s) - max(kube_node_status_allocatable{resource="cpu"}) by (%(clusterLabel)s)) > 0 + (sum(kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="cpu"}) by (%(clusterLabel)s) - max(kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="cpu"}) by (%(clusterLabel)s)) > 0 ||| % $._config, annotations+: { description: 'Cluster {{ $labels.%(clusterLabel)s }} has overcommitted CPU resource requests for Pods by {{ $value }} CPU shares and cannot tolerate node failure.' % $._config, }, } else { expr: ||| - sum(namespace_cpu:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) - (sum(kube_node_status_allocatable{resource="cpu"}) - max(kube_node_status_allocatable{resource="cpu"})) > 0 + sum(namespace_cpu:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) - (sum(kube_node_status_allocatable{resource="cpu", %(kubeStateMetricsSelector)s}) - max(kube_node_status_allocatable{resource="cpu", %(kubeStateMetricsSelector)s})) > 0 and - (sum(kube_node_status_allocatable{resource="cpu"}) - max(kube_node_status_allocatable{resource="cpu"})) > 0 + (sum(kube_node_status_allocatable{resource="cpu", %(kubeStateMetricsSelector)s}) - max(kube_node_status_allocatable{resource="cpu", %(kubeStateMetricsSelector)s})) > 0 ||| % $._config, annotations+: { description: 'Cluster has overcommitted CPU resource requests for Pods by {{ $value }} CPU shares and cannot tolerate node failure.' % $._config, @@ -63,9 +63,9 @@ } + if $._config.showMultiCluster then { expr: ||| - sum(namespace_memory:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) by (%(clusterLabel)s) - (sum(kube_node_status_allocatable{resource="memory"}) by (%(clusterLabel)s) - max(kube_node_status_allocatable{resource="memory"}) by (%(clusterLabel)s)) > 0 + sum(namespace_memory:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) by (%(clusterLabel)s) - (sum(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) by (%(clusterLabel)s) - max(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) by (%(clusterLabel)s)) > 0 and - (sum(kube_node_status_allocatable{resource="memory"}) by (%(clusterLabel)s) - max(kube_node_status_allocatable{resource="memory"}) by (%(clusterLabel)s)) > 0 + (sum(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) by (%(clusterLabel)s) - max(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) by (%(clusterLabel)s)) > 0 ||| % $._config, annotations+: { description: 'Cluster {{ $labels.%(clusterLabel)s }} has overcommitted memory resource requests for Pods by {{ $value | humanize }} bytes and cannot tolerate node failure.' % $._config, @@ -73,9 +73,9 @@ } else { expr: ||| - sum(namespace_memory:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) - (sum(kube_node_status_allocatable{resource="memory"}) - max(kube_node_status_allocatable{resource="memory"})) > 0 + sum(namespace_memory:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) - (sum(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) - max(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s})) > 0 and - (sum(kube_node_status_allocatable{resource="memory"}) - max(kube_node_status_allocatable{resource="memory"})) > 0 + (sum(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) - max(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s})) > 0 ||| % $._config, annotations+: { description: 'Cluster has overcommitted memory resource requests for Pods by {{ $value | humanize }} bytes and cannot tolerate node failure.', diff --git a/alerts/system_alerts.libsonnet b/alerts/system_alerts.libsonnet index f91e982b6..2a434a30f 100644 --- a/alerts/system_alerts.libsonnet +++ b/alerts/system_alerts.libsonnet @@ -1,6 +1,7 @@ { _config+:: { notKubeDnsCoreDnsSelector: 'job!~"kube-dns|coredns"', + kubeApiserverSelector: 'job="kube-apiserver"', }, prometheusAlerts+:: { @@ -28,9 +29,9 @@ // this is normal and an expected error, therefore it should be // ignored in this alert. expr: ||| - (sum(rate(rest_client_requests_total{code=~"5.."}[5m])) by (%(clusterLabel)s, instance, job, namespace) + (sum(rate(rest_client_requests_total{%(kubeApiserverSelector)s,code=~"5.."}[5m])) by (%(clusterLabel)s, instance, job, namespace) / - sum(rate(rest_client_requests_total[5m])) by (%(clusterLabel)s, instance, job, namespace)) + sum(rate(rest_client_requests_total{%(kubeApiserverSelector)s}[5m])) by (%(clusterLabel)s, instance, job, namespace)) > 0.01 ||| % $._config, 'for': '15m', diff --git a/dashboards/network-usage/namespace-by-workload.libsonnet b/dashboards/network-usage/namespace-by-workload.libsonnet index fe645879a..310107d6a 100644 --- a/dashboards/network-usage/namespace-by-workload.libsonnet +++ b/dashboards/network-usage/namespace-by-workload.libsonnet @@ -263,10 +263,10 @@ local annotation = grafana.annotation; name='type', datasource='$datasource', query='label_values(namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster",namespace="$namespace", workload=~".+"}, workload_type)' % $._config, - current='deployment', + current='', hide='', refresh=2, - includeAll=false, + includeAll=true, sort=0 ) + { auto: false, @@ -412,7 +412,7 @@ local annotation = grafana.annotation; graphQuery=||| sort_desc(sum(irate(container_network_receive_bytes_total{%(clusterLabel)s="$cluster",namespace="$namespace"}[$interval:$resolution]) * on (namespace,pod) - group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster",namespace="$namespace", workload=~".+", workload_type="$type"}) by (workload)) + group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster",namespace="$namespace", workload=~".+", workload_type=~"$type"}) by (workload)) ||| % $._config, legendFormat='{{ workload }}', ), @@ -424,7 +424,7 @@ local annotation = grafana.annotation; graphQuery=||| sort_desc(sum(irate(container_network_transmit_bytes_total{%(clusterLabel)s="$cluster",namespace="$namespace"}[$interval:$resolution]) * on (namespace,pod) - group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster",namespace="$namespace", workload=~".+", workload_type="$type"}) by (workload)) + group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster",namespace="$namespace", workload=~".+", workload_type=~"$type"}) by (workload)) ||| % $._config, legendFormat='{{ workload }}', ), @@ -437,42 +437,42 @@ local annotation = grafana.annotation; ||| sort_desc(sum(irate(container_network_receive_bytes_total{%(clusterLabel)s="$cluster",namespace="$namespace"}[$interval:$resolution]) * on (namespace,pod) - group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster",namespace="$namespace", workload=~".+", workload_type="$type"}) by (workload)) + group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster",namespace="$namespace", workload=~".+", workload_type=~"$type"}) by (workload)) ||| % $._config, ||| sort_desc(sum(irate(container_network_transmit_bytes_total{%(clusterLabel)s="$cluster",namespace="$namespace"}[$interval:$resolution]) * on (namespace,pod) - group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster",namespace="$namespace", workload=~".+", workload_type="$type"}) by (workload)) + group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster",namespace="$namespace", workload=~".+", workload_type=~"$type"}) by (workload)) ||| % $._config, ||| sort_desc(avg(irate(container_network_receive_bytes_total{%(clusterLabel)s="$cluster",namespace="$namespace"}[$interval:$resolution]) * on (namespace,pod) - group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster",namespace="$namespace", workload=~".+", workload_type="$type"}) by (workload)) + group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster",namespace="$namespace", workload=~".+", workload_type=~"$type"}) by (workload)) ||| % $._config, ||| sort_desc(avg(irate(container_network_transmit_bytes_total{%(clusterLabel)s="$cluster",namespace="$namespace"}[$interval:$resolution]) * on (namespace,pod) - group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster",namespace="$namespace", workload=~".+", workload_type="$type"}) by (workload)) + group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster",namespace="$namespace", workload=~".+", workload_type=~"$type"}) by (workload)) ||| % $._config, ||| sort_desc(sum(irate(container_network_receive_packets_total{%(clusterLabel)s="$cluster",namespace="$namespace"}[$interval:$resolution]) * on (namespace,pod) - group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster",namespace="$namespace", workload=~".+", workload_type="$type"}) by (workload)) + group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster",namespace="$namespace", workload=~".+", workload_type=~"$type"}) by (workload)) ||| % $._config, ||| sort_desc(sum(irate(container_network_transmit_packets_total{%(clusterLabel)s="$cluster",namespace="$namespace"}[$interval:$resolution]) * on (namespace,pod) - group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster",namespace="$namespace", workload=~".+", workload_type="$type"}) by (workload)) + group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster",namespace="$namespace", workload=~".+", workload_type=~"$type"}) by (workload)) ||| % $._config, ||| sort_desc(sum(irate(container_network_receive_packets_dropped_total{%(clusterLabel)s="$cluster",namespace="$namespace"}[$interval:$resolution]) * on (namespace,pod) - group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster",namespace="$namespace", workload=~".+", workload_type="$type"}) by (workload)) + group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster",namespace="$namespace", workload=~".+", workload_type=~"$type"}) by (workload)) ||| % $._config, ||| sort_desc(sum(irate(container_network_transmit_packets_dropped_total{%(clusterLabel)s="$cluster",namespace="$namespace"}[$interval:$resolution]) * on (namespace,pod) - group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster",namespace="$namespace", workload=~".+", workload_type="$type"}) by (workload)) + group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster",namespace="$namespace", workload=~".+", workload_type=~"$type"}) by (workload)) ||| % $._config, ] ), @@ -486,7 +486,7 @@ local annotation = grafana.annotation; graphQuery=||| sort_desc(avg(irate(container_network_receive_bytes_total{%(clusterLabel)s="$cluster",namespace="$namespace"}[$interval:$resolution]) * on (namespace,pod) - group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster",namespace="$namespace", workload=~".+", workload_type="$type"}) by (workload)) + group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster",namespace="$namespace", workload=~".+", workload_type=~"$type"}) by (workload)) ||| % $._config, legendFormat='{{ workload }}', ), @@ -498,7 +498,7 @@ local annotation = grafana.annotation; graphQuery=||| sort_desc(avg(irate(container_network_transmit_bytes_total{%(clusterLabel)s="$cluster",namespace="$namespace"}[$interval:$resolution]) * on (namespace,pod) - group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster",namespace="$namespace", workload=~".+", workload_type="$type"}) by (workload)) + group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster",namespace="$namespace", workload=~".+", workload_type=~"$type"}) by (workload)) ||| % $._config, legendFormat='{{ workload }}', ), @@ -515,7 +515,7 @@ local annotation = grafana.annotation; graphQuery=||| sort_desc(sum(irate(container_network_receive_bytes_total{%(clusterLabel)s="$cluster",namespace="$namespace"}[$interval:$resolution]) * on (namespace,pod) - group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster",namespace="$namespace", workload=~".+", workload_type="$type"}) by (workload)) + group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster",namespace="$namespace", workload=~".+", workload_type=~"$type"}) by (workload)) ||| % $._config, ), gridPos={ h: 9, w: 12, x: 0, y: 38 } @@ -526,7 +526,7 @@ local annotation = grafana.annotation; graphQuery=||| sort_desc(sum(irate(container_network_transmit_bytes_total{%(clusterLabel)s="$cluster",namespace="$namespace"}[$interval:$resolution]) * on (namespace,pod) - group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster",namespace="$namespace", workload=~".+", workload_type="$type"}) by (workload)) + group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster",namespace="$namespace", workload=~".+", workload_type=~"$type"}) by (workload)) ||| % $._config, ), gridPos={ h: 9, w: 12, x: 12, y: 38 } @@ -539,7 +539,7 @@ local annotation = grafana.annotation; graphQuery=||| sort_desc(sum(irate(container_network_receive_packets_total{%(clusterLabel)s="$cluster",namespace="$namespace"}[$interval:$resolution]) * on (namespace,pod) - group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster",namespace="$namespace", workload=~".+", workload_type="$type"}) by (workload)) + group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster",namespace="$namespace", workload=~".+", workload_type=~"$type"}) by (workload)) ||| % $._config, graphFormat='pps' ), @@ -551,7 +551,7 @@ local annotation = grafana.annotation; graphQuery=||| sort_desc(sum(irate(container_network_transmit_packets_total{%(clusterLabel)s="$cluster",namespace="$namespace"}[$interval:$resolution]) * on (namespace,pod) - group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster",namespace="$namespace", workload=~".+", workload_type="$type"}) by (workload)) + group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster",namespace="$namespace", workload=~".+", workload_type=~"$type"}) by (workload)) ||| % $._config, graphFormat='pps' ), @@ -567,7 +567,7 @@ local annotation = grafana.annotation; graphQuery=||| sort_desc(sum(irate(container_network_receive_packets_dropped_total{%(clusterLabel)s="$cluster",namespace="$namespace"}[$interval:$resolution]) * on (namespace,pod) - group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster",namespace="$namespace", workload=~".+", workload_type="$type"}) by (workload)) + group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster",namespace="$namespace", workload=~".+", workload_type=~"$type"}) by (workload)) ||| % $._config, graphFormat='pps' ), @@ -579,7 +579,7 @@ local annotation = grafana.annotation; graphQuery=||| sort_desc(sum(irate(container_network_transmit_packets_dropped_total{%(clusterLabel)s="$cluster",namespace="$namespace"}[$interval:$resolution]) * on (namespace,pod) - group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster",namespace="$namespace", workload=~".+", workload_type="$type"}) by (workload)) + group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster",namespace="$namespace", workload=~".+", workload_type=~"$type"}) by (workload)) ||| % $._config, graphFormat='pps' ), diff --git a/dashboards/network-usage/workload-total.libsonnet b/dashboards/network-usage/workload-total.libsonnet index 8cc4130cd..5bff78686 100644 --- a/dashboards/network-usage/workload-total.libsonnet +++ b/dashboards/network-usage/workload-total.libsonnet @@ -146,10 +146,10 @@ local annotation = grafana.annotation; name='type', datasource='$datasource', query='label_values(namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster",namespace=~"$namespace", workload=~"$workload"}, workload_type)' % $._config, - current='deployment', + current='', hide='', refresh=2, - includeAll=false, + includeAll=true, sort=0 ) + { auto: false, @@ -296,7 +296,7 @@ local annotation = grafana.annotation; graphQuery=||| sort_desc(sum(irate(container_network_receive_bytes_total{%(cadvisorSelector)s, %(clusterLabel)s="$cluster",namespace=~"$namespace"}[$interval:$resolution]) * on (namespace,pod) - group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster",namespace=~"$namespace", workload=~"$workload", workload_type="$type"}) by (pod)) + group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster",namespace=~"$namespace", workload=~"$workload", workload_type=~"$type"}) by (pod)) ||| % $._config, legendFormat='{{ pod }}', ), @@ -308,7 +308,7 @@ local annotation = grafana.annotation; graphQuery=||| sort_desc(sum(irate(container_network_transmit_bytes_total{%(cadvisorSelector)s, %(clusterLabel)s="$cluster",namespace=~"$namespace"}[$interval:$resolution]) * on (namespace,pod) - group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster",namespace=~"$namespace", workload=~"$workload", workload_type="$type"}) by (pod)) + group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster",namespace=~"$namespace", workload=~"$workload", workload_type=~"$type"}) by (pod)) ||| % $._config, legendFormat='{{ pod }}', ), @@ -322,7 +322,7 @@ local annotation = grafana.annotation; graphQuery=||| sort_desc(avg(irate(container_network_receive_bytes_total{%(cadvisorSelector)s, %(clusterLabel)s="$cluster",namespace=~"$namespace"}[$interval:$resolution]) * on (namespace,pod) - group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster",namespace=~"$namespace", workload=~"$workload", workload_type="$type"}) by (pod)) + group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster",namespace=~"$namespace", workload=~"$workload", workload_type=~"$type"}) by (pod)) ||| % $._config, legendFormat='{{ pod }}', ), @@ -334,7 +334,7 @@ local annotation = grafana.annotation; graphQuery=||| sort_desc(avg(irate(container_network_transmit_bytes_total{%(cadvisorSelector)s, %(clusterLabel)s="$cluster",namespace=~"$namespace"}[$interval:$resolution]) * on (namespace,pod) - group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster",namespace=~"$namespace", workload=~"$workload", workload_type="$type"}) by (pod)) + group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster",namespace=~"$namespace", workload=~"$workload", workload_type=~"$type"}) by (pod)) ||| % $._config, legendFormat='{{ pod }}', ), @@ -351,7 +351,7 @@ local annotation = grafana.annotation; graphQuery=||| sort_desc(sum(irate(container_network_receive_bytes_total{%(cadvisorSelector)s, %(clusterLabel)s="$cluster",namespace=~"$namespace"}[$interval:$resolution]) * on (namespace,pod) - group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster",namespace=~"$namespace", workload=~"$workload", workload_type="$type"}) by (pod)) + group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster",namespace=~"$namespace", workload=~"$workload", workload_type=~"$type"}) by (pod)) ||| % $._config, ), gridPos={ h: 9, w: 12, x: 0, y: 12 } @@ -362,7 +362,7 @@ local annotation = grafana.annotation; graphQuery=||| sort_desc(sum(irate(container_network_transmit_bytes_total{%(cadvisorSelector)s, %(clusterLabel)s="$cluster",namespace=~"$namespace"}[$interval:$resolution]) * on (namespace,pod) - group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster",namespace=~"$namespace", workload=~"$workload", workload_type="$type"}) by (pod)) + group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster",namespace=~"$namespace", workload=~"$workload", workload_type=~"$type"}) by (pod)) ||| % $._config, ), gridPos={ h: 9, w: 12, x: 12, y: 12 } @@ -375,7 +375,7 @@ local annotation = grafana.annotation; graphQuery=||| sort_desc(sum(irate(container_network_receive_packets_total{%(cadvisorSelector)s, %(clusterLabel)s="$cluster",namespace=~"$namespace"}[$interval:$resolution]) * on (namespace,pod) - group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster",namespace=~"$namespace", workload=~"$workload", workload_type="$type"}) by (pod)) + group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster",namespace=~"$namespace", workload=~"$workload", workload_type=~"$type"}) by (pod)) ||| % $._config, graphFormat='pps' ), @@ -387,7 +387,7 @@ local annotation = grafana.annotation; graphQuery=||| sort_desc(sum(irate(container_network_transmit_packets_total{%(cadvisorSelector)s, %(clusterLabel)s="$cluster",namespace=~"$namespace"}[$interval:$resolution]) * on (namespace,pod) - group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster",namespace=~"$namespace", workload=~"$workload", workload_type="$type"}) by (pod)) + group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster",namespace=~"$namespace", workload=~"$workload", workload_type=~"$type"}) by (pod)) ||| % $._config, graphFormat='pps' ), @@ -403,7 +403,7 @@ local annotation = grafana.annotation; graphQuery=||| sort_desc(sum(irate(container_network_receive_packets_dropped_total{%(cadvisorSelector)s, %(clusterLabel)s="$cluster",namespace=~"$namespace"}[$interval:$resolution]) * on (namespace,pod) - group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster",namespace=~"$namespace", workload=~"$workload", workload_type="$type"}) by (pod)) + group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster",namespace=~"$namespace", workload=~"$workload", workload_type=~"$type"}) by (pod)) ||| % $._config, graphFormat='pps' ), @@ -415,7 +415,7 @@ local annotation = grafana.annotation; graphQuery=||| sort_desc(sum(irate(container_network_transmit_packets_dropped_total{%(cadvisorSelector)s, %(clusterLabel)s="$cluster",namespace=~"$namespace"}[$interval:$resolution]) * on (namespace,pod) - group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster",namespace=~"$namespace", workload=~"$workload", workload_type="$type"}) by (pod)) + group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster",namespace=~"$namespace", workload=~"$workload", workload_type=~"$type"}) by (pod)) ||| % $._config, graphFormat='pps' ), diff --git a/dashboards/resources/workload-namespace.libsonnet b/dashboards/resources/workload-namespace.libsonnet index 65f03f52f..5c95a48ee 100644 --- a/dashboards/resources/workload-namespace.libsonnet +++ b/dashboards/resources/workload-namespace.libsonnet @@ -22,10 +22,10 @@ local template = grafana.template; name='type', datasource='$datasource', query='label_values(namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", namespace="$namespace", workload=~".+"}, workload_type)' % $._config.clusterLabel, - current='deployment', + current='', hide='', refresh=2, - includeAll=false, + includeAll=true, sort=0 ) + { auto: false, @@ -63,32 +63,32 @@ local template = grafana.template; ||| (sum(irate(container_network_receive_bytes_total{%(cadvisorSelector)s, %(clusterLabel)s="$cluster", %(namespaceLabel)s="$namespace"}[%(grafanaIntervalVar)s]) * on (namespace,pod) - group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s="$namespace", workload_type="$type"}) by (workload)) + group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s="$namespace", workload_type=~"$type"}) by (workload)) ||| % $._config, ||| (sum(irate(container_network_transmit_bytes_total{%(cadvisorSelector)s, %(clusterLabel)s="$cluster", %(namespaceLabel)s="$namespace"}[%(grafanaIntervalVar)s]) * on (namespace,pod) - group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s="$namespace", workload_type="$type"}) by (workload)) + group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s="$namespace", workload_type=~"$type"}) by (workload)) ||| % $._config, ||| (sum(irate(container_network_receive_packets_total{%(cadvisorSelector)s, %(clusterLabel)s="$cluster", %(namespaceLabel)s="$namespace"}[%(grafanaIntervalVar)s]) * on (namespace,pod) - group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s="$namespace", workload_type="$type"}) by (workload)) + group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s="$namespace", workload_type=~"$type"}) by (workload)) ||| % $._config, ||| (sum(irate(container_network_transmit_packets_total{%(cadvisorSelector)s, %(clusterLabel)s="$cluster", %(namespaceLabel)s="$namespace"}[%(grafanaIntervalVar)s]) * on (namespace,pod) - group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s="$namespace", workload_type="$type"}) by (workload)) + group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s="$namespace", workload_type=~"$type"}) by (workload)) ||| % $._config, ||| (sum(irate(container_network_receive_packets_dropped_total{%(cadvisorSelector)s, %(clusterLabel)s="$cluster", %(namespaceLabel)s="$namespace"}[%(grafanaIntervalVar)s]) * on (namespace,pod) - group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s="$namespace", workload_type="$type"}) by (workload)) + group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s="$namespace", workload_type=~"$type"}) by (workload)) ||| % $._config, ||| (sum(irate(container_network_transmit_packets_dropped_total{%(cadvisorSelector)s, %(clusterLabel)s="$cluster", %(namespaceLabel)s="$namespace"}[%(grafanaIntervalVar)s]) * on (namespace,pod) - group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s="$namespace", workload_type="$type"}) by (workload)) + group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s="$namespace", workload_type=~"$type"}) by (workload)) ||| % $._config, ]; @@ -131,7 +131,7 @@ local template = grafana.template; sum( node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{%(clusterLabel)s="$cluster", namespace="$namespace"} * on(namespace,pod) - group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", namespace="$namespace", workload_type="$type"} + group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", namespace="$namespace", workload_type=~"$type"} ) by (workload, workload_type) ||| % $._config; @@ -139,18 +139,18 @@ local template = grafana.template; sum( kube_pod_container_resource_requests{%(kubeStateMetricsSelector)s, %(clusterLabel)s="$cluster", namespace="$namespace", resource="cpu"} * on(namespace,pod) - group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", namespace="$namespace", workload_type="$type"} + group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", namespace="$namespace", workload_type=~"$type"} ) by (workload, workload_type) ||| % $._config; - local podCountQuery = 'count(namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", namespace="$namespace", workload_type="$type"}) by (workload, workload_type)' % $._config; + local podCountQuery = 'count(namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", namespace="$namespace", workload_type=~"$type"}) by (workload, workload_type)' % $._config; local cpuLimitsQuery = std.strReplace(cpuRequestsQuery, 'requests', 'limits'); local memUsageQuery = ||| sum( container_memory_working_set_bytes{%(cadvisorSelector)s, %(clusterLabel)s="$cluster", namespace="$namespace", container!="", image!=""} * on(namespace,pod) - group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", namespace="$namespace", workload_type="$type"} + group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", namespace="$namespace", workload_type=~"$type"} ) by (workload, workload_type) ||| % $._config; local memRequestsQuery = std.strReplace(cpuRequestsQuery, 'cpu', 'memory'); @@ -292,7 +292,7 @@ local template = grafana.template; g.queryPanel(||| (sum(irate(container_network_receive_bytes_total{%(cadvisorSelector)s, %(clusterLabel)s="$cluster", namespace="$namespace"}[%(grafanaIntervalVar)s]) * on (namespace,pod) - group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s="$namespace", workload=~".+", workload_type="$type"}) by (workload)) + group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s="$namespace", workload=~".+", workload_type=~"$type"}) by (workload)) ||| % $._config, '{{workload}}') + g.stack + { yaxes: g.yaxes('Bps') }, @@ -302,7 +302,7 @@ local template = grafana.template; g.queryPanel(||| (sum(irate(container_network_transmit_bytes_total{%(cadvisorSelector)s, %(clusterLabel)s="$cluster", namespace="$namespace"}[%(grafanaIntervalVar)s]) * on (namespace,pod) - group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s="$namespace", workload=~".+", workload_type="$type"}) by (workload)) + group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s="$namespace", workload=~".+", workload_type=~"$type"}) by (workload)) ||| % $._config, '{{workload}}') + g.stack + { yaxes: g.yaxes('Bps') }, @@ -315,7 +315,7 @@ local template = grafana.template; g.queryPanel(||| (avg(irate(container_network_receive_bytes_total{%(cadvisorSelector)s, %(clusterLabel)s="$cluster", namespace="$namespace"}[%(grafanaIntervalVar)s]) * on (namespace,pod) - group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s="$namespace", workload=~".+", workload_type="$type"}) by (workload)) + group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s="$namespace", workload=~".+", workload_type=~"$type"}) by (workload)) ||| % $._config, '{{workload}}') + g.stack + { yaxes: g.yaxes('Bps') }, @@ -325,7 +325,7 @@ local template = grafana.template; g.queryPanel(||| (avg(irate(container_network_transmit_bytes_total{%(cadvisorSelector)s, %(clusterLabel)s="$cluster", namespace="$namespace"}[%(grafanaIntervalVar)s]) * on (namespace,pod) - group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s="$namespace", workload=~".+", workload_type="$type"}) by (workload)) + group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s="$namespace", workload=~".+", workload_type=~"$type"}) by (workload)) ||| % $._config, '{{workload}}') + g.stack + { yaxes: g.yaxes('Bps') }, @@ -338,7 +338,7 @@ local template = grafana.template; g.queryPanel(||| (sum(irate(container_network_receive_packets_total{%(cadvisorSelector)s, %(clusterLabel)s="$cluster", namespace="$namespace"}[%(grafanaIntervalVar)s]) * on (namespace,pod) - group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s="$namespace", workload=~".+", workload_type="$type"}) by (workload)) + group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s="$namespace", workload=~".+", workload_type=~"$type"}) by (workload)) ||| % $._config, '{{workload}}') + g.stack + { yaxes: g.yaxes('pps') }, @@ -348,7 +348,7 @@ local template = grafana.template; g.queryPanel(||| (sum(irate(container_network_transmit_packets_total{%(cadvisorSelector)s, %(clusterLabel)s="$cluster", namespace="$namespace"}[%(grafanaIntervalVar)s]) * on (namespace,pod) - group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s="$namespace", workload=~".+", workload_type="$type"}) by (workload)) + group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s="$namespace", workload=~".+", workload_type=~"$type"}) by (workload)) ||| % $._config, '{{workload}}') + g.stack + { yaxes: g.yaxes('pps') }, @@ -361,7 +361,7 @@ local template = grafana.template; g.queryPanel(||| (sum(irate(container_network_receive_packets_dropped_total{%(cadvisorSelector)s, %(clusterLabel)s="$cluster", namespace="$namespace"}[%(grafanaIntervalVar)s]) * on (namespace,pod) - group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s="$namespace", workload=~".+", workload_type="$type"}) by (workload)) + group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s="$namespace", workload=~".+", workload_type=~"$type"}) by (workload)) ||| % $._config, '{{workload}}') + g.stack + { yaxes: g.yaxes('pps') }, @@ -371,7 +371,7 @@ local template = grafana.template; g.queryPanel(||| (sum(irate(container_network_transmit_packets_dropped_total{%(cadvisorSelector)s, %(clusterLabel)s="$cluster", namespace="$namespace"}[%(grafanaIntervalVar)s]) * on (namespace,pod) - group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s="$namespace", workload=~".+", workload_type="$type"}) by (workload)) + group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s="$namespace", workload=~".+", workload_type=~"$type"}) by (workload)) ||| % $._config, '{{workload}}') + g.stack + { yaxes: g.yaxes('pps') }, diff --git a/dashboards/resources/workload.libsonnet b/dashboards/resources/workload.libsonnet index 7b405588a..666a639bb 100644 --- a/dashboards/resources/workload.libsonnet +++ b/dashboards/resources/workload.libsonnet @@ -37,15 +37,15 @@ local template = grafana.template; current='', hide='', refresh=2, - includeAll=false, - sort=1 + includeAll=true, + sort=0 ), local workloadTemplate = template.new( name='workload', datasource='$datasource', - query='label_values(namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", namespace="$namespace", workload_type="$type"}, workload)' % $._config.clusterLabel, + query='label_values(namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", namespace="$namespace", workload_type=~"$type"}, workload)' % $._config.clusterLabel, current='', hide='', refresh=2, @@ -64,32 +64,32 @@ local template = grafana.template; ||| (sum(irate(container_network_receive_bytes_total{%(cadvisorSelector)s, %(clusterLabel)s="$cluster", %(namespaceLabel)s="$namespace"}[%(grafanaIntervalVar)s]) * on (namespace,pod) - group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s="$namespace", workload=~"$workload", workload_type="$type"}) by (pod)) + group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s="$namespace", workload=~"$workload", workload_type=~"$type"}) by (pod)) ||| % $._config, ||| (sum(irate(container_network_transmit_bytes_total{%(cadvisorSelector)s, %(clusterLabel)s="$cluster", %(namespaceLabel)s="$namespace"}[%(grafanaIntervalVar)s]) * on (namespace,pod) - group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s="$namespace", workload=~"$workload", workload_type="$type"}) by (pod)) + group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s="$namespace", workload=~"$workload", workload_type=~"$type"}) by (pod)) ||| % $._config, ||| (sum(irate(container_network_receive_packets_total{%(cadvisorSelector)s, %(clusterLabel)s="$cluster", %(namespaceLabel)s="$namespace"}[%(grafanaIntervalVar)s]) * on (namespace,pod) - group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s="$namespace", workload=~"$workload", workload_type="$type"}) by (pod)) + group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s="$namespace", workload=~"$workload", workload_type=~"$type"}) by (pod)) ||| % $._config, ||| (sum(irate(container_network_transmit_packets_total{%(cadvisorSelector)s, %(clusterLabel)s="$cluster", %(namespaceLabel)s="$namespace"}[%(grafanaIntervalVar)s]) * on (namespace,pod) - group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s="$namespace", workload=~"$workload", workload_type="$type"}) by (pod)) + group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s="$namespace", workload=~"$workload", workload_type=~"$type"}) by (pod)) ||| % $._config, ||| (sum(irate(container_network_receive_packets_dropped_total{%(cadvisorSelector)s, %(clusterLabel)s="$cluster", %(namespaceLabel)s="$namespace"}[%(grafanaIntervalVar)s]) * on (namespace,pod) - group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s="$namespace", workload=~"$workload", workload_type="$type"}) by (pod)) + group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s="$namespace", workload=~"$workload", workload_type=~"$type"}) by (pod)) ||| % $._config, ||| (sum(irate(container_network_transmit_packets_dropped_total{%(cadvisorSelector)s, %(clusterLabel)s="$cluster", %(namespaceLabel)s="$namespace"}[%(grafanaIntervalVar)s]) * on (namespace,pod) - group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s="$namespace", workload=~"$workload", workload_type="$type"}) by (pod)) + group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s="$namespace", workload=~"$workload", workload_type=~"$type"}) by (pod)) ||| % $._config, ]; @@ -129,7 +129,7 @@ local template = grafana.template; sum( node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{%(clusterLabel)s="$cluster", namespace="$namespace"} * on(namespace,pod) - group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", namespace="$namespace", workload="$workload", workload_type="$type"} + group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", namespace="$namespace", workload="$workload", workload_type=~"$type"} ) by (pod) ||| % $._config; @@ -137,7 +137,7 @@ local template = grafana.template; sum( kube_pod_container_resource_requests{%(kubeStateMetricsSelector)s, %(clusterLabel)s="$cluster", namespace="$namespace", resource="cpu"} * on(namespace,pod) - group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", namespace="$namespace", workload="$workload", workload_type="$type"} + group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", namespace="$namespace", workload="$workload", workload_type=~"$type"} ) by (pod) ||| % $._config; @@ -147,7 +147,7 @@ local template = grafana.template; sum( container_memory_working_set_bytes{%(clusterLabel)s="$cluster", namespace="$namespace", container!="", image!=""} * on(namespace,pod) - group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", namespace="$namespace", workload="$workload", workload_type="$type"} + group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", namespace="$namespace", workload="$workload", workload_type=~"$type"} ) by (pod) ||| % $._config; local memRequestsQuery = std.strReplace(cpuRequestsQuery, 'cpu', 'memory'); @@ -229,7 +229,7 @@ local template = grafana.template; g.queryPanel(||| (sum(irate(container_network_receive_bytes_total{%(cadvisorSelector)s, %(clusterLabel)s="$cluster", namespace="$namespace"}[%(grafanaIntervalVar)s]) * on (namespace,pod) - group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s="$namespace", workload=~"$workload", workload_type="$type"}) by (pod)) + group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s="$namespace", workload=~"$workload", workload_type=~"$type"}) by (pod)) ||| % $._config, '{{pod}}') + g.stack + { yaxes: g.yaxes('Bps') }, @@ -239,7 +239,7 @@ local template = grafana.template; g.queryPanel(||| (sum(irate(container_network_transmit_bytes_total{%(cadvisorSelector)s, %(clusterLabel)s="$cluster", namespace="$namespace"}[%(grafanaIntervalVar)s]) * on (namespace,pod) - group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s="$namespace", workload=~"$workload", workload_type="$type"}) by (pod)) + group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s="$namespace", workload=~"$workload", workload_type=~"$type"}) by (pod)) ||| % $._config, '{{pod}}') + g.stack + { yaxes: g.yaxes('Bps') }, @@ -252,7 +252,7 @@ local template = grafana.template; g.queryPanel(||| (avg(irate(container_network_receive_bytes_total{%(cadvisorSelector)s, %(clusterLabel)s="$cluster", namespace="$namespace"}[%(grafanaIntervalVar)s]) * on (namespace,pod) - group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s="$namespace", workload=~"$workload", workload_type="$type"}) by (pod)) + group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s="$namespace", workload=~"$workload", workload_type=~"$type"}) by (pod)) ||| % $._config, '{{pod}}') + g.stack + { yaxes: g.yaxes('Bps') }, @@ -262,7 +262,7 @@ local template = grafana.template; g.queryPanel(||| (avg(irate(container_network_transmit_bytes_total{%(cadvisorSelector)s, %(clusterLabel)s="$cluster", namespace="$namespace"}[%(grafanaIntervalVar)s]) * on (namespace,pod) - group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s="$namespace", workload=~"$workload", workload_type="$type"}) by (pod)) + group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s="$namespace", workload=~"$workload", workload_type=~"$type"}) by (pod)) ||| % $._config, '{{pod}}') + g.stack + { yaxes: g.yaxes('Bps') }, @@ -275,7 +275,7 @@ local template = grafana.template; g.queryPanel(||| (sum(irate(container_network_receive_packets_total{%(cadvisorSelector)s, %(clusterLabel)s="$cluster", namespace="$namespace"}[%(grafanaIntervalVar)s]) * on (namespace,pod) - group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s="$namespace", workload=~"$workload", workload_type="$type"}) by (pod)) + group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s="$namespace", workload=~"$workload", workload_type=~"$type"}) by (pod)) ||| % $._config, '{{pod}}') + g.stack + { yaxes: g.yaxes('pps') }, @@ -285,7 +285,7 @@ local template = grafana.template; g.queryPanel(||| (sum(irate(container_network_transmit_packets_total{%(cadvisorSelector)s, %(clusterLabel)s="$cluster", namespace="$namespace"}[%(grafanaIntervalVar)s]) * on (namespace,pod) - group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s="$namespace", workload=~"$workload", workload_type="$type"}) by (pod)) + group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s="$namespace", workload=~"$workload", workload_type=~"$type"}) by (pod)) ||| % $._config, '{{pod}}') + g.stack + { yaxes: g.yaxes('pps') }, @@ -298,7 +298,7 @@ local template = grafana.template; g.queryPanel(||| (sum(irate(container_network_receive_packets_dropped_total{%(cadvisorSelector)s, %(clusterLabel)s="$cluster", namespace="$namespace"}[%(grafanaIntervalVar)s]) * on (namespace,pod) - group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s="$namespace", workload=~"$workload", workload_type="$type"}) by (pod)) + group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s="$namespace", workload=~"$workload", workload_type=~"$type"}) by (pod)) ||| % $._config, '{{pod}}') + g.stack + { yaxes: g.yaxes('pps') }, @@ -308,7 +308,7 @@ local template = grafana.template; g.queryPanel(||| (sum(irate(container_network_transmit_packets_dropped_total{%(cadvisorSelector)s, %(clusterLabel)s="$cluster", namespace="$namespace"}[%(grafanaIntervalVar)s]) * on (namespace,pod) - group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s="$namespace", workload=~"$workload", workload_type="$type"}) by (pod)) + group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s="$namespace", workload=~"$workload", workload_type=~"$type"}) by (pod)) ||| % $._config, '{{pod}}') + g.stack + { yaxes: g.yaxes('pps') }, diff --git a/rules/kube_apiserver.libsonnet b/rules/kube_apiserver.libsonnet index 1d75dbe02..be20c25c3 100644 --- a/rules/kube_apiserver.libsonnet +++ b/rules/kube_apiserver.libsonnet @@ -151,7 +151,7 @@ { record: 'cluster_verb_scope:apiserver_request_slo_duration_seconds_count:increase1h', expr: ||| - sum by (%(clusterLabel)s, verb, scope) (increase(apiserver_request_slo_duration_seconds_count[1h])) + sum by (%(clusterLabel)s, verb, scope) (increase(apiserver_request_slo_duration_seconds_count{%(kubeApiserverSelector)s}[1h])) ||| % $._config, }, { diff --git a/rules/kubelet.libsonnet b/rules/kubelet.libsonnet index 397ee9d09..e932a8744 100644 --- a/rules/kubelet.libsonnet +++ b/rules/kubelet.libsonnet @@ -11,7 +11,7 @@ { record: 'node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile', expr: ||| - histogram_quantile(%(quantile)s, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (%(clusterLabel)s, instance, le) * on(%(clusterLabel)s, instance) group_left(node) kubelet_node_name{%(kubeletSelector)s}) + histogram_quantile(%(quantile)s, sum(rate(kubelet_pleg_relist_duration_seconds_bucket{%(kubeletSelector)s}[5m])) by (%(clusterLabel)s, instance, le) * on(%(clusterLabel)s, instance) group_left(node) kubelet_node_name{%(kubeletSelector)s}) ||| % ({ quantile: quantile } + $._config), labels: { quantile: quantile, diff --git a/rules/windows.libsonnet b/rules/windows.libsonnet index 014032ef7..0d720b08b 100644 --- a/rules/windows.libsonnet +++ b/rules/windows.libsonnet @@ -180,37 +180,37 @@ { record: 'windows_pod_container_available', expr: ||| - windows_container_available{%(windowsExporterSelector)s} * on(container_id) group_left(container, pod, namespace, %(clusterLabel)s) max(kube_pod_container_info{%(kubeStateMetricsSelector)s}) by(container, container_id, pod, namespace, %(clusterLabel)s) + windows_container_available{%(windowsExporterSelector)s, container_id != ""} * on(container_id) group_left(container, pod, namespace, %(clusterLabel)s) max(kube_pod_container_info{%(kubeStateMetricsSelector)s, container_id != ""}) by(container, container_id, pod, namespace, %(clusterLabel)s) ||| % $._config, }, { record: 'windows_container_total_runtime', expr: ||| - windows_container_cpu_usage_seconds_total{%(windowsExporterSelector)s} * on(container_id) group_left(container, pod, namespace, %(clusterLabel)s) max(kube_pod_container_info{%(kubeStateMetricsSelector)s}) by(container, container_id, pod, namespace, %(clusterLabel)s) + windows_container_cpu_usage_seconds_total{%(windowsExporterSelector)s, container_id != ""} * on(container_id) group_left(container, pod, namespace, %(clusterLabel)s) max(kube_pod_container_info{%(kubeStateMetricsSelector)s, container_id != ""}) by(container, container_id, pod, namespace, %(clusterLabel)s) ||| % $._config, }, { record: 'windows_container_memory_usage', expr: ||| - windows_container_memory_usage_commit_bytes{%(windowsExporterSelector)s} * on(container_id) group_left(container, pod, namespace, %(clusterLabel)s) max(kube_pod_container_info{%(kubeStateMetricsSelector)s}) by(container, container_id, pod, namespace, %(clusterLabel)s) + windows_container_memory_usage_commit_bytes{%(windowsExporterSelector)s, container_id != ""} * on(container_id) group_left(container, pod, namespace, %(clusterLabel)s) max(kube_pod_container_info{%(kubeStateMetricsSelector)s, container_id != ""}) by(container, container_id, pod, namespace, %(clusterLabel)s) ||| % $._config, }, { record: 'windows_container_private_working_set_usage', expr: ||| - windows_container_memory_usage_private_working_set_bytes{%(windowsExporterSelector)s} * on(container_id) group_left(container, pod, namespace, %(clusterLabel)s) max(kube_pod_container_info{%(kubeStateMetricsSelector)s}) by(container, container_id, pod, namespace, %(clusterLabel)s) + windows_container_memory_usage_private_working_set_bytes{%(windowsExporterSelector)s, container_id != ""} * on(container_id) group_left(container, pod, namespace, %(clusterLabel)s) max(kube_pod_container_info{%(kubeStateMetricsSelector)s, container_id != ""}) by(container, container_id, pod, namespace, %(clusterLabel)s) ||| % $._config, }, { record: 'windows_container_network_received_bytes_total', expr: ||| - windows_container_network_receive_bytes_total{%(windowsExporterSelector)s} * on(container_id) group_left(container, pod, namespace, %(clusterLabel)s) max(kube_pod_container_info{%(kubeStateMetricsSelector)s}) by(container, container_id, pod, namespace, %(clusterLabel)s) + windows_container_network_receive_bytes_total{%(windowsExporterSelector)s, container_id != ""} * on(container_id) group_left(container, pod, namespace, %(clusterLabel)s) max(kube_pod_container_info{%(kubeStateMetricsSelector)s, container_id != ""}) by(container, container_id, pod, namespace, %(clusterLabel)s) ||| % $._config, }, { record: 'windows_container_network_transmitted_bytes_total', expr: ||| - windows_container_network_transmit_bytes_total{%(windowsExporterSelector)s} * on(container_id) group_left(container, pod, namespace, %(clusterLabel)s) max(kube_pod_container_info{%(kubeStateMetricsSelector)s}) by(container, container_id, pod, namespace, %(clusterLabel)s) + windows_container_network_transmit_bytes_total{%(windowsExporterSelector)s, container_id != ""} * on(container_id) group_left(container, pod, namespace, %(clusterLabel)s) max(kube_pod_container_info{%(kubeStateMetricsSelector)s, container_id != ""}) by(container, container_id, pod, namespace, %(clusterLabel)s) ||| % $._config, }, { diff --git a/runbook.md b/runbook.md index fd42f9b7c..e136d1e59 100644 --- a/runbook.md +++ b/runbook.md @@ -24,6 +24,10 @@ This page collects this repositories alerts and begins the process of describing ##### Alert Name: KubeletDown + *Message*: `Kubelet has disappeared from Prometheus target discovery.` + *Severity*: critical +##### Alert Name: KubeProxyDown ++ *Message*: `KubeProxy has disappeared from Prometheus target discovery` ++ *Severity*: critical ++ *Runbook*: [Link](https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeproxydown/) ### Group Name: kubernetes-apps ##### Alert Name: KubePodCrashLooping + *Message*: `{{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is restarting {{ printf \"%.2f\" $value }} / second` @@ -37,6 +41,9 @@ This page collects this repositories alerts and begins the process of describing ##### Alert Name: "KubeDeploymentReplicasMismatch" + *Message*: `Deployment {{ $labels.namespace }}/{{ $labels.deployment }} replica mismatch` + *Severity*: warning +##### Alert Name: "KubeDeploymentRolloutStuck" ++ *Message*: `Rollout of deployment {{ $labels.namespace }}/{{ $labels.deployment }} is not progressing` ++ *Severity*: warning ##### Alert Name: "KubeStatefulSetReplicasMismatch" + *Message*: `StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} replica mismatch` + *Severity*: warning diff --git a/scripts/check-selectors-ksm.sh b/scripts/check-selectors-ksm.sh new file mode 100755 index 000000000..b6b6a9e65 --- /dev/null +++ b/scripts/check-selectors-ksm.sh @@ -0,0 +1,49 @@ +#!/usr/bin/env bash + +# Set -u to error out if we use an unset variable. +# Set -o pipefail to propagate errors in a pipeline. +set -uo pipefail + +# Remove kube-state-metrics directory if it exists. +rm -rf kube-state-metrics + +# Clone kube-state-metrics repository. +git clone https://github.com/kubernetes/kube-state-metrics --depth 1 + +# Set the repository root. +repository_root=$(git rev-parse --show-toplevel) + +# Change directory to kube-state-metrics. +cd kube-state-metrics || exit + +# Grep all metrics in the codebase. +find internal/store -type f -not -name '*_test.go' -exec sed -nE 's/.*"(kube_[^"]+)".*/\1/p' {} \; | sort -u > metrics.txt + +# Set the KSM selector specifier. +ksm_selector="kubeStateMetricsSelector" + +# Set the paths to the alerts, lib and rules directories. +alerts_path="$repository_root/alerts" +lib_path="$repository_root/lib" +rules_path="$repository_root/rules" + +# Read metrics.txt line by line. +while IFS= read -r metric; do + selector_misses=$(\ + grep --only-matching --color=always --line-number "$metric{[^}]*}" --directories=recurse "$alerts_path" "$lib_path" "$rules_path" |\ + grep --invert-match "$ksm_selector" \ + ) + if [ -n "$selector_misses" ]; then + echo "The following $metric metrics are missing the $ksm_selector specifier:" + echo "$selector_misses" + fi +done < metrics.txt + +# Clean artefacts. +rm metrics.txt +cd .. || exit +rm -rf kube-state-metrics + +# TODO: Currently, there are only two possible states the workflow can report: success or failure. +# We could benefit from a third "warning" state, for cases where we observe an overlap of selectors for the same metric. +# Ref: https://docs.github.com/en/actions/creating-actions/setting-exit-codes-for-actions#about-exit-codes diff --git a/tests.yaml b/tests.yaml index 91cf98d3b..a2fb0bb0f 100644 --- a/tests.yaml +++ b/tests.yaml @@ -518,29 +518,29 @@ tests: - interval: 1m input_series: # Create a histogram where all of the last 10 samples are in the +Inf (> 10 seconds) bucket. - - series: 'kubelet_pleg_relist_duration_seconds_bucket{job="kublet", le="0.005", instance="10.0.2.15:10250"}' + - series: 'kubelet_pleg_relist_duration_seconds_bucket{job="kubelet", le="0.005", instance="10.0.2.15:10250"}' values: '1+0x10' - - series: 'kubelet_pleg_relist_duration_seconds_bucket{job="kublet", le="0.01", instance="10.0.2.15:10250"}' + - series: 'kubelet_pleg_relist_duration_seconds_bucket{job="kubelet", le="0.01", instance="10.0.2.15:10250"}' values: '1+0x10' - - series: 'kubelet_pleg_relist_duration_seconds_bucket{job="kublet", le="0.025", instance="10.0.2.15:10250"}' + - series: 'kubelet_pleg_relist_duration_seconds_bucket{job="kubelet", le="0.025", instance="10.0.2.15:10250"}' values: '1+0x10' - - series: 'kubelet_pleg_relist_duration_seconds_bucket{job="kublet", le="0.05", instance="10.0.2.15:10250"}' + - series: 'kubelet_pleg_relist_duration_seconds_bucket{job="kubelet", le="0.05", instance="10.0.2.15:10250"}' values: '1+0x10' - - series: 'kubelet_pleg_relist_duration_seconds_bucket{job="kublet", le="0.1", instance="10.0.2.15:10250"}' + - series: 'kubelet_pleg_relist_duration_seconds_bucket{job="kubelet", le="0.1", instance="10.0.2.15:10250"}' values: '1+0x10' - - series: 'kubelet_pleg_relist_duration_seconds_bucket{job="kublet", le="0.25", instance="10.0.2.15:10250"}' + - series: 'kubelet_pleg_relist_duration_seconds_bucket{job="kubelet", le="0.25", instance="10.0.2.15:10250"}' values: '1+0x10' - - series: 'kubelet_pleg_relist_duration_seconds_bucket{job="kublet", le="0.5", instance="10.0.2.15:10250"}' + - series: 'kubelet_pleg_relist_duration_seconds_bucket{job="kubelet", le="0.5", instance="10.0.2.15:10250"}' values: '1+0x10' - - series: 'kubelet_pleg_relist_duration_seconds_bucket{job="kublet", le="1", instance="10.0.2.15:10250"}' + - series: 'kubelet_pleg_relist_duration_seconds_bucket{job="kubelet", le="1", instance="10.0.2.15:10250"}' values: '1+0x10' - - series: 'kubelet_pleg_relist_duration_seconds_bucket{job="kublet", le="2.5", instance="10.0.2.15:10250"}' + - series: 'kubelet_pleg_relist_duration_seconds_bucket{job="kubelet", le="2.5", instance="10.0.2.15:10250"}' values: '1+0x10' - - series: 'kubelet_pleg_relist_duration_seconds_bucket{job="kublet", le="5", instance="10.0.2.15:10250"}' + - series: 'kubelet_pleg_relist_duration_seconds_bucket{job="kubelet", le="5", instance="10.0.2.15:10250"}' values: '1+0x10' - - series: 'kubelet_pleg_relist_duration_seconds_bucket{job="kublet", le="10", instance="10.0.2.15:10250"}' + - series: 'kubelet_pleg_relist_duration_seconds_bucket{job="kubelet", le="10", instance="10.0.2.15:10250"}' values: '1+0x10' - - series: 'kubelet_pleg_relist_duration_seconds_bucket{job="kublet", le="+Inf", instance="10.0.2.15:10250"}' + - series: 'kubelet_pleg_relist_duration_seconds_bucket{job="kubelet", le="+Inf", instance="10.0.2.15:10250"}' values: '30+1x10' - series: 'kubelet_node_name{endpoint="https-metrics",instance="10.0.2.15:10250",job="kubelet",namespace="kube-system",node="minikube",service="kubelet"}' values: '1 1 1 1 1 1 1 1 1 1' @@ -813,6 +813,30 @@ tests: runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetrolloutstuck - eval_time: 34m alertname: KubeDaemonSetRolloutStuck +# KubeDeploymentRolloutStuck +- interval: 1m + input_series: + - series: 'kube_deployment_status_condition{job="kube-state-metrics",namespace="monitoring",deployment="stuck", condition="Progressing", status="false"}' + values: '1+0x17 0+0x5' + alert_rule_test: + - eval_time: 14m + alertname: KubeDeploymentRolloutStuck + - eval_time: 16m + alertname: KubeDeploymentRolloutStuck + exp_alerts: + - exp_labels: + job: kube-state-metrics + namespace: monitoring + deployment: stuck + severity: warning + condition: Progressing + status: "false" + exp_annotations: + summary: 'Deployment rollout is not progressing.' + description: 'Rollout of deployment monitoring/stuck is not progressing for longer than 15 minutes.' + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentrolloutstuck + - eval_time: 18m + alertname: KubeDeploymentRolloutStuck - interval: 1m # Misscheduled is non zero. input_series: