Skip to content

Commit

Permalink
Merge branch 'master' into master
Browse files Browse the repository at this point in the history
  • Loading branch information
TBurda authored Sep 8, 2023
2 parents 4e72db2 + 63337d9 commit 559d01e
Show file tree
Hide file tree
Showing 18 changed files with 222 additions and 126 deletions.
14 changes: 14 additions & 0 deletions .github/workflows/check-with-upstream.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
name: check-with-upstream
# Run every Monday.
on:
schedule:
- cron: '0 0 * * 1'
jobs:
check-selectors-ksm:
runs-on: ubuntu-latest
name: Check if KSM selectors are present on applicable metrics.
steps:
- uses: actions/checkout@v2
with:
persist-credentials: false
- run: make --always-make check-selectors-ksm
20 changes: 0 additions & 20 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@ name: ci
on:
- push
- pull_request
env:
golang-version: '1.17'
jobs:
generate:
runs-on: ubuntu-latest
Expand All @@ -12,9 +10,6 @@ jobs:
- uses: actions/checkout@v2
with:
persist-credentials: false
- uses: actions/setup-go@v2
with:
go-version: ${{ env.golang-version }}
- run: make --always-make generate && git diff --exit-code
jsonnet-lint:
runs-on: ubuntu-latest
Expand All @@ -23,9 +18,6 @@ jobs:
- uses: actions/checkout@v2
with:
persist-credentials: false
- uses: actions/setup-go@v2
with:
go-version: ${{ env.golang-version }}
- run: make --always-make jsonnet-lint
dashboards-lint:
runs-on: ubuntu-latest
Expand All @@ -34,9 +26,6 @@ jobs:
- uses: actions/checkout@v2
with:
persist-credentials: false
- uses: actions/setup-go@v2
with:
go-version: ${{ env.golang-version }}
- run: make --always-make dashboards-lint
alerts-lint:
runs-on: ubuntu-latest
Expand All @@ -45,9 +34,6 @@ jobs:
- uses: actions/checkout@v2
with:
persist-credentials: false
- uses: actions/setup-go@v2
with:
go-version: ${{ env.golang-version }}
- run: make --always-make alerts-lint
fmt:
runs-on: ubuntu-latest
Expand All @@ -56,9 +42,6 @@ jobs:
- uses: actions/checkout@v2
with:
persist-credentials: false
- uses: actions/setup-go@v2
with:
go-version: ${{ env.golang-version }}
- run: make --always-make fmt && git diff --exit-code
unit-tests:
runs-on: ubuntu-latest
Expand All @@ -67,7 +50,4 @@ jobs:
- uses: actions/checkout@v2
with:
persist-credentials: false
- uses: actions/setup-go@v2
with:
go-version: ${{ env.golang-version }}
- run: make --always-make test
6 changes: 6 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -76,3 +76,9 @@ $(TOOLING): $(BIN_DIR)
@echo Installing tools from hack/tools.go
@cd scripts && go list -mod=mod -tags tools -f '{{ range .Imports }}{{ printf "%s\n" .}}{{end}}' ./ | xargs -tI % go build -mod=mod -o $(BIN_DIR) %

########################################
# "check-with-upstream" workflow checks.
########################################

check-selectors-ksm:
@./scripts/check-selectors-ksm.sh
17 changes: 16 additions & 1 deletion alerts/apps_alerts.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,21 @@
'for': '15m',
alert: 'KubeDeploymentReplicasMismatch',
},
{
expr: |||
kube_deployment_status_condition{condition="Progressing", status="false",%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s}
!= 0
||| % $._config,
labels: {
severity: 'warning',
},
annotations: {
description: 'Rollout of deployment {{ $labels.namespace }}/{{ $labels.deployment }} is not progressing for longer than 15 minutes.',
summary: 'Deployment rollout is not progressing.',
},
'for': '15m',
alert: 'KubeDeploymentRolloutStuck',
},
{
expr: |||
(
Expand All @@ -104,7 +119,7 @@
},
annotations: {
description: 'StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has not matched the expected number of replicas for longer than 15 minutes.',
summary: 'Deployment has not matched the expected number of replicas.',
summary: 'StatefulSet has not matched the expected number of replicas.',
},
'for': '15m',
alert: 'KubeStatefulSetReplicasMismatch',
Expand Down
4 changes: 2 additions & 2 deletions alerts/kube_apiserver.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ local utils = import '../lib/utils.libsonnet';
{
alert: 'KubeAggregatedAPIErrors',
expr: |||
sum by(name, namespace, %(clusterLabel)s)(increase(aggregator_unavailable_apiservice_total[10m])) > 4
sum by(name, namespace, %(clusterLabel)s)(increase(aggregator_unavailable_apiservice_total{%(kubeApiserverSelector)s}[10m])) > 4
||| % $._config,
labels: {
severity: 'warning',
Expand All @@ -90,7 +90,7 @@ local utils = import '../lib/utils.libsonnet';
{
alert: 'KubeAggregatedAPIDown',
expr: |||
(1 - max by(name, namespace, %(clusterLabel)s)(avg_over_time(aggregator_unavailable_apiservice[10m]))) * 100 < 85
(1 - max by(name, namespace, %(clusterLabel)s)(avg_over_time(aggregator_unavailable_apiservice{%(kubeApiserverSelector)s}[10m]))) * 100 < 85
||| % $._config,
'for': '5m',
labels: {
Expand Down
2 changes: 1 addition & 1 deletion alerts/kubelet.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@
{
alert: 'KubeNodeReadinessFlapping',
expr: |||
sum(changes(kube_node_status_condition{status="true",condition="Ready"}[15m])) by (%(clusterLabel)s, node) > 2
sum(changes(kube_node_status_condition{%(kubeStateMetricsSelector)s,status="true",condition="Ready"}[15m])) by (%(clusterLabel)s, node) > 2
||| % $._config,
'for': '15m',
labels: {
Expand Down
16 changes: 8 additions & 8 deletions alerts/resource_alerts.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -34,18 +34,18 @@
} +
if $._config.showMultiCluster then {
expr: |||
sum(namespace_cpu:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) by (%(clusterLabel)s) - (sum(kube_node_status_allocatable{resource="cpu"}) by (%(clusterLabel)s) - max(kube_node_status_allocatable{resource="cpu"}) by (%(clusterLabel)s)) > 0
sum(namespace_cpu:kube_pod_container_resource_requests:sum{%(kubeStateMetricsSelector)s,%(ignoringOverprovisionedWorkloadSelector)s}) by (%(clusterLabel)s) - (sum(kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="cpu"}) by (%(clusterLabel)s) - max(kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="cpu"}) by (%(clusterLabel)s)) > 0
and
(sum(kube_node_status_allocatable{resource="cpu"}) by (%(clusterLabel)s) - max(kube_node_status_allocatable{resource="cpu"}) by (%(clusterLabel)s)) > 0
(sum(kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="cpu"}) by (%(clusterLabel)s) - max(kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="cpu"}) by (%(clusterLabel)s)) > 0
||| % $._config,
annotations+: {
description: 'Cluster {{ $labels.%(clusterLabel)s }} has overcommitted CPU resource requests for Pods by {{ $value }} CPU shares and cannot tolerate node failure.' % $._config,
},
} else {
expr: |||
sum(namespace_cpu:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) - (sum(kube_node_status_allocatable{resource="cpu"}) - max(kube_node_status_allocatable{resource="cpu"})) > 0
sum(namespace_cpu:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) - (sum(kube_node_status_allocatable{resource="cpu", %(kubeStateMetricsSelector)s}) - max(kube_node_status_allocatable{resource="cpu", %(kubeStateMetricsSelector)s})) > 0
and
(sum(kube_node_status_allocatable{resource="cpu"}) - max(kube_node_status_allocatable{resource="cpu"})) > 0
(sum(kube_node_status_allocatable{resource="cpu", %(kubeStateMetricsSelector)s}) - max(kube_node_status_allocatable{resource="cpu", %(kubeStateMetricsSelector)s})) > 0
||| % $._config,
annotations+: {
description: 'Cluster has overcommitted CPU resource requests for Pods by {{ $value }} CPU shares and cannot tolerate node failure.' % $._config,
Expand All @@ -63,19 +63,19 @@
} +
if $._config.showMultiCluster then {
expr: |||
sum(namespace_memory:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) by (%(clusterLabel)s) - (sum(kube_node_status_allocatable{resource="memory"}) by (%(clusterLabel)s) - max(kube_node_status_allocatable{resource="memory"}) by (%(clusterLabel)s)) > 0
sum(namespace_memory:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) by (%(clusterLabel)s) - (sum(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) by (%(clusterLabel)s) - max(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) by (%(clusterLabel)s)) > 0
and
(sum(kube_node_status_allocatable{resource="memory"}) by (%(clusterLabel)s) - max(kube_node_status_allocatable{resource="memory"}) by (%(clusterLabel)s)) > 0
(sum(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) by (%(clusterLabel)s) - max(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) by (%(clusterLabel)s)) > 0
||| % $._config,
annotations+: {
description: 'Cluster {{ $labels.%(clusterLabel)s }} has overcommitted memory resource requests for Pods by {{ $value | humanize }} bytes and cannot tolerate node failure.' % $._config,
},
} else
{
expr: |||
sum(namespace_memory:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) - (sum(kube_node_status_allocatable{resource="memory"}) - max(kube_node_status_allocatable{resource="memory"})) > 0
sum(namespace_memory:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) - (sum(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) - max(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s})) > 0
and
(sum(kube_node_status_allocatable{resource="memory"}) - max(kube_node_status_allocatable{resource="memory"})) > 0
(sum(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) - max(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s})) > 0
||| % $._config,
annotations+: {
description: 'Cluster has overcommitted memory resource requests for Pods by {{ $value | humanize }} bytes and cannot tolerate node failure.',
Expand Down
5 changes: 3 additions & 2 deletions alerts/system_alerts.libsonnet
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
{
_config+:: {
notKubeDnsCoreDnsSelector: 'job!~"kube-dns|coredns"',
kubeApiserverSelector: 'job="kube-apiserver"',
},

prometheusAlerts+:: {
Expand Down Expand Up @@ -28,9 +29,9 @@
// this is normal and an expected error, therefore it should be
// ignored in this alert.
expr: |||
(sum(rate(rest_client_requests_total{code=~"5.."}[5m])) by (%(clusterLabel)s, instance, job, namespace)
(sum(rate(rest_client_requests_total{%(kubeApiserverSelector)s,code=~"5.."}[5m])) by (%(clusterLabel)s, instance, job, namespace)
/
sum(rate(rest_client_requests_total[5m])) by (%(clusterLabel)s, instance, job, namespace))
sum(rate(rest_client_requests_total{%(kubeApiserverSelector)s}[5m])) by (%(clusterLabel)s, instance, job, namespace))
> 0.01
||| % $._config,
'for': '15m',
Expand Down
Loading

0 comments on commit 559d01e

Please sign in to comment.