From e3c44dbcaf4b8b25d44efa3a3078d29d42f524f2 Mon Sep 17 00:00:00 2001 From: vsaveliev Date: Fri, 27 Oct 2017 14:11:57 +0700 Subject: [PATCH] added alert rules for Prometheus, configured slack messages --- roles/prometheus/templates/alert-manager.yaml | 13 ++- roles/prometheus/templates/alerts/app.conf | 32 ++++++++ .../templates/alerts/deployment.conf | 21 +++++ .../prometheus/templates/alerts/ingress.conf | 33 ++++++++ roles/prometheus/templates/alerts/node.conf | 82 +++++++++++++++++++ roles/prometheus/templates/alerts/pods.conf | 35 ++++++++ .../prometheus/templates/alerts/service.conf | 21 +++++ roles/prometheus/templates/server.yaml | 12 +++ 8 files changed, 248 insertions(+), 1 deletion(-) create mode 100644 roles/prometheus/templates/alerts/app.conf create mode 100644 roles/prometheus/templates/alerts/deployment.conf create mode 100644 roles/prometheus/templates/alerts/ingress.conf create mode 100644 roles/prometheus/templates/alerts/node.conf create mode 100644 roles/prometheus/templates/alerts/pods.conf create mode 100644 roles/prometheus/templates/alerts/service.conf diff --git a/roles/prometheus/templates/alert-manager.yaml b/roles/prometheus/templates/alert-manager.yaml index 37b7358..dc6d076 100644 --- a/roles/prometheus/templates/alert-manager.yaml +++ b/roles/prometheus/templates/alert-manager.yaml @@ -12,12 +12,23 @@ data: alertmanager.yml: |- global: slack_api_url: '{{ prometheus_slack_api_url }}' + receivers: - name: slack-receiver slack_configs: - channel: '{{ prometheus_slack_channel }}' send_resolved: true - +{% raw %} + title: '[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}]{{.CommonAnnotations.summary}}' + text: >- + {{ range .Alerts }} + *Alert:* {{ .Annotations.summary }} - `{{ .Labels.severity }}` + *Description:* {{ .Annotations.description }} + *Details*: + {{ range .Labels.SortedPairs }} • {{ .Name }}: `{{ .Value }}` + {{ end }} + {{ end }} +{% endraw %} route: group_wait: 10s group_interval: 5m diff --git a/roles/prometheus/templates/alerts/app.conf b/roles/prometheus/templates/alerts/app.conf new file mode 100644 index 0000000..60c1269 --- /dev/null +++ b/roles/prometheus/templates/alerts/app.conf @@ -0,0 +1,32 @@ +{% raw %} + # + # Applications alerts (from /metrics handler) + # + - name: application-alerts + rules: + # + # Alert on each 5XX code of any http service + # + - alert: HttpServiceFatalError + expr: rate(http_request_duration_seconds{job="kubernetes-service-endpoints",status=~"5.."}[1m]) + > 0 + labels: + notify: sre + severity: warning + annotations: + summary: "{{ $labels.namespace }}/{{ $label.name }}: Service returns 5XX error" + description: "{{ $labels.namespace }}/{{ $label.name }}: Service has processed some request incorrectly (code = {{ $labels.status }})" + + # + # Alert for any instance that have a median request latency > 0.5s + # + - alert: HttpServiceHighRequestLatency + expr: http_request_duration_seconds{quantile="0.5"} > 0.5 + for: 1m + labels: + notify: sre + severity: warning + annotations: + summary: "{{ $labels.namespace }}/{{ $label.name }}: High request latency" + description: "{{ $labels.namespace }}/{{ $label.name }} (instance {{ $labels.instance }}): Service has a median request latency > 0.5s (current value: {{ $value }}s)" +{% endraw %} diff --git a/roles/prometheus/templates/alerts/deployment.conf b/roles/prometheus/templates/alerts/deployment.conf new file mode 100644 index 0000000..39e8d38 --- /dev/null +++ b/roles/prometheus/templates/alerts/deployment.conf @@ -0,0 +1,21 @@ +{% raw %} + # + # Deployment alerts (from kube-state-metrics) + # + - name: deployments-alerts + rules: + # + # Alert on deployment has not enough replicas + # + - alert: DeploymentReplicasMismatch + expr: (kube_deployment_spec_replicas != kube_deployment_status_replicas_available) + or (kube_deployment_spec_replicas unless kube_deployment_status_replicas_unavailable) + for: 5m + labels: + notify: sre + severity: warning + annotations: + summary: "{{ $labels.namespace }}/{{ $labels.deployment }}: Deployment is failed" + description: "{{ $labels.namespace }}/{{ $labels.deployment }}: Deployment is failed - observed replicas != intended replicas" + +{% endraw %} diff --git a/roles/prometheus/templates/alerts/ingress.conf b/roles/prometheus/templates/alerts/ingress.conf new file mode 100644 index 0000000..3bb89a4 --- /dev/null +++ b/roles/prometheus/templates/alerts/ingress.conf @@ -0,0 +1,33 @@ +{% raw %} + # + # Ingress alerts (from blackbox-exporter) + # + - name: ingress-alerts + rules: + # + # Alert on host (in ingress) is unreachable > 5 min + # + - alert: IngressIsUnreachable + expr: probe_success{job="kubernetes-ingresses"} == 0 + for: 5m + labels: + notify: sre + severity: warning + annotations: + summary: "{{ $labels.instance }}: Ingress is unreachable" + description: "{{ $labels.instance }}: Ingress is unreachable more than 5 minutes" + + # + # Alert on SSL certificate will be expired < 30 days + # + - alert: SSLCertExpiringSoon + expr: probe_ssl_earliest_cert_expiry{job="kubernetes-ingresses"} - time() < 86400 * 30 + for: 5m + labels: + notify: sre + severity: warning + annotations: + summary: "{{ $labels.instance }}: SSL certificate will be expired soon" + description: "{{ $labels.instance }}: SSL certificate will be expired less than 30 days" + +{% endraw %} diff --git a/roles/prometheus/templates/alerts/node.conf b/roles/prometheus/templates/alerts/node.conf new file mode 100644 index 0000000..684552e --- /dev/null +++ b/roles/prometheus/templates/alerts/node.conf @@ -0,0 +1,82 @@ +{% raw %} + # + # Node alerts (from kube-state-metrics / node-exporter) + # + # There are some alerts from https://github.com/kayrus/prometheus-kubernetes/tree/master/prometheus-rules + - name: node-alerts + rules: + + # + # Alert on CPU > 75% on all nodes + # + - alert: NodeCPUUsage + expr: (100 - (avg(irate(node_cpu{component="node-exporter",mode="idle"}[5m])) + BY (instance) * 100)) > 75 + for: 2m + labels: + notify: sre + severity: info + annotations: + summary: "{{ $labels.instance }}: High CPU usage is detected" + description: "{{ $labels.instance }}: CPU usage is above 75% (current value is: {{ $value }})" + + # + # Alert on load average > 1 + # + - alert: NodeLoadAverage + expr: ((node_load5 / count(node_cpu{mode="system"}) WITHOUT (cpu, mode)) > 1) + for: 5m + labels: + notify: sre + severity: info + annotations: + description: "{{ $labels.instance }}: Load average is high" + summary: "{{ $labels.instance }}: High LA detected" + + # + # Alert on swap > 75% + # + - alert: NodeSwapUsage + expr: (((node_memory_SwapTotal - node_memory_SwapFree) / node_memory_SwapTotal) + * 100) > 75 + for: 5m + labels: + notify: sre + severity: info + annotations: + summary: "{{ $labels.instance }}: Swap usage is detected" + description: "{{ $labels.instance }}: Swap usage usage is above 75% (current value is: {{ $value }})" + + # + # Alert on memory > 75% + # + - alert: NodeMemoryUsage + expr: (((node_memory_MemTotal - node_memory_MemFree - node_memory_Cached) / (node_memory_MemTotal) + * 100)) > 75 + for: 5m + labels: + notify: sre + severity: info + annotations: + summary: "{{ $labels.instance }}: High memory usage is detected" + description: "{{ $labels.instance }}: Memory usage is above 75% (current value is: {{ $value }})" + + # + # Disk is free < 25% + # + - alert: HighNodeDiskUsage + expr: (avg(node_filesystem_avail{device=~"^/dev/[sv]da1$"}) BY (instance)) / (avg(node_filesystem_size{device=~"^/dev/[sv]da1$"}) + BY (instance)) * 100 < 25 + for: 5m + labels: + notify: sre + severity: info + annotations: + summary: "{{$labels.instance}}: High disk usage is detected" + description: "{{$labels.instance}}: Disk usage is above 75% (current value is: {{ $value }})" + + # + # TODO: Disk will be busy after 4 hours + # something like: predict_linear((avg by (instance) (node_filesystem_avail{device=~"^/dev/[sv]da1$"})), 4*3600) < 0 + +{% endraw %} diff --git a/roles/prometheus/templates/alerts/pods.conf b/roles/prometheus/templates/alerts/pods.conf new file mode 100644 index 0000000..313d6c0 --- /dev/null +++ b/roles/prometheus/templates/alerts/pods.conf @@ -0,0 +1,35 @@ +{% raw %} + # + # Pod alerts (from cAdvisor / kube-state-metrics) + # + - name: pod-alerts + rules: + + # + # Pod restarting count > 5 times per minute + # + - alert: PodRestartingTooMuch + expr: rate(kube_pod_container_status_restarts{job="kubernetes-service-endpoints"}[1m]) + > 1 / (5 * 60) + for: 1h + labels: + notify: tech + severity: warning + annotations: + summary: "{{ $labels.instance }}: Pod restarts too much" + description: "{{ $labels.instance }}: Pod restarts more than 5 times per minute" + + # + # Alert for any instance that is unreachable for > 5 minutes + # + - alert: InstanceDown + expr: up == 0 + for: 5m + labels: + notify: sre + severity: warning + annotations: + summary: "{{ $labels.instance }}: Pod is down" + description: "{{ $labels.instance }}: Pod is down for more than 5 minutes" + +{% endraw %} \ No newline at end of file diff --git a/roles/prometheus/templates/alerts/service.conf b/roles/prometheus/templates/alerts/service.conf new file mode 100644 index 0000000..8d132b1 --- /dev/null +++ b/roles/prometheus/templates/alerts/service.conf @@ -0,0 +1,21 @@ +{% raw %} + # + # Service alerts (from blackbox-exporter) + # + - name: service-alerts + rules: + + # + # Alert on service is unreachable > 5 min + # + - alert: ServiceIsUnreachable + expr: probe_success{job="kubernetes-services"} == 0 + for: 5m + labels: + notify: sre + severity: warning + annotations: + description: "{{ $labels.instance }}: Service is unreachable" + summary: "{{ $labels.instance }}: Service is unreachable more than 5 minutes" + +{% endraw %} diff --git a/roles/prometheus/templates/server.yaml b/roles/prometheus/templates/server.yaml index 59064dc..cec390f 100644 --- a/roles/prometheus/templates/server.yaml +++ b/roles/prometheus/templates/server.yaml @@ -11,6 +11,18 @@ metadata: data: alerts: |- {% include 'alerts/common.conf' %} + groups: +{% include 'alerts/app.conf' %} + +{% include 'alerts/deployment.conf' %} + +{% include 'alerts/pods.conf' %} + +{% include 'alerts/node.conf' %} + +{% include 'alerts/ingress.conf' %} + +{% include 'alerts/service.conf' %} prometheus.yml: |- alerting: