Skip to content

Commit

Permalink
added alert rules for Prometheus, configured slack messages
Browse files Browse the repository at this point in the history
  • Loading branch information
vsaveliev committed Oct 27, 2017
1 parent 1001fdb commit e3c44db
Show file tree
Hide file tree
Showing 8 changed files with 248 additions and 1 deletion.
13 changes: 12 additions & 1 deletion roles/prometheus/templates/alert-manager.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,23 @@ data:
alertmanager.yml: |-
global:
slack_api_url: '{{ prometheus_slack_api_url }}'
receivers:
- name: slack-receiver
slack_configs:
- channel: '{{ prometheus_slack_channel }}'
send_resolved: true
{% raw %}
title: '[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}]{{.CommonAnnotations.summary}}'
text: >-
{{ range .Alerts }}
*Alert:* {{ .Annotations.summary }} - `{{ .Labels.severity }}`
*Description:* {{ .Annotations.description }}
*Details*:
{{ range .Labels.SortedPairs }} • {{ .Name }}: `{{ .Value }}`
{{ end }}
{{ end }}
{% endraw %}
route:
group_wait: 10s
group_interval: 5m
Expand Down
32 changes: 32 additions & 0 deletions roles/prometheus/templates/alerts/app.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
{% raw %}
#
# Applications alerts (from /metrics handler)
#
- name: application-alerts
rules:
#
# Alert on each 5XX code of any http service
#
- alert: HttpServiceFatalError
expr: rate(http_request_duration_seconds{job="kubernetes-service-endpoints",status=~"5.."}[1m])
> 0
labels:
notify: sre
severity: warning
annotations:
summary: "{{ $labels.namespace }}/{{ $label.name }}: Service returns 5XX error"
description: "{{ $labels.namespace }}/{{ $label.name }}: Service has processed some request incorrectly (code = {{ $labels.status }})"

#
# Alert for any instance that have a median request latency > 0.5s
#
- alert: HttpServiceHighRequestLatency
expr: http_request_duration_seconds{quantile="0.5"} > 0.5
for: 1m
labels:
notify: sre
severity: warning
annotations:
summary: "{{ $labels.namespace }}/{{ $label.name }}: High request latency"
description: "{{ $labels.namespace }}/{{ $label.name }} (instance {{ $labels.instance }}): Service has a median request latency > 0.5s (current value: {{ $value }}s)"
{% endraw %}
21 changes: 21 additions & 0 deletions roles/prometheus/templates/alerts/deployment.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
{% raw %}
#
# Deployment alerts (from kube-state-metrics)
#
- name: deployments-alerts
rules:
#
# Alert on deployment has not enough replicas
#
- alert: DeploymentReplicasMismatch
expr: (kube_deployment_spec_replicas != kube_deployment_status_replicas_available)
or (kube_deployment_spec_replicas unless kube_deployment_status_replicas_unavailable)
for: 5m
labels:
notify: sre
severity: warning
annotations:
summary: "{{ $labels.namespace }}/{{ $labels.deployment }}: Deployment is failed"
description: "{{ $labels.namespace }}/{{ $labels.deployment }}: Deployment is failed - observed replicas != intended replicas"

{% endraw %}
33 changes: 33 additions & 0 deletions roles/prometheus/templates/alerts/ingress.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
{% raw %}
#
# Ingress alerts (from blackbox-exporter)
#
- name: ingress-alerts
rules:
#
# Alert on host (in ingress) is unreachable > 5 min
#
- alert: IngressIsUnreachable
expr: probe_success{job="kubernetes-ingresses"} == 0
for: 5m
labels:
notify: sre
severity: warning
annotations:
summary: "{{ $labels.instance }}: Ingress is unreachable"
description: "{{ $labels.instance }}: Ingress is unreachable more than 5 minutes"

#
# Alert on SSL certificate will be expired < 30 days
#
- alert: SSLCertExpiringSoon
expr: probe_ssl_earliest_cert_expiry{job="kubernetes-ingresses"} - time() < 86400 * 30
for: 5m
labels:
notify: sre
severity: warning
annotations:
summary: "{{ $labels.instance }}: SSL certificate will be expired soon"
description: "{{ $labels.instance }}: SSL certificate will be expired less than 30 days"

{% endraw %}
82 changes: 82 additions & 0 deletions roles/prometheus/templates/alerts/node.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
{% raw %}
#
# Node alerts (from kube-state-metrics / node-exporter)
#
# There are some alerts from https://github.com/kayrus/prometheus-kubernetes/tree/master/prometheus-rules
- name: node-alerts
rules:

#
# Alert on CPU > 75% on all nodes
#
- alert: NodeCPUUsage
expr: (100 - (avg(irate(node_cpu{component="node-exporter",mode="idle"}[5m]))
BY (instance) * 100)) > 75
for: 2m
labels:
notify: sre
severity: info
annotations:
summary: "{{ $labels.instance }}: High CPU usage is detected"
description: "{{ $labels.instance }}: CPU usage is above 75% (current value is: {{ $value }})"

#
# Alert on load average > 1
#
- alert: NodeLoadAverage
expr: ((node_load5 / count(node_cpu{mode="system"}) WITHOUT (cpu, mode)) > 1)
for: 5m
labels:
notify: sre
severity: info
annotations:
description: "{{ $labels.instance }}: Load average is high"
summary: "{{ $labels.instance }}: High LA detected"

#
# Alert on swap > 75%
#
- alert: NodeSwapUsage
expr: (((node_memory_SwapTotal - node_memory_SwapFree) / node_memory_SwapTotal)
* 100) > 75
for: 5m
labels:
notify: sre
severity: info
annotations:
summary: "{{ $labels.instance }}: Swap usage is detected"
description: "{{ $labels.instance }}: Swap usage usage is above 75% (current value is: {{ $value }})"

#
# Alert on memory > 75%
#
- alert: NodeMemoryUsage
expr: (((node_memory_MemTotal - node_memory_MemFree - node_memory_Cached) / (node_memory_MemTotal)
* 100)) > 75
for: 5m
labels:
notify: sre
severity: info
annotations:
summary: "{{ $labels.instance }}: High memory usage is detected"
description: "{{ $labels.instance }}: Memory usage is above 75% (current value is: {{ $value }})"

#
# Disk is free < 25%
#
- alert: HighNodeDiskUsage
expr: (avg(node_filesystem_avail{device=~"^/dev/[sv]da1$"}) BY (instance)) / (avg(node_filesystem_size{device=~"^/dev/[sv]da1$"})
BY (instance)) * 100 < 25
for: 5m
labels:
notify: sre
severity: info
annotations:
summary: "{{$labels.instance}}: High disk usage is detected"
description: "{{$labels.instance}}: Disk usage is above 75% (current value is: {{ $value }})"

#
# TODO: Disk will be busy after 4 hours
# something like: predict_linear((avg by (instance) (node_filesystem_avail{device=~"^/dev/[sv]da1$"})), 4*3600) < 0

{% endraw %}
35 changes: 35 additions & 0 deletions roles/prometheus/templates/alerts/pods.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
{% raw %}
#
# Pod alerts (from cAdvisor / kube-state-metrics)
#
- name: pod-alerts
rules:

#
# Pod restarting count > 5 times per minute
#
- alert: PodRestartingTooMuch
expr: rate(kube_pod_container_status_restarts{job="kubernetes-service-endpoints"}[1m])
> 1 / (5 * 60)
for: 1h
labels:
notify: tech
severity: warning
annotations:
summary: "{{ $labels.instance }}: Pod restarts too much"
description: "{{ $labels.instance }}: Pod restarts more than 5 times per minute"

#
# Alert for any instance that is unreachable for > 5 minutes
#
- alert: InstanceDown
expr: up == 0
for: 5m
labels:
notify: sre
severity: warning
annotations:
summary: "{{ $labels.instance }}: Pod is down"
description: "{{ $labels.instance }}: Pod is down for more than 5 minutes"

{% endraw %}
21 changes: 21 additions & 0 deletions roles/prometheus/templates/alerts/service.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
{% raw %}
#
# Service alerts (from blackbox-exporter)
#
- name: service-alerts
rules:

#
# Alert on service is unreachable > 5 min
#
- alert: ServiceIsUnreachable
expr: probe_success{job="kubernetes-services"} == 0
for: 5m
labels:
notify: sre
severity: warning
annotations:
description: "{{ $labels.instance }}: Service is unreachable"
summary: "{{ $labels.instance }}: Service is unreachable more than 5 minutes"

{% endraw %}
12 changes: 12 additions & 0 deletions roles/prometheus/templates/server.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,18 @@ metadata:
data:
alerts: |-
{% include 'alerts/common.conf' %}
groups:
{% include 'alerts/app.conf' %}

{% include 'alerts/deployment.conf' %}

{% include 'alerts/pods.conf' %}

{% include 'alerts/node.conf' %}

{% include 'alerts/ingress.conf' %}

{% include 'alerts/service.conf' %}

prometheus.yml: |-
alerting:
Expand Down

0 comments on commit e3c44db

Please sign in to comment.