Merge pull request #3 from vsaveliev/feature/basic_alerts

added alert rules for Prometheus, configured slack messages
k8s-community · Oct 30, 2017 · 82818d8 · 82818d8
2 parents 1001fdb + e3c44db
commit 82818d8
Show file tree

Hide file tree

Showing 8 changed files with 248 additions and 1 deletion.
diff --git a/roles/prometheus/templates/alert-manager.yaml b/roles/prometheus/templates/alert-manager.yaml
@@ -12,12 +12,23 @@ data:
   alertmanager.yml: |-
     global:
       slack_api_url: '{{ prometheus_slack_api_url }}'
+
     receivers:
       - name: slack-receiver
         slack_configs:
           - channel: '{{ prometheus_slack_channel }}'
             send_resolved: true
-
+{% raw %}
+            title: '[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}]{{.CommonAnnotations.summary}}'
+            text: >-
+              {{ range .Alerts }}
+              *Alert:* {{ .Annotations.summary }} - `{{ .Labels.severity }}`
+                *Description:* {{ .Annotations.description }}
+                *Details*:
+                  {{ range .Labels.SortedPairs }} • {{ .Name }}: `{{ .Value }}`
+                  {{ end }}
+                {{ end }}
+{% endraw %}
     route:
       group_wait: 10s
       group_interval: 5m

diff --git a/roles/prometheus/templates/alerts/app.conf b/roles/prometheus/templates/alerts/app.conf
@@ -0,0 +1,32 @@
+{% raw %}
+    #
+    # Applications alerts (from /metrics handler)
+    #
+    - name: application-alerts
+      rules:
+      #
+      # Alert on each 5XX code of any http service
+      #
+      - alert: HttpServiceFatalError
+        expr: rate(http_request_duration_seconds{job="kubernetes-service-endpoints",status=~"5.."}[1m])
+          > 0
+        labels:
+          notify: sre
+          severity: warning
+        annotations:
+          summary: "{{ $labels.namespace }}/{{ $label.name }}: Service returns 5XX error"
+          description: "{{ $labels.namespace }}/{{ $label.name }}: Service has processed some request incorrectly (code = {{ $labels.status }})"
+
+      #
+      # Alert for any instance that have a median request latency > 0.5s
+      #
+      - alert: HttpServiceHighRequestLatency
+        expr: http_request_duration_seconds{quantile="0.5"} > 0.5
+        for: 1m
+        labels:
+          notify: sre
+          severity: warning
+        annotations:
+          summary: "{{ $labels.namespace }}/{{ $label.name }}: High request latency"
+          description: "{{ $labels.namespace }}/{{ $label.name }} (instance {{ $labels.instance }}): Service has a median request latency > 0.5s (current value: {{ $value }}s)"
+{% endraw %}
diff --git a/roles/prometheus/templates/alerts/deployment.conf b/roles/prometheus/templates/alerts/deployment.conf
@@ -0,0 +1,21 @@
+{% raw %}
+    #
+    # Deployment alerts (from kube-state-metrics)
+    #
+    - name: deployments-alerts
+      rules:
+      #
+      # Alert on deployment has not enough replicas
+      #
+      - alert: DeploymentReplicasMismatch
+        expr: (kube_deployment_spec_replicas != kube_deployment_status_replicas_available)
+          or (kube_deployment_spec_replicas unless kube_deployment_status_replicas_unavailable)
+        for: 5m
+        labels:
+          notify: sre
+          severity: warning
+        annotations:
+          summary: "{{ $labels.namespace }}/{{ $labels.deployment }}: Deployment is failed"
+          description: "{{ $labels.namespace }}/{{ $labels.deployment }}: Deployment is failed - observed replicas != intended replicas"
+
+{% endraw %}
diff --git a/roles/prometheus/templates/alerts/ingress.conf b/roles/prometheus/templates/alerts/ingress.conf
@@ -0,0 +1,33 @@
+{% raw %}
+    #
+    # Ingress alerts (from blackbox-exporter)
+    #
+    - name: ingress-alerts
+      rules:
+      #
+      # Alert on host (in ingress) is unreachable > 5 min
+      #
+      - alert: IngressIsUnreachable
+        expr: probe_success{job="kubernetes-ingresses"} == 0
+        for: 5m
+        labels:
+          notify: sre
+          severity: warning
+        annotations:
+          summary: "{{ $labels.instance }}: Ingress is unreachable"
+          description: "{{ $labels.instance }}: Ingress is unreachable more than 5 minutes"
+
+      #
+      # Alert on SSL certificate will be expired < 30 days
+      #
+      - alert: SSLCertExpiringSoon
+        expr: probe_ssl_earliest_cert_expiry{job="kubernetes-ingresses"} - time() < 86400 * 30
+        for: 5m
+        labels:
+          notify: sre
+          severity: warning
+        annotations:
+          summary: "{{ $labels.instance }}: SSL certificate will be expired soon"
+          description: "{{ $labels.instance }}: SSL certificate will be expired less than 30 days"
+
+{% endraw %}
diff --git a/roles/prometheus/templates/alerts/node.conf b/roles/prometheus/templates/alerts/node.conf
@@ -0,0 +1,82 @@
+{% raw %}
+    #
+    # Node alerts (from kube-state-metrics / node-exporter)
+    #
+    # There are some alerts from https://github.com/kayrus/prometheus-kubernetes/tree/master/prometheus-rules
+    - name: node-alerts
+      rules:
+
+      #
+      # Alert on CPU > 75% on all nodes
+      #
+      - alert: NodeCPUUsage
+        expr: (100 - (avg(irate(node_cpu{component="node-exporter",mode="idle"}[5m]))
+          BY (instance) * 100)) > 75
+        for: 2m
+        labels:
+          notify: sre
+          severity: info
+        annotations:
+          summary: "{{ $labels.instance }}: High CPU usage is detected"
+          description: "{{ $labels.instance }}: CPU usage is above 75% (current value is: {{ $value }})"
+
+      #
+      # Alert on load average > 1
+      #
+      - alert: NodeLoadAverage
+        expr: ((node_load5 / count(node_cpu{mode="system"}) WITHOUT (cpu, mode)) > 1)
+        for: 5m
+        labels:
+          notify: sre
+          severity: info
+        annotations:
+          description: "{{ $labels.instance }}: Load average is high"
+          summary: "{{ $labels.instance }}: High LA detected"
+
+      #
+      # Alert on swap > 75%
+      #
+      - alert: NodeSwapUsage
+        expr: (((node_memory_SwapTotal - node_memory_SwapFree) / node_memory_SwapTotal)
+          * 100) > 75
+        for: 5m
+        labels:
+          notify: sre
+          severity: info
+        annotations:
+          summary: "{{ $labels.instance }}: Swap usage is detected"
+          description: "{{ $labels.instance }}: Swap usage usage is above 75% (current value is: {{ $value }})"
+
+      #
+      # Alert on memory > 75%
+      #
+      - alert: NodeMemoryUsage
+        expr: (((node_memory_MemTotal - node_memory_MemFree - node_memory_Cached) / (node_memory_MemTotal)
+          * 100)) > 75
+        for: 5m
+        labels:
+          notify: sre
+          severity: info
+        annotations:
+          summary: "{{ $labels.instance }}: High memory usage is detected"
+          description: "{{ $labels.instance }}: Memory usage is above 75% (current value is: {{ $value }})"
+
+      #
+      # Disk is free < 25%
+      #
+      - alert: HighNodeDiskUsage
+        expr: (avg(node_filesystem_avail{device=~"^/dev/[sv]da1$"}) BY (instance)) / (avg(node_filesystem_size{device=~"^/dev/[sv]da1$"})
+          BY (instance)) * 100 < 25
+        for: 5m
+        labels:
+          notify: sre
+          severity: info
+        annotations:
+          summary: "{{$labels.instance}}: High disk usage is detected"
+          description: "{{$labels.instance}}: Disk usage is above 75% (current value is: {{ $value }})"
+
+      #
+      # TODO: Disk will be busy after 4 hours
+      # something like: predict_linear((avg by (instance) (node_filesystem_avail{device=~"^/dev/[sv]da1$"})), 4*3600) < 0
+
+{% endraw %}
diff --git a/roles/prometheus/templates/alerts/pods.conf b/roles/prometheus/templates/alerts/pods.conf
@@ -0,0 +1,35 @@
+{% raw %}
+    #
+    # Pod alerts (from cAdvisor / kube-state-metrics)
+    #
+    - name: pod-alerts
+      rules:
+
+      #
+      # Pod restarting count > 5 times per minute
+      #
+      - alert: PodRestartingTooMuch
+        expr: rate(kube_pod_container_status_restarts{job="kubernetes-service-endpoints"}[1m])
+          > 1 / (5 * 60)
+        for: 1h
+        labels:
+          notify: tech
+          severity: warning
+        annotations:
+          summary: "{{ $labels.instance }}: Pod restarts too much"
+          description: "{{ $labels.instance }}: Pod restarts more than 5 times per minute"
+
+      #
+      # Alert for any instance that is unreachable for > 5 minutes
+      #
+      - alert: InstanceDown
+        expr: up == 0
+        for: 5m
+        labels:
+          notify: sre
+          severity: warning
+        annotations:
+          summary: "{{ $labels.instance }}: Pod is down"
+          description: "{{ $labels.instance }}: Pod is down for more than 5 minutes"
+
+{% endraw %}
diff --git a/roles/prometheus/templates/alerts/service.conf b/roles/prometheus/templates/alerts/service.conf
@@ -0,0 +1,21 @@
+{% raw %}
+    #
+    # Service alerts (from blackbox-exporter)
+    #
+    - name: service-alerts
+      rules:
+
+      #
+      # Alert on service is unreachable > 5 min
+      #
+      - alert: ServiceIsUnreachable
+        expr: probe_success{job="kubernetes-services"} == 0
+        for: 5m
+        labels:
+          notify: sre
+          severity: warning
+        annotations:
+          description: "{{ $labels.instance }}: Service is unreachable"
+          summary: "{{ $labels.instance }}: Service is unreachable more than 5 minutes"
+
+{% endraw %}
diff --git a/roles/prometheus/templates/server.yaml b/roles/prometheus/templates/server.yaml
@@ -11,6 +11,18 @@ metadata:
 data:
   alerts: |-
 {% include 'alerts/common.conf' %}
+    groups:
+{% include 'alerts/app.conf' %}
+
+{% include 'alerts/deployment.conf' %}
+
+{% include 'alerts/pods.conf' %}
+
+{% include 'alerts/node.conf' %}
+
+{% include 'alerts/ingress.conf' %}
+
+{% include 'alerts/service.conf' %}
 
   prometheus.yml: |-
     alerting: