From 4fc29f296daa0485ada11074d29c173ba2fa453d Mon Sep 17 00:00:00 2001 From: vsaveliev Date: Wed, 1 Nov 2017 13:01:35 +0700 Subject: [PATCH 01/21] made alerts more readable, updated scrape configs for it --- roles/prometheus/templates/alerts/app.conf | 24 +++++------ .../prometheus/templates/alerts/ingress.conf | 8 ++-- roles/prometheus/templates/alerts/node.conf | 41 ++++++++++++------- roles/prometheus/templates/alerts/pods.conf | 17 ++++---- .../prometheus/templates/alerts/service.conf | 4 +- .../templates/scrape_configs/kubernetes.yml | 21 +++++----- 6 files changed, 64 insertions(+), 51 deletions(-) diff --git a/roles/prometheus/templates/alerts/app.conf b/roles/prometheus/templates/alerts/app.conf index 60c1269..d2fcd55 100644 --- a/roles/prometheus/templates/alerts/app.conf +++ b/roles/prometheus/templates/alerts/app.conf @@ -8,25 +8,25 @@ # Alert on each 5XX code of any http service # - alert: HttpServiceFatalError - expr: rate(http_request_duration_seconds{job="kubernetes-service-endpoints",status=~"5.."}[1m]) + expr: rate(http_requests_total{job="kubernetes-service-endpoints",status=~"5.."}[1m]) > 0 labels: notify: sre severity: warning annotations: - summary: "{{ $labels.namespace }}/{{ $label.name }}: Service returns 5XX error" - description: "{{ $labels.namespace }}/{{ $label.name }}: Service has processed some request incorrectly (code = {{ $labels.status }})" + summary: "{{ $labels.namespace }}/{{ $label.kubernetes_pod_name }}: Service returns 5XX error" + description: "{{ $labels.namespace }}/{{ $label.kubernetes_pod_name }}: Service has processed some request incorrectly (code = {{ $labels.status }})" # # Alert for any instance that have a median request latency > 0.5s # - - alert: HttpServiceHighRequestLatency - expr: http_request_duration_seconds{quantile="0.5"} > 0.5 - for: 1m - labels: - notify: sre - severity: warning - annotations: - summary: "{{ $labels.namespace }}/{{ $label.name }}: High request latency" - description: "{{ $labels.namespace }}/{{ $label.name }} (instance {{ $labels.instance }}): Service has a median request latency > 0.5s (current value: {{ $value }}s)" + #- alert: HttpServiceHighRequestLatency + # expr: http_request_duration_seconds_bucket{le="0.5"} > 0.5 + # for: 1m + # labels: + # notify: sre + # severity: warning + # annotations: + # summary: "{{ $labels.namespace }}/{{ $label.kubernetes_pod_name }}: High request latency" + # description: "{{ $labels.namespace }}/{{ $label.kubernetes_pod_name }} (instance {{ $labels.instance }}): Service has a median request latency > 0.5s (current value: {{ $value }}s)" {% endraw %} diff --git a/roles/prometheus/templates/alerts/ingress.conf b/roles/prometheus/templates/alerts/ingress.conf index 3bb89a4..23116c3 100644 --- a/roles/prometheus/templates/alerts/ingress.conf +++ b/roles/prometheus/templates/alerts/ingress.conf @@ -14,8 +14,8 @@ notify: sre severity: warning annotations: - summary: "{{ $labels.instance }}: Ingress is unreachable" - description: "{{ $labels.instance }}: Ingress is unreachable more than 5 minutes" + summary: "{{ $labels.namespace }}/{{ $labels.kubernetes_ingress_name }}: Ingress is unreachable" + description: "{{ $labels.namespace }}/{{ $labels.kubernetes_ingress_name }}: Ingress is unreachable more than 5 minutes (domain is {{ $labels.instance }})" # # Alert on SSL certificate will be expired < 30 days @@ -27,7 +27,7 @@ notify: sre severity: warning annotations: - summary: "{{ $labels.instance }}: SSL certificate will be expired soon" - description: "{{ $labels.instance }}: SSL certificate will be expired less than 30 days" + summary: "{{ $labels.namespace }}/{{ $labels.kubernetes_ingress_name }}: SSL certificate will be expired soon" + description: "{{ $labels.namespace }}/{{ $labels.kubernetes_ingress_name }}: SSL certificate will be expired less than 30 days (domain is {{ $labels.instance }})" {% endraw %} diff --git a/roles/prometheus/templates/alerts/node.conf b/roles/prometheus/templates/alerts/node.conf index 684552e..4931f0e 100644 --- a/roles/prometheus/templates/alerts/node.conf +++ b/roles/prometheus/templates/alerts/node.conf @@ -11,27 +11,27 @@ # - alert: NodeCPUUsage expr: (100 - (avg(irate(node_cpu{component="node-exporter",mode="idle"}[5m])) - BY (instance) * 100)) > 75 + BY (kubernetes_pod_node_name) * 100)) > 75 for: 2m labels: notify: sre severity: info annotations: - summary: "{{ $labels.instance }}: High CPU usage is detected" - description: "{{ $labels.instance }}: CPU usage is above 75% (current value is: {{ $value }})" + summary: "{{ $labels.kubernetes_pod_node_name }}: High CPU usage is detected" + description: "{{ $labels.kubernetes_pod_node_name }}: CPU usage is above 75% (current value is: {{ $value }})" # # Alert on load average > 1 # - alert: NodeLoadAverage - expr: ((node_load5 / count(node_cpu{mode="system"}) WITHOUT (cpu, mode)) > 1) + expr: ((node_load5 / count(node_cpu{mode="system"}) WITHOUT (cpu, mode)) > 1.5) for: 5m labels: notify: sre severity: info annotations: - description: "{{ $labels.instance }}: Load average is high" - summary: "{{ $labels.instance }}: High LA detected" + description: "{{ $labels.kubernetes_pod_node_name }}: Load average is high" + summary: "{{ $labels.kubernetes_pod_node_name }}: High LA detected" # # Alert on swap > 75% @@ -44,8 +44,8 @@ notify: sre severity: info annotations: - summary: "{{ $labels.instance }}: Swap usage is detected" - description: "{{ $labels.instance }}: Swap usage usage is above 75% (current value is: {{ $value }})" + summary: "{{ $labels.kubernetes_pod_node_name }}: Swap usage is detected" + description: "{{ $labels.kubernetes_pod_node_name }}: Swap usage usage is above 75% (current value is: {{ $value }})" # # Alert on memory > 75% @@ -58,22 +58,35 @@ notify: sre severity: info annotations: - summary: "{{ $labels.instance }}: High memory usage is detected" - description: "{{ $labels.instance }}: Memory usage is above 75% (current value is: {{ $value }})" + summary: "{{ $labels.kubernetes_pod_node_name }}: High memory usage is detected" + description: "{{ $labels.kubernetes_pod_node_name }}: Memory usage is above 75% (current value is: {{ $value }})" # # Disk is free < 25% # - alert: HighNodeDiskUsage - expr: (avg(node_filesystem_avail{device=~"^/dev/[sv]da1$"}) BY (instance)) / (avg(node_filesystem_size{device=~"^/dev/[sv]da1$"}) - BY (instance)) * 100 < 25 + expr: (avg(node_filesystem_avail{device=~"^/dev/[sv]da1$"}) BY (kubernetes_pod_node_name)) / (avg(node_filesystem_size{device=~"^/dev/[sv]da1$"}) + BY (kubernetes_pod_node_name)) * 100 < 25 for: 5m labels: notify: sre severity: info annotations: - summary: "{{$labels.instance}}: High disk usage is detected" - description: "{{$labels.instance}}: Disk usage is above 75% (current value is: {{ $value }})" + summary: "{{$labels.kubernetes_pod_node_name}}: High disk usage is detected" + description: "{{$labels.kubernetes_pod_node_name}}: Disk is free less than 25% (current value is: {{ $value }})" + + # + # Alert for node that is unreachable for > 5 minutes + # + - alert: NodeIsDown + expr: up{job="kubernetes-nodes"} == 0 + for: 5m + labels: + notify: sre + severity: warning + annotations: + summary: "{{ $labels.kubernetes_node_name }}: Node is down" + description: "{{ $labels.kubernetes_node_name }}: Node is down for more than 5 minutes" # # TODO: Disk will be busy after 4 hours diff --git a/roles/prometheus/templates/alerts/pods.conf b/roles/prometheus/templates/alerts/pods.conf index 313d6c0..b13cdee 100644 --- a/roles/prometheus/templates/alerts/pods.conf +++ b/roles/prometheus/templates/alerts/pods.conf @@ -9,27 +9,26 @@ # Pod restarting count > 5 times per minute # - alert: PodRestartingTooMuch - expr: rate(kube_pod_container_status_restarts{job="kubernetes-service-endpoints"}[1m]) - > 1 / (5 * 60) + expr: rate(kube_pod_container_status_restarts[1m]) > 1 / (5 * 60) for: 1h labels: notify: tech severity: warning annotations: - summary: "{{ $labels.instance }}: Pod restarts too much" - description: "{{ $labels.instance }}: Pod restarts more than 5 times per minute" + summary: "{{ $labels.kubernetes_namespace }}/{{ $labels.pod }}: Pod restarts too much" + description: "{{ $labels.kubernetes_namespace }}/{{ $labels.pod }}: Pod restarts more than 5 times per minute" # - # Alert for any instance that is unreachable for > 5 minutes + # Alert for pod that is unreachable for > 5 minutes # - - alert: InstanceDown - expr: up == 0 + - alert: PodIsDown + expr: up{job="kubernetes-service-endpoints"} == 0 for: 5m labels: notify: sre severity: warning annotations: - summary: "{{ $labels.instance }}: Pod is down" - description: "{{ $labels.instance }}: Pod is down for more than 5 minutes" + summary: "{{ $labels.kubernetes_namespace }}/{{ $labels.kubernetes_pod_name }}: Pod is down" + description: "{{ $labels.kubernetes_namespace }}/{{ $labels.kubernetes_pod_name }}: Pod is down for more than 5 minutes" {% endraw %} \ No newline at end of file diff --git a/roles/prometheus/templates/alerts/service.conf b/roles/prometheus/templates/alerts/service.conf index 8d132b1..ac3553f 100644 --- a/roles/prometheus/templates/alerts/service.conf +++ b/roles/prometheus/templates/alerts/service.conf @@ -15,7 +15,7 @@ notify: sre severity: warning annotations: - description: "{{ $labels.instance }}: Service is unreachable" - summary: "{{ $labels.instance }}: Service is unreachable more than 5 minutes" + description: "{{ $labels.namespace }}/{{ $labels.kubernetes_service_name }}: Service is unreachable" + summary: "{{ $labels.namespace }}/{{ $labels.kubernetes_service_name }}: Service is unreachable more than 5 minutes" {% endraw %} diff --git a/roles/prometheus/templates/scrape_configs/kubernetes.yml b/roles/prometheus/templates/scrape_configs/kubernetes.yml index 46b30e1..aff8b7e 100644 --- a/roles/prometheus/templates/scrape_configs/kubernetes.yml +++ b/roles/prometheus/templates/scrape_configs/kubernetes.yml @@ -111,6 +111,9 @@ regex: (.+) target_label: __metrics_path__ replacement: /api/v1/nodes/${1}/proxy/metrics + - source_labels: [__meta_kubernetes_node_name] + action: replace + target_label: kubernetes_node_name # Scrape config for service endpoints. # @@ -145,17 +148,17 @@ target_label: __address__ regex: (.+)(?::\d+);(\d+) replacement: $1:$2 + - source_labels: [__meta_kubernetes_pod_node_name] + action: replace + target_label: kubernetes_pod_node_name - action: labelmap regex: __meta_kubernetes_service_label_(.+) - source_labels: [__meta_kubernetes_namespace] action: replace target_label: kubernetes_namespace - - source_labels: [__meta_kubernetes_service_name] - action: replace - target_label: kubernetes_name - - source_labels: [__meta_kubernetes_node_name] + - source_labels: [__meta_kubernetes_pod_name] action: replace - target_label: kubernetes_node_name + target_label: kubernetes_pod_name # Scrape config for pods # @@ -178,10 +181,8 @@ action: replace target_label: __metrics_path__ regex: (.+) - - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port] + - source_labels: [__meta_kubernetes_pod_node_name] action: replace - regex: (.+):(?:\d+);(\d+) - replacement: ${1}:${2} target_label: __address__ - action: labelmap regex: __meta_kubernetes_pod_label_(.+) @@ -219,7 +220,7 @@ - source_labels: [__meta_kubernetes_namespace] target_label: kubernetes_namespace - source_labels: [__meta_kubernetes_service_name] - target_label: kubernetes_name + target_label: kubernetes_service_name # Scrape config for probing ingresses via the Blackbox Exporter. # @@ -253,4 +254,4 @@ - source_labels: [__meta_kubernetes_namespace] target_label: kubernetes_namespace - source_labels: [__meta_kubernetes_ingress_name] - target_label: kubernetes_name + target_label: kubernetes_ingress_name From 0739bb6e534b46f7b301171d99873545735b6c50 Mon Sep 17 00:00:00 2001 From: vsaveliev Date: Wed, 1 Nov 2017 13:13:32 +0700 Subject: [PATCH 02/21] updated prometheus role default values --- roles/prometheus/defaults/main.yml | 23 +++++-- roles/prometheus/tasks/main.yml | 9 +-- roles/prometheus/templates/alert-manager.yaml | 17 ++---- roles/prometheus/templates/config.yaml | 60 +++++++++++++++++++ .../prometheus/templates/deploy-prometheus.sh | 16 ++++- roles/prometheus/templates/server.yaml | 3 +- 6 files changed, 104 insertions(+), 24 deletions(-) create mode 100644 roles/prometheus/templates/config.yaml diff --git a/roles/prometheus/defaults/main.yml b/roles/prometheus/defaults/main.yml index 2572655..e96a3c4 100644 --- a/roles/prometheus/defaults/main.yml +++ b/roles/prometheus/defaults/main.yml @@ -1,7 +1,7 @@ --- # Kubernetes configs path k8s_conf_dir: /etc/kubernetes -k8s_addons_dir: '{{ k8s_conf_dir }}/addons' +k8s_prometheus_dir: '{{ k8s_conf_dir }}/addons/prometheus' # Master hosts names k8s_master_hosts: {} @@ -14,7 +14,7 @@ k8s_prometheus_grafana_name: '' k8s_prometheus_blackbox_name: '' # Prometheus namespace -k8s_prometheus_namespace: default +k8s_prometheus_namespace: prometheus # Prometheus images k8s_prometheus_image: prom/prometheus @@ -39,7 +39,20 @@ k8s_prometheus_blackbox_image_tag: v0.10.0 k8s_prometheus_configreload_image: jimmidyson/configmap-reload k8s_prometheus_configreload_image_tag: v0.1 -# Prometheus alert configs for Slack -prometheus_slack_api_url: '' -prometheus_slack_channel: '' +# Prometheus scrape configs +k8s_prometheus_scrape_istio_metrics: false +# Prometheus alert configs for Slack +k8s_prometheus_slack_api_url: '' +k8s_prometheus_slack_channel: '' +k8s_prometheus_slack_message_title: '{% raw %}[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}]{{.CommonAnnotations.summary}}{% endraw %}' +k8s_prometheus_slack_message_body: | + {% raw %} + {{ range .Alerts }} + *Alert:* {{ .Annotations.summary }} - `{{ .Labels.severity }}` + *Description:* {{ .Annotations.description }} + *Details*: + {{ range .Labels.SortedPairs }} • {{ .Name }}: `{{ .Value }}` + {{ end }} + {{ end }} + {% endraw %} diff --git a/roles/prometheus/tasks/main.yml b/roles/prometheus/tasks/main.yml index b9c83fc..4567ad7 100644 --- a/roles/prometheus/tasks/main.yml +++ b/roles/prometheus/tasks/main.yml @@ -5,21 +5,22 @@ state: directory mode: 0755 with_items: - - '{{ k8s_addons_dir }}' + - '{{ k8s_prometheus_dir }}' - name: Prometheus template: src: "{{ item }}" - dest: "{{ k8s_addons_dir }}/{{ item }}" + dest: "{{ k8s_prometheus_dir }}/{{ item }}" with_items: + - config.yaml - prometheus.yaml - name: Deploy script for Prometheus template: src: deploy-prometheus.sh - dest: "{{ k8s_addons_dir }}/deploy-prometheus.sh" + dest: "{{ k8s_prometheus_dir }}/deploy-prometheus.sh" mode: 0755 - name: Run deploy script for Prometheus - command: "{{ k8s_addons_dir }}/deploy-prometheus.sh" + command: "{{ k8s_prometheus_dir }}/deploy-prometheus.sh" when: inventory_hostname in k8s_master_hosts[0] diff --git a/roles/prometheus/templates/alert-manager.yaml b/roles/prometheus/templates/alert-manager.yaml index dc6d076..1f65314 100644 --- a/roles/prometheus/templates/alert-manager.yaml +++ b/roles/prometheus/templates/alert-manager.yaml @@ -11,24 +11,17 @@ metadata: data: alertmanager.yml: |- global: - slack_api_url: '{{ prometheus_slack_api_url }}' + slack_api_url: '{{ k8s_prometheus_slack_api_url }}' receivers: - name: slack-receiver slack_configs: - - channel: '{{ prometheus_slack_channel }}' + - channel: '{{ k8s_prometheus_slack_channel }}' send_resolved: true -{% raw %} - title: '[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}]{{.CommonAnnotations.summary}}' + title: '{{ k8s_prometheus_slack_message_title }}' text: >- - {{ range .Alerts }} - *Alert:* {{ .Annotations.summary }} - `{{ .Labels.severity }}` - *Description:* {{ .Annotations.description }} - *Details*: - {{ range .Labels.SortedPairs }} • {{ .Name }}: `{{ .Value }}` - {{ end }} - {{ end }} -{% endraw %} + {{ k8s_prometheus_slack_message_body }} + route: group_wait: 10s group_interval: 5m diff --git a/roles/prometheus/templates/config.yaml b/roles/prometheus/templates/config.yaml new file mode 100644 index 0000000..d28daa1 --- /dev/null +++ b/roles/prometheus/templates/config.yaml @@ -0,0 +1,60 @@ +--- + +apiVersion: v1 +kind: Namespace +metadata: + name: {{ k8s_prometheus_namespace }} + +--- + +apiVersion: v1 +kind: Secret +metadata: + name: tls-secret + namespace: {{ k8s_prometheus_namespace }} +data: + tls.crt: {{ k8s_services_cert | b64encode }} + tls.key: {{ k8s_services_cert_key | b64encode }} + +--- + +apiVersion: v1 +kind: Secret +metadata: + name: registry-pull-secret + namespace: {{ k8s_prometheus_namespace }} +type: kubernetes.io/dockerconfigjson +data: + .dockerconfigjson: {{ k8s_docker_registry_auth_token }} + +--- + +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: Role +metadata: + name: {{ k8s_prometheus_namespace }}-admin + namespace: {{ k8s_prometheus_namespace }} + labels: + basic.auth/role: {{ k8s_prometheus_namespace }}-admin +rules: +- apiGroups: + - '*' + resources: + - '*' + verbs: + - '*' + +--- + +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: RoleBinding +metadata: + name: {{ k8s_prometheus_namespace }} + namespace: {{ k8s_prometheus_namespace }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: {{ k8s_prometheus_namespace }}-admin +subjects: +- kind: User + name: {{ k8s_release_username }} diff --git a/roles/prometheus/templates/deploy-prometheus.sh b/roles/prometheus/templates/deploy-prometheus.sh index 4001f1e..a880035 100755 --- a/roles/prometheus/templates/deploy-prometheus.sh +++ b/roles/prometheus/templates/deploy-prometheus.sh @@ -1,15 +1,27 @@ #!/bin/bash +function deploy_prometheus_config { + if kubectl get namespaces | grep {{ k8s_prometheus_namespace }} &> /dev/null; then + echo "Prometheus config already exists" + else + echo "Creating Prometheus config" + kubectl apply -f {{ k8s_prometheus_dir }}/config.yaml + fi + + echo +} + function deploy_prometheus { if kubectl get deploy -l app=prometheus -n {{ k8s_prometheus_namespace }} | grep prometheus &> /dev/null; then echo "Updating Prometheus (config-maps will be auto applied)" - kubectl apply -f {{ k8s_addons_dir }}/prometheus.yaml + kubectl apply -f {{ k8s_prometheus_dir }}/prometheus.yaml else echo "Creating Prometheus" - kubectl create -f {{ k8s_addons_dir }}/prometheus.yaml + kubectl create -f {{ k8s_prometheus_dir }}/prometheus.yaml fi echo } +deploy_prometheus_config deploy_prometheus diff --git a/roles/prometheus/templates/server.yaml b/roles/prometheus/templates/server.yaml index cec390f..fc0d66a 100644 --- a/roles/prometheus/templates/server.yaml +++ b/roles/prometheus/templates/server.yaml @@ -41,8 +41,9 @@ data: {% include 'scrape_configs/kubernetes.yml' %} +{% if k8s_prometheus_scrape_istio_metrics %} {% include 'scrape_configs/istio.yml' %} - +{% endif %} --- apiVersion: v1 From 6eb39bdf4fe88162168cb4b3655508076fb35b6c Mon Sep 17 00:00:00 2001 From: vsaveliev Date: Wed, 1 Nov 2017 13:14:15 +0700 Subject: [PATCH 03/21] updated grafana dashboard - Nodes - now we can see data by hostname not by IP --- roles/prometheus/templates/grafana-dashboards/nodes.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/roles/prometheus/templates/grafana-dashboards/nodes.json b/roles/prometheus/templates/grafana-dashboards/nodes.json index 8baef30..6a24def 100644 --- a/roles/prometheus/templates/grafana-dashboards/nodes.json +++ b/roles/prometheus/templates/grafana-dashboards/nodes.json @@ -1,3 +1,3 @@ {% raw %} - {"dashboard":{"__inputs":[{"name":"DS_PROMETHEUS","label":"prometheus","description":"","type":"datasource","pluginId":"prometheus","pluginName":"Prometheus"}],"__requires":[{"type":"grafana","id":"grafana","name":"Grafana","version":"4.4.1"},{"type":"panel","id":"graph","name":"Graph","version":""},{"type":"datasource","id":"prometheus","name":"Prometheus","version":"1.0.0"},{"type":"panel","id":"singlestat","name":"Singlestat","version":""}],"annotations":{"list":[]},"description":"Dashboard to get an overview of one server","editable":true,"gnetId":22,"graphTooltip":0,"hideControls":false,"id":null,"links":[],"refresh":false,"rows":[{"collapse":false,"height":"250px","panels":[{"alerting":{},"aliasColors":{},"bars":false,"dashLength":10,"dashes":false,"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"fill":1,"grid":{},"id":3,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"spaceLength":10,"span":6,"stack":false,"steppedLine":false,"targets":[{"expr":"100 - (avg by (cpu) (irate(node_cpu{mode=\"idle\", instance=\"$server\"}[5m])) * 100)","hide":false,"intervalFactor":10,"legendFormat":"{{cpu}}","refId":"A","step":50}],"thresholds":[],"timeFrom":null,"timeShift":null,"title":"Idle cpu","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"cumulative"},"type":"graph","xaxis":{"buckets":null,"mode":"time","name":null,"show":true,"values":[]},"yaxes":[{"format":"percent","label":"cpu usage","logBase":1,"max":100,"min":0,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true}]},{"alerting":{},"aliasColors":{},"bars":false,"dashLength":10,"dashes":false,"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"fill":1,"grid":{},"id":9,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"spaceLength":10,"span":6,"stack":false,"steppedLine":false,"targets":[{"expr":"node_load1{instance=\"$server\"}","intervalFactor":4,"legendFormat":"load 1m","refId":"A","step":20,"target":""},{"expr":"node_load5{instance=\"$server\"}","intervalFactor":4,"legendFormat":"load 5m","refId":"B","step":20,"target":""},{"expr":"node_load15{instance=\"$server\"}","intervalFactor":4,"legendFormat":"load 15m","refId":"C","step":20,"target":""}],"thresholds":[],"timeFrom":null,"timeShift":null,"title":"System load","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"cumulative"},"type":"graph","xaxis":{"buckets":null,"mode":"time","name":null,"show":true,"values":[]},"yaxes":[{"format":"percentunit","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true}]}],"repeat":null,"repeatIteration":null,"repeatRowId":null,"showTitle":false,"title":"New row","titleSize":"h6"},{"collapse":false,"height":"250px","panels":[{"alerting":{},"aliasColors":{},"bars":false,"dashLength":10,"dashes":false,"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"fill":1,"grid":{},"id":4,"legend":{"alignAsTable":false,"avg":false,"current":false,"hideEmpty":false,"hideZero":false,"max":false,"min":false,"rightSide":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[{"alias":"node_memory_SwapFree{instance=\"172.17.0.1:9100\",job=\"prometheus\"}","yaxis":2}],"spaceLength":10,"span":9,"stack":true,"steppedLine":false,"targets":[{"expr":"node_memory_MemTotal{instance=\"$server\"} - node_memory_MemFree{instance=\"$server\"} - node_memory_Buffers{instance=\"$server\"} - node_memory_Cached{instance=\"$server\"}","hide":false,"interval":"","intervalFactor":2,"legendFormat":"memory used","metric":"","refId":"C","step":10},{"expr":"node_memory_Buffers{instance=\"$server\"}","interval":"","intervalFactor":2,"legendFormat":"memory buffers","metric":"","refId":"E","step":10},{"expr":"node_memory_Cached{instance=\"$server\"}","intervalFactor":2,"legendFormat":"memory cached","metric":"","refId":"F","step":10},{"expr":"node_memory_MemFree{instance=\"$server\"}","intervalFactor":2,"legendFormat":"memory free","metric":"","refId":"D","step":10}],"thresholds":[],"timeFrom":null,"timeShift":null,"title":"Memory usage","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"individual"},"type":"graph","xaxis":{"buckets":null,"mode":"time","name":null,"show":true,"values":[]},"yaxes":[{"format":"bytes","label":null,"logBase":1,"max":null,"min":"0","show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true}]},{"cacheTimeout":null,"colorBackground":false,"colorValue":false,"colors":["rgba(50, 172, 45, 0.97)","rgba(237, 129, 40, 0.89)","rgba(245, 54, 54, 0.9)"],"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"format":"percent","gauge":{"maxValue":100,"minValue":0,"show":true,"thresholdLabels":false,"thresholdMarkers":true},"id":5,"interval":null,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":3,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"tableColumn":"","targets":[{"expr":"((node_memory_MemTotal{instance=\"$server\"} - node_memory_MemFree{instance=\"$server\"} - node_memory_Buffers{instance=\"$server\"} - node_memory_Cached{instance=\"$server\"}) / node_memory_MemTotal{instance=\"$server\"}) * 100","intervalFactor":2,"refId":"A","step":60,"target":""}],"thresholds":"80, 90","title":"Memory usage","type":"singlestat","valueFontSize":"80%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"avg"}],"repeat":null,"repeatIteration":null,"repeatRowId":null,"showTitle":false,"title":"New row","titleSize":"h6"},{"collapse":false,"height":"250px","panels":[{"alerting":{},"aliasColors":{},"bars":false,"dashLength":10,"dashes":false,"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"fill":1,"grid":{},"id":6,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[{"alias":"read","yaxis":1},{"alias":"{instance=\"172.17.0.1:9100\"}","yaxis":2},{"alias":"io time","yaxis":2}],"spaceLength":10,"span":9,"stack":false,"steppedLine":false,"targets":[{"expr":"sum by (instance) (rate(node_disk_bytes_read{instance=\"$server\"}[2m]))","hide":false,"intervalFactor":4,"legendFormat":"read","refId":"A","step":20,"target":""},{"expr":"sum by (instance) (rate(node_disk_bytes_written{instance=\"$server\"}[2m]))","intervalFactor":4,"legendFormat":"written","refId":"B","step":20},{"expr":"sum by (instance) (rate(node_disk_io_time_ms{instance=\"$server\"}[2m]))","intervalFactor":4,"legendFormat":"io time","refId":"C","step":20}],"thresholds":[],"timeFrom":null,"timeShift":null,"title":"Disk I/O","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"cumulative"},"type":"graph","xaxis":{"buckets":null,"mode":"time","name":null,"show":true,"values":[]},"yaxes":[{"format":"bytes","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"ms","label":null,"logBase":1,"max":null,"min":null,"show":true}]},{"cacheTimeout":null,"colorBackground":false,"colorValue":false,"colors":["rgba(50, 172, 45, 0.97)","rgba(237, 129, 40, 0.89)","rgba(245, 54, 54, 0.9)"],"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"format":"percentunit","gauge":{"maxValue":1,"minValue":0,"show":true,"thresholdLabels":false,"thresholdMarkers":true},"id":7,"interval":null,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":3,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"tableColumn":"","targets":[{"expr":"(sum(node_filesystem_size{device!=\"rootfs\",instance=\"$server\"}) - sum(node_filesystem_free{device!=\"rootfs\",instance=\"$server\"})) / sum(node_filesystem_size{device!=\"rootfs\",instance=\"$server\"})","intervalFactor":2,"refId":"A","step":60,"target":""}],"thresholds":"0.75, 0.9","title":"Disk space usage","type":"singlestat","valueFontSize":"80%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"current"}],"repeat":null,"repeatIteration":null,"repeatRowId":null,"showTitle":false,"title":"New row","titleSize":"h6"},{"collapse":false,"height":"250px","panels":[{"alerting":{},"aliasColors":{},"bars":false,"dashLength":10,"dashes":false,"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"fill":1,"grid":{},"id":8,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[{"alias":"transmitted ","yaxis":2}],"spaceLength":10,"span":6,"stack":false,"steppedLine":false,"targets":[{"expr":"rate(node_network_receive_bytes{instance=\"$server\",device!~\"lo\"}[5m])","hide":false,"intervalFactor":2,"legendFormat":"{{device}}","refId":"A","step":10,"target":""}],"thresholds":[],"timeFrom":null,"timeShift":null,"title":"Network received","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"cumulative"},"type":"graph","xaxis":{"buckets":null,"mode":"time","name":null,"show":true,"values":[]},"yaxes":[{"format":"bytes","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"bytes","label":null,"logBase":1,"max":null,"min":null,"show":true}]},{"alerting":{},"aliasColors":{},"bars":false,"dashLength":10,"dashes":false,"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"fill":1,"grid":{},"id":10,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[{"alias":"transmitted ","yaxis":2}],"spaceLength":10,"span":6,"stack":false,"steppedLine":false,"targets":[{"expr":"rate(node_network_transmit_bytes{instance=\"$server\",device!~\"lo\"}[5m])","hide":false,"intervalFactor":2,"legendFormat":"{{device}}","refId":"B","step":10,"target":""}],"thresholds":[],"timeFrom":null,"timeShift":null,"title":"Network transmitted","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"cumulative"},"type":"graph","xaxis":{"buckets":null,"mode":"time","name":null,"show":true,"values":[]},"yaxes":[{"format":"bytes","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"bytes","label":null,"logBase":1,"max":null,"min":null,"show":true}]}],"repeat":null,"repeatIteration":null,"repeatRowId":null,"showTitle":false,"title":"New row","titleSize":"h6"}],"schemaVersion":14,"style":"dark","tags":[],"templating":{"list":[{"allValue":null,"current":{},"datasource":"${DS_PROMETHEUS}","hide":0,"includeAll":false,"label":null,"multi":false,"name":"server","options":[],"query":"label_values(node_boot_time, instance)","refresh":1,"regex":"","sort":0,"tagValuesQuery":"","tags":[],"tagsQuery":"","type":"query","useTags":false}]},"time":{"from":"now-1h","to":"now"},"timepicker":{"refresh_intervals":["5s","10s","30s","1m","5m","15m","30m","1h","2h","1d"],"time_options":["5m","15m","1h","6h","12h","24h","2d","7d","30d"]},"timezone":"browser","title":"Nodes","version":2},"overwrite":true,"inputs":[{"name":"DS_PROMETHEUS","type":"datasource","pluginId":"prometheus","value":"prometheus"}]} + {"dashboard":{"__inputs":[{"name":"DS_PROMETHEUS","label":"prometheus","description":"","type":"datasource","pluginId":"prometheus","pluginName":"Prometheus"}],"__requires":[{"type":"grafana","id":"grafana","name":"Grafana","version":"4.4.1"},{"type":"panel","id":"graph","name":"Graph","version":""},{"type":"datasource","id":"prometheus","name":"Prometheus","version":"1.0.0"},{"type":"panel","id":"singlestat","name":"Singlestat","version":""}],"annotations":{"list":[]},"description":"Dashboard to get an overview of one server","editable":true,"gnetId":22,"graphTooltip":0,"hideControls":false,"id":null,"links":[],"refresh":false,"rows":[{"collapse":false,"height":"250px","panels":[{"alerting":{},"aliasColors":{},"bars":false,"dashLength":10,"dashes":false,"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"fill":1,"grid":{},"id":3,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"spaceLength":10,"span":6,"stack":false,"steppedLine":false,"targets":[{"expr":"100 - (avg by (cpu) (irate(node_cpu{mode=\"idle\", kubernetes_pod_node_name=\"$server\"}[5m])) * 100)","hide":false,"intervalFactor":10,"legendFormat":"{{cpu}}","refId":"A","step":50}],"thresholds":[],"timeFrom":null,"timeShift":null,"title":"Idle cpu","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"cumulative"},"type":"graph","xaxis":{"buckets":null,"mode":"time","name":null,"show":true,"values":[]},"yaxes":[{"format":"percent","label":"cpu usage","logBase":1,"max":100,"min":0,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true}]},{"alerting":{},"aliasColors":{},"bars":false,"dashLength":10,"dashes":false,"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"fill":1,"grid":{},"id":9,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"spaceLength":10,"span":6,"stack":false,"steppedLine":false,"targets":[{"expr":"node_load1{kubernetes_pod_node_name=\"$server\"}","intervalFactor":4,"legendFormat":"load 1m","refId":"A","step":20,"target":""},{"expr":"node_load5{kubernetes_pod_node_name=\"$server\"}","intervalFactor":4,"legendFormat":"load 5m","refId":"B","step":20,"target":""},{"expr":"node_load15{kubernetes_pod_node_name=\"$server\"}","intervalFactor":4,"legendFormat":"load 15m","refId":"C","step":20,"target":""}],"thresholds":[],"timeFrom":null,"timeShift":null,"title":"System load","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"cumulative"},"type":"graph","xaxis":{"buckets":null,"mode":"time","name":null,"show":true,"values":[]},"yaxes":[{"format":"percentunit","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true}]}],"repeat":null,"repeatIteration":null,"repeatRowId":null,"showTitle":false,"title":"New row","titleSize":"h6"},{"collapse":false,"height":"250px","panels":[{"alerting":{},"aliasColors":{},"bars":false,"dashLength":10,"dashes":false,"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"fill":1,"grid":{},"id":4,"legend":{"alignAsTable":false,"avg":false,"current":false,"hideEmpty":false,"hideZero":false,"max":false,"min":false,"rightSide":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[{"alias":"node_memory_SwapFree{instance=\"172.17.0.1:9100\",job=\"prometheus\"}","yaxis":2}],"spaceLength":10,"span":9,"stack":true,"steppedLine":false,"targets":[{"expr":"node_memory_MemTotal{kubernetes_pod_node_name=\"$server\"} - node_memory_MemFree{kubernetes_pod_node_name=\"$server\"} - node_memory_Buffers{kubernetes_pod_node_name=\"$server\"} - node_memory_Cached{kubernetes_pod_node_name=\"$server\"}","hide":false,"interval":"","intervalFactor":2,"legendFormat":"memory used","metric":"","refId":"C","step":10},{"expr":"node_memory_Buffers{kubernetes_pod_node_name=\"$server\"}","interval":"","intervalFactor":2,"legendFormat":"memory buffers","metric":"","refId":"E","step":10},{"expr":"node_memory_Cached{kubernetes_pod_node_name=\"$server\"}","intervalFactor":2,"legendFormat":"memory cached","metric":"","refId":"F","step":10},{"expr":"node_memory_MemFree{kubernetes_pod_node_name=\"$server\"}","intervalFactor":2,"legendFormat":"memory free","metric":"","refId":"D","step":10}],"thresholds":[],"timeFrom":null,"timeShift":null,"title":"Memory usage","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"individual"},"type":"graph","xaxis":{"buckets":null,"mode":"time","name":null,"show":true,"values":[]},"yaxes":[{"format":"bytes","label":null,"logBase":1,"max":null,"min":"0","show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true}]},{"cacheTimeout":null,"colorBackground":false,"colorValue":false,"colors":["rgba(50, 172, 45, 0.97)","rgba(237, 129, 40, 0.89)","rgba(245, 54, 54, 0.9)"],"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"format":"percent","gauge":{"maxValue":100,"minValue":0,"show":true,"thresholdLabels":false,"thresholdMarkers":true},"id":5,"interval":null,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":3,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"tableColumn":"","targets":[{"expr":"((node_memory_MemTotal{kubernetes_pod_node_name=\"$server\"} - node_memory_MemFree{kubernetes_pod_node_name=\"$server\"} - node_memory_Buffers{kubernetes_pod_node_name=\"$server\"} - node_memory_Cached{kubernetes_pod_node_name=\"$server\"}) / node_memory_MemTotal{kubernetes_pod_node_name=\"$server\"}) * 100","intervalFactor":2,"refId":"A","step":60,"target":""}],"thresholds":"80, 90","title":"Memory usage","type":"singlestat","valueFontSize":"80%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"avg"}],"repeat":null,"repeatIteration":null,"repeatRowId":null,"showTitle":false,"title":"New row","titleSize":"h6"},{"collapse":false,"height":"250px","panels":[{"alerting":{},"aliasColors":{},"bars":false,"dashLength":10,"dashes":false,"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"fill":1,"grid":{},"id":6,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[{"alias":"read","yaxis":1},{"alias":"{instance=\"172.17.0.1:9100\"}","yaxis":2},{"alias":"io time","yaxis":2}],"spaceLength":10,"span":9,"stack":false,"steppedLine":false,"targets":[{"expr":"sum by (instance) (rate(node_disk_bytes_read{kubernetes_pod_node_name=\"$server\"}[2m]))","hide":false,"intervalFactor":4,"legendFormat":"read","refId":"A","step":20,"target":""},{"expr":"sum by (instance) (rate(node_disk_bytes_written{kubernetes_pod_node_name=\"$server\"}[2m]))","intervalFactor":4,"legendFormat":"written","refId":"B","step":20},{"expr":"sum by (instance) (rate(node_disk_io_time_ms{kubernetes_pod_node_name=\"$server\"}[2m]))","intervalFactor":4,"legendFormat":"io time","refId":"C","step":20}],"thresholds":[],"timeFrom":null,"timeShift":null,"title":"Disk I/O","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"cumulative"},"type":"graph","xaxis":{"buckets":null,"mode":"time","name":null,"show":true,"values":[]},"yaxes":[{"format":"bytes","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"ms","label":null,"logBase":1,"max":null,"min":null,"show":true}]},{"cacheTimeout":null,"colorBackground":false,"colorValue":false,"colors":["rgba(50, 172, 45, 0.97)","rgba(237, 129, 40, 0.89)","rgba(245, 54, 54, 0.9)"],"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"format":"percentunit","gauge":{"maxValue":1,"minValue":0,"show":true,"thresholdLabels":false,"thresholdMarkers":true},"id":7,"interval":null,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":3,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"tableColumn":"","targets":[{"expr":"(sum(node_filesystem_size{device!=\"rootfs\",kubernetes_pod_node_name=\"$server\"}) - sum(node_filesystem_free{device!=\"rootfs\",kubernetes_pod_node_name=\"$server\"})) / sum(node_filesystem_size{device!=\"rootfs\",kubernetes_pod_node_name=\"$server\"})","intervalFactor":2,"refId":"A","step":60,"target":""}],"thresholds":"0.75, 0.9","title":"Disk space usage","type":"singlestat","valueFontSize":"80%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"current"}],"repeat":null,"repeatIteration":null,"repeatRowId":null,"showTitle":false,"title":"New row","titleSize":"h6"},{"collapse":false,"height":"250px","panels":[{"alerting":{},"aliasColors":{},"bars":false,"dashLength":10,"dashes":false,"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"fill":1,"grid":{},"id":8,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[{"alias":"transmitted ","yaxis":2}],"spaceLength":10,"span":6,"stack":false,"steppedLine":false,"targets":[{"expr":"rate(node_network_receive_bytes{kubernetes_pod_node_name=\"$server\",device!~\"lo\"}[5m])","hide":false,"intervalFactor":2,"legendFormat":"{{device}}","refId":"A","step":10,"target":""}],"thresholds":[],"timeFrom":null,"timeShift":null,"title":"Network received","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"cumulative"},"type":"graph","xaxis":{"buckets":null,"mode":"time","name":null,"show":true,"values":[]},"yaxes":[{"format":"bytes","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"bytes","label":null,"logBase":1,"max":null,"min":null,"show":true}]},{"alerting":{},"aliasColors":{},"bars":false,"dashLength":10,"dashes":false,"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"fill":1,"grid":{},"id":10,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[{"alias":"transmitted ","yaxis":2}],"spaceLength":10,"span":6,"stack":false,"steppedLine":false,"targets":[{"expr":"rate(node_network_transmit_bytes{kubernetes_pod_node_name=\"$server\",device!~\"lo\"}[5m])","hide":false,"intervalFactor":2,"legendFormat":"{{device}}","refId":"B","step":10,"target":""}],"thresholds":[],"timeFrom":null,"timeShift":null,"title":"Network transmitted","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"cumulative"},"type":"graph","xaxis":{"buckets":null,"mode":"time","name":null,"show":true,"values":[]},"yaxes":[{"format":"bytes","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"bytes","label":null,"logBase":1,"max":null,"min":null,"show":true}]}],"repeat":null,"repeatIteration":null,"repeatRowId":null,"showTitle":false,"title":"New row","titleSize":"h6"}],"schemaVersion":14,"style":"dark","tags":[],"templating":{"list":[{"allValue":null,"current":{},"datasource":"${DS_PROMETHEUS}","hide":0,"includeAll":false,"label":null,"multi":false,"name":"server","options":[],"query":"label_values(node_boot_time, kubernetes_pod_node_name)","refresh":1,"regex":"","sort":0,"tagValuesQuery":"","tags":[],"tagsQuery":"","type":"query","useTags":false}]},"time":{"from":"now-1h","to":"now"},"timepicker":{"refresh_intervals":["5s","10s","30s","1m","5m","15m","30m","1h","2h","1d"],"time_options":["5m","15m","1h","6h","12h","24h","2d","7d","30d"]},"timezone":"browser","title":"Nodes","version":2},"overwrite":true,"inputs":[{"name":"DS_PROMETHEUS","type":"datasource","pluginId":"prometheus","value":"prometheus"}]} {% endraw %} From 77cf02c8116070b130603fd9831b5f2d6df8e871 Mon Sep 17 00:00:00 2001 From: vsaveliev Date: Wed, 1 Nov 2017 14:41:15 +0700 Subject: [PATCH 04/21] updated group vars --- inventory/group_vars/all.yml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/inventory/group_vars/all.yml b/inventory/group_vars/all.yml index 4ee94e6..bde1f5a 100644 --- a/inventory/group_vars/all.yml +++ b/inventory/group_vars/all.yml @@ -44,7 +44,6 @@ k8s_prometheus_namespace: prometheus # Additional Kubernetes namespaces k8s_namespaces: - dev - - '{{ k8s_prometheus_namespace }}' # On-prem LB services # List of services which use TCP LB for k8s masters/nodes @@ -399,7 +398,7 @@ gce_credentials_file: '{{ ansible_env.HOME }}/gcloud.json' gce_project_id: my-project-id # Slack webhook URL for Prometheus alerts -prometheus_slack_api_url: '' +k8s_prometheus_slack_api_url: '' # Slack channel for Prometheus alerts -prometheus_slack_channel: '' +k8s_prometheus_slack_channel: '' From 73d99bff28d582b9c51c14200c49bc1a9a3fe9e5 Mon Sep 17 00:00:00 2001 From: vsaveliev Date: Wed, 1 Nov 2017 14:42:02 +0700 Subject: [PATCH 05/21] small changes in prometheus role --- roles/prometheus/defaults/main.yml | 17 ++++++++--------- roles/prometheus/templates/alert-manager.yaml | 2 +- .../prometheus/templates/blackbox-exporter.yaml | 2 +- roles/prometheus/templates/grafana.yaml | 2 +- .../templates/scrape_configs/prometheus.yml | 2 +- roles/prometheus/templates/server.yaml | 2 +- 6 files changed, 13 insertions(+), 14 deletions(-) diff --git a/roles/prometheus/defaults/main.yml b/roles/prometheus/defaults/main.yml index e96a3c4..25d8b6f 100644 --- a/roles/prometheus/defaults/main.yml +++ b/roles/prometheus/defaults/main.yml @@ -46,13 +46,12 @@ k8s_prometheus_scrape_istio_metrics: false k8s_prometheus_slack_api_url: '' k8s_prometheus_slack_channel: '' k8s_prometheus_slack_message_title: '{% raw %}[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}]{{.CommonAnnotations.summary}}{% endraw %}' +# be careful with tabs - it can break structure of yml file k8s_prometheus_slack_message_body: | - {% raw %} - {{ range .Alerts }} - *Alert:* {{ .Annotations.summary }} - `{{ .Labels.severity }}` - *Description:* {{ .Annotations.description }} - *Details*: - {{ range .Labels.SortedPairs }} • {{ .Name }}: `{{ .Value }}` - {{ end }} - {{ end }} - {% endraw %} + {% raw %}{{ range .Alerts }} + *Alert:* {{ .Annotations.summary }} - `{{ .Labels.severity }}` + *Description:* {{ .Annotations.description }} + *Details*: + {{ range .Labels.SortedPairs }} • {{ .Name }}: `{{ .Value }}` + {{ end }} + {{ end }}{% endraw %} diff --git a/roles/prometheus/templates/alert-manager.yaml b/roles/prometheus/templates/alert-manager.yaml index 1f65314..886f978 100644 --- a/roles/prometheus/templates/alert-manager.yaml +++ b/roles/prometheus/templates/alert-manager.yaml @@ -142,7 +142,7 @@ spec: - name: storage-volume emptyDir: {} -{% if k8s_prometheus_alertmanager_name is defined and k8s_prometheus_alertmanager_name != '' %} +{% if k8s_prometheus_alertmanager_name != '' %} --- apiVersion: extensions/v1beta1 diff --git a/roles/prometheus/templates/blackbox-exporter.yaml b/roles/prometheus/templates/blackbox-exporter.yaml index 231485c..cc836a1 100644 --- a/roles/prometheus/templates/blackbox-exporter.yaml +++ b/roles/prometheus/templates/blackbox-exporter.yaml @@ -135,7 +135,7 @@ spec: configMap: name: prometheus-blackbox-exporter -{% if k8s_prometheus_blackbox_name is defined and k8s_prometheus_blackbox_name != '' %} +{% if k8s_prometheus_blackbox_name != '' %} --- apiVersion: extensions/v1beta1 diff --git a/roles/prometheus/templates/grafana.yaml b/roles/prometheus/templates/grafana.yaml index 32e4281..8877200 100644 --- a/roles/prometheus/templates/grafana.yaml +++ b/roles/prometheus/templates/grafana.yaml @@ -133,7 +133,7 @@ spec: configMap: name: prometheus-grafana -{% if k8s_prometheus_grafana_name is defined and k8s_prometheus_grafana_name != '' %} +{% if k8s_prometheus_grafana_name != '' %} --- apiVersion: extensions/v1beta1 diff --git a/roles/prometheus/templates/scrape_configs/prometheus.yml b/roles/prometheus/templates/scrape_configs/prometheus.yml index 2ce66f3..802e3f9 100644 --- a/roles/prometheus/templates/scrape_configs/prometheus.yml +++ b/roles/prometheus/templates/scrape_configs/prometheus.yml @@ -1,4 +1,4 @@ - # A scrape configuration for monitoring itself + # A scrape configuration for short-lived jobs (ephemeral and batch jobs) # - job_name: prometheus static_configs: diff --git a/roles/prometheus/templates/server.yaml b/roles/prometheus/templates/server.yaml index fc0d66a..9c94ef1 100644 --- a/roles/prometheus/templates/server.yaml +++ b/roles/prometheus/templates/server.yaml @@ -205,7 +205,7 @@ spec: - name: storage-volume emptyDir: {} -{% if k8s_prometheus_name is defined and k8s_prometheus_name != '' %} +{% if k8s_prometheus_name != '' %} --- apiVersion: extensions/v1beta1 From 5ae0351deb728e8a00d78ed7a8a33abd92a3a02c Mon Sep 17 00:00:00 2001 From: vsaveliev Date: Wed, 1 Nov 2017 15:09:22 +0700 Subject: [PATCH 06/21] fixed devices in Prometheus alerts, formatted file --- roles/prometheus/templates/alerts/node.conf | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/roles/prometheus/templates/alerts/node.conf b/roles/prometheus/templates/alerts/node.conf index 4931f0e..6051cdd 100644 --- a/roles/prometheus/templates/alerts/node.conf +++ b/roles/prometheus/templates/alerts/node.conf @@ -37,8 +37,7 @@ # Alert on swap > 75% # - alert: NodeSwapUsage - expr: (((node_memory_SwapTotal - node_memory_SwapFree) / node_memory_SwapTotal) - * 100) > 75 + expr: (((node_memory_SwapTotal - node_memory_SwapFree) / node_memory_SwapTotal) * 100) > 75 for: 5m labels: notify: sre @@ -51,8 +50,7 @@ # Alert on memory > 75% # - alert: NodeMemoryUsage - expr: (((node_memory_MemTotal - node_memory_MemFree - node_memory_Cached) / (node_memory_MemTotal) - * 100)) > 75 + expr: (((node_memory_MemTotal - node_memory_MemFree - node_memory_Cached) / (node_memory_MemTotal)* 100)) > 75 for: 5m labels: notify: sre @@ -65,7 +63,7 @@ # Disk is free < 25% # - alert: HighNodeDiskUsage - expr: (avg(node_filesystem_avail{device=~"^/dev/[sv]da1$"}) BY (kubernetes_pod_node_name)) / (avg(node_filesystem_size{device=~"^/dev/[sv]da1$"}) + expr: (avg(node_filesystem_avail{device=~"^/dev/[sv]da[0-9]$"}) BY (kubernetes_pod_node_name)) / (avg(node_filesystem_size{device=~"^/dev/[sv]da[0-9]$"}) BY (kubernetes_pod_node_name)) * 100 < 25 for: 5m labels: @@ -90,6 +88,6 @@ # # TODO: Disk will be busy after 4 hours - # something like: predict_linear((avg by (instance) (node_filesystem_avail{device=~"^/dev/[sv]da1$"})), 4*3600) < 0 + # something like: predict_linear((avg by (instance) (node_filesystem_avail{device=~"^/dev/[sv]da[0-9]$"})), 4*3600) < 0 {% endraw %} From a365fea981957afd885cd15feba83a84c4bfc333 Mon Sep 17 00:00:00 2001 From: vsaveliev Date: Wed, 1 Nov 2017 19:05:26 +0700 Subject: [PATCH 07/21] updated namespaces in prometheus alerts --- roles/prometheus/templates/alerts/app.conf | 8 ++++---- roles/prometheus/templates/alerts/deployment.conf | 4 ++-- roles/prometheus/templates/alerts/ingress.conf | 8 ++++---- roles/prometheus/templates/alerts/service.conf | 4 ++-- 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/roles/prometheus/templates/alerts/app.conf b/roles/prometheus/templates/alerts/app.conf index d2fcd55..6d2f037 100644 --- a/roles/prometheus/templates/alerts/app.conf +++ b/roles/prometheus/templates/alerts/app.conf @@ -14,8 +14,8 @@ notify: sre severity: warning annotations: - summary: "{{ $labels.namespace }}/{{ $label.kubernetes_pod_name }}: Service returns 5XX error" - description: "{{ $labels.namespace }}/{{ $label.kubernetes_pod_name }}: Service has processed some request incorrectly (code = {{ $labels.status }})" + summary: "{{ $labels.kubernetes_namespace }}/{{ $label.kubernetes_pod_name }}: Service returns 5XX error" + description: "{{ $labels.kubernetes_namespace }}/{{ $label.kubernetes_pod_name }}: Service has processed some request incorrectly (code = {{ $labels.status }})" # # Alert for any instance that have a median request latency > 0.5s @@ -27,6 +27,6 @@ # notify: sre # severity: warning # annotations: - # summary: "{{ $labels.namespace }}/{{ $label.kubernetes_pod_name }}: High request latency" - # description: "{{ $labels.namespace }}/{{ $label.kubernetes_pod_name }} (instance {{ $labels.instance }}): Service has a median request latency > 0.5s (current value: {{ $value }}s)" + # summary: "{{ $labels.kubernetes_namespace }}/{{ $label.kubernetes_pod_name }}: High request latency" + # description: "{{ $labels.kubernetes_namespace }}/{{ $label.kubernetes_pod_name }} (instance {{ $labels.instance }}): Service has a median request latency > 0.5s (current value: {{ $value }}s)" {% endraw %} diff --git a/roles/prometheus/templates/alerts/deployment.conf b/roles/prometheus/templates/alerts/deployment.conf index 39e8d38..645ba90 100644 --- a/roles/prometheus/templates/alerts/deployment.conf +++ b/roles/prometheus/templates/alerts/deployment.conf @@ -15,7 +15,7 @@ notify: sre severity: warning annotations: - summary: "{{ $labels.namespace }}/{{ $labels.deployment }}: Deployment is failed" - description: "{{ $labels.namespace }}/{{ $labels.deployment }}: Deployment is failed - observed replicas != intended replicas" + summary: "{{ $labels.kubernetes_namespace }}/{{ $labels.deployment }}: Deployment is failed" + description: "{{ $labels.kubernetes_namespace }}/{{ $labels.deployment }}: Deployment is failed - observed replicas != intended replicas" {% endraw %} diff --git a/roles/prometheus/templates/alerts/ingress.conf b/roles/prometheus/templates/alerts/ingress.conf index 23116c3..5d6d78d 100644 --- a/roles/prometheus/templates/alerts/ingress.conf +++ b/roles/prometheus/templates/alerts/ingress.conf @@ -14,8 +14,8 @@ notify: sre severity: warning annotations: - summary: "{{ $labels.namespace }}/{{ $labels.kubernetes_ingress_name }}: Ingress is unreachable" - description: "{{ $labels.namespace }}/{{ $labels.kubernetes_ingress_name }}: Ingress is unreachable more than 5 minutes (domain is {{ $labels.instance }})" + summary: "{{ $labels.kubernetes_namespace }}/{{ $labels.kubernetes_ingress_name }}: Ingress is unreachable" + description: "{{ $labels.kubernetes_namespace }}/{{ $labels.kubernetes_ingress_name }}: Ingress is unreachable more than 5 minutes (domain is {{ $labels.instance }})" # # Alert on SSL certificate will be expired < 30 days @@ -27,7 +27,7 @@ notify: sre severity: warning annotations: - summary: "{{ $labels.namespace }}/{{ $labels.kubernetes_ingress_name }}: SSL certificate will be expired soon" - description: "{{ $labels.namespace }}/{{ $labels.kubernetes_ingress_name }}: SSL certificate will be expired less than 30 days (domain is {{ $labels.instance }})" + summary: "{{ $labels.kubernetes_namespace }}/{{ $labels.kubernetes_ingress_name }}: SSL certificate will be expired soon" + description: "{{ $labels.kubernetes_namespace }}/{{ $labels.kubernetes_ingress_name }}: SSL certificate will be expired less than 30 days (domain is {{ $labels.instance }})" {% endraw %} diff --git a/roles/prometheus/templates/alerts/service.conf b/roles/prometheus/templates/alerts/service.conf index ac3553f..ab722f4 100644 --- a/roles/prometheus/templates/alerts/service.conf +++ b/roles/prometheus/templates/alerts/service.conf @@ -15,7 +15,7 @@ notify: sre severity: warning annotations: - description: "{{ $labels.namespace }}/{{ $labels.kubernetes_service_name }}: Service is unreachable" - summary: "{{ $labels.namespace }}/{{ $labels.kubernetes_service_name }}: Service is unreachable more than 5 minutes" + description: "{{ $labels.kubernetes_namespace }}/{{ $labels.kubernetes_service_name }}: Service is unreachable" + summary: "{{ $labels.kubernetes_namespace }}/{{ $labels.kubernetes_service_name }}: Service is unreachable more than 5 minutes" {% endraw %} From 7a1d3c27a9b67c7685b1ab134c66ce2e29224204 Mon Sep 17 00:00:00 2001 From: vsaveliev Date: Mon, 6 Nov 2017 10:07:00 +0700 Subject: [PATCH 08/21] formatted default vars in prometheus role --- roles/prometheus/defaults/main.yml | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/roles/prometheus/defaults/main.yml b/roles/prometheus/defaults/main.yml index 25d8b6f..667678a 100644 --- a/roles/prometheus/defaults/main.yml +++ b/roles/prometheus/defaults/main.yml @@ -25,17 +25,16 @@ k8s_prometheus_ksm_image: gcr.io/google_containers/kube-state-metrics k8s_prometheus_ksm_image_tag: v0.5.0 k8s_prometheus_nodeexport_image: prom/node-exporter k8s_prometheus_nodeexport_image_tag: v0.15.0 +k8s_prometheus_blackbox_image: prom/blackbox-exporter +k8s_prometheus_blackbox_image_tag: v0.10.0 + +# Prometheus additional images k8s_prometheus_pushgateway_image: prom/pushgateway k8s_prometheus_pushgateway_image_tag: v0.4.0 k8s_prometheus_grafana_image: grafana/grafana k8s_prometheus_grafana_image_tag: 4.5.2 k8s_prometheus_grafana_watcher_image: quay.io/coreos/grafana-watcher k8s_prometheus_grafana_watcher_image_tag: v0.0.8 - -k8s_prometheus_blackbox_image: prom/blackbox-exporter -k8s_prometheus_blackbox_image_tag: v0.10.0 - -# Prometheus additional images k8s_prometheus_configreload_image: jimmidyson/configmap-reload k8s_prometheus_configreload_image_tag: v0.1 From 9a35933d8d65023a7add2a8b952308169051904f Mon Sep 17 00:00:00 2001 From: vsaveliev Date: Tue, 7 Nov 2017 11:55:25 +0700 Subject: [PATCH 09/21] updated prometheus role - readme, alerts, comments --- roles/prometheus/README.md | 127 ++++++++++++++---- roles/prometheus/defaults/main.yml | 22 ++- roles/prometheus/templates/alert-manager.yaml | 5 + roles/prometheus/templates/alerts/app.conf | 4 +- .../templates/alerts/kubernetes.conf | 103 ++++++++++++++ roles/prometheus/templates/alerts/node.conf | 65 +++++++-- roles/prometheus/templates/alerts/pods.conf | 49 ++++++- .../templates/alerts/prometheus.conf | 20 +++ roles/prometheus/templates/server.yaml | 4 + 9 files changed, 355 insertions(+), 44 deletions(-) create mode 100644 roles/prometheus/templates/alerts/kubernetes.conf create mode 100644 roles/prometheus/templates/alerts/prometheus.conf diff --git a/roles/prometheus/README.md b/roles/prometheus/README.md index 9494ddd..ba2f8bb 100644 --- a/roles/prometheus/README.md +++ b/roles/prometheus/README.md @@ -1,7 +1,9 @@ Prometheus role -========= +=============== -This role installs Prometheus for Kubernetes cluster (endpoints, pods, nodes, istio, ...) +This role installs Prometheus for Kubernetes cluster (endpoints, pods, nodes, istio, ...) with some basic alerts, dashboards and etc. + +[Official documentation](https://prometheus.io/docs/introduction/overview/) [![Contributions Welcome](https://img.shields.io/badge/contributions-welcome-brightgreen.svg?style=flat)](https://github.com/k8s-community/cluster-deploy/issues) @@ -13,36 +15,111 @@ No special requirements. Role Variables -------------- -Available variables are listed below, along with default values (see `defaults/main.yml`): +You can see all available params in `defaults/main.yml` with default values and descriptions why it needs. By default, all prometheus components will be created in `prometheus` namespace (even if it doesn't exist before). Node exporter will be ran on all nodes (even on master nodes). -Prometheus version: -```yaml -k8s_prometheus_image_tag: v1.5.1 -``` -Domain name for prometheus (if it's empty so ingress object isn't created): -```yaml -k8s_prometheus_name: '' -``` +How to create new alert +----------------------- -Domain name for prometheus alert manager (if it's empty so ingress isn't created): -```yaml -k8s_prometheus_alertmanager_name: '' -``` +New alerts can be added in `templates/alerts` directory (check already existing alerts). For example: -Domain name for prometheus push gateway (if it's empty so ingress isn't created): -```yaml -k8s_prometheus_pushgateway_name: '' -``` + # + # Alert on deployment has not enough replicas + # + - alert: DeploymentReplicasMismatch + expr: (kube_deployment_spec_replicas != kube_deployment_status_replicas_available) + or (kube_deployment_spec_replicas unless kube_deployment_status_replicas_unavailable) + for: 5m + labels: + notify: sre + severity: warning + annotations: + summary: "{{ $labels.kubernetes_namespace }}/{{ $labels.deployment }}: Deployment is failed" + description: "{{ $labels.kubernetes_namespace }}/{{ $labels.deployment }}: Deployment is failed - observed replicas != intended replicas" -New alerts --------------- +It's better to have short description of alert in top for other people. You can use different labels for alerts, we offer to follow recommendations from `templates/alerts/common.conf`. -New alerts can be added in templates/alerts directory. +If you want to create a new file with alerts in `templates/alerts` then you need to add line with file name also in `templates/server.yaml` after creation. Template: -New scrape configs --------------- + {% raw %} + # + # Some alerts for something + # + - name: some-alerts + rules: + + # + # Alert on something + # + - alert: SomethingWrong + .... + + {% endraw %} + +More details about alerts on: [Official documentation about alerts](https://prometheus.io/docs/alerting/rules/) + +What configs should app have +---------------------------- + +You will have to do some changes in manifests / charts if you want to monitor your apps. Changes are described below. + +Ingress should have: + + annotations: + prometheus.io/probe: 'true' + +Black box exporter would check your app via HTTPS check if it needs this check (and SSL certificate expiration). + +Service should have: + + annotations: + prometheus.io/scrape: 'true' + prometheus.io/probe: 'true' + # by default (pass this values only if it should be different) + prometheus.io/port: '8080' + prometheus.io/path: '/metrics' + +Black box exporter would check your app via HTTP check if other apps can communicate with it inside Kubernetes cluster. Also all metrics would be scraped from each pod of your app to create your custom alerts in future. By default, the system monitors only 5XX HTTP codes for apps. + +How to create new scrape configs +-------------------------------- + +New scrape configs can be added in `templates/scrape_configs` directory (check already existing scrapes). If you create a new file then you need to add it also in `templates/server.yaml`. Template: + + {% raw %} + # A scrape configuration for something. + # + - job_name: some-thing + ... + + {% endraw %} + +All details about scrape config on: [Official documentation about scrape configs](https://prometheus.io/docs/operating/configuration/#) + +How to add own dashboard in Grafana +----------------------------------- + +1. You can add a new dashboard manually via `Dashboards --> New` or find existing on [grafana.com](https://grafana.com/dashboards) +2. export (download) it on your computer +3. copy content of downloaded JSON file +4. create a new file in `templates/grafana-dashboards` directory. Template: + + {% raw %} + { + "dashboard": { + ... ... + }, + "overwrite": true, + "inputs": [ + { + "name": "DS_PROMETHEUS", + "type": "datasource", + "pluginId": "prometheus", + "value": "prometheus" + } + ] + }{% endraw %} -New scrape configs can be added in templates/scrape_configs directory. +5. add line with file name in `templates/grafana.yaml`. Example Playbook ---------------- diff --git a/roles/prometheus/defaults/main.yml b/roles/prometheus/defaults/main.yml index 667678a..06bcc2d 100644 --- a/roles/prometheus/defaults/main.yml +++ b/roles/prometheus/defaults/main.yml @@ -6,39 +6,51 @@ k8s_prometheus_dir: '{{ k8s_conf_dir }}/addons/prometheus' # Master hosts names k8s_master_hosts: {} -# Prometheus host names (for Prometheus, AlertManager, PushGateway, Grafana) +# Prometheus host name (it will be created if it doesn't exist) k8s_prometheus_name: '' +# AlertManager host name (it will be created if it doesn't exist) k8s_prometheus_alertmanager_name: '' +# PushGateway host name (it will be created if it doesn't exist) k8s_prometheus_pushgateway_name: '' +# Grafana host name (it will be created if it doesn't exist) k8s_prometheus_grafana_name: '' +# AlertManager host name (it will be created if it doesn't exist) k8s_prometheus_blackbox_name: '' -# Prometheus namespace +# Prometheus namespace (it will be created if it doesn't exist) k8s_prometheus_namespace: prometheus -# Prometheus images +# Prometheus image k8s_prometheus_image: prom/prometheus k8s_prometheus_image_tag: v2.0.0-rc.1 +# Alert manager image k8s_prometheus_alertmanager_image: prom/alertmanager k8s_prometheus_alertmanager_image_tag: v0.9.1 +# Kube state metrics image k8s_prometheus_ksm_image: gcr.io/google_containers/kube-state-metrics k8s_prometheus_ksm_image_tag: v0.5.0 +# Node exporter image k8s_prometheus_nodeexport_image: prom/node-exporter k8s_prometheus_nodeexport_image_tag: v0.15.0 +# Black box exporter image k8s_prometheus_blackbox_image: prom/blackbox-exporter k8s_prometheus_blackbox_image_tag: v0.10.0 -# Prometheus additional images +# Prometheus additional components + +# Push gateway image k8s_prometheus_pushgateway_image: prom/pushgateway k8s_prometheus_pushgateway_image_tag: v0.4.0 +# Grafan image (+ grafana watcher) k8s_prometheus_grafana_image: grafana/grafana k8s_prometheus_grafana_image_tag: 4.5.2 k8s_prometheus_grafana_watcher_image: quay.io/coreos/grafana-watcher k8s_prometheus_grafana_watcher_image_tag: v0.0.8 +# Config reload image k8s_prometheus_configreload_image: jimmidyson/configmap-reload k8s_prometheus_configreload_image_tag: v0.1 -# Prometheus scrape configs +# Prometheus scrape configs for Istio k8s_prometheus_scrape_istio_metrics: false # Prometheus alert configs for Slack diff --git a/roles/prometheus/templates/alert-manager.yaml b/roles/prometheus/templates/alert-manager.yaml index 886f978..627c44d 100644 --- a/roles/prometheus/templates/alert-manager.yaml +++ b/roles/prometheus/templates/alert-manager.yaml @@ -21,12 +21,17 @@ data: title: '{{ k8s_prometheus_slack_message_title }}' text: >- {{ k8s_prometheus_slack_message_body }} + - name: null route: group_wait: 10s group_interval: 5m receiver: slack-receiver repeat_interval: 3h + routes: + - receiver: null + match: + alertname: DeadMansSwitch --- diff --git a/roles/prometheus/templates/alerts/app.conf b/roles/prometheus/templates/alerts/app.conf index 6d2f037..f89c14a 100644 --- a/roles/prometheus/templates/alerts/app.conf +++ b/roles/prometheus/templates/alerts/app.conf @@ -8,8 +8,7 @@ # Alert on each 5XX code of any http service # - alert: HttpServiceFatalError - expr: rate(http_requests_total{job="kubernetes-service-endpoints",status=~"5.."}[1m]) - > 0 + expr: rate(http_requests_total{job="kubernetes-service-endpoints",status=~"5.."}[1m]) > 0 labels: notify: sre severity: warning @@ -29,4 +28,5 @@ # annotations: # summary: "{{ $labels.kubernetes_namespace }}/{{ $label.kubernetes_pod_name }}: High request latency" # description: "{{ $labels.kubernetes_namespace }}/{{ $label.kubernetes_pod_name }} (instance {{ $labels.instance }}): Service has a median request latency > 0.5s (current value: {{ $value }}s)" + {% endraw %} diff --git a/roles/prometheus/templates/alerts/kubernetes.conf b/roles/prometheus/templates/alerts/kubernetes.conf new file mode 100644 index 0000000..4460b88 --- /dev/null +++ b/roles/prometheus/templates/alerts/kubernetes.conf @@ -0,0 +1,103 @@ +{% raw %} + # + # Kubernetes API server alerts + # + - name: kube-api-server-alerts + rules: + # + # Alert that Kubernetes API server is down + # + - alert: K8SApiServerDown + expr: absent(up{job="kubernetes-apiservers"} == 1) + for: 5m + labels: + severity: critical + notify: sre + annotations: + summary: "Kubernetes API server unreachable" + description: "Prometheus failed to scrape API server(s), or all API servers have disappeared from service discovery" + + # + # Alert that Kubernetes API has high latency + # + - alert: K8SApiServerLatency + expr: histogram_quantile(0.99, sum(apiserver_request_latencies_bucket{subresource!="log",verb!~"^(?:CONNECT|WATCHLIST|WATCH|PROXY)$"}) + WITHOUT (instance, resource)) / 1e+06 > 1 + for: 10m + labels: + severity: warning + notify: sre + annotations: + summary: "Kubernetes apiserver latency is high" + description: "99th percentile Latency for {{ $labels.verb }} requests to the kube-apiserver is higher than 1s" + + # + # Kubernetes kubelet alerts + # + - name: kubelet-alerts + rules: + # + # Alert that Kubernetes node isn't ready to work + # + - alert: K8SNodeNotReady + expr: kube_node_status_ready{condition="true"} == 0 + for: 1h + labels: + severity: warning + notify: sre + annotations: + summary: "Node status is NotReady" + description: "{{ $labels.kubernetes_pod_node_name }}: The Kubelet has not checked in with the API, or has set itself to NotReady, for more than an hour" + + # + # Alert that Kubernetes few nodes aren't ready to work + # + - alert: K8SManyNodesNotReady + expr: count(kube_node_status_ready{condition="true"} == 0) > 1 + and (count(kube_node_status_ready{condition="true"} == 0) / count(kube_node_status_ready{condition="true"})) > 0.2 + for: 1m + labels: + severity: critical + annotations: + summary: "Many Kubernetes nodes are Not Ready" + description: "{{ $value }} Kubernetes nodes (more than 10% are in the NotReady state)" + + # + # Alert that kubelet is down + # + #- alert: K8SKubeletDown + # expr: count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.03 + # for: 1h + # labels: + # severity: warning + # notify: sre + # annotations: + # summary: "Many Kubelets cannot be scraped" + # description: "Prometheus failed to scrape {{ $value }}% of kubelets" + + # + # Alert that kubelet is down + # + #- alert: K8SKubeletDown + # expr: absent(up{job="kubelet"} == 1) + # or count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.1 + # for: 1h + # labels: + # severity: critical + # notify: sre + # annotations: + # summary: "Many Kubelets cannot be scraped" + # description: "Prometheus failed to scrape {{ $value }}% of kubelets, or all Kubelets have disappeared from service discovery" + + # + # Alert that kubelet can reach limit of running pods on one node + # + - alert: K8SKubeletTooManyPods + expr: kubelet_running_pod_count > 100 + labels: + severity: warning + notify: sre + annotations: + summary: "Kubelet is close to pod limit" + description: "{{$labels.kubernetes_node_name}}: Kubelet is running {{$value}} pods, close to the limit of 110" +{% endraw %} diff --git a/roles/prometheus/templates/alerts/node.conf b/roles/prometheus/templates/alerts/node.conf index 6051cdd..bc4bdbf 100644 --- a/roles/prometheus/templates/alerts/node.conf +++ b/roles/prometheus/templates/alerts/node.conf @@ -14,8 +14,8 @@ BY (kubernetes_pod_node_name) * 100)) > 75 for: 2m labels: - notify: sre severity: info + notify: sre annotations: summary: "{{ $labels.kubernetes_pod_node_name }}: High CPU usage is detected" description: "{{ $labels.kubernetes_pod_node_name }}: CPU usage is above 75% (current value is: {{ $value }})" @@ -27,8 +27,8 @@ expr: ((node_load5 / count(node_cpu{mode="system"}) WITHOUT (cpu, mode)) > 1.5) for: 5m labels: - notify: sre severity: info + notify: sre annotations: description: "{{ $labels.kubernetes_pod_node_name }}: Load average is high" summary: "{{ $labels.kubernetes_pod_node_name }}: High LA detected" @@ -40,8 +40,8 @@ expr: (((node_memory_SwapTotal - node_memory_SwapFree) / node_memory_SwapTotal) * 100) > 75 for: 5m labels: - notify: sre severity: info + notify: sre annotations: summary: "{{ $labels.kubernetes_pod_node_name }}: Swap usage is detected" description: "{{ $labels.kubernetes_pod_node_name }}: Swap usage usage is above 75% (current value is: {{ $value }})" @@ -53,39 +53,84 @@ expr: (((node_memory_MemTotal - node_memory_MemFree - node_memory_Cached) / (node_memory_MemTotal)* 100)) > 75 for: 5m labels: - notify: sre severity: info + notify: sre annotations: summary: "{{ $labels.kubernetes_pod_node_name }}: High memory usage is detected" description: "{{ $labels.kubernetes_pod_node_name }}: Memory usage is above 75% (current value is: {{ $value }})" # - # Disk is free < 25% + # Disk is free < 10% # - alert: HighNodeDiskUsage expr: (avg(node_filesystem_avail{device=~"^/dev/[sv]da[0-9]$"}) BY (kubernetes_pod_node_name)) / (avg(node_filesystem_size{device=~"^/dev/[sv]da[0-9]$"}) - BY (kubernetes_pod_node_name)) * 100 < 25 + BY (kubernetes_pod_node_name)) * 100 < 10 for: 5m labels: + severity: high notify: sre - severity: info annotations: summary: "{{$labels.kubernetes_pod_node_name}}: High disk usage is detected" - description: "{{$labels.kubernetes_pod_node_name}}: Disk is free less than 25% (current value is: {{ $value }})" + description: "{{$labels.kubernetes_pod_node_name}}: Disk is free less than 10% (current value is: {{ $value }})" # # Alert for node that is unreachable for > 5 minutes # - alert: NodeIsDown - expr: up{job="kubernetes-nodes"} == 0 + expr: up{job="kubernetes-nodes"} == 0 or absent(up{job="kubernetes-nodes"} == 1) for: 5m labels: - notify: sre severity: warning + notify: sre annotations: summary: "{{ $labels.kubernetes_node_name }}: Node is down" description: "{{ $labels.kubernetes_node_name }}: Node is down for more than 5 minutes" + # + # Defines ratio between max and open file descriptors + # + - record: instance:fd_node_utilization + expr: process_open_fds{job="kubernetes-nodes"} / process_max_fds{job="kubernetes-nodes"} + + # + # Alert for node that uses 80% of available file descriptors + # + - alert: TooManyOpenFDNode + expr: 100 * instance:fd_node_utilization > 80 + for: 10m + labels: + severity: critical + notify: sre + annotations: + summary: "{{ $labels.kubernetes_namespace }}/{{ $labels.kubernetes_node_name }}: High usage of file descriptors by node" + description: '{{ $labels.kubernetes_namespace }}/{{ $labels.kubernetes_node_name }}: Node is using {{ $value }}% of the available file/socket descriptors' + + # + # Alert on expected limit of open file descriptors after 4 hours + # + - alert: FdNodeExhaustionClose + expr: predict_linear(instance:fd_node_utilization[1h], 3600 * 4) > 1 + for: 10m + labels: + severity: warning + notify: sre + annotations: + summary: "{{ $labels.kubernetes_namespace }}/{{ $labels.kubernetes_node_name }}: Expected high usage of file descriptors by node" + description: '{{ $labels.kubernetes_namespace }}/{{ $labels.kubernetes_node_name }}: Node will exhaust in file/socket descriptors after 4 hours' + + # + # Alert on expected limit of open file descriptors after 1h + # + - alert: FdNodeExhaustionTooClose + expr: predict_linear(instance:fd_node_utilization[10m], 3600) > 1 + for: 10m + labels: + severity: critical + notify: sre + annotations: + summary: "{{ $labels.kubernetes_namespace }}/{{ $labels.kubernetes_node_name }}: Expected high usage of file descriptors by node" + description: '{{ $labels.kubernetes_namespace }}/{{ $labels.kubernetes_node_name }}: Node will exhaust in file/socket descriptors after 1 hour' + # # TODO: Disk will be busy after 4 hours # something like: predict_linear((avg by (instance) (node_filesystem_avail{device=~"^/dev/[sv]da[0-9]$"})), 4*3600) < 0 diff --git a/roles/prometheus/templates/alerts/pods.conf b/roles/prometheus/templates/alerts/pods.conf index b13cdee..36aafc2 100644 --- a/roles/prometheus/templates/alerts/pods.conf +++ b/roles/prometheus/templates/alerts/pods.conf @@ -22,7 +22,7 @@ # Alert for pod that is unreachable for > 5 minutes # - alert: PodIsDown - expr: up{job="kubernetes-service-endpoints"} == 0 + expr: up{job="kubernetes-service-endpoints"} == 0 or absent(up{job="kubernetes-service-endpoints"} == 1) for: 5m labels: notify: sre @@ -31,4 +31,49 @@ summary: "{{ $labels.kubernetes_namespace }}/{{ $labels.kubernetes_pod_name }}: Pod is down" description: "{{ $labels.kubernetes_namespace }}/{{ $labels.kubernetes_pod_name }}: Pod is down for more than 5 minutes" -{% endraw %} \ No newline at end of file + # + # Defines ratio between max and open file descriptors + # + - record: instance:fd_pod_utilization + expr: process_open_fds{job="kubernetes-service-endpoints"} / process_max_fds{job="kubernetes-service-endpoints"} + + # + # Alert for pod that uses 80% of available file descriptors + # + - alert: TooManyOpenFDPod + expr: 100 * instance:fd_pod_utilization > 80 + for: 10m + labels: + notify: sre + severity: critical + annotations: + summary: "{{ $labels.kubernetes_namespace }}/{{ $labels.kubernetes_pod_name }}: High usage of file descriptors by pod" + description: '{{ $labels.kubernetes_namespace }}/{{ $labels.kubernetes_pod_name }}: Pod is using {{ $value }}% of the available file/socket descriptors' + + # + # Alert on expected limit of open file descriptors after 4 hours + # + - alert: FdPodExhaustionClose + expr: predict_linear(instance:fd_pod_utilization[1h], 3600 * 4) > 1 + for: 10m + labels: + notify: sre + severity: warning + annotations: + summary: "{{ $labels.kubernetes_namespace }}/{{ $labels.kubernetes_pod_name }}: Expected high usage of file descriptors by pod" + description: '{{ $labels.kubernetes_namespace }}/{{ $labels.kubernetes_pod_name }}: Pod will exhaust in file/socket descriptors after 4 hours' + + # + # Alert on expected limit of open file descriptors after 1h + # + - alert: FdPodExhaustionTooClose + expr: predict_linear(instance:fd_pod_utilization[10m], 3600) > 1 + for: 10m + labels: + notify: sre + severity: critical + annotations: + summary: "{{ $labels.kubernetes_namespace }}/{{ $labels.kubernetes_pod_name }}: Expected high usage of file descriptors by pod" + description: '{{ $labels.kubernetes_namespace }}/{{ $labels.kubernetes_pod_name }}: Pod will exhaust in file/socket descriptors after 1 hour' + +{% endraw %} diff --git a/roles/prometheus/templates/alerts/prometheus.conf b/roles/prometheus/templates/alerts/prometheus.conf new file mode 100644 index 0000000..eaa738d --- /dev/null +++ b/roles/prometheus/templates/alerts/prometheus.conf @@ -0,0 +1,20 @@ +{% raw %} + # + # Applications alerts (from /metrics handler) + # + - name: prometheus-alerts + rules: + # + # Alert on failed config reload + # + - alert: FailedReload + expr: prometheus_config_last_reload_successful == 0 + for: 10m + labels: + severity: warning + annotations: + description: Reloading Prometheus' configuration has failed for {{ $labels.namespace + }}/{{ $labels.pod}}. + summary: Prometheus configuration reload has failed + +{% endraw %} diff --git a/roles/prometheus/templates/server.yaml b/roles/prometheus/templates/server.yaml index 9c94ef1..28d508a 100644 --- a/roles/prometheus/templates/server.yaml +++ b/roles/prometheus/templates/server.yaml @@ -24,6 +24,10 @@ data: {% include 'alerts/service.conf' %} +{% include 'alerts/prometheus.conf' %} + +{% include 'alerts/kubernetes.conf' %} + prometheus.yml: |- alerting: alertmanagers: From e967b4c87b8e9a9b15dac225582c9cba3e3e6c54 Mon Sep 17 00:00:00 2001 From: vsaveliev Date: Wed, 8 Nov 2017 08:51:30 +0700 Subject: [PATCH 10/21] updated alerts, readme for prometheus role --- roles/prometheus/README.md | 4 ++-- roles/prometheus/templates/alert-manager.yaml | 4 ++-- roles/prometheus/templates/alerts/app.conf | 4 ++-- .../prometheus/templates/alerts/deployment.conf | 2 +- roles/prometheus/templates/alerts/ingress.conf | 4 ++-- roles/prometheus/templates/alerts/pods.conf | 10 +++++----- .../prometheus/templates/alerts/prometheus.conf | 16 +++++++++++++--- roles/prometheus/templates/alerts/service.conf | 2 +- 8 files changed, 28 insertions(+), 18 deletions(-) diff --git a/roles/prometheus/README.md b/roles/prometheus/README.md index ba2f8bb..4a3bda3 100644 --- a/roles/prometheus/README.md +++ b/roles/prometheus/README.md @@ -95,10 +95,10 @@ New scrape configs can be added in `templates/scrape_configs` directory (check a All details about scrape config on: [Official documentation about scrape configs](https://prometheus.io/docs/operating/configuration/#) -How to add own dashboard in Grafana +How to add dashboard in Grafana ----------------------------------- -1. You can add a new dashboard manually via `Dashboards --> New` or find existing on [grafana.com](https://grafana.com/dashboards) +1. create a new dashboard manually via `Dashboards --> New` or find existing on [grafana.com](https://grafana.com/dashboards) 2. export (download) it on your computer 3. copy content of downloaded JSON file 4. create a new file in `templates/grafana-dashboards` directory. Template: diff --git a/roles/prometheus/templates/alert-manager.yaml b/roles/prometheus/templates/alert-manager.yaml index 627c44d..af5d8d8 100644 --- a/roles/prometheus/templates/alert-manager.yaml +++ b/roles/prometheus/templates/alert-manager.yaml @@ -21,7 +21,7 @@ data: title: '{{ k8s_prometheus_slack_message_title }}' text: >- {{ k8s_prometheus_slack_message_body }} - - name: null + - name: 'null' route: group_wait: 10s @@ -29,7 +29,7 @@ data: receiver: slack-receiver repeat_interval: 3h routes: - - receiver: null + - receiver: 'null' match: alertname: DeadMansSwitch diff --git a/roles/prometheus/templates/alerts/app.conf b/roles/prometheus/templates/alerts/app.conf index f89c14a..ca395bd 100644 --- a/roles/prometheus/templates/alerts/app.conf +++ b/roles/prometheus/templates/alerts/app.conf @@ -10,8 +10,8 @@ - alert: HttpServiceFatalError expr: rate(http_requests_total{job="kubernetes-service-endpoints",status=~"5.."}[1m]) > 0 labels: - notify: sre severity: warning + notify: sre annotations: summary: "{{ $labels.kubernetes_namespace }}/{{ $label.kubernetes_pod_name }}: Service returns 5XX error" description: "{{ $labels.kubernetes_namespace }}/{{ $label.kubernetes_pod_name }}: Service has processed some request incorrectly (code = {{ $labels.status }})" @@ -23,8 +23,8 @@ # expr: http_request_duration_seconds_bucket{le="0.5"} > 0.5 # for: 1m # labels: - # notify: sre # severity: warning + # notify: sre # annotations: # summary: "{{ $labels.kubernetes_namespace }}/{{ $label.kubernetes_pod_name }}: High request latency" # description: "{{ $labels.kubernetes_namespace }}/{{ $label.kubernetes_pod_name }} (instance {{ $labels.instance }}): Service has a median request latency > 0.5s (current value: {{ $value }}s)" diff --git a/roles/prometheus/templates/alerts/deployment.conf b/roles/prometheus/templates/alerts/deployment.conf index 645ba90..56395c8 100644 --- a/roles/prometheus/templates/alerts/deployment.conf +++ b/roles/prometheus/templates/alerts/deployment.conf @@ -12,8 +12,8 @@ or (kube_deployment_spec_replicas unless kube_deployment_status_replicas_unavailable) for: 5m labels: - notify: sre severity: warning + notify: sre annotations: summary: "{{ $labels.kubernetes_namespace }}/{{ $labels.deployment }}: Deployment is failed" description: "{{ $labels.kubernetes_namespace }}/{{ $labels.deployment }}: Deployment is failed - observed replicas != intended replicas" diff --git a/roles/prometheus/templates/alerts/ingress.conf b/roles/prometheus/templates/alerts/ingress.conf index 5d6d78d..d029f2e 100644 --- a/roles/prometheus/templates/alerts/ingress.conf +++ b/roles/prometheus/templates/alerts/ingress.conf @@ -11,8 +11,8 @@ expr: probe_success{job="kubernetes-ingresses"} == 0 for: 5m labels: - notify: sre severity: warning + notify: sre annotations: summary: "{{ $labels.kubernetes_namespace }}/{{ $labels.kubernetes_ingress_name }}: Ingress is unreachable" description: "{{ $labels.kubernetes_namespace }}/{{ $labels.kubernetes_ingress_name }}: Ingress is unreachable more than 5 minutes (domain is {{ $labels.instance }})" @@ -24,8 +24,8 @@ expr: probe_ssl_earliest_cert_expiry{job="kubernetes-ingresses"} - time() < 86400 * 30 for: 5m labels: - notify: sre severity: warning + notify: sre annotations: summary: "{{ $labels.kubernetes_namespace }}/{{ $labels.kubernetes_ingress_name }}: SSL certificate will be expired soon" description: "{{ $labels.kubernetes_namespace }}/{{ $labels.kubernetes_ingress_name }}: SSL certificate will be expired less than 30 days (domain is {{ $labels.instance }})" diff --git a/roles/prometheus/templates/alerts/pods.conf b/roles/prometheus/templates/alerts/pods.conf index 36aafc2..d4b4dc7 100644 --- a/roles/prometheus/templates/alerts/pods.conf +++ b/roles/prometheus/templates/alerts/pods.conf @@ -12,8 +12,8 @@ expr: rate(kube_pod_container_status_restarts[1m]) > 1 / (5 * 60) for: 1h labels: - notify: tech severity: warning + notify: tech annotations: summary: "{{ $labels.kubernetes_namespace }}/{{ $labels.pod }}: Pod restarts too much" description: "{{ $labels.kubernetes_namespace }}/{{ $labels.pod }}: Pod restarts more than 5 times per minute" @@ -25,8 +25,8 @@ expr: up{job="kubernetes-service-endpoints"} == 0 or absent(up{job="kubernetes-service-endpoints"} == 1) for: 5m labels: - notify: sre severity: warning + notify: sre annotations: summary: "{{ $labels.kubernetes_namespace }}/{{ $labels.kubernetes_pod_name }}: Pod is down" description: "{{ $labels.kubernetes_namespace }}/{{ $labels.kubernetes_pod_name }}: Pod is down for more than 5 minutes" @@ -44,8 +44,8 @@ expr: 100 * instance:fd_pod_utilization > 80 for: 10m labels: - notify: sre severity: critical + notify: sre annotations: summary: "{{ $labels.kubernetes_namespace }}/{{ $labels.kubernetes_pod_name }}: High usage of file descriptors by pod" description: '{{ $labels.kubernetes_namespace }}/{{ $labels.kubernetes_pod_name }}: Pod is using {{ $value }}% of the available file/socket descriptors' @@ -57,8 +57,8 @@ expr: predict_linear(instance:fd_pod_utilization[1h], 3600 * 4) > 1 for: 10m labels: - notify: sre severity: warning + notify: sre annotations: summary: "{{ $labels.kubernetes_namespace }}/{{ $labels.kubernetes_pod_name }}: Expected high usage of file descriptors by pod" description: '{{ $labels.kubernetes_namespace }}/{{ $labels.kubernetes_pod_name }}: Pod will exhaust in file/socket descriptors after 4 hours' @@ -70,8 +70,8 @@ expr: predict_linear(instance:fd_pod_utilization[10m], 3600) > 1 for: 10m labels: - notify: sre severity: critical + notify: sre annotations: summary: "{{ $labels.kubernetes_namespace }}/{{ $labels.kubernetes_pod_name }}: Expected high usage of file descriptors by pod" description: '{{ $labels.kubernetes_namespace }}/{{ $labels.kubernetes_pod_name }}: Pod will exhaust in file/socket descriptors after 1 hour' diff --git a/roles/prometheus/templates/alerts/prometheus.conf b/roles/prometheus/templates/alerts/prometheus.conf index eaa738d..0aa84ed 100644 --- a/roles/prometheus/templates/alerts/prometheus.conf +++ b/roles/prometheus/templates/alerts/prometheus.conf @@ -12,9 +12,19 @@ for: 10m labels: severity: warning + notify: sre annotations: - description: Reloading Prometheus' configuration has failed for {{ $labels.namespace - }}/{{ $labels.pod}}. - summary: Prometheus configuration reload has failed + description: "Reloading Prometheus' configuration has failed" + summary: "Prometheus configuration reload has failed" + # + # Alert to check pipeline + # + - alert: DeadMansSwitch + expr: vector(1) + labels: + severity: none + annotations: + summary: "Alerting DeadMansSwitch" + description: "This is a DeadMansSwitch meant to ensure that the entire Alerting pipeline is functional" {% endraw %} diff --git a/roles/prometheus/templates/alerts/service.conf b/roles/prometheus/templates/alerts/service.conf index ab722f4..2ff3f22 100644 --- a/roles/prometheus/templates/alerts/service.conf +++ b/roles/prometheus/templates/alerts/service.conf @@ -12,8 +12,8 @@ expr: probe_success{job="kubernetes-services"} == 0 for: 5m labels: - notify: sre severity: warning + notify: sre annotations: description: "{{ $labels.kubernetes_namespace }}/{{ $labels.kubernetes_service_name }}: Service is unreachable" summary: "{{ $labels.kubernetes_namespace }}/{{ $labels.kubernetes_service_name }}: Service is unreachable more than 5 minutes" From 4ee828c358d4be04fac5362b8152c85f87446115 Mon Sep 17 00:00:00 2001 From: vsaveliev Date: Wed, 8 Nov 2017 19:46:26 +0700 Subject: [PATCH 11/21] fixed slow requests of blackbox exporter --- roles/prometheus/templates/alerts/ingress.conf | 2 +- roles/prometheus/templates/alerts/service.conf | 2 +- roles/prometheus/templates/scrape_configs/kubernetes.yml | 5 ++++- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/roles/prometheus/templates/alerts/ingress.conf b/roles/prometheus/templates/alerts/ingress.conf index d029f2e..353c322 100644 --- a/roles/prometheus/templates/alerts/ingress.conf +++ b/roles/prometheus/templates/alerts/ingress.conf @@ -8,7 +8,7 @@ # Alert on host (in ingress) is unreachable > 5 min # - alert: IngressIsUnreachable - expr: probe_success{job="kubernetes-ingresses"} == 0 + expr: probe_success{job="kubernetes-ingresses"} == 0 or absent(probe_success{job="kubernetes-ingresses"} == 1) for: 5m labels: severity: warning diff --git a/roles/prometheus/templates/alerts/service.conf b/roles/prometheus/templates/alerts/service.conf index 2ff3f22..6a9173c 100644 --- a/roles/prometheus/templates/alerts/service.conf +++ b/roles/prometheus/templates/alerts/service.conf @@ -9,7 +9,7 @@ # Alert on service is unreachable > 5 min # - alert: ServiceIsUnreachable - expr: probe_success{job="kubernetes-services"} == 0 + expr: probe_success{job="kubernetes-services"} == 0 or absent(probe_success{job="kubernetes-services"} == 1) for: 5m labels: severity: warning diff --git a/roles/prometheus/templates/scrape_configs/kubernetes.yml b/roles/prometheus/templates/scrape_configs/kubernetes.yml index aff8b7e..8c22d90 100644 --- a/roles/prometheus/templates/scrape_configs/kubernetes.yml +++ b/roles/prometheus/templates/scrape_configs/kubernetes.yml @@ -200,6 +200,8 @@ # # * `prometheus.io/probe`: Only probe services that have a value of `true` - job_name: 'kubernetes-services' + scrape_interval: 60s + scrape_timeout: 50s metrics_path: /probe params: module: [http_2xx] @@ -229,7 +231,8 @@ # # * `prometheus.io/probe`: Only probe ingresses that have a value of `true` - job_name: 'kubernetes-ingresses' - + scrape_interval: 60s + scrape_timeout: 50s metrics_path: /probe params: module: [https_2xx] From 44e24bc8cbc1b82e8a0c9941e2539ea881f358d5 Mon Sep 17 00:00:00 2001 From: vsaveliev Date: Wed, 8 Nov 2017 23:27:13 +0700 Subject: [PATCH 12/21] updated prometheus and node exporter image tags --- roles/prometheus/defaults/main.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/roles/prometheus/defaults/main.yml b/roles/prometheus/defaults/main.yml index 06bcc2d..fbee590 100644 --- a/roles/prometheus/defaults/main.yml +++ b/roles/prometheus/defaults/main.yml @@ -22,7 +22,7 @@ k8s_prometheus_namespace: prometheus # Prometheus image k8s_prometheus_image: prom/prometheus -k8s_prometheus_image_tag: v2.0.0-rc.1 +k8s_prometheus_image_tag: v2.0.0 # Alert manager image k8s_prometheus_alertmanager_image: prom/alertmanager k8s_prometheus_alertmanager_image_tag: v0.9.1 @@ -31,7 +31,7 @@ k8s_prometheus_ksm_image: gcr.io/google_containers/kube-state-metrics k8s_prometheus_ksm_image_tag: v0.5.0 # Node exporter image k8s_prometheus_nodeexport_image: prom/node-exporter -k8s_prometheus_nodeexport_image_tag: v0.15.0 +k8s_prometheus_nodeexport_image_tag: v0.15.1 # Black box exporter image k8s_prometheus_blackbox_image: prom/blackbox-exporter k8s_prometheus_blackbox_image_tag: v0.10.0 From edbe71786897e26684a0ff0c523cf100691e4af7 Mon Sep 17 00:00:00 2001 From: vsaveliev Date: Mon, 13 Nov 2017 10:16:06 +0700 Subject: [PATCH 13/21] added node exporter port in firewall --- playbooks/system/firewall.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/playbooks/system/firewall.yml b/playbooks/system/firewall.yml index 8d4ef0d..b5644be 100644 --- a/playbooks/system/firewall.yml +++ b/playbooks/system/firewall.yml @@ -16,6 +16,7 @@ - 10250/tcp # kubelet port - 10255/tcp # kubelet port - 4194/tcp # cAdvisor port + - 9100/tcp # Prometheus node-exporter port nodes_ports: - 10250/tcp # kubelet port - 10255/tcp # kubelet port From 638b1be4bec4763c7ef9b5b48c494168f7413020 Mon Sep 17 00:00:00 2001 From: Vlad Saveliev Date: Thu, 16 Nov 2017 11:53:04 +0700 Subject: [PATCH 14/21] updated manifests for prometheus - namespace creation and etc. --- roles/prometheus/templates/prometheus.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/roles/prometheus/templates/prometheus.yaml b/roles/prometheus/templates/prometheus.yaml index e205fee..2b574e4 100644 --- a/roles/prometheus/templates/prometheus.yaml +++ b/roles/prometheus/templates/prometheus.yaml @@ -1,3 +1,6 @@ +{% include 'config.yaml' %} + + {% include 'server.yaml' %} From 3da6d1c687090c1ced0c8d1a388101e94b990efa Mon Sep 17 00:00:00 2001 From: Vlad Saveliev Date: Thu, 16 Nov 2017 18:27:23 +0700 Subject: [PATCH 15/21] added defaults for alertmanager, increased timeout for blackbox exporter in prometheus role --- roles/prometheus/defaults/main.yml | 7 ++++++ roles/prometheus/templates/alert-manager.yaml | 25 +++++++++++++++---- .../templates/blackbox-exporter.yaml | 4 +-- 3 files changed, 29 insertions(+), 7 deletions(-) diff --git a/roles/prometheus/defaults/main.yml b/roles/prometheus/defaults/main.yml index fbee590..f010b37 100644 --- a/roles/prometheus/defaults/main.yml +++ b/roles/prometheus/defaults/main.yml @@ -53,7 +53,10 @@ k8s_prometheus_configreload_image_tag: v0.1 # Prometheus scrape configs for Istio k8s_prometheus_scrape_istio_metrics: false +k8s_prometheus_alerts_default_route: 'null' + # Prometheus alert configs for Slack +k8s_prometheus_slack_alerts_enabled: true k8s_prometheus_slack_api_url: '' k8s_prometheus_slack_channel: '' k8s_prometheus_slack_message_title: '{% raw %}[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}]{{.CommonAnnotations.summary}}{% endraw %}' @@ -66,3 +69,7 @@ k8s_prometheus_slack_message_body: | {{ range .Labels.SortedPairs }} • {{ .Name }}: `{{ .Value }}` {{ end }} {{ end }}{% endraw %} + +# Prometheus alert configs for Telegram +k8s_prometheus_telegram_alerts_enabled: true +k8s_prometheus_telegram_webhook: '' diff --git a/roles/prometheus/templates/alert-manager.yaml b/roles/prometheus/templates/alert-manager.yaml index af5d8d8..03f88b5 100644 --- a/roles/prometheus/templates/alert-manager.yaml +++ b/roles/prometheus/templates/alert-manager.yaml @@ -10,24 +10,39 @@ metadata: name: prometheus-alertmanager data: alertmanager.yml: |- + {% if k8s_prometheus_slack_alerts_enabled %} + global: slack_api_url: '{{ k8s_prometheus_slack_api_url }}' + {% endif %} receivers: - - name: slack-receiver + - name: 'null' + + {% if k8s_prometheus_slack_alerts_enabled %} + + - name: slack slack_configs: - channel: '{{ k8s_prometheus_slack_channel }}' send_resolved: true title: '{{ k8s_prometheus_slack_message_title }}' text: >- {{ k8s_prometheus_slack_message_body }} - - name: 'null' + {% endif %} + + {% if k8s_prometheus_telegram_alerts_enabled %} + + - name: telegram + webhook_configs: + - send_resolved: True + url: {{ k8s_prometheus_telegram_webhook }} + {% endif %} route: group_wait: 10s - group_interval: 5m - receiver: slack-receiver - repeat_interval: 3h + group_interval: 1h + receiver: {{ k8s_prometheus_alerts_default_route }} + repeat_interval: 4h routes: - receiver: 'null' match: diff --git a/roles/prometheus/templates/blackbox-exporter.yaml b/roles/prometheus/templates/blackbox-exporter.yaml index cc836a1..821f697 100644 --- a/roles/prometheus/templates/blackbox-exporter.yaml +++ b/roles/prometheus/templates/blackbox-exporter.yaml @@ -13,11 +13,11 @@ data: modules: http_2xx: prober: http - timeout: 20s + timeout: 50s http: https_2xx: prober: http - timeout: 20s + timeout: 50s http: method: GET valid_status_codes: [200] From b6791b40ef0ba5686329fe5f4cc51ef13aad1aa5 Mon Sep 17 00:00:00 2001 From: Vlad Saveliev Date: Thu, 16 Nov 2017 19:12:15 +0700 Subject: [PATCH 16/21] blackbox exporter works via ipv4 in prometheus role --- roles/prometheus/templates/blackbox-exporter.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/roles/prometheus/templates/blackbox-exporter.yaml b/roles/prometheus/templates/blackbox-exporter.yaml index 821f697..9def698 100644 --- a/roles/prometheus/templates/blackbox-exporter.yaml +++ b/roles/prometheus/templates/blackbox-exporter.yaml @@ -15,6 +15,7 @@ data: prober: http timeout: 50s http: + preferred_ip_protocol: ip4 https_2xx: prober: http timeout: 50s @@ -24,6 +25,7 @@ data: no_follow_redirects: false fail_if_ssl: false fail_if_not_ssl: true + preferred_ip_protocol: ip4 --- From 15a4ae24da6528612b2a55ebca9764b04eb1825d Mon Sep 17 00:00:00 2001 From: Vlad Saveliev Date: Mon, 27 Nov 2017 18:24:00 +0700 Subject: [PATCH 17/21] updated alerts, increased replicas of blackbox exporter to 2, updated alertmanager configs, move retention days in params --- roles/prometheus/defaults/main.yml | 7 +++++-- roles/prometheus/templates/alert-manager.yaml | 15 +++++++++++++++ roles/prometheus/templates/alerts/app.conf | 2 +- roles/prometheus/templates/alerts/deployment.conf | 4 ++-- roles/prometheus/templates/alerts/node.conf | 4 ++-- roles/prometheus/templates/blackbox-exporter.yaml | 2 +- roles/prometheus/templates/server.yaml | 2 +- 7 files changed, 27 insertions(+), 9 deletions(-) diff --git a/roles/prometheus/defaults/main.yml b/roles/prometheus/defaults/main.yml index f010b37..03e731d 100644 --- a/roles/prometheus/defaults/main.yml +++ b/roles/prometheus/defaults/main.yml @@ -6,6 +6,9 @@ k8s_prometheus_dir: '{{ k8s_conf_dir }}/addons/prometheus' # Master hosts names k8s_master_hosts: {} +# Retention days (how long to store data) +k8s_prometheus_retention_days: 7 + # Prometheus host name (it will be created if it doesn't exist) k8s_prometheus_name: '' # AlertManager host name (it will be created if it doesn't exist) @@ -56,7 +59,7 @@ k8s_prometheus_scrape_istio_metrics: false k8s_prometheus_alerts_default_route: 'null' # Prometheus alert configs for Slack -k8s_prometheus_slack_alerts_enabled: true +k8s_prometheus_slack_alerts_enabled: false k8s_prometheus_slack_api_url: '' k8s_prometheus_slack_channel: '' k8s_prometheus_slack_message_title: '{% raw %}[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}]{{.CommonAnnotations.summary}}{% endraw %}' @@ -71,5 +74,5 @@ k8s_prometheus_slack_message_body: | {{ end }}{% endraw %} # Prometheus alert configs for Telegram -k8s_prometheus_telegram_alerts_enabled: true +k8s_prometheus_telegram_alerts_enabled: false k8s_prometheus_telegram_webhook: '' diff --git a/roles/prometheus/templates/alert-manager.yaml b/roles/prometheus/templates/alert-manager.yaml index 03f88b5..04ba66a 100644 --- a/roles/prometheus/templates/alert-manager.yaml +++ b/roles/prometheus/templates/alert-manager.yaml @@ -48,6 +48,21 @@ data: match: alertname: DeadMansSwitch + {% if k8s_prometheus_slack_alerts_enabled %} + + - receiver: slack + match_re: + notify: ^sre|dev$ + continue: true + {% endif %} + + {% if k8s_prometheus_telegram_alerts_enabled %} + + - receiver: telegram + match: + notify: sre + {% endif %} + --- apiVersion: v1 diff --git a/roles/prometheus/templates/alerts/app.conf b/roles/prometheus/templates/alerts/app.conf index ca395bd..aa56601 100644 --- a/roles/prometheus/templates/alerts/app.conf +++ b/roles/prometheus/templates/alerts/app.conf @@ -11,7 +11,7 @@ expr: rate(http_requests_total{job="kubernetes-service-endpoints",status=~"5.."}[1m]) > 0 labels: severity: warning - notify: sre + notify: dev annotations: summary: "{{ $labels.kubernetes_namespace }}/{{ $label.kubernetes_pod_name }}: Service returns 5XX error" description: "{{ $labels.kubernetes_namespace }}/{{ $label.kubernetes_pod_name }}: Service has processed some request incorrectly (code = {{ $labels.status }})" diff --git a/roles/prometheus/templates/alerts/deployment.conf b/roles/prometheus/templates/alerts/deployment.conf index 56395c8..378093e 100644 --- a/roles/prometheus/templates/alerts/deployment.conf +++ b/roles/prometheus/templates/alerts/deployment.conf @@ -15,7 +15,7 @@ severity: warning notify: sre annotations: - summary: "{{ $labels.kubernetes_namespace }}/{{ $labels.deployment }}: Deployment is failed" - description: "{{ $labels.kubernetes_namespace }}/{{ $labels.deployment }}: Deployment is failed - observed replicas != intended replicas" + summary: "{{ $labels.namespace }}/{{ $labels.deployment }}: Deployment is failed" + description: "{{ $labels.namespace }}/{{ $labels.deployment }}: Deployment is failed - observed replicas != intended replicas" {% endraw %} diff --git a/roles/prometheus/templates/alerts/node.conf b/roles/prometheus/templates/alerts/node.conf index bc4bdbf..2fd1cda 100644 --- a/roles/prometheus/templates/alerts/node.conf +++ b/roles/prometheus/templates/alerts/node.conf @@ -21,7 +21,7 @@ description: "{{ $labels.kubernetes_pod_node_name }}: CPU usage is above 75% (current value is: {{ $value }})" # - # Alert on load average > 1 + # Alert on load average > 1.5 # - alert: NodeLoadAverage expr: ((node_load5 / count(node_cpu{mode="system"}) WITHOUT (cpu, mode)) > 1.5) @@ -30,8 +30,8 @@ severity: info notify: sre annotations: - description: "{{ $labels.kubernetes_pod_node_name }}: Load average is high" summary: "{{ $labels.kubernetes_pod_node_name }}: High LA detected" + description: "{{ $labels.kubernetes_pod_node_name }}: Load average is high (current value is: {{ $value }})" # # Alert on swap > 75% diff --git a/roles/prometheus/templates/blackbox-exporter.yaml b/roles/prometheus/templates/blackbox-exporter.yaml index 9def698..3fb92d0 100644 --- a/roles/prometheus/templates/blackbox-exporter.yaml +++ b/roles/prometheus/templates/blackbox-exporter.yaml @@ -90,7 +90,7 @@ metadata: namespace: {{ k8s_prometheus_namespace }} name: prometheus-blackbox-exporter spec: - replicas: 1 + replicas: 2 template: metadata: labels: diff --git a/roles/prometheus/templates/server.yaml b/roles/prometheus/templates/server.yaml index 28d508a..f098e55 100644 --- a/roles/prometheus/templates/server.yaml +++ b/roles/prometheus/templates/server.yaml @@ -177,7 +177,7 @@ spec: args: - --config.file=/etc/config/prometheus.yml - --storage.tsdb.path=/data - - --storage.tsdb.retention=7d + - --storage.tsdb.retention={{ k8s_prometheus_retention_days }}d - --web.console.libraries=/etc/prometheus/console_libraries - --web.console.templates=/etc/prometheus/consoles ports: From 5aa45ae1bd2775d521800deaef5838aad06ddc86 Mon Sep 17 00:00:00 2001 From: Vlad Saveliev Date: Tue, 28 Nov 2017 15:47:09 +0700 Subject: [PATCH 18/21] removed double using config in prometheus role --- roles/prometheus/templates/prometheus.yaml | 3 --- 1 file changed, 3 deletions(-) diff --git a/roles/prometheus/templates/prometheus.yaml b/roles/prometheus/templates/prometheus.yaml index 2b574e4..e205fee 100644 --- a/roles/prometheus/templates/prometheus.yaml +++ b/roles/prometheus/templates/prometheus.yaml @@ -1,6 +1,3 @@ -{% include 'config.yaml' %} - - {% include 'server.yaml' %} From 436d83a062299b918443474a0d6b59a03264060a Mon Sep 17 00:00:00 2001 From: Vlad Saveliev Date: Wed, 29 Nov 2017 15:22:37 +0700 Subject: [PATCH 19/21] refactoring, small fixes, added cockroachdb alerts --- roles/prometheus/defaults/main.yml | 4 + roles/prometheus/templates/alerts/all.yml | 23 +++ .../templates/alerts/cockroachdb.conf | 185 ++++++++++++++++++ roles/prometheus/templates/alerts/common.conf | 2 - .../templates/scrape_configs/all.yml | 7 + .../templates/scrape_configs/kubernetes.yml | 3 + roles/prometheus/templates/server.yaml | 25 +-- 7 files changed, 224 insertions(+), 25 deletions(-) create mode 100644 roles/prometheus/templates/alerts/all.yml create mode 100644 roles/prometheus/templates/alerts/cockroachdb.conf create mode 100644 roles/prometheus/templates/scrape_configs/all.yml diff --git a/roles/prometheus/defaults/main.yml b/roles/prometheus/defaults/main.yml index 03e731d..59f975b 100644 --- a/roles/prometheus/defaults/main.yml +++ b/roles/prometheus/defaults/main.yml @@ -56,6 +56,7 @@ k8s_prometheus_configreload_image_tag: v0.1 # Prometheus scrape configs for Istio k8s_prometheus_scrape_istio_metrics: false +# Alerts default route is stub k8s_prometheus_alerts_default_route: 'null' # Prometheus alert configs for Slack @@ -76,3 +77,6 @@ k8s_prometheus_slack_message_body: | # Prometheus alert configs for Telegram k8s_prometheus_telegram_alerts_enabled: false k8s_prometheus_telegram_webhook: '' + +# Prometheus scrape configs for Cockroachdb +k8s_prometheus_scrape_cockroachdb_metrics: false diff --git a/roles/prometheus/templates/alerts/all.yml b/roles/prometheus/templates/alerts/all.yml new file mode 100644 index 0000000..630116a --- /dev/null +++ b/roles/prometheus/templates/alerts/all.yml @@ -0,0 +1,23 @@ +{% include 'alerts/common.conf' %} + + groups: +{% include 'alerts/app.conf' %} + +{% include 'alerts/deployment.conf' %} + +{% include 'alerts/pods.conf' %} + +{% include 'alerts/node.conf' %} + +{% include 'alerts/ingress.conf' %} + +{% include 'alerts/service.conf' %} + +{% include 'alerts/prometheus.conf' %} + +{% include 'alerts/kubernetes.conf' %} + +{% if k8s_prometheus_scrape_cockroachdb_metrics %} + +{% include 'alerts/cockroachdb.conf' %} +{% endif %} \ No newline at end of file diff --git a/roles/prometheus/templates/alerts/cockroachdb.conf b/roles/prometheus/templates/alerts/cockroachdb.conf new file mode 100644 index 0000000..f36a7a6 --- /dev/null +++ b/roles/prometheus/templates/alerts/cockroachdb.conf @@ -0,0 +1,185 @@ +{% raw %} + # + # CockroachDB alerts + # + - name: cockroachdb-alerts + rules: + # + # Aggregation rules + # + # This file contains aggregation rules, specifically: + # "node:X" node-level aggregation of a per-store metric X + # "cluster:X" cluster-level aggregation of a per-store or per-node metric X + # + # Most aggregation rules should use the "without (label1, label2, ...)" keyword + # to keep all labels but the ones specified. + # + - record: node:capacity + expr: sum(capacity{app="cockroachdb"}) WITHOUT (store) + - record: cluster:capacity + expr: sum(node:capacity{app="cockroachdb"}) WITHOUT (instance) + - record: node:capacity_available + expr: sum(capacity_available{app="cockroachdb"}) WITHOUT (store) + - record: cluster:capacity_available + expr: sum(node:capacity_available{app="cockroachdb"}) WITHOUT (instance) + - record: capacity_available:ratio + expr: capacity_available{app="cockroachdb"} / capacity{app="cockroachdb"} + - record: node:capacity_available:ratio + expr: node:capacity_available{app="cockroachdb"} / node:capacity{app="cockroachdb"} + - record: cluster:capacity_available:ratio + expr: cluster:capacity_available{app="cockroachdb"} / cluster:capacity{app="cockroachdb"} + - record: mutex_storenanos_bucket:rate1m + expr: rate(mutex_storenanos_bucket{app="cockroachdb"}[1m]) + - record: node:mutex_storenanos_bucket:rate1m + expr: sum(mutex_storenanos_bucket:rate1m{app="cockroachdb"}) WITHOUT (store) + - record: cluster:mutex_storenanos_bucket:rate1m + expr: sum(node:mutex_storenanos_bucket:rate1m{app="cockroachdb"}) WITHOUT (instance) + - record: cluster:mutex_storenanos:rate1m:quantile_50 + expr: histogram_quantile(0.5, cluster:mutex_storenanos_bucket:rate1m) + - record: cluster:mutex_storenanos:rate1m:quantile_75 + expr: histogram_quantile(0.75, cluster:mutex_storenanos_bucket:rate1m) + - record: cluster:mutex_storenanos:rate1m:quantile_90 + expr: histogram_quantile(0.9, cluster:mutex_storenanos_bucket:rate1m) + - record: cluster:mutex_storenanos:rate1m:quantile_95 + expr: histogram_quantile(0.95, cluster:mutex_storenanos_bucket:rate1m) + - record: mutex_raftnanos_bucket:rate1m + expr: rate(mutex_raftnanos_bucket{app="cockroachdb"}[1m]) + - record: node:mutex_raftnanos_bucket:rate1m + expr: sum(mutex_raftnanos_bucket:rate1m{app="cockroachdb"}) WITHOUT (store) + - record: cluster:mutex_raftnanos_bucket:rate1m + expr: sum(node:mutex_raftnanos_bucket:rate1m{app="cockroachdb"}) WITHOUT (instance) + - record: cluster:mutex_raftnanos:rate1m:quantile_50 + expr: histogram_quantile(0.5, cluster:mutex_raftnanos_bucket:rate1m) + - record: cluster:mutex_raftnanos:rate1m:quantile_75 + expr: histogram_quantile(0.75, cluster:mutex_raftnanos_bucket:rate1m) + - record: cluster:mutex_raftnanos:rate1m:quantile_90 + expr: histogram_quantile(0.9, cluster:mutex_raftnanos_bucket:rate1m) + - record: cluster:mutex_raftnanos:rate1m:quantile_95 + expr: histogram_quantile(0.95, cluster:mutex_raftnanos_bucket:rate1m) + - record: mutex_replicananos_bucket:rate1m + expr: rate(mutex_replicananos_bucket{app="cockroachdb"}[1m]) + - record: node:mutex_replicananos_bucket:rate1m + expr: sum(mutex_replicananos_bucket:rate1m{app="cockroachdb"}) WITHOUT (store) + - record: cluster:mutex_replicananos_bucket:rate1m + expr: sum(node:mutex_replicananos_bucket:rate1m{app="cockroachdb"}) WITHOUT (instance) + - record: cluster:mutex_replicananos:rate1m:quantile_50 + expr: histogram_quantile(0.5, cluster:mutex_replicananos_bucket:rate1m) + - record: cluster:mutex_replicananos:rate1m:quantile_75 + expr: histogram_quantile(0.75, cluster:mutex_replicananos_bucket:rate1m) + - record: cluster:mutex_replicananos:rate1m:quantile_90 + expr: histogram_quantile(0.9, cluster:mutex_replicananos_bucket:rate1m) + - record: cluster:mutex_replicananos:rate1m:quantile_95 + expr: histogram_quantile(0.95, cluster:mutex_replicananos_bucket:rate1m) + - record: mutex_commandqueuenanos_bucket:rate1m + expr: rate(mutex_commandqueuenanos_bucket{app="cockroachdb"}[1m]) + - record: node:mutex_commandqueuenanos_bucket:rate1m + expr: sum(mutex_commandqueuenanos_bucket:rate1m{app="cockroachdb"}) WITHOUT (store) + - record: cluster:mutex_commandqueuenanos_bucket:rate1m + expr: sum(node:mutex_commandqueuenanos_bucket:rate1m{app="cockroachdb"}) WITHOUT + (instance) + - record: cluster:mutex_commandqueuenanos:rate1m:quantile_50 + expr: histogram_quantile(0.5, cluster:mutex_commandqueuenanos_bucket:rate1m) + - record: cluster:mutex_commandqueuenanos:rate1m:quantile_75 + expr: histogram_quantile(0.75, cluster:mutex_commandqueuenanos_bucket:rate1m) + - record: cluster:mutex_commandqueuenanos:rate1m:quantile_90 + expr: histogram_quantile(0.9, cluster:mutex_commandqueuenanos_bucket:rate1m) + - record: cluster:mutex_commandqueuenanos:rate1m:quantile_95 + expr: histogram_quantile(0.95, cluster:mutex_commandqueuenanos_bucket:rate1m) + - record: mutex_schedulernanos_bucket:rate1m + expr: rate(mutex_schedulernanos_bucket{app="cockroachdb"}[1m]) + - record: node:mutex_schedulernanos_bucket:rate1m + expr: sum(mutex_schedulernanos_bucket:rate1m{app="cockroachdb"}) WITHOUT (store) + - record: cluster:mutex_schedulernanos_bucket:rate1m + expr: sum(node:mutex_schedulernanos_bucket:rate1m{app="cockroachdb"}) WITHOUT + (instance) + - record: cluster:mutex_schedulernanos:rate1m:quantile_50 + expr: histogram_quantile(0.5, cluster:mutex_schedulernanos_bucket:rate1m) + - record: cluster:mutex_schedulernanos:rate1m:quantile_75 + expr: histogram_quantile(0.75, cluster:mutex_schedulernanos_bucket:rate1m) + - record: cluster:mutex_schedulernanos:rate1m:quantile_90 + expr: histogram_quantile(0.9, cluster:mutex_schedulernanos_bucket:rate1m) + - record: cluster:mutex_schedulernanos:rate1m:quantile_95 + expr: histogram_quantile(0.95, cluster:mutex_schedulernanos_bucket:rate1m) + + # + # Alert rules + # + + # + # Alert on version mismatch. + # + # This alert is intentionally loose (30 minutes) to allow for rolling upgrades. + # This may need to be adjusted for large clusters. + # + - alert: VersionMismatch + expr: count(count_values("version", build_timestamp{app="cockroachdb"}) BY (tag)) > 1 + for: 30m + labels: + severity: warning + notify: sre + annotations: + description: "CockroachDB: Cluster running {{ $value }} different versions" + summary: "CockroachDB: Binary version mismatch" + + # + # Alert on available capacity < 15% + # + - alert: StoreDiskLow + expr: capacity_available:ratio{app="cockroachdb"} < 0.15 + labels: + severity: warning + notify: sre + annotations: + description: "{{ $labels.kubernetes_namespace }}/{{ $labels.kubernetes_pod_name }}: Store {{ $labels.store }} on node {{ $labels.kubernetes_pod_node_name }} at {{ $value }} available disk fraction" + summary: "{{ $labels.kubernetes_namespace }}/{{ $labels.kubernetes_pod_name }}: CockroachDB store has low disk" + + # + # Alert on low disk < 20% + # + - alert: ClusterDiskLow + expr: cluster:capacity_available:ratio{app="cockroachdb"} < 0.2 + labels: + severity: warning + notify: sre + annotations: + description: "CockroachDB cluster at {{ $value }} available disk fraction" + summary: "CockroachDB cluster at {{ $value }} available disk fraction" + + # + # Alert on zero SQL qps + # + - alert: ZeroSQLQps + expr: sql_conns{app="cockroachdb"} > 0 and rate(sql_query_count{app="cockroachdb"}[5m]) == 0 + for: 10m + labels: + severity: warning + notify: sre + annotations: + description: "{{ $labels.kubernetes_namespace }}/{{ $labels.kubernetes_pod_name }}: CockroachDB pod has SQL connections but no queries" + summary: "{{ $labels.kubernetes_namespace }}/{{ $labels.kubernetes_pod_name }}: CockroachDB pod has SQL connections but no queries" + + # + # Alert on certificate expiration (< 1 year) + # + - alert: CACertificateExpiresSoon + expr: (security_certificate_expiration_ca{app="cockroachdb"} > 0) and (security_certificate_expiration_ca{app="cockroachdb"} - time()) < 86400 * 366 + labels: + severity: warning + notify: sre + annotations: + description: "{{ $labels.kubernetes_namespace }}/{{ $labels.kubernetes_pod_name }}: CA certificate for CockroachDB pod expires in less than a year" + summary: "{{ $labels.kubernetes_namespace }}/{{ $labels.kubernetes_pod_name }}: CA certificate for CockroachDB pod expires in less than a year" + + # + # Alert on certificate expiration (< 0.5 year) + # + - alert: NodeCertificateExpiresSoon + expr: (security_certificate_expiration_node{app="cockroachdb"} > 0) and (security_certificate_expiration_node{app="cockroachdb"} - time()) < 86400 * 183 + labels: + severity: warning + notify: sre + annotations: + description: "{{ $labels.kubernetes_namespace }}/{{ $labels.kubernetes_pod_name }}: Node certificate expires in less than 6 months" + summary: "{{ $labels.kubernetes_namespace }}/{{ $labels.kubernetes_pod_name }}: Node certificate expires in less than 6 months" + +{% endraw %} diff --git a/roles/prometheus/templates/alerts/common.conf b/roles/prometheus/templates/alerts/common.conf index a1e5a59..c5f83e3 100644 --- a/roles/prometheus/templates/alerts/common.conf +++ b/roles/prometheus/templates/alerts/common.conf @@ -1,5 +1,3 @@ - ## alert.rules ## - # # Common rules of monitoring: # diff --git a/roles/prometheus/templates/scrape_configs/all.yml b/roles/prometheus/templates/scrape_configs/all.yml new file mode 100644 index 0000000..c4018fa --- /dev/null +++ b/roles/prometheus/templates/scrape_configs/all.yml @@ -0,0 +1,7 @@ +{% include 'scrape_configs/prometheus.yml' %} + +{% include 'scrape_configs/kubernetes.yml' %} + +{% if k8s_prometheus_scrape_istio_metrics %} +{% include 'scrape_configs/istio.yml' %} +{% endif %} diff --git a/roles/prometheus/templates/scrape_configs/kubernetes.yml b/roles/prometheus/templates/scrape_configs/kubernetes.yml index 8c22d90..bffeb7a 100644 --- a/roles/prometheus/templates/scrape_configs/kubernetes.yml +++ b/roles/prometheus/templates/scrape_configs/kubernetes.yml @@ -131,6 +131,9 @@ kubernetes_sd_configs: - role: endpoints + tls_config: + insecure_skip_verify: true + relabel_configs: - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape] action: keep diff --git a/roles/prometheus/templates/server.yaml b/roles/prometheus/templates/server.yaml index f098e55..87ea667 100644 --- a/roles/prometheus/templates/server.yaml +++ b/roles/prometheus/templates/server.yaml @@ -10,23 +10,7 @@ metadata: name: prometheus-server data: alerts: |- -{% include 'alerts/common.conf' %} - groups: -{% include 'alerts/app.conf' %} - -{% include 'alerts/deployment.conf' %} - -{% include 'alerts/pods.conf' %} - -{% include 'alerts/node.conf' %} - -{% include 'alerts/ingress.conf' %} - -{% include 'alerts/service.conf' %} - -{% include 'alerts/prometheus.conf' %} - -{% include 'alerts/kubernetes.conf' %} +{% include 'alerts/all.yml' %} prometheus.yml: |- alerting: @@ -41,13 +25,8 @@ data: # A list of scrape configurations. scrape_configs: -{% include 'scrape_configs/prometheus.yml' %} +{% include 'scrape_configs/all.yml' %} -{% include 'scrape_configs/kubernetes.yml' %} - -{% if k8s_prometheus_scrape_istio_metrics %} -{% include 'scrape_configs/istio.yml' %} -{% endif %} --- apiVersion: v1 From 8ea0cb5f67b19cc0fd8f75333969681af7103be9 Mon Sep 17 00:00:00 2001 From: Vlad Saveliev Date: Wed, 29 Nov 2017 15:54:39 +0700 Subject: [PATCH 20/21] renamed alerts files in prometheus role --- roles/prometheus/templates/alerts/all.yml | 20 +++++++++---------- .../templates/alerts/{app.conf => app.yml} | 0 .../{cockroachdb.conf => cockroachdb.yml} | 0 .../alerts/{common.conf => common.yml} | 0 .../{deployment.conf => deployment.yml} | 0 .../alerts/{ingress.conf => ingress.yml} | 0 .../{kubernetes.conf => kubernetes.yml} | 0 .../templates/alerts/{node.conf => node.yml} | 0 .../templates/alerts/{pods.conf => pods.yml} | 0 .../{prometheus.conf => prometheus.yml} | 0 .../alerts/{service.conf => service.yml} | 0 11 files changed, 10 insertions(+), 10 deletions(-) rename roles/prometheus/templates/alerts/{app.conf => app.yml} (100%) rename roles/prometheus/templates/alerts/{cockroachdb.conf => cockroachdb.yml} (100%) rename roles/prometheus/templates/alerts/{common.conf => common.yml} (100%) rename roles/prometheus/templates/alerts/{deployment.conf => deployment.yml} (100%) rename roles/prometheus/templates/alerts/{ingress.conf => ingress.yml} (100%) rename roles/prometheus/templates/alerts/{kubernetes.conf => kubernetes.yml} (100%) rename roles/prometheus/templates/alerts/{node.conf => node.yml} (100%) rename roles/prometheus/templates/alerts/{pods.conf => pods.yml} (100%) rename roles/prometheus/templates/alerts/{prometheus.conf => prometheus.yml} (100%) rename roles/prometheus/templates/alerts/{service.conf => service.yml} (100%) diff --git a/roles/prometheus/templates/alerts/all.yml b/roles/prometheus/templates/alerts/all.yml index 630116a..8e82d07 100644 --- a/roles/prometheus/templates/alerts/all.yml +++ b/roles/prometheus/templates/alerts/all.yml @@ -1,23 +1,23 @@ -{% include 'alerts/common.conf' %} +{% include 'alerts/common.yml' %} groups: -{% include 'alerts/app.conf' %} +{% include 'alerts/app.yml' %} -{% include 'alerts/deployment.conf' %} +{% include 'alerts/deployment.yml' %} -{% include 'alerts/pods.conf' %} +{% include 'alerts/pods.yml' %} -{% include 'alerts/node.conf' %} +{% include 'alerts/node.yml' %} -{% include 'alerts/ingress.conf' %} +{% include 'alerts/ingress.yml' %} -{% include 'alerts/service.conf' %} +{% include 'alerts/service.yml' %} -{% include 'alerts/prometheus.conf' %} +{% include 'alerts/prometheus.yml' %} -{% include 'alerts/kubernetes.conf' %} +{% include 'alerts/kubernetes.yml' %} {% if k8s_prometheus_scrape_cockroachdb_metrics %} -{% include 'alerts/cockroachdb.conf' %} +{% include 'alerts/cockroachdb.yml' %} {% endif %} \ No newline at end of file diff --git a/roles/prometheus/templates/alerts/app.conf b/roles/prometheus/templates/alerts/app.yml similarity index 100% rename from roles/prometheus/templates/alerts/app.conf rename to roles/prometheus/templates/alerts/app.yml diff --git a/roles/prometheus/templates/alerts/cockroachdb.conf b/roles/prometheus/templates/alerts/cockroachdb.yml similarity index 100% rename from roles/prometheus/templates/alerts/cockroachdb.conf rename to roles/prometheus/templates/alerts/cockroachdb.yml diff --git a/roles/prometheus/templates/alerts/common.conf b/roles/prometheus/templates/alerts/common.yml similarity index 100% rename from roles/prometheus/templates/alerts/common.conf rename to roles/prometheus/templates/alerts/common.yml diff --git a/roles/prometheus/templates/alerts/deployment.conf b/roles/prometheus/templates/alerts/deployment.yml similarity index 100% rename from roles/prometheus/templates/alerts/deployment.conf rename to roles/prometheus/templates/alerts/deployment.yml diff --git a/roles/prometheus/templates/alerts/ingress.conf b/roles/prometheus/templates/alerts/ingress.yml similarity index 100% rename from roles/prometheus/templates/alerts/ingress.conf rename to roles/prometheus/templates/alerts/ingress.yml diff --git a/roles/prometheus/templates/alerts/kubernetes.conf b/roles/prometheus/templates/alerts/kubernetes.yml similarity index 100% rename from roles/prometheus/templates/alerts/kubernetes.conf rename to roles/prometheus/templates/alerts/kubernetes.yml diff --git a/roles/prometheus/templates/alerts/node.conf b/roles/prometheus/templates/alerts/node.yml similarity index 100% rename from roles/prometheus/templates/alerts/node.conf rename to roles/prometheus/templates/alerts/node.yml diff --git a/roles/prometheus/templates/alerts/pods.conf b/roles/prometheus/templates/alerts/pods.yml similarity index 100% rename from roles/prometheus/templates/alerts/pods.conf rename to roles/prometheus/templates/alerts/pods.yml diff --git a/roles/prometheus/templates/alerts/prometheus.conf b/roles/prometheus/templates/alerts/prometheus.yml similarity index 100% rename from roles/prometheus/templates/alerts/prometheus.conf rename to roles/prometheus/templates/alerts/prometheus.yml diff --git a/roles/prometheus/templates/alerts/service.conf b/roles/prometheus/templates/alerts/service.yml similarity index 100% rename from roles/prometheus/templates/alerts/service.conf rename to roles/prometheus/templates/alerts/service.yml From aeb387451cb4f799e9a06aeba71dcb09f87c01de Mon Sep 17 00:00:00 2001 From: Vlad Saveliev Date: Wed, 29 Nov 2017 22:46:17 +0700 Subject: [PATCH 21/21] updated cockroachd scrape configs, added grafana dashboards in prometheus role --- roles/prometheus/README.md | 2 +- .../templates/grafana-dashboards/all.yml | 44 ++++ .../cockroachdb/replicas.json | 3 + .../cockroachdb/runtime.json | 3 + .../grafana-dashboards/cockroachdb/sql.json | 3 + .../cockroachdb/storage.json | 3 + .../grafana-dashboards/http-service.json | 249 ++++++++++++++++++ .../grafana-dashboards/template.json | 15 ++ roles/prometheus/templates/grafana.yaml | 27 +- .../templates/scrape_configs/all.yml | 4 + .../templates/scrape_configs/cockroachdb.yml | 44 ++++ .../templates/scrape_configs/kubernetes.yml | 6 + 12 files changed, 376 insertions(+), 27 deletions(-) create mode 100644 roles/prometheus/templates/grafana-dashboards/all.yml create mode 100644 roles/prometheus/templates/grafana-dashboards/cockroachdb/replicas.json create mode 100644 roles/prometheus/templates/grafana-dashboards/cockroachdb/runtime.json create mode 100644 roles/prometheus/templates/grafana-dashboards/cockroachdb/sql.json create mode 100644 roles/prometheus/templates/grafana-dashboards/cockroachdb/storage.json create mode 100644 roles/prometheus/templates/grafana-dashboards/http-service.json create mode 100644 roles/prometheus/templates/grafana-dashboards/template.json create mode 100644 roles/prometheus/templates/scrape_configs/cockroachdb.yml diff --git a/roles/prometheus/README.md b/roles/prometheus/README.md index 4a3bda3..7cdbf78 100644 --- a/roles/prometheus/README.md +++ b/roles/prometheus/README.md @@ -101,7 +101,7 @@ How to add dashboard in Grafana 1. create a new dashboard manually via `Dashboards --> New` or find existing on [grafana.com](https://grafana.com/dashboards) 2. export (download) it on your computer 3. copy content of downloaded JSON file -4. create a new file in `templates/grafana-dashboards` directory. Template: +4. create a new file in `templates/grafana-dashboards` directory. Template (`templates/grafana-dashboards/dashboard-template.json`): {% raw %} { diff --git a/roles/prometheus/templates/grafana-dashboards/all.yml b/roles/prometheus/templates/grafana-dashboards/all.yml new file mode 100644 index 0000000..23b2137 --- /dev/null +++ b/roles/prometheus/templates/grafana-dashboards/all.yml @@ -0,0 +1,44 @@ + capacity-planning-dashboard.json: |- +{% include 'grafana-dashboards/capacity-planning.json' %} + + cluster-health-dashboard.json: |- +{% include 'grafana-dashboards/cluster-health.json' %} + + cluster-monitoring-dashboard.json: |- +{% include 'grafana-dashboards/cluster-monitoring.json' %} + + cluster-status-dashboard.json: |- +{% include 'grafana-dashboards/cluster-status.json' %} + + control-plane-status-dashboard.json: |- +{% include 'grafana-dashboards/control-plane-status.json' %} + + deployment-dashboard.json: |- +{% include 'grafana-dashboards/deployment.json' %} + + nodes-dashboard.json: |- +{% include 'grafana-dashboards/nodes.json' %} + + pods-dashboard.json: |- +{% include 'grafana-dashboards/pods.json' %} + + resource-requests-dashboard.json: |- +{% include 'grafana-dashboards/resource-requests.json' %} + +{% if k8s_prometheus_scrape_cockroachdb_metrics %} + + cockroachdb-replicas-dashboard.json: |- +{% include 'grafana-dashboards/cockroachdb/replicas.json' %} + + cockroachdb-runtime-dashboard.json: |- +{% include 'grafana-dashboards/cockroachdb/runtime.json' %} + + cockroachdb-sql-dashboard.json: |- +{% include 'grafana-dashboards/cockroachdb/sql.json' %} + + cockroachdb-storage-dashboard.json: |- +{% include 'grafana-dashboards/cockroachdb/storage.json' %} + + + +{% endif %} \ No newline at end of file diff --git a/roles/prometheus/templates/grafana-dashboards/cockroachdb/replicas.json b/roles/prometheus/templates/grafana-dashboards/cockroachdb/replicas.json new file mode 100644 index 0000000..ee325a4 --- /dev/null +++ b/roles/prometheus/templates/grafana-dashboards/cockroachdb/replicas.json @@ -0,0 +1,3 @@ +{% raw %} + {"dashboard":{"__inputs":[{"name":"DS_PROMETHEUS","label":"Prometheus","description":"","type":"datasource","pluginId":"prometheus","pluginName":"Prometheus"}],"__requires":[{"type":"panel","id":"graph","name":"Graph","version":""},{"type":"panel","id":"table","name":"Table","version":""},{"type":"grafana","id":"grafana","name":"Grafana","version":"3.1.1"},{"type":"datasource","id":"prometheus","name":"Prometheus","version":"1.0.0"}],"id":null,"title":"Cockroach Replicas","tags":["cockroach"],"style":"dark","timezone":"utc","editable":true,"hideControls":false,"sharedCrosshair":true,"rows":[{"collapse":false,"editable":true,"height":"250px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"fill":1,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":16,"isNew":true,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"null as zero","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[{"alias":"unavailable","yaxis":2},{"alias":"raft leaders not lease holders","yaxis":2},{"alias":"under-replicated","yaxis":1}],"span":8,"stack":false,"steppedLine":false,"targets":[{"expr":"sum(sum(ranges{cluster=\"$cluster\",instance=~\"$node\"}) by (instance))","intervalFactor":2,"legendFormat":"ranges","refId":"D","step":120},{"expr":"sum(sum(replicas_leaders{cluster=\"$cluster\",instance=~\"$node\"}) by (instance))","interval":"","intervalFactor":2,"legendFormat":"raft leaders","metric":"","refId":"A","step":120},{"expr":"sum(sum(replicas_leaders_not_leaseholders{cluster=\"$cluster\",instance=~\"$node\"}) by (instance))","interval":"","intervalFactor":2,"legendFormat":"raft leaders not lease holders","metric":"","refId":"C","step":120},{"expr":"sum(sum(ranges_unavailable{cluster=\"$cluster\",instance=~\"$node\"}) by (instance))","interval":"","intervalFactor":2,"legendFormat":"unavailable","metric":"","refId":"B","step":120},{"expr":"sum(sum(ranges_underreplicated{cluster=\"$cluster\",instance=~\"$node\"}) by (instance))","intervalFactor":2,"legendFormat":"under-replicated","refId":"E","step":120}],"timeFrom":null,"timeShift":null,"title":"Ranges: $node","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"short","label":null,"logBase":1,"max":null,"min":0,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":0,"show":true}]},{"columns":[{"text":"Current","value":"current"},{"text":"Avg","value":"avg"},{"text":"Min","value":"min"},{"text":"Max","value":"max"}],"editable":true,"error":false,"fontSize":"90%","id":17,"isNew":true,"links":[],"pageSize":null,"scroll":true,"showHeader":true,"sort":{"col":0,"desc":false},"span":4,"styles":[{"colorMode":null,"colors":["rgba(245, 54, 54, 0.9)","rgba(237, 129, 40, 0.89)","rgba(50, 172, 45, 0.97)"],"decimals":2,"pattern":"/.*/","thresholds":[],"type":"number","unit":"short"}],"targets":[{"expr":"sum(ranges{cluster=\"$cluster\",instance=~\"$node\"}) by (instance)","interval":"","intervalFactor":2,"legendFormat":"{{instance}} - ranges","metric":"","refId":"A","step":240},{"expr":"sum(ranges_unavailable{cluster=\"$cluster\",instance=~\"$node\"}) by (instance)","interval":"","intervalFactor":2,"legendFormat":"{{instance}} - unavailable","metric":"","refId":"B","step":240}],"title":"Ranges: $node","transform":"timeseries_aggregations","type":"table"}],"title":"New row"},{"collapse":false,"editable":true,"height":"250px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"fill":1,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":28,"isNew":true,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"null as zero","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":8,"stack":false,"steppedLine":false,"targets":[{"expr":"sum(replicas{cluster=\"$cluster\",instance=~\"$node\"}) by (instance)","interval":"","intervalFactor":2,"legendFormat":"{{instance}}","metric":"","refId":"A","step":120}],"timeFrom":null,"timeShift":null,"title":"Replicas per node: $node","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"short","label":null,"logBase":1,"max":null,"min":0,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true}]},{"columns":[{"text":"Current","value":"current"},{"text":"Avg","value":"avg"},{"text":"Min","value":"min"},{"text":"Max","value":"max"}],"editable":true,"error":false,"fontSize":"90%","id":29,"isNew":true,"links":[],"pageSize":null,"scroll":true,"showHeader":true,"sort":{"col":0,"desc":false},"span":4,"styles":[{"colorMode":null,"colors":["rgba(245, 54, 54, 0.9)","rgba(237, 129, 40, 0.89)","rgba(50, 172, 45, 0.97)"],"decimals":2,"pattern":"/.*/","thresholds":[],"type":"number","unit":"short"}],"targets":[{"expr":"sum(replicas{cluster=\"$cluster\",instance=~\"$node\"}) by (instance)","interval":"","intervalFactor":2,"legendFormat":"{{instance}}","metric":"","refId":"A","step":240}],"title":"Replicas: $node","transform":"timeseries_aggregations","type":"table"},{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"fill":1,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":41,"isNew":true,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"null as zero","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":8,"stack":false,"steppedLine":false,"targets":[{"expr":"sum(replicas_leaseholders{cluster=\"$cluster\",instance=~\"$node\"}) by (instance)","interval":"","intervalFactor":2,"legendFormat":"{{instance}}","metric":"","refId":"A","step":120}],"timeFrom":null,"timeShift":null,"title":"Replica leaseholders per node: $node","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"short","label":null,"logBase":1,"max":null,"min":0,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true}]},{"columns":[{"text":"Current","value":"current"},{"text":"Avg","value":"avg"},{"text":"Min","value":"min"},{"text":"Max","value":"max"}],"editable":true,"error":false,"fontSize":"90%","id":40,"isNew":true,"links":[],"pageSize":null,"scroll":true,"showHeader":true,"sort":{"col":0,"desc":false},"span":4,"styles":[{"colorMode":null,"colors":["rgba(245, 54, 54, 0.9)","rgba(237, 129, 40, 0.89)","rgba(50, 172, 45, 0.97)"],"decimals":2,"pattern":"/.*/","thresholds":[],"type":"number","unit":"short"}],"targets":[{"expr":"sum(replicas_leaseholders{cluster=\"$cluster\",instance=~\"$node\"}) by (instance)","interval":"","intervalFactor":2,"legendFormat":"{{instance}}","metric":"","refId":"A","step":240}],"title":"Replica leaseholders: $node","transform":"timeseries_aggregations","type":"table"}],"title":"New row"},{"collapse":false,"editable":true,"height":"250px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"fill":1,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":30,"isNew":true,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"null as zero","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":8,"stack":false,"steppedLine":false,"targets":[{"expr":"sum(sum(replicas{cluster=\"$cluster\",instance=~\"$node\"}) by (instance))","interval":"","intervalFactor":2,"legendFormat":"Replicas","refId":"A","step":120},{"expr":"sum(sum(replicas_quiescent{cluster=\"$cluster\",instance=~\"$node\"}) by (instance))","interval":"","intervalFactor":2,"legendFormat":"Quiescent","refId":"B","step":120},{"expr":"sum(sum(replicas{cluster=\"$cluster\",instance=~\"$node\"}) by (instance) - sum(replicas_quiescent{cluster=\"$cluster\",instance=~\"$node\"}) by (instance))","intervalFactor":2,"legendFormat":"Active","refId":"C","step":120}],"timeFrom":null,"timeShift":null,"title":"Replicas: $node","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true}]},{"columns":[{"text":"Current","value":"current"},{"text":"Avg","value":"avg"},{"text":"Min","value":"min"},{"text":"Max","value":"max"}],"editable":true,"error":false,"fontSize":"90%","id":31,"isNew":true,"links":[],"pageSize":null,"scroll":true,"showHeader":true,"sort":{"col":0,"desc":true},"span":4,"styles":[{"dateFormat":"YYYY-MM-DD HH:mm:ss","pattern":"Time","type":"date"},{"colorMode":null,"colors":["rgba(245, 54, 54, 0.9)","rgba(237, 129, 40, 0.89)","rgba(50, 172, 45, 0.97)"],"decimals":2,"pattern":"/.*/","thresholds":[],"type":"number","unit":"short"}],"targets":[{"expr":"sum(replicas_quiescent{cluster=\"$cluster\",instance=~\"$node\"}) by (instance)","interval":"","intervalFactor":2,"legendFormat":"{{instance}}","refId":"A","step":240}],"title":"Quiescent Replicas: $node","transform":"timeseries_aggregations","type":"table"}],"title":"New row"},{"collapse":false,"editable":true,"height":"250px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"fill":1,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":9,"isNew":true,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"null as zero","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":8,"stack":false,"steppedLine":false,"targets":[{"expr":"sum(sum(rate(range_splits{cluster=\"$cluster\",instance=~\"$node\"}[$rate_interval])) by (instance))","interval":"","intervalFactor":2,"legendFormat":"splits","metric":"","refId":"A","step":120},{"expr":"sum(sum(rate(range_adds{cluster=\"$cluster\",instance=~\"$node\"}[$rate_interval])) by (instance))","interval":"","intervalFactor":2,"legendFormat":"adds","metric":"","refId":"B","step":120},{"expr":"sum(sum(rate(range_removes{cluster=\"$cluster\",instance=~\"$node\"}[$rate_interval])) by (instance))","interval":"","intervalFactor":2,"legendFormat":"removes","metric":"","refId":"C","step":120}],"timeFrom":null,"timeShift":null,"title":"Range Ops: $node","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"ops","label":null,"logBase":1,"max":null,"min":0,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true}]},{"columns":[{"text":"Current","value":"current"},{"text":"Avg","value":"avg"},{"text":"Min","value":"min"},{"text":"Max","value":"max"}],"editable":true,"error":false,"fontSize":"90%","id":14,"isNew":true,"links":[],"pageSize":null,"scroll":true,"showHeader":true,"sort":{"col":0,"desc":true},"span":4,"styles":[{"dateFormat":"YYYY-MM-DD HH:mm:ss","pattern":"Time","type":"date"},{"colorMode":null,"colors":["rgba(245, 54, 54, 0.9)","rgba(237, 129, 40, 0.89)","rgba(50, 172, 45, 0.97)"],"decimals":2,"pattern":"/.*/","thresholds":[],"type":"number","unit":"short"}],"targets":[{"expr":"sum(rate(range_splits{cluster=\"$cluster\",instance=~\"$node\"}[$rate_interval])) by (instance) + sum(rate(range_adds{cluster=\"$cluster\",instance=~\"$node\"}[$rate_interval])) by (instance) + sum(rate(range_removes{cluster=\"$cluster\",instance=~\"$node\"}[$rate_interval])) by (instance)","interval":"","intervalFactor":2,"legendFormat":"{{instance}}","metric":"","refId":"A","step":240}],"title":"Range Ops: $node","transform":"timeseries_aggregations","type":"table"}],"title":"New row"},{"collapse":false,"editable":true,"height":"250px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"fill":1,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":7,"isNew":true,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"null as zero","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[{"alias":"Reserved Capacity","yaxis":2}],"span":8,"stack":false,"steppedLine":false,"targets":[{"expr":"sum(sum(rate(range_snapshots_generated{cluster=\"$cluster\",instance=~\"$node\"}[$rate_interval])) by (instance))","interval":"","intervalFactor":2,"legendFormat":"Generated","metric":"","refId":"A","step":120},{"expr":"sum(sum(rate(range_snapshots_normal_applied{cluster=\"$cluster\",instance=~\"$node\"}[$rate_interval])) by (instance))","interval":"","intervalFactor":2,"legendFormat":"Normal-applied","metric":"","refId":"B","step":120},{"expr":"sum(sum(rate(range_snapshots_preemptive_applied{cluster=\"$cluster\",instance=~\"$node\"}[$rate_interval])) by (instance))","interval":"","intervalFactor":2,"legendFormat":"Preemptive-applied","refId":"C","step":120},{"expr":"sum(capacity_reserved{cluster=\"$cluster\",instance=~\"$node\"})","hide":false,"interval":"","intervalFactor":2,"legendFormat":"Reserved Capacity","refId":"D","step":120},{"expr":"sum(replicas_reserved{cluster=\"$cluster\",instance=~\"$node\"})","intervalFactor":2,"legendFormat":"Reservations","refId":"E","step":120}],"timeFrom":null,"timeShift":null,"title":"Snapshots: $node","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"ops","label":"Snapshots","logBase":1,"max":null,"min":0,"show":true},{"format":"KBs","label":"Reservations","logBase":1,"max":null,"min":0,"show":true}]},{"columns":[{"text":"Current","value":"current"},{"text":"Avg","value":"avg"},{"text":"Min","value":"min"},{"text":"Max","value":"max"}],"editable":true,"error":false,"fontSize":"90%","hideTimeOverride":false,"id":13,"isNew":true,"links":[],"pageSize":null,"scroll":true,"showHeader":true,"sort":{"col":0,"desc":true},"span":4,"styles":[{"colorMode":null,"colors":["rgba(245, 54, 54, 0.9)","rgba(237, 129, 40, 0.89)","rgba(50, 172, 45, 0.97)"],"dateFormat":"YYYY-MM-DD HH:mm:ss","decimals":2,"pattern":"/.*/","thresholds":[],"type":"number","unit":"short"}],"targets":[{"expr":"sum(rate(range_snapshots_generated{cluster=\"$cluster\",instance=~\"$node\"}[$rate_interval])) by (instance)","interval":"","intervalFactor":2,"legendFormat":"{{instance}} - generated","refId":"A","step":240},{"expr":"sum(rate(range_snapshots_normal_applied{cluster=\"$cluster\",instance=~\"$node\"}[$rate_interval])) by (instance) + sum(rate(range_snapshots_preemptive_applied{cluster=\"$cluster\",instance=~\"$node\"}[$rate_interval])) by (instance)","interval":"","intervalFactor":2,"legendFormat":"{{instance}} - applied","metric":"","refId":"B","step":240}],"timeFrom":null,"timeShift":null,"title":"Snapshots: $node","transform":"timeseries_aggregations","type":"table"}],"title":"New row"}],"time":{"from":"now-24h","to":"now"},"timepicker":{"refresh_intervals":["5s","10s","30s","1m","5m","15m","30m","1h","2h","1d"],"time_options":["5m","15m","1h","6h","12h","24h","2d","7d","30d"]},"templating":{"list":[{"current":{},"datasource":"${DS_PROMETHEUS}","hide":0,"includeAll":false,"label":"Cluster name","multi":false,"name":"cluster","options":[],"query":"sys_uptime","refresh":1,"regex":"/cluster=\"([^\"]+)\"/","type":"query"},{"allValue":".*","current":{},"datasource":"${DS_PROMETHEUS}","hide":0,"includeAll":true,"label":"Node","multi":false,"name":"node","options":[],"query":"label_values(sys_uptime{job=\"cockroach\",cluster=\"$cluster\"},instance)","refresh":1,"regex":"","type":"query"},{"current":{"selected":true,"text":"1m","value":"1m"},"datasource":null,"hide":0,"includeAll":false,"label":"Rate Interval","multi":false,"name":"rate_interval","options":[{"selected":false,"text":"30s","value":"30s"},{"selected":true,"text":"1m","value":"1m"},{"selected":false,"text":"5m","value":"5m"},{"selected":false,"text":"10m","value":"10m"},{"selected":false,"text":"30m","value":"30m"},{"selected":false,"text":"1h","value":"1h"},{"selected":false,"text":"6h","value":"6h"},{"selected":false,"text":"12h","value":"12h"},{"selected":false,"text":"1d","value":"1d"}],"query":"30s,1m,5m,10m,30m,1h,6h,12h,1d","refresh":0,"type":"interval"}]},"annotations":{"list":[]},"refresh":false,"schemaVersion":12,"version":7,"links":[{"asDropdown":true,"icon":"external link","includeVars":true,"keepTime":true,"tags":["cockroach"],"targetBlank":true,"title":"Dashboards","type":"dashboards"}],"gnetId":null},"overwrite":true,"inputs":[{"name":"DS_PROMETHEUS","type":"datasource","pluginId":"prometheus","value":"prometheus"}]} +{% endraw %} \ No newline at end of file diff --git a/roles/prometheus/templates/grafana-dashboards/cockroachdb/runtime.json b/roles/prometheus/templates/grafana-dashboards/cockroachdb/runtime.json new file mode 100644 index 0000000..74f728a --- /dev/null +++ b/roles/prometheus/templates/grafana-dashboards/cockroachdb/runtime.json @@ -0,0 +1,3 @@ +{% raw %} + {"dashboard":{"__inputs":[{"name":"DS_PROMETHEUS","label":"Prometheus","description":"","type":"datasource","pluginId":"prometheus","pluginName":"Prometheus"}],"__requires":[{"type":"panel","id":"graph","name":"Graph","version":""},{"type":"panel","id":"table","name":"Table","version":""},{"type":"grafana","id":"grafana","name":"Grafana","version":"3.1.1"},{"type":"datasource","id":"prometheus","name":"Prometheus","version":"1.0.0"}],"id":null,"title":"Cockroach Runtime","tags":["cockroach"],"style":"dark","timezone":"utc","editable":true,"hideControls":false,"sharedCrosshair":true,"rows":[{"collapse":false,"editable":true,"height":"250px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"fill":0,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":2,"isNew":true,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"null as zero","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[{"alias":"Live nodes","yaxis":1},{"alias":"All nodes","yaxis":1}],"span":8,"stack":false,"steppedLine":false,"targets":[{"expr":"count(up{job=\"cockroach\",cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"})","hide":false,"interval":"","intervalFactor":2,"legendFormat":"All nodes","metric":"","refId":"A","step":120},{"expr":"count(up{job=\"cockroach\",cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"} == 1)","interval":"","intervalFactor":2,"legendFormat":"Live nodes","metric":"","refId":"B","step":120}],"timeFrom":null,"timeShift":null,"title":"Nodes: $node","tooltip":{"msResolution":true,"shared":true,"sort":0,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"short","label":"","logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":"","logBase":1,"max":null,"min":null,"show":true}]},{"columns":[{"text":"Current","value":"current"}],"editable":true,"error":false,"fontSize":"90%","id":19,"isNew":true,"links":[],"pageSize":null,"scroll":true,"showHeader":true,"sort":{"col":0,"desc":true},"span":4,"styles":[],"targets":[{"expr":"build_timestamp{cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}","interval":"","intervalFactor":2,"legendFormat":"{{kubernetes_pod_node_name}} - {{tag}}","metric":"","refId":"A","step":240}],"title":"Built Timestamp","transform":"timeseries_aggregations","type":"table"}],"showTitle":false,"title":"New row"},{"collapse":false,"editable":true,"height":"250px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"fill":1,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":12,"isNew":true,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"null as zero","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":8,"stack":false,"steppedLine":false,"targets":[{"expr":"sys_uptime{job=\"cockroach\",cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}","interval":"","intervalFactor":2,"legendFormat":"{{kubernetes_pod_node_name}}","refId":"A","step":120}],"timeFrom":null,"timeShift":null,"title":"Uptime: $node","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"s","label":null,"logBase":10,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true}]},{"columns":[{"text":"Current","value":"current"}],"editable":true,"error":false,"fontSize":"90%","id":13,"isNew":true,"links":[],"pageSize":null,"scroll":true,"showHeader":true,"sort":{"col":1,"desc":false},"span":4,"styles":[{"dateFormat":"YYYY-MM-DD HH:mm:ss","pattern":"Current","type":"number","unit":"s"}],"targets":[{"expr":"sys_uptime{job=\"cockroach\",cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}","interval":"","intervalFactor":2,"legendFormat":"{{kubernetes_pod_node_name}}","metric":"","refId":"A","step":240}],"title":"Uptime: $node","transform":"timeseries_aggregations","type":"table"}],"title":"New row"},{"collapse":false,"editable":true,"height":"250px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"fill":1,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":9,"isNew":true,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"null as zero","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":8,"stack":false,"steppedLine":false,"targets":[{"expr":"sum(sys_rss{job=\"cockroach\",cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"})","interval":"","intervalFactor":2,"legendFormat":"RSS","metric":"sys_rss","refId":"A","step":120},{"expr":"sum(sys_go_allocbytes{job=\"cockroach\",cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"})","interval":"","intervalFactor":2,"legendFormat":"Go Allocated","metric":"sys_rss","refId":"C","step":120},{"expr":"sum(sys_go_totalbytes{job=\"cockroach\",cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"})","interval":"","intervalFactor":2,"legendFormat":"Go Total","metric":"sys_cgo","refId":"D","step":120},{"expr":"sum(sys_cgo_allocbytes{job=\"cockroach\",cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"})","interval":"","intervalFactor":2,"legendFormat":"CGo Allocated","metric":"sys_rss","refId":"B","step":120},{"expr":"sum(sys_cgo_totalbytes{job=\"cockroach\",cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"})","interval":"","intervalFactor":2,"legendFormat":"CGo Total","metric":"sys_rss","refId":"E","step":120}],"timeFrom":null,"timeShift":null,"title":"Memory: $node","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"bytes","label":null,"logBase":1,"max":null,"min":0,"show":true},{"format":"bytes","label":null,"logBase":1,"max":null,"min":null,"show":true}]},{"columns":[{"text":"Current","value":"current"},{"text":"Avg","value":"avg"},{"text":"Max","value":"max"}],"editable":true,"error":false,"fontSize":"90%","hideTimeOverride":false,"id":10,"isNew":true,"links":[],"pageSize":null,"scroll":true,"showHeader":true,"sort":{"col":null,"desc":false},"span":4,"styles":[{"colorMode":null,"colors":["rgba(245, 54, 54, 0.9)","rgba(237, 129, 40, 0.89)","rgba(50, 172, 45, 0.97)"],"decimals":2,"pattern":".*","thresholds":[],"type":"number","unit":"bytes"}],"targets":[{"expr":"sys_rss{job=\"cockroach\",cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}","interval":"","intervalFactor":2,"legendFormat":"{{kubernetes_pod_node_name}}","refId":"A","step":240}],"title":"RSS: $node","transform":"timeseries_aggregations","type":"table"}],"title":"New row"},{"collapse":false,"editable":true,"height":"250px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"fill":1,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":11,"isNew":true,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"null as zero","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":8,"stack":false,"steppedLine":false,"targets":[{"expr":"sum(sys_goroutines{job=\"cockroach\",cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"})","interval":"","intervalFactor":2,"legendFormat":"Goroutines","metric":"sys","refId":"A","step":120}],"timeFrom":null,"timeShift":null,"title":"Goroutines: $node","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"short","label":null,"logBase":1,"max":null,"min":0,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true}]},{"columns":[{"text":"Current","value":"current"},{"text":"Avg","value":"avg"},{"text":"Max","value":"max"}],"editable":true,"error":false,"fontSize":"90%","id":15,"isNew":true,"links":[],"pageSize":null,"scroll":true,"showHeader":true,"sort":{"col":0,"desc":false},"span":4,"styles":[{"colorMode":null,"colors":["rgba(245, 54, 54, 0.9)","rgba(237, 129, 40, 0.89)","rgba(50, 172, 45, 0.97)"],"dateFormat":"YYYY-MM-DD HH:mm:ss","decimals":0,"pattern":"/.*/","thresholds":[],"type":"number","unit":"none"}],"targets":[{"expr":"sys_goroutines{job=\"cockroach\",cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}","interval":"","intervalFactor":2,"legendFormat":"{{kubernetes_pod_node_name}}","refId":"A","step":240}],"title":"Goroutines: $node","transform":"timeseries_aggregations","type":"table"}],"title":"New row"},{"collapse":false,"editable":true,"height":"250px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"fill":1,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":14,"isNew":true,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"null as zero","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[{"alias":"GC Pauses","yaxis":2}],"span":8,"stack":false,"steppedLine":false,"targets":[{"expr":"sum(rate(sys_gc_count{job=\"cockroach\",cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}[$rate_interval]))","interval":"","intervalFactor":2,"legendFormat":"GC Runs","refId":"A","step":120},{"expr":"sum(rate(sys_gc_pause_ns{job=\"cockroach\",cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}[$rate_interval]))","interval":"","intervalFactor":2,"legendFormat":"GC Pauses","metric":"","refId":"B","step":120}],"timeFrom":null,"timeShift":null,"title":"GC: $node","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"short","label":null,"logBase":1,"max":null,"min":0,"show":true},{"format":"ns","label":null,"logBase":1,"max":null,"min":0,"show":true}]},{"columns":[{"text":"Current","value":"current"},{"text":"Avg","value":"avg"},{"text":"Max","value":"max"}],"editable":true,"error":false,"fontSize":"90%","id":16,"isNew":true,"links":[],"pageSize":null,"scroll":true,"showHeader":true,"sort":{"col":0,"desc":false},"span":4,"styles":[{"dateFormat":"YYYY-MM-DD HH:mm:ss","pattern":"Time","type":"date"},{"colorMode":null,"colors":["rgba(245, 54, 54, 0.9)","rgba(237, 129, 40, 0.89)","rgba(50, 172, 45, 0.97)"],"decimals":2,"pattern":"/.*/","thresholds":[],"type":"number","unit":"ns"}],"targets":[{"expr":"rate(sys_gc_pause_ns{job=\"cockroach\",cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}[$rate_interval])","interval":"","intervalFactor":2,"legendFormat":"{{kubernetes_pod_node_name}}","refId":"A","step":240}],"title":"GC Pauses: $node","transform":"timeseries_aggregations","type":"table"}],"title":"New row"},{"collapse":false,"editable":true,"height":"250px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"fill":1,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":17,"isNew":true,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"null as zero","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":8,"stack":false,"steppedLine":false,"targets":[{"expr":"sum(rate(sys_cpu_user_ns{job=\"cockroach\",cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}[$rate_interval]))","interval":"","intervalFactor":2,"legendFormat":"User","refId":"A","step":120},{"expr":"sum(rate(sys_cpu_sys_ns{job=\"cockroach\",cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}[$rate_interval]))","intervalFactor":2,"legendFormat":"System","metric":"","refId":"B","step":120}],"timeFrom":null,"timeShift":null,"title":"CPU Time: $node","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"ns","label":null,"logBase":1,"max":null,"min":0,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true}]},{"columns":[{"text":"Current","value":"current"},{"text":"Avg","value":"avg"},{"text":"Min","value":"min"},{"text":"Max","value":"max"}],"editable":true,"error":false,"fontSize":"90%","id":18,"isNew":true,"links":[],"pageSize":null,"scroll":true,"showHeader":true,"sort":{"col":0,"desc":true},"span":4,"styles":[{"colorMode":null,"colors":["rgba(245, 54, 54, 0.9)","rgba(237, 129, 40, 0.89)","rgba(50, 172, 45, 0.97)"],"decimals":2,"pattern":"/.*/","thresholds":[],"type":"number","unit":"ns"}],"targets":[{"expr":"rate(sys_cpu_user_ns{job=\"cockroach\",cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}[$rate_interval]) + rate(sys_cpu_sys_ns{job=\"cockroach\",cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}[$rate_interval])","interval":"","intervalFactor":2,"legendFormat":"{{kubernetes_pod_node_name}}","metric":"","refId":"A","step":240}],"title":"CPU Time: $node","transform":"timeseries_aggregations","type":"table"}],"title":"New row"},{"collapse":false,"editable":true,"height":"250px","panels":[],"title":"New row"}],"time":{"from":"now-24h","to":"now"},"timepicker":{"refresh_intervals":["5s","10s","30s","1m","5m","15m","30m","1h","2h","1d"],"time_options":["5m","15m","1h","6h","12h","24h","2d","7d","30d"]},"templating":{"list":[{"current":{},"datasource":"${DS_PROMETHEUS}","hide":0,"includeAll":false,"label":"Cluster name","multi":false,"name":"cluster","options":[],"query":"sys_uptime","refresh":1,"regex":"/cluster=\"([^\"]+)\"/","type":"query"},{"allValue":".*","current":{},"datasource":"${DS_PROMETHEUS}","hide":0,"includeAll":true,"label":"Node","multi":false,"name":"node","options":[],"query":"label_values(sys_uptime{job=\"cockroach\",cluster=\"$cluster\"},kubernetes_pod_node_name)","refresh":1,"regex":"","type":"query"},{"auto":false,"current":{"selected":true,"text":"1m","value":"1m"},"datasource":null,"hide":0,"includeAll":false,"label":"Rate Interval","multi":false,"name":"rate_interval","options":[{"selected":false,"text":"30s","value":"30s"},{"selected":true,"text":"1m","value":"1m"},{"selected":false,"text":"5m","value":"5m"},{"selected":false,"text":"10m","value":"10m"},{"selected":false,"text":"30m","value":"30m"},{"selected":false,"text":"1h","value":"1h"},{"selected":false,"text":"6h","value":"6h"},{"selected":false,"text":"12h","value":"12h"},{"selected":false,"text":"1d","value":"1d"}],"query":"30s,1m,5m,10m,30m,1h,6h,12h,1d","refresh":0,"regex":"","type":"interval"}]},"annotations":{"list":[]},"refresh":"1m","schemaVersion":12,"version":62,"links":[{"asDropdown":true,"icon":"external link","includeVars":true,"keepTime":true,"tags":["cockroach"],"targetBlank":true,"title":"Dashboards","type":"dashboards"}],"gnetId":null},"overwrite":true,"inputs":[{"name":"DS_PROMETHEUS","type":"datasource","pluginId":"prometheus","value":"prometheus"}]} +{% endraw %} \ No newline at end of file diff --git a/roles/prometheus/templates/grafana-dashboards/cockroachdb/sql.json b/roles/prometheus/templates/grafana-dashboards/cockroachdb/sql.json new file mode 100644 index 0000000..6189fbb --- /dev/null +++ b/roles/prometheus/templates/grafana-dashboards/cockroachdb/sql.json @@ -0,0 +1,3 @@ +{% raw %} + {"dashboard":{"__inputs":[{"name":"DS_PROMETHEUS","label":"Prometheus","description":"","type":"datasource","pluginId":"prometheus","pluginName":"Prometheus"}],"__requires":[{"type":"panel","id":"graph","name":"Graph","version":""},{"type":"panel","id":"table","name":"Table","version":""},{"type":"grafana","id":"grafana","name":"Grafana","version":"3.1.1"},{"type":"datasource","id":"prometheus","name":"Prometheus","version":"1.0.0"}],"id":null,"title":"Cockroach SQL","tags":["cockroach"],"style":"dark","timezone":"utc","editable":true,"hideControls":false,"sharedCrosshair":true,"rows":[{"collapse":false,"editable":true,"height":"250px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"fill":1,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":6,"isNew":true,"legend":{"alignAsTable":false,"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"null as zero","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":8,"stack":false,"steppedLine":false,"targets":[{"expr":"sum(sql_conns{cluster=\"$cluster\",job=\"cockroach\",kubernetes_pod_node_name=~\"$node\"})","hide":false,"interval":"","intervalFactor":2,"legendFormat":"Connections","metric":"","refId":"A","step":120}],"timeFrom":null,"timeShift":null,"title":"SQL Connections: $node","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"short","label":null,"logBase":1,"max":null,"min":0,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true}]},{"columns":[{"text":"Current","value":"current"},{"text":"Avg","value":"avg"},{"text":"Min","value":"min"},{"text":"Max","value":"max"}],"editable":true,"error":false,"fontSize":"90%","id":12,"isNew":true,"links":[],"pageSize":null,"scroll":true,"showHeader":true,"sort":{"col":0,"desc":false},"span":4,"styles":[{"dateFormat":"YYYY-MM-DD HH:mm:ss","pattern":"Time","type":"date"},{"colorMode":null,"colors":["rgba(245, 54, 54, 0.9)","rgba(237, 129, 40, 0.89)","rgba(50, 172, 45, 0.97)"],"decimals":0,"pattern":"/.*/","thresholds":[],"type":"number","unit":"short"}],"targets":[{"expr":"sql_conns{job=\"cockroach\",cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}","interval":"","intervalFactor":2,"legendFormat":"{{kubernetes_pod_node_name}}","refId":"A","step":240}],"title":"Connections: $node","transform":"timeseries_aggregations","type":"table"}],"title":"New row"},{"collapse":false,"editable":true,"height":"250px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"fill":1,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":7,"isNew":true,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"null as zero","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":8,"stack":false,"steppedLine":false,"targets":[{"expr":"sum(rate(sql_bytesin{job=\"cockroach\", cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}[$rate_interval]))","interval":"","intervalFactor":2,"legendFormat":"In","metric":"","refId":"A","step":120},{"expr":"sum(rate(sql_bytesout{job=\"cockroach\", cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}[$rate_interval]))","interval":"","intervalFactor":2,"legendFormat":"Out","metric":"","refId":"B","step":120}],"timeFrom":null,"timeShift":null,"title":"Bytes: $node","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"Bps","label":null,"logBase":1,"max":null,"min":0,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true}]},{"columns":[{"text":"Current","value":"current"},{"text":"Avg","value":"avg"},{"text":"Min","value":"min"},{"text":"Max","value":"max"}],"editable":true,"error":false,"fontSize":"90%","hideTimeOverride":false,"id":13,"isNew":true,"links":[],"pageSize":null,"scroll":true,"showHeader":true,"sort":{"col":0,"desc":true},"span":4,"styles":[{"colorMode":null,"colors":["rgba(245, 54, 54, 0.9)","rgba(237, 129, 40, 0.89)","rgba(50, 172, 45, 0.97)"],"dateFormat":"YYYY-MM-DD HH:mm:ss","decimals":2,"pattern":"/.*/","thresholds":[],"type":"number","unit":"Bps"}],"targets":[{"expr":"rate(sql_bytesin{job=\"cockroach\", cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}[$rate_interval])","interval":"","intervalFactor":2,"legendFormat":"{{kubernetes_pod_node_name}} - in","refId":"A","step":240},{"expr":"rate(sql_bytesout{job=\"cockroach\", cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}[$rate_interval])","interval":"","intervalFactor":2,"legendFormat":"{{kubernetes_pod_node_name}} - out","metric":"","refId":"B","step":240}],"timeFrom":null,"timeShift":null,"title":"Bytes in/out: $node","transform":"timeseries_aggregations","type":"table"}],"title":"New row"},{"collapse":false,"editable":true,"height":"250px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"fill":1,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":9,"isNew":true,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"null as zero","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":8,"stack":false,"steppedLine":false,"targets":[{"expr":"sum(rate(sql_select_count{job=\"cockroach\",cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}[$rate_interval]))","interval":"","intervalFactor":2,"legendFormat":"select","metric":"","refId":"A","step":120},{"expr":"sum(rate(sql_insert_count{job=\"cockroach\",cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}[$rate_interval]))","interval":"","intervalFactor":2,"legendFormat":"insert","metric":"","refId":"B","step":120},{"expr":"sum(rate(sql_update_count{job=\"cockroach\",cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}[$rate_interval]))","interval":"","intervalFactor":2,"legendFormat":"update","metric":"","refId":"C","step":120},{"expr":"sum(rate(sql_delete_count{job=\"cockroach\",cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}[$rate_interval]))","interval":"","intervalFactor":2,"legendFormat":"delete","metric":"","refId":"D","step":120}],"timeFrom":null,"timeShift":null,"title":"Queries: $node","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"short","label":null,"logBase":1,"max":null,"min":0,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true}]},{"columns":[{"text":"Current","value":"current"},{"text":"Avg","value":"avg"},{"text":"Min","value":"min"},{"text":"Max","value":"max"}],"editable":true,"error":false,"fontSize":"90%","id":14,"isNew":true,"links":[],"pageSize":null,"scroll":true,"showHeader":true,"sort":{"col":0,"desc":true},"span":4,"styles":[{"dateFormat":"YYYY-MM-DD HH:mm:ss","pattern":"Time","type":"date"},{"colorMode":null,"colors":["rgba(245, 54, 54, 0.9)","rgba(237, 129, 40, 0.89)","rgba(50, 172, 45, 0.97)"],"decimals":2,"pattern":"/.*/","thresholds":[],"type":"number","unit":"short"}],"targets":[{"expr":"rate(sql_select_count{job=\"cockroach\",cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}[$rate_interval]) + rate(sql_insert_count{job=\"cockroach\",cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}[$rate_interval]) + rate(sql_update_count{job=\"cockroach\",cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}[$rate_interval]) + rate(sql_delete_count{job=\"cockroach\",cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}[$rate_interval])","interval":"","intervalFactor":2,"legendFormat":"{{kubernetes_pod_node_name}}","metric":"","refId":"A","step":240}],"title":"Queries: $node","transform":"timeseries_aggregations","type":"table"}],"title":"New row"},{"collapse":false,"editable":true,"height":"250px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"fill":1,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":10,"isNew":true,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"null as zero","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":8,"stack":false,"steppedLine":false,"targets":[{"expr":"sum(rate(sql_txn_begin_count{job=\"cockroach\",cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}[$rate_interval]))","interval":"","intervalFactor":2,"legendFormat":"begin","metric":"","refId":"A","step":120},{"expr":"sum(rate(sql_txn_commit_count{job=\"cockroach\",cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}[$rate_interval]))","interval":"","intervalFactor":2,"legendFormat":"commit","refId":"B","step":120},{"expr":"sum(rate(sql_txn_abort_count{job=\"cockroach\",cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}[$rate_interval]))","interval":"","intervalFactor":2,"legendFormat":"abort","refId":"C","step":120},{"expr":"sum(rate(sql_txn_rollback_count{job=\"cockroach\",cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}[$rate_interval]))","interval":"","intervalFactor":2,"legendFormat":"rollback","refId":"D","step":120}],"timeFrom":null,"timeShift":null,"title":"Transactions: $node","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"short","label":null,"logBase":1,"max":null,"min":0,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true}]},{"columns":[{"text":"Current","value":"current"},{"text":"Avg","value":"avg"},{"text":"Min","value":"min"},{"text":"Max","value":"max"}],"editable":true,"error":false,"fontSize":"90%","id":15,"isNew":true,"links":[],"pageSize":null,"scroll":true,"showHeader":true,"sort":{"col":0,"desc":true},"span":4,"styles":[{"dateFormat":"YYYY-MM-DD HH:mm:ss","pattern":"Time","type":"date"},{"colorMode":null,"colors":["rgba(245, 54, 54, 0.9)","rgba(237, 129, 40, 0.89)","rgba(50, 172, 45, 0.97)"],"decimals":2,"pattern":"/.*/","thresholds":[],"type":"number","unit":"short"}],"targets":[{"expr":"rate(sql_txn_begin_count{job=\"cockroach\",cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}[$rate_interval]) + rate(sql_txn_commit_count{job=\"cockroach\",cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}[$rate_interval]) + rate(sql_txn_abort_count{job=\"cockroach\",cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}[$rate_interval]) + rate(sql_txn_rollback_count{job=\"cockroach\",cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}[$rate_interval])","interval":"","intervalFactor":2,"legendFormat":"{{kubernetes_pod_node_name}}","refId":"A","step":240}],"title":"Transactions: $node","transform":"timeseries_aggregations","type":"table"}],"title":"New row"},{"collapse":false,"editable":true,"height":"250px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"fill":1,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":11,"isNew":true,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"null as zero","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":8,"stack":false,"steppedLine":false,"targets":[{"expr":"sum(rate(sql_ddl_count{job=\"cockroach\",cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}[$rate_interval]))","interval":"","intervalFactor":2,"legendFormat":"DDL","metric":"","refId":"A","step":120}],"timeFrom":null,"timeShift":null,"title":"Schema changes: $node","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"short","label":null,"logBase":1,"max":null,"min":0,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true}]},{"columns":[{"text":"Current","value":"current"},{"text":"Avg","value":"avg"},{"text":"Min","value":"min"},{"text":"Max","value":"max"}],"editable":true,"error":false,"fontSize":"90%","id":16,"isNew":true,"links":[],"pageSize":null,"scroll":true,"showHeader":true,"sort":{"col":0,"desc":true},"span":4,"styles":[{"dateFormat":"YYYY-MM-DD HH:mm:ss","pattern":"Time","type":"date"},{"colorMode":null,"colors":["rgba(245, 54, 54, 0.9)","rgba(237, 129, 40, 0.89)","rgba(50, 172, 45, 0.97)"],"decimals":2,"pattern":"/.*/","thresholds":[],"type":"number","unit":"short"}],"targets":[{"expr":"rate(sql_ddl_count{job=\"cockroach\",cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}[$rate_interval])","interval":"","intervalFactor":2,"legendFormat":"{{kubernetes_pod_node_name}}","metric":"","refId":"A","step":240}],"title":"Schema Changes: $node","transform":"timeseries_aggregations","type":"table"}],"title":"New row"}],"time":{"from":"now-24h","to":"now"},"timepicker":{"refresh_intervals":["5s","10s","30s","1m","5m","15m","30m","1h","2h","1d"],"time_options":["5m","15m","1h","6h","12h","24h","2d","7d","30d"]},"templating":{"list":[{"current":{},"datasource":"${DS_PROMETHEUS}","hide":0,"includeAll":false,"label":"Cluster name","multi":false,"name":"cluster","options":[],"query":"sys_uptime","refresh":1,"regex":"/cluster=\"([^\"]+)\"/","type":"query"},{"allValue":".*","current":{},"datasource":"${DS_PROMETHEUS}","hide":0,"includeAll":true,"label":"Node","multi":false,"name":"node","options":[],"query":"label_values(sys_uptime{job=\"cockroach\",cluster=\"$cluster\"},kubernetes_pod_node_name)","refresh":1,"regex":"","type":"query"},{"current":{"selected":true,"text":"1m","value":"1m"},"datasource":null,"hide":0,"includeAll":false,"label":"Rate Interval","multi":false,"name":"rate_interval","options":[{"selected":false,"text":"30s","value":"30s"},{"selected":true,"text":"1m","value":"1m"},{"selected":false,"text":"5m","value":"5m"},{"selected":false,"text":"10m","value":"10m"},{"selected":false,"text":"30m","value":"30m"},{"selected":false,"text":"1h","value":"1h"},{"selected":false,"text":"6h","value":"6h"},{"selected":false,"text":"12h","value":"12h"},{"selected":false,"text":"1d","value":"1d"}],"query":"30s,1m,5m,10m,30m,1h,6h,12h,1d","refresh":0,"type":"interval"}]},"annotations":{"list":[]},"refresh":false,"schemaVersion":12,"version":33,"links":[{"asDropdown":true,"icon":"external link","includeVars":true,"keepTime":true,"tags":["cockroach"],"targetBlank":true,"title":"Dashboards","type":"dashboards"}],"gnetId":null},"overwrite":true,"inputs":[{"name":"DS_PROMETHEUS","type":"datasource","pluginId":"prometheus","value":"prometheus"}]} +{% endraw %} \ No newline at end of file diff --git a/roles/prometheus/templates/grafana-dashboards/cockroachdb/storage.json b/roles/prometheus/templates/grafana-dashboards/cockroachdb/storage.json new file mode 100644 index 0000000..f2a1e82 --- /dev/null +++ b/roles/prometheus/templates/grafana-dashboards/cockroachdb/storage.json @@ -0,0 +1,3 @@ +{% raw %} + {"dashboard":{"__inputs":[{"name":"DS_PROMETHEUS","label":"Prometheus","description":"","type":"datasource","pluginId":"prometheus","pluginName":"Prometheus"}],"__requires":[{"type":"panel","id":"graph","name":"Graph","version":""},{"type":"panel","id":"table","name":"Table","version":""},{"type":"grafana","id":"grafana","name":"Grafana","version":"3.1.1"},{"type":"datasource","id":"prometheus","name":"Prometheus","version":"1.0.0"}],"id":null,"title":"Cockroach Storage","tags":["cockroach"],"style":"dark","timezone":"utc","editable":true,"hideControls":false,"sharedCrosshair":true,"rows":[{"collapse":false,"editable":true,"height":"250px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"fill":1,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":16,"isNew":true,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":8,"stack":false,"steppedLine":false,"targets":[{"expr":"sum(sum(capacity{cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}) by (kubernetes_pod_node_name)) - sum(sum(capacity_available{cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}) by (kubernetes_pod_node_name))","interval":"","intervalFactor":2,"legendFormat":"Used","refId":"A","step":120},{"expr":"sum(sum(capacity{cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}) by (kubernetes_pod_node_name))","interval":"","intervalFactor":2,"legendFormat":"Capacity","metric":"","refId":"B","step":120}],"timeFrom":null,"timeShift":null,"title":"Capacity: $node","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"bytes","label":"","logBase":1,"max":null,"min":0,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true}]},{"columns":[{"text":"Current","value":"current"},{"text":"Avg","value":"avg"},{"text":"Min","value":"min"},{"text":"Max","value":"max"}],"editable":true,"error":false,"fontSize":"90%","id":17,"isNew":true,"links":[],"pageSize":null,"scroll":true,"showHeader":true,"sort":{"col":0,"desc":false},"span":4,"styles":[{"colorMode":null,"colors":["rgba(245, 54, 54, 0.9)","rgba(237, 129, 40, 0.89)","rgba(50, 172, 45, 0.97)"],"decimals":2,"pattern":"/.*/","thresholds":[],"type":"number","unit":"percentunit"}],"targets":[{"expr":"(sum(capacity{cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}) by (kubernetes_pod_node_name) - sum(capacity_available{cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}) by (kubernetes_pod_node_name)) / sum(capacity{cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}) by (kubernetes_pod_node_name)","interval":"","intervalFactor":2,"legendFormat":"{{kubernetes_pod_node_name}}","refId":"A","step":240}],"title":"Capacity Used: $node","transform":"timeseries_aggregations","type":"table"},{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"fill":1,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":22,"isNew":true,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":8,"stack":false,"steppedLine":false,"targets":[{"expr":"1 - sum(capacity_available{cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}) by (kubernetes_pod_node_name) / sum(capacity{cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}) by (kubernetes_pod_node_name) ","interval":"","intervalFactor":2,"legendFormat":"{{kubernetes_pod_node_name}}","refId":"A","step":120}],"timeFrom":null,"timeShift":null,"title":"Percentage of capacity used per node: All","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"percentunit","label":"","logBase":1,"max":null,"min":0,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true}]},{"columns":[{"text":"Current","value":"current"},{"text":"Avg","value":"avg"},{"text":"Min","value":"min"},{"text":"Max","value":"max"}],"editable":true,"error":false,"fontSize":"90%","id":23,"isNew":true,"links":[],"pageSize":null,"scroll":true,"showHeader":true,"sort":{"col":0,"desc":false},"span":4,"styles":[{"colorMode":null,"colors":["rgba(245, 54, 54, 0.9)","rgba(237, 129, 40, 0.89)","rgba(50, 172, 45, 0.97)"],"decimals":2,"pattern":"/.*/","thresholds":[],"type":"number","unit":"percentunit"}],"targets":[{"expr":"1 - sum(capacity_available{cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}) by (kubernetes_pod_node_name) / sum(capacity{cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}) by (kubernetes_pod_node_name) ","interval":"","intervalFactor":2,"legendFormat":"{{kubernetes_pod_node_name}}","refId":"A","step":240}],"title":"Percentage of capacity used per node: All","transform":"timeseries_aggregations","type":"table"}],"title":"New row"},{"collapse":false,"editable":true,"height":"250px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"fill":1,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":6,"isNew":true,"legend":{"alignAsTable":false,"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":8,"stack":false,"steppedLine":false,"targets":[{"expr":"sum(sum(livebytes{cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}) by (kubernetes_pod_node_name))","hide":false,"interval":"","intervalFactor":2,"legendFormat":"Live","metric":"","refId":"A","step":120},{"expr":"sum(sum(sysbytes{cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}) by (kubernetes_pod_node_name))","interval":"","intervalFactor":2,"legendFormat":"System","metric":"","refId":"C","step":120}],"timeFrom":null,"timeShift":null,"title":"Bytes: $node","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"bytes","label":null,"logBase":1,"max":null,"min":0,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true}]},{"columns":[{"text":"Current","value":"current"},{"text":"Avg","value":"avg"},{"text":"Min","value":"min"},{"text":"Max","value":"max"}],"editable":true,"error":false,"fontSize":"90%","id":12,"isNew":true,"links":[],"pageSize":null,"scroll":true,"showHeader":true,"sort":{"col":0,"desc":false},"span":4,"styles":[{"colorMode":null,"colors":["rgba(245, 54, 54, 0.9)","rgba(237, 129, 40, 0.89)","rgba(50, 172, 45, 0.97)"],"decimals":2,"pattern":"/.*/","thresholds":[],"type":"number","unit":"bytes"}],"targets":[{"expr":"sum(livebytes{cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}) by (kubernetes_pod_node_name)","interval":"","intervalFactor":2,"legendFormat":"{{kubernetes_pod_node_name}}","metric":"","refId":"A","step":240}],"title":"Live Bytes: $node","transform":"timeseries_aggregations","type":"table"}],"title":"New row"},{"collapse":false,"editable":true,"height":"250px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"fill":1,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":9,"isNew":true,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":8,"stack":false,"steppedLine":false,"targets":[{"expr":"avg(avg(rocksdb_read_amplification{cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}) by (kubernetes_pod_node_name))","interval":"","intervalFactor":2,"legendFormat":"Read Amplification","metric":"","refId":"A","step":120}],"timeFrom":null,"timeShift":null,"title":"Read Amplification: $node","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"short","label":null,"logBase":1,"max":null,"min":0,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true}]},{"columns":[{"text":"Current","value":"current"},{"text":"Avg","value":"avg"},{"text":"Min","value":"min"},{"text":"Max","value":"max"}],"editable":true,"error":false,"fontSize":"90%","id":14,"isNew":true,"links":[],"pageSize":null,"scroll":true,"showHeader":true,"sort":{"col":0,"desc":true},"span":4,"styles":[{"dateFormat":"YYYY-MM-DD HH:mm:ss","pattern":"Time","type":"date"},{"colorMode":null,"colors":["rgba(245, 54, 54, 0.9)","rgba(237, 129, 40, 0.89)","rgba(50, 172, 45, 0.97)"],"decimals":2,"pattern":"/.*/","thresholds":[],"type":"number","unit":"short"}],"targets":[{"expr":"avg(rocksdb_read_amplification{cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}) by (kubernetes_pod_node_name)","interval":"","intervalFactor":2,"legendFormat":"{{kubernetes_pod_node_name}}","metric":"","refId":"A","step":240}],"title":"Read Amplification: $node","transform":"timeseries_aggregations","type":"table"}],"title":"New row"},{"collapse":false,"editable":true,"height":"250px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"fill":1,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":18,"isNew":true,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":8,"stack":false,"steppedLine":false,"targets":[{"expr":"sum(sum(rocksdb_num_sstables{cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}) by (kubernetes_pod_node_name))","interval":"","intervalFactor":2,"legendFormat":"SSTables","refId":"A","step":120}],"timeFrom":null,"timeShift":null,"title":"RocksDB SSTables: $node","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true}]},{"columns":[{"text":"Current","value":"current"},{"text":"Avg","value":"avg"},{"text":"Max","value":"max"}],"editable":true,"error":false,"fontSize":"90%","id":19,"isNew":true,"links":[],"pageSize":null,"scroll":true,"showHeader":true,"sort":{"col":0,"desc":true},"span":4,"styles":[{"dateFormat":"YYYY-MM-DD HH:mm:ss","pattern":"Time","type":"date"},{"colorMode":null,"colors":["rgba(245, 54, 54, 0.9)","rgba(237, 129, 40, 0.89)","rgba(50, 172, 45, 0.97)"],"decimals":2,"pattern":"/.*/","thresholds":[],"type":"number","unit":"short"}],"targets":[{"expr":"sum(rocksdb_num_sstables{cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}) by (kubernetes_pod_node_name)","interval":"","intervalFactor":2,"legendFormat":"{{kubernetes_pod_node_name}}","metric":"","refId":"A","step":240}],"title":"RocksDB SSTables: $node","transform":"timeseries_aggregations","type":"table"}],"title":"New row"},{"collapse":false,"editable":true,"height":"250px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"fill":1,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":20,"isNew":true,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"null as zero","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":8,"stack":false,"steppedLine":false,"targets":[{"expr":"sum(sum(sys_fd_open{cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}) by (kubernetes_pod_node_name))","interval":"","intervalFactor":2,"legendFormat":"Open FDs","refId":"A","step":120},{"expr":"sum(sum(sys_fd_softlimit{cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}) by (kubernetes_pod_node_name))","intervalFactor":2,"legendFormat":"Limit","refId":"B","step":120}],"timeFrom":null,"timeShift":null,"title":"File Descriptors: $node","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true}]},{"columns":[{"text":"Current","value":"current"},{"text":"Avg","value":"avg"},{"text":"Min","value":"min"},{"text":"Max","value":"max"}],"editable":true,"error":false,"fontSize":"90%","id":21,"isNew":true,"links":[],"pageSize":null,"scroll":true,"showHeader":true,"sort":{"col":0,"desc":true},"span":4,"styles":[{"colorMode":null,"colors":["rgba(245, 54, 54, 0.9)","rgba(237, 129, 40, 0.89)","rgba(50, 172, 45, 0.97)"],"decimals":2,"pattern":"/.*/","thresholds":[],"type":"number","unit":"percentunit"}],"targets":[{"expr":"sum(sys_fd_open{cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}) by (kubernetes_pod_node_name) / sum(sys_fd_softlimit{cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}) by (kubernetes_pod_node_name)","interval":"","intervalFactor":2,"legendFormat":"{{kubernetes_pod_node_name}}","refId":"A","step":240}],"title":"File descriptor usage: $node","transform":"timeseries_aggregations","type":"table"}],"title":"New row"}],"time":{"from":"now-24h","to":"now"},"timepicker":{"refresh_intervals":["5s","10s","30s","1m","5m","15m","30m","1h","2h","1d"],"time_options":["5m","15m","1h","6h","12h","24h","2d","7d","30d"]},"templating":{"list":[{"current":{},"datasource":"${DS_PROMETHEUS}","hide":0,"includeAll":false,"label":"Cluster name","multi":false,"name":"cluster","options":[],"query":"sys_uptime","refresh":1,"regex":"/cluster=\"([^\"]+)\"/","type":"query"},{"allValue":".*","current":{},"datasource":"${DS_PROMETHEUS}","hide":0,"includeAll":true,"label":"Node","multi":false,"name":"node","options":[],"query":"label_values(sys_uptime{job=\"cockroach\",cluster=\"$cluster\"},kubernetes_pod_node_name)","refresh":1,"regex":"","type":"query"},{"current":{"selected":true,"text":"1m","value":"1m"},"datasource":null,"hide":0,"includeAll":false,"label":"Rate Interval","multi":false,"name":"rate_interval","options":[{"selected":false,"text":"30s","value":"30s"},{"selected":true,"text":"1m","value":"1m"},{"selected":false,"text":"5m","value":"5m"},{"selected":false,"text":"10m","value":"10m"},{"selected":false,"text":"30m","value":"30m"},{"selected":false,"text":"1h","value":"1h"},{"selected":false,"text":"6h","value":"6h"},{"selected":false,"text":"12h","value":"12h"},{"selected":false,"text":"1d","value":"1d"}],"query":"30s,1m,5m,10m,30m,1h,6h,12h,1d","refresh":0,"type":"interval"}]},"annotations":{"list":[]},"refresh":false,"schemaVersion":12,"version":15,"links":[{"asDropdown":true,"icon":"external link","includeVars":true,"keepTime":true,"tags":["cockroach"],"targetBlank":true,"title":"Dashboards","type":"dashboards"}],"gnetId":null},"overwrite":true,"inputs":[{"name":"DS_PROMETHEUS","type":"datasource","pluginId":"prometheus","value":"prometheus"}]} +{% endraw %} \ No newline at end of file diff --git a/roles/prometheus/templates/grafana-dashboards/http-service.json b/roles/prometheus/templates/grafana-dashboards/http-service.json new file mode 100644 index 0000000..d2fe2fc --- /dev/null +++ b/roles/prometheus/templates/grafana-dashboards/http-service.json @@ -0,0 +1,249 @@ +{% raw %} + +{ + "dashboard": { + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "4.4.1" + }, + { + "type": "panel", + "id": "graph", + "name": "Graph", + "version": "" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + } + ], + "annotations": { + "list": [] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 1, + "hideControls": false, + "id": null, + "links": [], + "rows": [ + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 1, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "(sum(rate(http_requests_total{job=\"kubernetes-service-endpoints\",status=~\"2..\",component=\"$component\"}[1m]))", + "interval": "1m", + "intervalFactor": 1, + "legendFormat": "2xx", + "metric": "container_memory_usage_bytes", + "refId": "A", + "step": 15 + }, + { + "expr": "(sum(rate(http_requests_total{job=\"kubernetes-service-endpoints\",status=~\"5..\",component=\"$component\"}[1m])) / sum(rate(http_requests_total{job=\"kubernetes-service-endpoints\",component=\"$component\"}[1m]))) * 100", + "interval": "1m", + "intervalFactor": 2, + "legendFormat": "5xx", + "metric": "kube_pod_container_resource_requests_memory_bytes", + "refId": "B", + "step": 20 + }, + { + "expr": "sum(rate(http_requests_total{job=\"kubernetes-service-endpoints\",component=\"$component\"}[1m]))", + "interval": "1m", + "intervalFactor": 2, + "legendFormat": "total", + "metric": "kube_pod_container_resource_requests_memory_bytes", + "refId": "B", + "step": 20 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Request / error rate ", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Row", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "allValue": ".*", + "current": {}, + "datasource": "${DS_PROMETHEUS}", + "hide": 0, + "includeAll": true, + "label": "Namespace", + "multi": false, + "name": "namespace", + "options": [], + "query": "label_values(http_request_duration_seconds_sum, kubernetes_namespace)", + "refresh": 1, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": {}, + "datasource": "${DS_PROMETHEUS}", + "hide": 0, + "includeAll": false, + "label": "HTTP service", + "multi": false, + "name": "component", + "options": [], + "query": "label_values(http_request_duration_seconds_sum{kubernetes_namespace=~\"$namespace\"}, component)", + "refresh": 1, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "HTTP service", + "version": 1 + }, + "overwrite": true, + "inputs": [ + { + "name": "DS_PROMETHEUS", + "type": "datasource", + "pluginId": "prometheus", + "value": "prometheus" + } + ] +} +{% endraw %} diff --git a/roles/prometheus/templates/grafana-dashboards/template.json b/roles/prometheus/templates/grafana-dashboards/template.json new file mode 100644 index 0000000..822c008 --- /dev/null +++ b/roles/prometheus/templates/grafana-dashboards/template.json @@ -0,0 +1,15 @@ +{% raw %} +{ + "dashboard": { + ... ... + }, + "overwrite": true, + "inputs": [ + { + "name": "DS_PROMETHEUS", + "type": "datasource", + "pluginId": "prometheus", + "value": "prometheus" + } + ] +}{% endraw %} \ No newline at end of file diff --git a/roles/prometheus/templates/grafana.yaml b/roles/prometheus/templates/grafana.yaml index 8877200..eb315ab 100644 --- a/roles/prometheus/templates/grafana.yaml +++ b/roles/prometheus/templates/grafana.yaml @@ -9,32 +9,7 @@ metadata: namespace: {{ k8s_prometheus_namespace }} name: prometheus-grafana data: - capacity-planning-dashboard.json: |- -{% include 'grafana-dashboards/capacity-planning.json' %} - - cluster-health-dashboard.json: |- -{% include 'grafana-dashboards/cluster-health.json' %} - - cluster-monitoring-dashboard.json: |- -{% include 'grafana-dashboards/cluster-monitoring.json' %} - - cluster-status-dashboard.json: |- -{% include 'grafana-dashboards/cluster-status.json' %} - - control-plane-status-dashboard.json: |- -{% include 'grafana-dashboards/control-plane-status.json' %} - - deployment-dashboard.json: |- -{% include 'grafana-dashboards/deployment.json' %} - - nodes-dashboard.json: |- -{% include 'grafana-dashboards/nodes.json' %} - - pods-dashboard.json: |- -{% include 'grafana-dashboards/pods.json' %} - - resource-requests-dashboard.json: |- -{% include 'grafana-dashboards/resource-requests.json' %} +{% include 'grafana-dashboards/all.yml' %} prometheus-datasource.json: |- { diff --git a/roles/prometheus/templates/scrape_configs/all.yml b/roles/prometheus/templates/scrape_configs/all.yml index c4018fa..a397e59 100644 --- a/roles/prometheus/templates/scrape_configs/all.yml +++ b/roles/prometheus/templates/scrape_configs/all.yml @@ -5,3 +5,7 @@ {% if k8s_prometheus_scrape_istio_metrics %} {% include 'scrape_configs/istio.yml' %} {% endif %} + +{% if k8s_prometheus_scrape_cockroachdb_metrics %} +{% include 'scrape_configs/cockroachdb.yml' %} +{% endif %} diff --git a/roles/prometheus/templates/scrape_configs/cockroachdb.yml b/roles/prometheus/templates/scrape_configs/cockroachdb.yml new file mode 100644 index 0000000..c7727a8 --- /dev/null +++ b/roles/prometheus/templates/scrape_configs/cockroachdb.yml @@ -0,0 +1,44 @@ + - job_name: 'cockroach' + + kubernetes_sd_configs: + - role: endpoints + + tls_config: + insecure_skip_verify: true + + relabel_configs: + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape] + action: keep + regex: true + - source_labels: [__meta_kubernetes_pod_name] + action: keep + regex: cockroachdb-(\d+) + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme] + action: replace + target_label: __scheme__ + regex: (https?) + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path] + action: replace + target_label: __metrics_path__ + regex: (.+) + - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port] + action: replace + target_label: __address__ + regex: (.+)(?::\d+);(\d+) + replacement: $1:$2 + - source_labels: [__meta_kubernetes_pod_node_name] + action: replace + target_label: kubernetes_pod_node_name + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: kubernetes_namespace + - source_labels: [__meta_kubernetes_pod_name] + action: replace + target_label: kubernetes_pod_name + - source_labels: [__address__] + action: replace + target_label: cluster + regex: (.+) + replacement: cockroachdb-cluster \ No newline at end of file diff --git a/roles/prometheus/templates/scrape_configs/kubernetes.yml b/roles/prometheus/templates/scrape_configs/kubernetes.yml index bffeb7a..08a5e8b 100644 --- a/roles/prometheus/templates/scrape_configs/kubernetes.yml +++ b/roles/prometheus/templates/scrape_configs/kubernetes.yml @@ -138,6 +138,12 @@ - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape] action: keep regex: true +{% if k8s_prometheus_scrape_cockroachdb_metrics %} + + - source_labels: [__meta_kubernetes_pod_name] + action: drop + regex: cockroachdb-(\d+) +{% endif %} - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme] action: replace target_label: __scheme__