diff --git a/inventory/group_vars/all.yml b/inventory/group_vars/all.yml index 4ee94e6..bde1f5a 100644 --- a/inventory/group_vars/all.yml +++ b/inventory/group_vars/all.yml @@ -44,7 +44,6 @@ k8s_prometheus_namespace: prometheus # Additional Kubernetes namespaces k8s_namespaces: - dev - - '{{ k8s_prometheus_namespace }}' # On-prem LB services # List of services which use TCP LB for k8s masters/nodes @@ -399,7 +398,7 @@ gce_credentials_file: '{{ ansible_env.HOME }}/gcloud.json' gce_project_id: my-project-id # Slack webhook URL for Prometheus alerts -prometheus_slack_api_url: '' +k8s_prometheus_slack_api_url: '' # Slack channel for Prometheus alerts -prometheus_slack_channel: '' +k8s_prometheus_slack_channel: '' diff --git a/playbooks/system/firewall.yml b/playbooks/system/firewall.yml index 8d4ef0d..b5644be 100644 --- a/playbooks/system/firewall.yml +++ b/playbooks/system/firewall.yml @@ -16,6 +16,7 @@ - 10250/tcp # kubelet port - 10255/tcp # kubelet port - 4194/tcp # cAdvisor port + - 9100/tcp # Prometheus node-exporter port nodes_ports: - 10250/tcp # kubelet port - 10255/tcp # kubelet port diff --git a/roles/prometheus/README.md b/roles/prometheus/README.md index 9494ddd..7cdbf78 100644 --- a/roles/prometheus/README.md +++ b/roles/prometheus/README.md @@ -1,7 +1,9 @@ Prometheus role -========= +=============== -This role installs Prometheus for Kubernetes cluster (endpoints, pods, nodes, istio, ...) +This role installs Prometheus for Kubernetes cluster (endpoints, pods, nodes, istio, ...) with some basic alerts, dashboards and etc. + +[Official documentation](https://prometheus.io/docs/introduction/overview/) [![Contributions Welcome](https://img.shields.io/badge/contributions-welcome-brightgreen.svg?style=flat)](https://github.com/k8s-community/cluster-deploy/issues) @@ -13,36 +15,111 @@ No special requirements. Role Variables -------------- -Available variables are listed below, along with default values (see `defaults/main.yml`): +You can see all available params in `defaults/main.yml` with default values and descriptions why it needs. By default, all prometheus components will be created in `prometheus` namespace (even if it doesn't exist before). Node exporter will be ran on all nodes (even on master nodes). -Prometheus version: -```yaml -k8s_prometheus_image_tag: v1.5.1 -``` -Domain name for prometheus (if it's empty so ingress object isn't created): -```yaml -k8s_prometheus_name: '' -``` +How to create new alert +----------------------- -Domain name for prometheus alert manager (if it's empty so ingress isn't created): -```yaml -k8s_prometheus_alertmanager_name: '' -``` +New alerts can be added in `templates/alerts` directory (check already existing alerts). For example: -Domain name for prometheus push gateway (if it's empty so ingress isn't created): -```yaml -k8s_prometheus_pushgateway_name: '' -``` + # + # Alert on deployment has not enough replicas + # + - alert: DeploymentReplicasMismatch + expr: (kube_deployment_spec_replicas != kube_deployment_status_replicas_available) + or (kube_deployment_spec_replicas unless kube_deployment_status_replicas_unavailable) + for: 5m + labels: + notify: sre + severity: warning + annotations: + summary: "{{ $labels.kubernetes_namespace }}/{{ $labels.deployment }}: Deployment is failed" + description: "{{ $labels.kubernetes_namespace }}/{{ $labels.deployment }}: Deployment is failed - observed replicas != intended replicas" -New alerts --------------- +It's better to have short description of alert in top for other people. You can use different labels for alerts, we offer to follow recommendations from `templates/alerts/common.conf`. -New alerts can be added in templates/alerts directory. +If you want to create a new file with alerts in `templates/alerts` then you need to add line with file name also in `templates/server.yaml` after creation. Template: -New scrape configs --------------- + {% raw %} + # + # Some alerts for something + # + - name: some-alerts + rules: + + # + # Alert on something + # + - alert: SomethingWrong + .... + + {% endraw %} + +More details about alerts on: [Official documentation about alerts](https://prometheus.io/docs/alerting/rules/) + +What configs should app have +---------------------------- + +You will have to do some changes in manifests / charts if you want to monitor your apps. Changes are described below. + +Ingress should have: + + annotations: + prometheus.io/probe: 'true' + +Black box exporter would check your app via HTTPS check if it needs this check (and SSL certificate expiration). + +Service should have: + + annotations: + prometheus.io/scrape: 'true' + prometheus.io/probe: 'true' + # by default (pass this values only if it should be different) + prometheus.io/port: '8080' + prometheus.io/path: '/metrics' + +Black box exporter would check your app via HTTP check if other apps can communicate with it inside Kubernetes cluster. Also all metrics would be scraped from each pod of your app to create your custom alerts in future. By default, the system monitors only 5XX HTTP codes for apps. + +How to create new scrape configs +-------------------------------- + +New scrape configs can be added in `templates/scrape_configs` directory (check already existing scrapes). If you create a new file then you need to add it also in `templates/server.yaml`. Template: + + {% raw %} + # A scrape configuration for something. + # + - job_name: some-thing + ... + + {% endraw %} + +All details about scrape config on: [Official documentation about scrape configs](https://prometheus.io/docs/operating/configuration/#) + +How to add dashboard in Grafana +----------------------------------- + +1. create a new dashboard manually via `Dashboards --> New` or find existing on [grafana.com](https://grafana.com/dashboards) +2. export (download) it on your computer +3. copy content of downloaded JSON file +4. create a new file in `templates/grafana-dashboards` directory. Template (`templates/grafana-dashboards/dashboard-template.json`): + + {% raw %} + { + "dashboard": { + ... ... + }, + "overwrite": true, + "inputs": [ + { + "name": "DS_PROMETHEUS", + "type": "datasource", + "pluginId": "prometheus", + "value": "prometheus" + } + ] + }{% endraw %} -New scrape configs can be added in templates/scrape_configs directory. +5. add line with file name in `templates/grafana.yaml`. Example Playbook ---------------- diff --git a/roles/prometheus/defaults/main.yml b/roles/prometheus/defaults/main.yml index 2572655..59f975b 100644 --- a/roles/prometheus/defaults/main.yml +++ b/roles/prometheus/defaults/main.yml @@ -1,45 +1,82 @@ --- # Kubernetes configs path k8s_conf_dir: /etc/kubernetes -k8s_addons_dir: '{{ k8s_conf_dir }}/addons' +k8s_prometheus_dir: '{{ k8s_conf_dir }}/addons/prometheus' # Master hosts names k8s_master_hosts: {} -# Prometheus host names (for Prometheus, AlertManager, PushGateway, Grafana) +# Retention days (how long to store data) +k8s_prometheus_retention_days: 7 + +# Prometheus host name (it will be created if it doesn't exist) k8s_prometheus_name: '' +# AlertManager host name (it will be created if it doesn't exist) k8s_prometheus_alertmanager_name: '' +# PushGateway host name (it will be created if it doesn't exist) k8s_prometheus_pushgateway_name: '' +# Grafana host name (it will be created if it doesn't exist) k8s_prometheus_grafana_name: '' +# AlertManager host name (it will be created if it doesn't exist) k8s_prometheus_blackbox_name: '' -# Prometheus namespace -k8s_prometheus_namespace: default +# Prometheus namespace (it will be created if it doesn't exist) +k8s_prometheus_namespace: prometheus -# Prometheus images +# Prometheus image k8s_prometheus_image: prom/prometheus -k8s_prometheus_image_tag: v2.0.0-rc.1 +k8s_prometheus_image_tag: v2.0.0 +# Alert manager image k8s_prometheus_alertmanager_image: prom/alertmanager k8s_prometheus_alertmanager_image_tag: v0.9.1 +# Kube state metrics image k8s_prometheus_ksm_image: gcr.io/google_containers/kube-state-metrics k8s_prometheus_ksm_image_tag: v0.5.0 +# Node exporter image k8s_prometheus_nodeexport_image: prom/node-exporter -k8s_prometheus_nodeexport_image_tag: v0.15.0 +k8s_prometheus_nodeexport_image_tag: v0.15.1 +# Black box exporter image +k8s_prometheus_blackbox_image: prom/blackbox-exporter +k8s_prometheus_blackbox_image_tag: v0.10.0 + +# Prometheus additional components + +# Push gateway image k8s_prometheus_pushgateway_image: prom/pushgateway k8s_prometheus_pushgateway_image_tag: v0.4.0 +# Grafan image (+ grafana watcher) k8s_prometheus_grafana_image: grafana/grafana k8s_prometheus_grafana_image_tag: 4.5.2 k8s_prometheus_grafana_watcher_image: quay.io/coreos/grafana-watcher k8s_prometheus_grafana_watcher_image_tag: v0.0.8 - -k8s_prometheus_blackbox_image: prom/blackbox-exporter -k8s_prometheus_blackbox_image_tag: v0.10.0 - -# Prometheus additional images +# Config reload image k8s_prometheus_configreload_image: jimmidyson/configmap-reload k8s_prometheus_configreload_image_tag: v0.1 +# Prometheus scrape configs for Istio +k8s_prometheus_scrape_istio_metrics: false + +# Alerts default route is stub +k8s_prometheus_alerts_default_route: 'null' + # Prometheus alert configs for Slack -prometheus_slack_api_url: '' -prometheus_slack_channel: '' +k8s_prometheus_slack_alerts_enabled: false +k8s_prometheus_slack_api_url: '' +k8s_prometheus_slack_channel: '' +k8s_prometheus_slack_message_title: '{% raw %}[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}]{{.CommonAnnotations.summary}}{% endraw %}' +# be careful with tabs - it can break structure of yml file +k8s_prometheus_slack_message_body: | + {% raw %}{{ range .Alerts }} + *Alert:* {{ .Annotations.summary }} - `{{ .Labels.severity }}` + *Description:* {{ .Annotations.description }} + *Details*: + {{ range .Labels.SortedPairs }} • {{ .Name }}: `{{ .Value }}` + {{ end }} + {{ end }}{% endraw %} + +# Prometheus alert configs for Telegram +k8s_prometheus_telegram_alerts_enabled: false +k8s_prometheus_telegram_webhook: '' +# Prometheus scrape configs for Cockroachdb +k8s_prometheus_scrape_cockroachdb_metrics: false diff --git a/roles/prometheus/tasks/main.yml b/roles/prometheus/tasks/main.yml index b9c83fc..4567ad7 100644 --- a/roles/prometheus/tasks/main.yml +++ b/roles/prometheus/tasks/main.yml @@ -5,21 +5,22 @@ state: directory mode: 0755 with_items: - - '{{ k8s_addons_dir }}' + - '{{ k8s_prometheus_dir }}' - name: Prometheus template: src: "{{ item }}" - dest: "{{ k8s_addons_dir }}/{{ item }}" + dest: "{{ k8s_prometheus_dir }}/{{ item }}" with_items: + - config.yaml - prometheus.yaml - name: Deploy script for Prometheus template: src: deploy-prometheus.sh - dest: "{{ k8s_addons_dir }}/deploy-prometheus.sh" + dest: "{{ k8s_prometheus_dir }}/deploy-prometheus.sh" mode: 0755 - name: Run deploy script for Prometheus - command: "{{ k8s_addons_dir }}/deploy-prometheus.sh" + command: "{{ k8s_prometheus_dir }}/deploy-prometheus.sh" when: inventory_hostname in k8s_master_hosts[0] diff --git a/roles/prometheus/templates/alert-manager.yaml b/roles/prometheus/templates/alert-manager.yaml index dc6d076..04ba66a 100644 --- a/roles/prometheus/templates/alert-manager.yaml +++ b/roles/prometheus/templates/alert-manager.yaml @@ -10,30 +10,58 @@ metadata: name: prometheus-alertmanager data: alertmanager.yml: |- + {% if k8s_prometheus_slack_alerts_enabled %} + global: - slack_api_url: '{{ prometheus_slack_api_url }}' + slack_api_url: '{{ k8s_prometheus_slack_api_url }}' + {% endif %} receivers: - - name: slack-receiver + - name: 'null' + + {% if k8s_prometheus_slack_alerts_enabled %} + + - name: slack slack_configs: - - channel: '{{ prometheus_slack_channel }}' + - channel: '{{ k8s_prometheus_slack_channel }}' send_resolved: true -{% raw %} - title: '[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}]{{.CommonAnnotations.summary}}' + title: '{{ k8s_prometheus_slack_message_title }}' text: >- - {{ range .Alerts }} - *Alert:* {{ .Annotations.summary }} - `{{ .Labels.severity }}` - *Description:* {{ .Annotations.description }} - *Details*: - {{ range .Labels.SortedPairs }} • {{ .Name }}: `{{ .Value }}` - {{ end }} - {{ end }} -{% endraw %} + {{ k8s_prometheus_slack_message_body }} + {% endif %} + + {% if k8s_prometheus_telegram_alerts_enabled %} + + - name: telegram + webhook_configs: + - send_resolved: True + url: {{ k8s_prometheus_telegram_webhook }} + {% endif %} + route: group_wait: 10s - group_interval: 5m - receiver: slack-receiver - repeat_interval: 3h + group_interval: 1h + receiver: {{ k8s_prometheus_alerts_default_route }} + repeat_interval: 4h + routes: + - receiver: 'null' + match: + alertname: DeadMansSwitch + + {% if k8s_prometheus_slack_alerts_enabled %} + + - receiver: slack + match_re: + notify: ^sre|dev$ + continue: true + {% endif %} + + {% if k8s_prometheus_telegram_alerts_enabled %} + + - receiver: telegram + match: + notify: sre + {% endif %} --- @@ -149,7 +177,7 @@ spec: - name: storage-volume emptyDir: {} -{% if k8s_prometheus_alertmanager_name is defined and k8s_prometheus_alertmanager_name != '' %} +{% if k8s_prometheus_alertmanager_name != '' %} --- apiVersion: extensions/v1beta1 diff --git a/roles/prometheus/templates/alerts/all.yml b/roles/prometheus/templates/alerts/all.yml new file mode 100644 index 0000000..8e82d07 --- /dev/null +++ b/roles/prometheus/templates/alerts/all.yml @@ -0,0 +1,23 @@ +{% include 'alerts/common.yml' %} + + groups: +{% include 'alerts/app.yml' %} + +{% include 'alerts/deployment.yml' %} + +{% include 'alerts/pods.yml' %} + +{% include 'alerts/node.yml' %} + +{% include 'alerts/ingress.yml' %} + +{% include 'alerts/service.yml' %} + +{% include 'alerts/prometheus.yml' %} + +{% include 'alerts/kubernetes.yml' %} + +{% if k8s_prometheus_scrape_cockroachdb_metrics %} + +{% include 'alerts/cockroachdb.yml' %} +{% endif %} \ No newline at end of file diff --git a/roles/prometheus/templates/alerts/app.conf b/roles/prometheus/templates/alerts/app.conf deleted file mode 100644 index 60c1269..0000000 --- a/roles/prometheus/templates/alerts/app.conf +++ /dev/null @@ -1,32 +0,0 @@ -{% raw %} - # - # Applications alerts (from /metrics handler) - # - - name: application-alerts - rules: - # - # Alert on each 5XX code of any http service - # - - alert: HttpServiceFatalError - expr: rate(http_request_duration_seconds{job="kubernetes-service-endpoints",status=~"5.."}[1m]) - > 0 - labels: - notify: sre - severity: warning - annotations: - summary: "{{ $labels.namespace }}/{{ $label.name }}: Service returns 5XX error" - description: "{{ $labels.namespace }}/{{ $label.name }}: Service has processed some request incorrectly (code = {{ $labels.status }})" - - # - # Alert for any instance that have a median request latency > 0.5s - # - - alert: HttpServiceHighRequestLatency - expr: http_request_duration_seconds{quantile="0.5"} > 0.5 - for: 1m - labels: - notify: sre - severity: warning - annotations: - summary: "{{ $labels.namespace }}/{{ $label.name }}: High request latency" - description: "{{ $labels.namespace }}/{{ $label.name }} (instance {{ $labels.instance }}): Service has a median request latency > 0.5s (current value: {{ $value }}s)" -{% endraw %} diff --git a/roles/prometheus/templates/alerts/app.yml b/roles/prometheus/templates/alerts/app.yml new file mode 100644 index 0000000..aa56601 --- /dev/null +++ b/roles/prometheus/templates/alerts/app.yml @@ -0,0 +1,32 @@ +{% raw %} + # + # Applications alerts (from /metrics handler) + # + - name: application-alerts + rules: + # + # Alert on each 5XX code of any http service + # + - alert: HttpServiceFatalError + expr: rate(http_requests_total{job="kubernetes-service-endpoints",status=~"5.."}[1m]) > 0 + labels: + severity: warning + notify: dev + annotations: + summary: "{{ $labels.kubernetes_namespace }}/{{ $label.kubernetes_pod_name }}: Service returns 5XX error" + description: "{{ $labels.kubernetes_namespace }}/{{ $label.kubernetes_pod_name }}: Service has processed some request incorrectly (code = {{ $labels.status }})" + + # + # Alert for any instance that have a median request latency > 0.5s + # + #- alert: HttpServiceHighRequestLatency + # expr: http_request_duration_seconds_bucket{le="0.5"} > 0.5 + # for: 1m + # labels: + # severity: warning + # notify: sre + # annotations: + # summary: "{{ $labels.kubernetes_namespace }}/{{ $label.kubernetes_pod_name }}: High request latency" + # description: "{{ $labels.kubernetes_namespace }}/{{ $label.kubernetes_pod_name }} (instance {{ $labels.instance }}): Service has a median request latency > 0.5s (current value: {{ $value }}s)" + +{% endraw %} diff --git a/roles/prometheus/templates/alerts/cockroachdb.yml b/roles/prometheus/templates/alerts/cockroachdb.yml new file mode 100644 index 0000000..f36a7a6 --- /dev/null +++ b/roles/prometheus/templates/alerts/cockroachdb.yml @@ -0,0 +1,185 @@ +{% raw %} + # + # CockroachDB alerts + # + - name: cockroachdb-alerts + rules: + # + # Aggregation rules + # + # This file contains aggregation rules, specifically: + # "node:X" node-level aggregation of a per-store metric X + # "cluster:X" cluster-level aggregation of a per-store or per-node metric X + # + # Most aggregation rules should use the "without (label1, label2, ...)" keyword + # to keep all labels but the ones specified. + # + - record: node:capacity + expr: sum(capacity{app="cockroachdb"}) WITHOUT (store) + - record: cluster:capacity + expr: sum(node:capacity{app="cockroachdb"}) WITHOUT (instance) + - record: node:capacity_available + expr: sum(capacity_available{app="cockroachdb"}) WITHOUT (store) + - record: cluster:capacity_available + expr: sum(node:capacity_available{app="cockroachdb"}) WITHOUT (instance) + - record: capacity_available:ratio + expr: capacity_available{app="cockroachdb"} / capacity{app="cockroachdb"} + - record: node:capacity_available:ratio + expr: node:capacity_available{app="cockroachdb"} / node:capacity{app="cockroachdb"} + - record: cluster:capacity_available:ratio + expr: cluster:capacity_available{app="cockroachdb"} / cluster:capacity{app="cockroachdb"} + - record: mutex_storenanos_bucket:rate1m + expr: rate(mutex_storenanos_bucket{app="cockroachdb"}[1m]) + - record: node:mutex_storenanos_bucket:rate1m + expr: sum(mutex_storenanos_bucket:rate1m{app="cockroachdb"}) WITHOUT (store) + - record: cluster:mutex_storenanos_bucket:rate1m + expr: sum(node:mutex_storenanos_bucket:rate1m{app="cockroachdb"}) WITHOUT (instance) + - record: cluster:mutex_storenanos:rate1m:quantile_50 + expr: histogram_quantile(0.5, cluster:mutex_storenanos_bucket:rate1m) + - record: cluster:mutex_storenanos:rate1m:quantile_75 + expr: histogram_quantile(0.75, cluster:mutex_storenanos_bucket:rate1m) + - record: cluster:mutex_storenanos:rate1m:quantile_90 + expr: histogram_quantile(0.9, cluster:mutex_storenanos_bucket:rate1m) + - record: cluster:mutex_storenanos:rate1m:quantile_95 + expr: histogram_quantile(0.95, cluster:mutex_storenanos_bucket:rate1m) + - record: mutex_raftnanos_bucket:rate1m + expr: rate(mutex_raftnanos_bucket{app="cockroachdb"}[1m]) + - record: node:mutex_raftnanos_bucket:rate1m + expr: sum(mutex_raftnanos_bucket:rate1m{app="cockroachdb"}) WITHOUT (store) + - record: cluster:mutex_raftnanos_bucket:rate1m + expr: sum(node:mutex_raftnanos_bucket:rate1m{app="cockroachdb"}) WITHOUT (instance) + - record: cluster:mutex_raftnanos:rate1m:quantile_50 + expr: histogram_quantile(0.5, cluster:mutex_raftnanos_bucket:rate1m) + - record: cluster:mutex_raftnanos:rate1m:quantile_75 + expr: histogram_quantile(0.75, cluster:mutex_raftnanos_bucket:rate1m) + - record: cluster:mutex_raftnanos:rate1m:quantile_90 + expr: histogram_quantile(0.9, cluster:mutex_raftnanos_bucket:rate1m) + - record: cluster:mutex_raftnanos:rate1m:quantile_95 + expr: histogram_quantile(0.95, cluster:mutex_raftnanos_bucket:rate1m) + - record: mutex_replicananos_bucket:rate1m + expr: rate(mutex_replicananos_bucket{app="cockroachdb"}[1m]) + - record: node:mutex_replicananos_bucket:rate1m + expr: sum(mutex_replicananos_bucket:rate1m{app="cockroachdb"}) WITHOUT (store) + - record: cluster:mutex_replicananos_bucket:rate1m + expr: sum(node:mutex_replicananos_bucket:rate1m{app="cockroachdb"}) WITHOUT (instance) + - record: cluster:mutex_replicananos:rate1m:quantile_50 + expr: histogram_quantile(0.5, cluster:mutex_replicananos_bucket:rate1m) + - record: cluster:mutex_replicananos:rate1m:quantile_75 + expr: histogram_quantile(0.75, cluster:mutex_replicananos_bucket:rate1m) + - record: cluster:mutex_replicananos:rate1m:quantile_90 + expr: histogram_quantile(0.9, cluster:mutex_replicananos_bucket:rate1m) + - record: cluster:mutex_replicananos:rate1m:quantile_95 + expr: histogram_quantile(0.95, cluster:mutex_replicananos_bucket:rate1m) + - record: mutex_commandqueuenanos_bucket:rate1m + expr: rate(mutex_commandqueuenanos_bucket{app="cockroachdb"}[1m]) + - record: node:mutex_commandqueuenanos_bucket:rate1m + expr: sum(mutex_commandqueuenanos_bucket:rate1m{app="cockroachdb"}) WITHOUT (store) + - record: cluster:mutex_commandqueuenanos_bucket:rate1m + expr: sum(node:mutex_commandqueuenanos_bucket:rate1m{app="cockroachdb"}) WITHOUT + (instance) + - record: cluster:mutex_commandqueuenanos:rate1m:quantile_50 + expr: histogram_quantile(0.5, cluster:mutex_commandqueuenanos_bucket:rate1m) + - record: cluster:mutex_commandqueuenanos:rate1m:quantile_75 + expr: histogram_quantile(0.75, cluster:mutex_commandqueuenanos_bucket:rate1m) + - record: cluster:mutex_commandqueuenanos:rate1m:quantile_90 + expr: histogram_quantile(0.9, cluster:mutex_commandqueuenanos_bucket:rate1m) + - record: cluster:mutex_commandqueuenanos:rate1m:quantile_95 + expr: histogram_quantile(0.95, cluster:mutex_commandqueuenanos_bucket:rate1m) + - record: mutex_schedulernanos_bucket:rate1m + expr: rate(mutex_schedulernanos_bucket{app="cockroachdb"}[1m]) + - record: node:mutex_schedulernanos_bucket:rate1m + expr: sum(mutex_schedulernanos_bucket:rate1m{app="cockroachdb"}) WITHOUT (store) + - record: cluster:mutex_schedulernanos_bucket:rate1m + expr: sum(node:mutex_schedulernanos_bucket:rate1m{app="cockroachdb"}) WITHOUT + (instance) + - record: cluster:mutex_schedulernanos:rate1m:quantile_50 + expr: histogram_quantile(0.5, cluster:mutex_schedulernanos_bucket:rate1m) + - record: cluster:mutex_schedulernanos:rate1m:quantile_75 + expr: histogram_quantile(0.75, cluster:mutex_schedulernanos_bucket:rate1m) + - record: cluster:mutex_schedulernanos:rate1m:quantile_90 + expr: histogram_quantile(0.9, cluster:mutex_schedulernanos_bucket:rate1m) + - record: cluster:mutex_schedulernanos:rate1m:quantile_95 + expr: histogram_quantile(0.95, cluster:mutex_schedulernanos_bucket:rate1m) + + # + # Alert rules + # + + # + # Alert on version mismatch. + # + # This alert is intentionally loose (30 minutes) to allow for rolling upgrades. + # This may need to be adjusted for large clusters. + # + - alert: VersionMismatch + expr: count(count_values("version", build_timestamp{app="cockroachdb"}) BY (tag)) > 1 + for: 30m + labels: + severity: warning + notify: sre + annotations: + description: "CockroachDB: Cluster running {{ $value }} different versions" + summary: "CockroachDB: Binary version mismatch" + + # + # Alert on available capacity < 15% + # + - alert: StoreDiskLow + expr: capacity_available:ratio{app="cockroachdb"} < 0.15 + labels: + severity: warning + notify: sre + annotations: + description: "{{ $labels.kubernetes_namespace }}/{{ $labels.kubernetes_pod_name }}: Store {{ $labels.store }} on node {{ $labels.kubernetes_pod_node_name }} at {{ $value }} available disk fraction" + summary: "{{ $labels.kubernetes_namespace }}/{{ $labels.kubernetes_pod_name }}: CockroachDB store has low disk" + + # + # Alert on low disk < 20% + # + - alert: ClusterDiskLow + expr: cluster:capacity_available:ratio{app="cockroachdb"} < 0.2 + labels: + severity: warning + notify: sre + annotations: + description: "CockroachDB cluster at {{ $value }} available disk fraction" + summary: "CockroachDB cluster at {{ $value }} available disk fraction" + + # + # Alert on zero SQL qps + # + - alert: ZeroSQLQps + expr: sql_conns{app="cockroachdb"} > 0 and rate(sql_query_count{app="cockroachdb"}[5m]) == 0 + for: 10m + labels: + severity: warning + notify: sre + annotations: + description: "{{ $labels.kubernetes_namespace }}/{{ $labels.kubernetes_pod_name }}: CockroachDB pod has SQL connections but no queries" + summary: "{{ $labels.kubernetes_namespace }}/{{ $labels.kubernetes_pod_name }}: CockroachDB pod has SQL connections but no queries" + + # + # Alert on certificate expiration (< 1 year) + # + - alert: CACertificateExpiresSoon + expr: (security_certificate_expiration_ca{app="cockroachdb"} > 0) and (security_certificate_expiration_ca{app="cockroachdb"} - time()) < 86400 * 366 + labels: + severity: warning + notify: sre + annotations: + description: "{{ $labels.kubernetes_namespace }}/{{ $labels.kubernetes_pod_name }}: CA certificate for CockroachDB pod expires in less than a year" + summary: "{{ $labels.kubernetes_namespace }}/{{ $labels.kubernetes_pod_name }}: CA certificate for CockroachDB pod expires in less than a year" + + # + # Alert on certificate expiration (< 0.5 year) + # + - alert: NodeCertificateExpiresSoon + expr: (security_certificate_expiration_node{app="cockroachdb"} > 0) and (security_certificate_expiration_node{app="cockroachdb"} - time()) < 86400 * 183 + labels: + severity: warning + notify: sre + annotations: + description: "{{ $labels.kubernetes_namespace }}/{{ $labels.kubernetes_pod_name }}: Node certificate expires in less than 6 months" + summary: "{{ $labels.kubernetes_namespace }}/{{ $labels.kubernetes_pod_name }}: Node certificate expires in less than 6 months" + +{% endraw %} diff --git a/roles/prometheus/templates/alerts/common.conf b/roles/prometheus/templates/alerts/common.yml similarity index 93% rename from roles/prometheus/templates/alerts/common.conf rename to roles/prometheus/templates/alerts/common.yml index a1e5a59..c5f83e3 100644 --- a/roles/prometheus/templates/alerts/common.conf +++ b/roles/prometheus/templates/alerts/common.yml @@ -1,5 +1,3 @@ - ## alert.rules ## - # # Common rules of monitoring: # diff --git a/roles/prometheus/templates/alerts/deployment.conf b/roles/prometheus/templates/alerts/deployment.yml similarity index 100% rename from roles/prometheus/templates/alerts/deployment.conf rename to roles/prometheus/templates/alerts/deployment.yml index 39e8d38..378093e 100644 --- a/roles/prometheus/templates/alerts/deployment.conf +++ b/roles/prometheus/templates/alerts/deployment.yml @@ -12,8 +12,8 @@ or (kube_deployment_spec_replicas unless kube_deployment_status_replicas_unavailable) for: 5m labels: - notify: sre severity: warning + notify: sre annotations: summary: "{{ $labels.namespace }}/{{ $labels.deployment }}: Deployment is failed" description: "{{ $labels.namespace }}/{{ $labels.deployment }}: Deployment is failed - observed replicas != intended replicas" diff --git a/roles/prometheus/templates/alerts/ingress.conf b/roles/prometheus/templates/alerts/ingress.conf deleted file mode 100644 index 3bb89a4..0000000 --- a/roles/prometheus/templates/alerts/ingress.conf +++ /dev/null @@ -1,33 +0,0 @@ -{% raw %} - # - # Ingress alerts (from blackbox-exporter) - # - - name: ingress-alerts - rules: - # - # Alert on host (in ingress) is unreachable > 5 min - # - - alert: IngressIsUnreachable - expr: probe_success{job="kubernetes-ingresses"} == 0 - for: 5m - labels: - notify: sre - severity: warning - annotations: - summary: "{{ $labels.instance }}: Ingress is unreachable" - description: "{{ $labels.instance }}: Ingress is unreachable more than 5 minutes" - - # - # Alert on SSL certificate will be expired < 30 days - # - - alert: SSLCertExpiringSoon - expr: probe_ssl_earliest_cert_expiry{job="kubernetes-ingresses"} - time() < 86400 * 30 - for: 5m - labels: - notify: sre - severity: warning - annotations: - summary: "{{ $labels.instance }}: SSL certificate will be expired soon" - description: "{{ $labels.instance }}: SSL certificate will be expired less than 30 days" - -{% endraw %} diff --git a/roles/prometheus/templates/alerts/ingress.yml b/roles/prometheus/templates/alerts/ingress.yml new file mode 100644 index 0000000..353c322 --- /dev/null +++ b/roles/prometheus/templates/alerts/ingress.yml @@ -0,0 +1,33 @@ +{% raw %} + # + # Ingress alerts (from blackbox-exporter) + # + - name: ingress-alerts + rules: + # + # Alert on host (in ingress) is unreachable > 5 min + # + - alert: IngressIsUnreachable + expr: probe_success{job="kubernetes-ingresses"} == 0 or absent(probe_success{job="kubernetes-ingresses"} == 1) + for: 5m + labels: + severity: warning + notify: sre + annotations: + summary: "{{ $labels.kubernetes_namespace }}/{{ $labels.kubernetes_ingress_name }}: Ingress is unreachable" + description: "{{ $labels.kubernetes_namespace }}/{{ $labels.kubernetes_ingress_name }}: Ingress is unreachable more than 5 minutes (domain is {{ $labels.instance }})" + + # + # Alert on SSL certificate will be expired < 30 days + # + - alert: SSLCertExpiringSoon + expr: probe_ssl_earliest_cert_expiry{job="kubernetes-ingresses"} - time() < 86400 * 30 + for: 5m + labels: + severity: warning + notify: sre + annotations: + summary: "{{ $labels.kubernetes_namespace }}/{{ $labels.kubernetes_ingress_name }}: SSL certificate will be expired soon" + description: "{{ $labels.kubernetes_namespace }}/{{ $labels.kubernetes_ingress_name }}: SSL certificate will be expired less than 30 days (domain is {{ $labels.instance }})" + +{% endraw %} diff --git a/roles/prometheus/templates/alerts/kubernetes.yml b/roles/prometheus/templates/alerts/kubernetes.yml new file mode 100644 index 0000000..4460b88 --- /dev/null +++ b/roles/prometheus/templates/alerts/kubernetes.yml @@ -0,0 +1,103 @@ +{% raw %} + # + # Kubernetes API server alerts + # + - name: kube-api-server-alerts + rules: + # + # Alert that Kubernetes API server is down + # + - alert: K8SApiServerDown + expr: absent(up{job="kubernetes-apiservers"} == 1) + for: 5m + labels: + severity: critical + notify: sre + annotations: + summary: "Kubernetes API server unreachable" + description: "Prometheus failed to scrape API server(s), or all API servers have disappeared from service discovery" + + # + # Alert that Kubernetes API has high latency + # + - alert: K8SApiServerLatency + expr: histogram_quantile(0.99, sum(apiserver_request_latencies_bucket{subresource!="log",verb!~"^(?:CONNECT|WATCHLIST|WATCH|PROXY)$"}) + WITHOUT (instance, resource)) / 1e+06 > 1 + for: 10m + labels: + severity: warning + notify: sre + annotations: + summary: "Kubernetes apiserver latency is high" + description: "99th percentile Latency for {{ $labels.verb }} requests to the kube-apiserver is higher than 1s" + + # + # Kubernetes kubelet alerts + # + - name: kubelet-alerts + rules: + # + # Alert that Kubernetes node isn't ready to work + # + - alert: K8SNodeNotReady + expr: kube_node_status_ready{condition="true"} == 0 + for: 1h + labels: + severity: warning + notify: sre + annotations: + summary: "Node status is NotReady" + description: "{{ $labels.kubernetes_pod_node_name }}: The Kubelet has not checked in with the API, or has set itself to NotReady, for more than an hour" + + # + # Alert that Kubernetes few nodes aren't ready to work + # + - alert: K8SManyNodesNotReady + expr: count(kube_node_status_ready{condition="true"} == 0) > 1 + and (count(kube_node_status_ready{condition="true"} == 0) / count(kube_node_status_ready{condition="true"})) > 0.2 + for: 1m + labels: + severity: critical + annotations: + summary: "Many Kubernetes nodes are Not Ready" + description: "{{ $value }} Kubernetes nodes (more than 10% are in the NotReady state)" + + # + # Alert that kubelet is down + # + #- alert: K8SKubeletDown + # expr: count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.03 + # for: 1h + # labels: + # severity: warning + # notify: sre + # annotations: + # summary: "Many Kubelets cannot be scraped" + # description: "Prometheus failed to scrape {{ $value }}% of kubelets" + + # + # Alert that kubelet is down + # + #- alert: K8SKubeletDown + # expr: absent(up{job="kubelet"} == 1) + # or count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.1 + # for: 1h + # labels: + # severity: critical + # notify: sre + # annotations: + # summary: "Many Kubelets cannot be scraped" + # description: "Prometheus failed to scrape {{ $value }}% of kubelets, or all Kubelets have disappeared from service discovery" + + # + # Alert that kubelet can reach limit of running pods on one node + # + - alert: K8SKubeletTooManyPods + expr: kubelet_running_pod_count > 100 + labels: + severity: warning + notify: sre + annotations: + summary: "Kubelet is close to pod limit" + description: "{{$labels.kubernetes_node_name}}: Kubelet is running {{$value}} pods, close to the limit of 110" +{% endraw %} diff --git a/roles/prometheus/templates/alerts/node.conf b/roles/prometheus/templates/alerts/node.conf deleted file mode 100644 index 684552e..0000000 --- a/roles/prometheus/templates/alerts/node.conf +++ /dev/null @@ -1,82 +0,0 @@ -{% raw %} - # - # Node alerts (from kube-state-metrics / node-exporter) - # - # There are some alerts from https://github.com/kayrus/prometheus-kubernetes/tree/master/prometheus-rules - - name: node-alerts - rules: - - # - # Alert on CPU > 75% on all nodes - # - - alert: NodeCPUUsage - expr: (100 - (avg(irate(node_cpu{component="node-exporter",mode="idle"}[5m])) - BY (instance) * 100)) > 75 - for: 2m - labels: - notify: sre - severity: info - annotations: - summary: "{{ $labels.instance }}: High CPU usage is detected" - description: "{{ $labels.instance }}: CPU usage is above 75% (current value is: {{ $value }})" - - # - # Alert on load average > 1 - # - - alert: NodeLoadAverage - expr: ((node_load5 / count(node_cpu{mode="system"}) WITHOUT (cpu, mode)) > 1) - for: 5m - labels: - notify: sre - severity: info - annotations: - description: "{{ $labels.instance }}: Load average is high" - summary: "{{ $labels.instance }}: High LA detected" - - # - # Alert on swap > 75% - # - - alert: NodeSwapUsage - expr: (((node_memory_SwapTotal - node_memory_SwapFree) / node_memory_SwapTotal) - * 100) > 75 - for: 5m - labels: - notify: sre - severity: info - annotations: - summary: "{{ $labels.instance }}: Swap usage is detected" - description: "{{ $labels.instance }}: Swap usage usage is above 75% (current value is: {{ $value }})" - - # - # Alert on memory > 75% - # - - alert: NodeMemoryUsage - expr: (((node_memory_MemTotal - node_memory_MemFree - node_memory_Cached) / (node_memory_MemTotal) - * 100)) > 75 - for: 5m - labels: - notify: sre - severity: info - annotations: - summary: "{{ $labels.instance }}: High memory usage is detected" - description: "{{ $labels.instance }}: Memory usage is above 75% (current value is: {{ $value }})" - - # - # Disk is free < 25% - # - - alert: HighNodeDiskUsage - expr: (avg(node_filesystem_avail{device=~"^/dev/[sv]da1$"}) BY (instance)) / (avg(node_filesystem_size{device=~"^/dev/[sv]da1$"}) - BY (instance)) * 100 < 25 - for: 5m - labels: - notify: sre - severity: info - annotations: - summary: "{{$labels.instance}}: High disk usage is detected" - description: "{{$labels.instance}}: Disk usage is above 75% (current value is: {{ $value }})" - - # - # TODO: Disk will be busy after 4 hours - # something like: predict_linear((avg by (instance) (node_filesystem_avail{device=~"^/dev/[sv]da1$"})), 4*3600) < 0 - -{% endraw %} diff --git a/roles/prometheus/templates/alerts/node.yml b/roles/prometheus/templates/alerts/node.yml new file mode 100644 index 0000000..2fd1cda --- /dev/null +++ b/roles/prometheus/templates/alerts/node.yml @@ -0,0 +1,138 @@ +{% raw %} + # + # Node alerts (from kube-state-metrics / node-exporter) + # + # There are some alerts from https://github.com/kayrus/prometheus-kubernetes/tree/master/prometheus-rules + - name: node-alerts + rules: + + # + # Alert on CPU > 75% on all nodes + # + - alert: NodeCPUUsage + expr: (100 - (avg(irate(node_cpu{component="node-exporter",mode="idle"}[5m])) + BY (kubernetes_pod_node_name) * 100)) > 75 + for: 2m + labels: + severity: info + notify: sre + annotations: + summary: "{{ $labels.kubernetes_pod_node_name }}: High CPU usage is detected" + description: "{{ $labels.kubernetes_pod_node_name }}: CPU usage is above 75% (current value is: {{ $value }})" + + # + # Alert on load average > 1.5 + # + - alert: NodeLoadAverage + expr: ((node_load5 / count(node_cpu{mode="system"}) WITHOUT (cpu, mode)) > 1.5) + for: 5m + labels: + severity: info + notify: sre + annotations: + summary: "{{ $labels.kubernetes_pod_node_name }}: High LA detected" + description: "{{ $labels.kubernetes_pod_node_name }}: Load average is high (current value is: {{ $value }})" + + # + # Alert on swap > 75% + # + - alert: NodeSwapUsage + expr: (((node_memory_SwapTotal - node_memory_SwapFree) / node_memory_SwapTotal) * 100) > 75 + for: 5m + labels: + severity: info + notify: sre + annotations: + summary: "{{ $labels.kubernetes_pod_node_name }}: Swap usage is detected" + description: "{{ $labels.kubernetes_pod_node_name }}: Swap usage usage is above 75% (current value is: {{ $value }})" + + # + # Alert on memory > 75% + # + - alert: NodeMemoryUsage + expr: (((node_memory_MemTotal - node_memory_MemFree - node_memory_Cached) / (node_memory_MemTotal)* 100)) > 75 + for: 5m + labels: + severity: info + notify: sre + annotations: + summary: "{{ $labels.kubernetes_pod_node_name }}: High memory usage is detected" + description: "{{ $labels.kubernetes_pod_node_name }}: Memory usage is above 75% (current value is: {{ $value }})" + + # + # Disk is free < 10% + # + - alert: HighNodeDiskUsage + expr: (avg(node_filesystem_avail{device=~"^/dev/[sv]da[0-9]$"}) BY (kubernetes_pod_node_name)) / (avg(node_filesystem_size{device=~"^/dev/[sv]da[0-9]$"}) + BY (kubernetes_pod_node_name)) * 100 < 10 + for: 5m + labels: + severity: high + notify: sre + annotations: + summary: "{{$labels.kubernetes_pod_node_name}}: High disk usage is detected" + description: "{{$labels.kubernetes_pod_node_name}}: Disk is free less than 10% (current value is: {{ $value }})" + + # + # Alert for node that is unreachable for > 5 minutes + # + - alert: NodeIsDown + expr: up{job="kubernetes-nodes"} == 0 or absent(up{job="kubernetes-nodes"} == 1) + for: 5m + labels: + severity: warning + notify: sre + annotations: + summary: "{{ $labels.kubernetes_node_name }}: Node is down" + description: "{{ $labels.kubernetes_node_name }}: Node is down for more than 5 minutes" + + # + # Defines ratio between max and open file descriptors + # + - record: instance:fd_node_utilization + expr: process_open_fds{job="kubernetes-nodes"} / process_max_fds{job="kubernetes-nodes"} + + # + # Alert for node that uses 80% of available file descriptors + # + - alert: TooManyOpenFDNode + expr: 100 * instance:fd_node_utilization > 80 + for: 10m + labels: + severity: critical + notify: sre + annotations: + summary: "{{ $labels.kubernetes_namespace }}/{{ $labels.kubernetes_node_name }}: High usage of file descriptors by node" + description: '{{ $labels.kubernetes_namespace }}/{{ $labels.kubernetes_node_name }}: Node is using {{ $value }}% of the available file/socket descriptors' + + # + # Alert on expected limit of open file descriptors after 4 hours + # + - alert: FdNodeExhaustionClose + expr: predict_linear(instance:fd_node_utilization[1h], 3600 * 4) > 1 + for: 10m + labels: + severity: warning + notify: sre + annotations: + summary: "{{ $labels.kubernetes_namespace }}/{{ $labels.kubernetes_node_name }}: Expected high usage of file descriptors by node" + description: '{{ $labels.kubernetes_namespace }}/{{ $labels.kubernetes_node_name }}: Node will exhaust in file/socket descriptors after 4 hours' + + # + # Alert on expected limit of open file descriptors after 1h + # + - alert: FdNodeExhaustionTooClose + expr: predict_linear(instance:fd_node_utilization[10m], 3600) > 1 + for: 10m + labels: + severity: critical + notify: sre + annotations: + summary: "{{ $labels.kubernetes_namespace }}/{{ $labels.kubernetes_node_name }}: Expected high usage of file descriptors by node" + description: '{{ $labels.kubernetes_namespace }}/{{ $labels.kubernetes_node_name }}: Node will exhaust in file/socket descriptors after 1 hour' + + # + # TODO: Disk will be busy after 4 hours + # something like: predict_linear((avg by (instance) (node_filesystem_avail{device=~"^/dev/[sv]da[0-9]$"})), 4*3600) < 0 + +{% endraw %} diff --git a/roles/prometheus/templates/alerts/pods.conf b/roles/prometheus/templates/alerts/pods.conf deleted file mode 100644 index 313d6c0..0000000 --- a/roles/prometheus/templates/alerts/pods.conf +++ /dev/null @@ -1,35 +0,0 @@ -{% raw %} - # - # Pod alerts (from cAdvisor / kube-state-metrics) - # - - name: pod-alerts - rules: - - # - # Pod restarting count > 5 times per minute - # - - alert: PodRestartingTooMuch - expr: rate(kube_pod_container_status_restarts{job="kubernetes-service-endpoints"}[1m]) - > 1 / (5 * 60) - for: 1h - labels: - notify: tech - severity: warning - annotations: - summary: "{{ $labels.instance }}: Pod restarts too much" - description: "{{ $labels.instance }}: Pod restarts more than 5 times per minute" - - # - # Alert for any instance that is unreachable for > 5 minutes - # - - alert: InstanceDown - expr: up == 0 - for: 5m - labels: - notify: sre - severity: warning - annotations: - summary: "{{ $labels.instance }}: Pod is down" - description: "{{ $labels.instance }}: Pod is down for more than 5 minutes" - -{% endraw %} \ No newline at end of file diff --git a/roles/prometheus/templates/alerts/pods.yml b/roles/prometheus/templates/alerts/pods.yml new file mode 100644 index 0000000..d4b4dc7 --- /dev/null +++ b/roles/prometheus/templates/alerts/pods.yml @@ -0,0 +1,79 @@ +{% raw %} + # + # Pod alerts (from cAdvisor / kube-state-metrics) + # + - name: pod-alerts + rules: + + # + # Pod restarting count > 5 times per minute + # + - alert: PodRestartingTooMuch + expr: rate(kube_pod_container_status_restarts[1m]) > 1 / (5 * 60) + for: 1h + labels: + severity: warning + notify: tech + annotations: + summary: "{{ $labels.kubernetes_namespace }}/{{ $labels.pod }}: Pod restarts too much" + description: "{{ $labels.kubernetes_namespace }}/{{ $labels.pod }}: Pod restarts more than 5 times per minute" + + # + # Alert for pod that is unreachable for > 5 minutes + # + - alert: PodIsDown + expr: up{job="kubernetes-service-endpoints"} == 0 or absent(up{job="kubernetes-service-endpoints"} == 1) + for: 5m + labels: + severity: warning + notify: sre + annotations: + summary: "{{ $labels.kubernetes_namespace }}/{{ $labels.kubernetes_pod_name }}: Pod is down" + description: "{{ $labels.kubernetes_namespace }}/{{ $labels.kubernetes_pod_name }}: Pod is down for more than 5 minutes" + + # + # Defines ratio between max and open file descriptors + # + - record: instance:fd_pod_utilization + expr: process_open_fds{job="kubernetes-service-endpoints"} / process_max_fds{job="kubernetes-service-endpoints"} + + # + # Alert for pod that uses 80% of available file descriptors + # + - alert: TooManyOpenFDPod + expr: 100 * instance:fd_pod_utilization > 80 + for: 10m + labels: + severity: critical + notify: sre + annotations: + summary: "{{ $labels.kubernetes_namespace }}/{{ $labels.kubernetes_pod_name }}: High usage of file descriptors by pod" + description: '{{ $labels.kubernetes_namespace }}/{{ $labels.kubernetes_pod_name }}: Pod is using {{ $value }}% of the available file/socket descriptors' + + # + # Alert on expected limit of open file descriptors after 4 hours + # + - alert: FdPodExhaustionClose + expr: predict_linear(instance:fd_pod_utilization[1h], 3600 * 4) > 1 + for: 10m + labels: + severity: warning + notify: sre + annotations: + summary: "{{ $labels.kubernetes_namespace }}/{{ $labels.kubernetes_pod_name }}: Expected high usage of file descriptors by pod" + description: '{{ $labels.kubernetes_namespace }}/{{ $labels.kubernetes_pod_name }}: Pod will exhaust in file/socket descriptors after 4 hours' + + # + # Alert on expected limit of open file descriptors after 1h + # + - alert: FdPodExhaustionTooClose + expr: predict_linear(instance:fd_pod_utilization[10m], 3600) > 1 + for: 10m + labels: + severity: critical + notify: sre + annotations: + summary: "{{ $labels.kubernetes_namespace }}/{{ $labels.kubernetes_pod_name }}: Expected high usage of file descriptors by pod" + description: '{{ $labels.kubernetes_namespace }}/{{ $labels.kubernetes_pod_name }}: Pod will exhaust in file/socket descriptors after 1 hour' + +{% endraw %} diff --git a/roles/prometheus/templates/alerts/prometheus.yml b/roles/prometheus/templates/alerts/prometheus.yml new file mode 100644 index 0000000..0aa84ed --- /dev/null +++ b/roles/prometheus/templates/alerts/prometheus.yml @@ -0,0 +1,30 @@ +{% raw %} + # + # Applications alerts (from /metrics handler) + # + - name: prometheus-alerts + rules: + # + # Alert on failed config reload + # + - alert: FailedReload + expr: prometheus_config_last_reload_successful == 0 + for: 10m + labels: + severity: warning + notify: sre + annotations: + description: "Reloading Prometheus' configuration has failed" + summary: "Prometheus configuration reload has failed" + + # + # Alert to check pipeline + # + - alert: DeadMansSwitch + expr: vector(1) + labels: + severity: none + annotations: + summary: "Alerting DeadMansSwitch" + description: "This is a DeadMansSwitch meant to ensure that the entire Alerting pipeline is functional" +{% endraw %} diff --git a/roles/prometheus/templates/alerts/service.conf b/roles/prometheus/templates/alerts/service.conf deleted file mode 100644 index 8d132b1..0000000 --- a/roles/prometheus/templates/alerts/service.conf +++ /dev/null @@ -1,21 +0,0 @@ -{% raw %} - # - # Service alerts (from blackbox-exporter) - # - - name: service-alerts - rules: - - # - # Alert on service is unreachable > 5 min - # - - alert: ServiceIsUnreachable - expr: probe_success{job="kubernetes-services"} == 0 - for: 5m - labels: - notify: sre - severity: warning - annotations: - description: "{{ $labels.instance }}: Service is unreachable" - summary: "{{ $labels.instance }}: Service is unreachable more than 5 minutes" - -{% endraw %} diff --git a/roles/prometheus/templates/alerts/service.yml b/roles/prometheus/templates/alerts/service.yml new file mode 100644 index 0000000..6a9173c --- /dev/null +++ b/roles/prometheus/templates/alerts/service.yml @@ -0,0 +1,21 @@ +{% raw %} + # + # Service alerts (from blackbox-exporter) + # + - name: service-alerts + rules: + + # + # Alert on service is unreachable > 5 min + # + - alert: ServiceIsUnreachable + expr: probe_success{job="kubernetes-services"} == 0 or absent(probe_success{job="kubernetes-services"} == 1) + for: 5m + labels: + severity: warning + notify: sre + annotations: + description: "{{ $labels.kubernetes_namespace }}/{{ $labels.kubernetes_service_name }}: Service is unreachable" + summary: "{{ $labels.kubernetes_namespace }}/{{ $labels.kubernetes_service_name }}: Service is unreachable more than 5 minutes" + +{% endraw %} diff --git a/roles/prometheus/templates/blackbox-exporter.yaml b/roles/prometheus/templates/blackbox-exporter.yaml index 231485c..3fb92d0 100644 --- a/roles/prometheus/templates/blackbox-exporter.yaml +++ b/roles/prometheus/templates/blackbox-exporter.yaml @@ -13,17 +13,19 @@ data: modules: http_2xx: prober: http - timeout: 20s + timeout: 50s http: + preferred_ip_protocol: ip4 https_2xx: prober: http - timeout: 20s + timeout: 50s http: method: GET valid_status_codes: [200] no_follow_redirects: false fail_if_ssl: false fail_if_not_ssl: true + preferred_ip_protocol: ip4 --- @@ -88,7 +90,7 @@ metadata: namespace: {{ k8s_prometheus_namespace }} name: prometheus-blackbox-exporter spec: - replicas: 1 + replicas: 2 template: metadata: labels: @@ -135,7 +137,7 @@ spec: configMap: name: prometheus-blackbox-exporter -{% if k8s_prometheus_blackbox_name is defined and k8s_prometheus_blackbox_name != '' %} +{% if k8s_prometheus_blackbox_name != '' %} --- apiVersion: extensions/v1beta1 diff --git a/roles/prometheus/templates/config.yaml b/roles/prometheus/templates/config.yaml new file mode 100644 index 0000000..d28daa1 --- /dev/null +++ b/roles/prometheus/templates/config.yaml @@ -0,0 +1,60 @@ +--- + +apiVersion: v1 +kind: Namespace +metadata: + name: {{ k8s_prometheus_namespace }} + +--- + +apiVersion: v1 +kind: Secret +metadata: + name: tls-secret + namespace: {{ k8s_prometheus_namespace }} +data: + tls.crt: {{ k8s_services_cert | b64encode }} + tls.key: {{ k8s_services_cert_key | b64encode }} + +--- + +apiVersion: v1 +kind: Secret +metadata: + name: registry-pull-secret + namespace: {{ k8s_prometheus_namespace }} +type: kubernetes.io/dockerconfigjson +data: + .dockerconfigjson: {{ k8s_docker_registry_auth_token }} + +--- + +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: Role +metadata: + name: {{ k8s_prometheus_namespace }}-admin + namespace: {{ k8s_prometheus_namespace }} + labels: + basic.auth/role: {{ k8s_prometheus_namespace }}-admin +rules: +- apiGroups: + - '*' + resources: + - '*' + verbs: + - '*' + +--- + +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: RoleBinding +metadata: + name: {{ k8s_prometheus_namespace }} + namespace: {{ k8s_prometheus_namespace }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: {{ k8s_prometheus_namespace }}-admin +subjects: +- kind: User + name: {{ k8s_release_username }} diff --git a/roles/prometheus/templates/deploy-prometheus.sh b/roles/prometheus/templates/deploy-prometheus.sh index 4001f1e..a880035 100755 --- a/roles/prometheus/templates/deploy-prometheus.sh +++ b/roles/prometheus/templates/deploy-prometheus.sh @@ -1,15 +1,27 @@ #!/bin/bash +function deploy_prometheus_config { + if kubectl get namespaces | grep {{ k8s_prometheus_namespace }} &> /dev/null; then + echo "Prometheus config already exists" + else + echo "Creating Prometheus config" + kubectl apply -f {{ k8s_prometheus_dir }}/config.yaml + fi + + echo +} + function deploy_prometheus { if kubectl get deploy -l app=prometheus -n {{ k8s_prometheus_namespace }} | grep prometheus &> /dev/null; then echo "Updating Prometheus (config-maps will be auto applied)" - kubectl apply -f {{ k8s_addons_dir }}/prometheus.yaml + kubectl apply -f {{ k8s_prometheus_dir }}/prometheus.yaml else echo "Creating Prometheus" - kubectl create -f {{ k8s_addons_dir }}/prometheus.yaml + kubectl create -f {{ k8s_prometheus_dir }}/prometheus.yaml fi echo } +deploy_prometheus_config deploy_prometheus diff --git a/roles/prometheus/templates/grafana-dashboards/all.yml b/roles/prometheus/templates/grafana-dashboards/all.yml new file mode 100644 index 0000000..23b2137 --- /dev/null +++ b/roles/prometheus/templates/grafana-dashboards/all.yml @@ -0,0 +1,44 @@ + capacity-planning-dashboard.json: |- +{% include 'grafana-dashboards/capacity-planning.json' %} + + cluster-health-dashboard.json: |- +{% include 'grafana-dashboards/cluster-health.json' %} + + cluster-monitoring-dashboard.json: |- +{% include 'grafana-dashboards/cluster-monitoring.json' %} + + cluster-status-dashboard.json: |- +{% include 'grafana-dashboards/cluster-status.json' %} + + control-plane-status-dashboard.json: |- +{% include 'grafana-dashboards/control-plane-status.json' %} + + deployment-dashboard.json: |- +{% include 'grafana-dashboards/deployment.json' %} + + nodes-dashboard.json: |- +{% include 'grafana-dashboards/nodes.json' %} + + pods-dashboard.json: |- +{% include 'grafana-dashboards/pods.json' %} + + resource-requests-dashboard.json: |- +{% include 'grafana-dashboards/resource-requests.json' %} + +{% if k8s_prometheus_scrape_cockroachdb_metrics %} + + cockroachdb-replicas-dashboard.json: |- +{% include 'grafana-dashboards/cockroachdb/replicas.json' %} + + cockroachdb-runtime-dashboard.json: |- +{% include 'grafana-dashboards/cockroachdb/runtime.json' %} + + cockroachdb-sql-dashboard.json: |- +{% include 'grafana-dashboards/cockroachdb/sql.json' %} + + cockroachdb-storage-dashboard.json: |- +{% include 'grafana-dashboards/cockroachdb/storage.json' %} + + + +{% endif %} \ No newline at end of file diff --git a/roles/prometheus/templates/grafana-dashboards/cockroachdb/replicas.json b/roles/prometheus/templates/grafana-dashboards/cockroachdb/replicas.json new file mode 100644 index 0000000..ee325a4 --- /dev/null +++ b/roles/prometheus/templates/grafana-dashboards/cockroachdb/replicas.json @@ -0,0 +1,3 @@ +{% raw %} + {"dashboard":{"__inputs":[{"name":"DS_PROMETHEUS","label":"Prometheus","description":"","type":"datasource","pluginId":"prometheus","pluginName":"Prometheus"}],"__requires":[{"type":"panel","id":"graph","name":"Graph","version":""},{"type":"panel","id":"table","name":"Table","version":""},{"type":"grafana","id":"grafana","name":"Grafana","version":"3.1.1"},{"type":"datasource","id":"prometheus","name":"Prometheus","version":"1.0.0"}],"id":null,"title":"Cockroach Replicas","tags":["cockroach"],"style":"dark","timezone":"utc","editable":true,"hideControls":false,"sharedCrosshair":true,"rows":[{"collapse":false,"editable":true,"height":"250px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"fill":1,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":16,"isNew":true,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"null as zero","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[{"alias":"unavailable","yaxis":2},{"alias":"raft leaders not lease holders","yaxis":2},{"alias":"under-replicated","yaxis":1}],"span":8,"stack":false,"steppedLine":false,"targets":[{"expr":"sum(sum(ranges{cluster=\"$cluster\",instance=~\"$node\"}) by (instance))","intervalFactor":2,"legendFormat":"ranges","refId":"D","step":120},{"expr":"sum(sum(replicas_leaders{cluster=\"$cluster\",instance=~\"$node\"}) by (instance))","interval":"","intervalFactor":2,"legendFormat":"raft leaders","metric":"","refId":"A","step":120},{"expr":"sum(sum(replicas_leaders_not_leaseholders{cluster=\"$cluster\",instance=~\"$node\"}) by (instance))","interval":"","intervalFactor":2,"legendFormat":"raft leaders not lease holders","metric":"","refId":"C","step":120},{"expr":"sum(sum(ranges_unavailable{cluster=\"$cluster\",instance=~\"$node\"}) by (instance))","interval":"","intervalFactor":2,"legendFormat":"unavailable","metric":"","refId":"B","step":120},{"expr":"sum(sum(ranges_underreplicated{cluster=\"$cluster\",instance=~\"$node\"}) by (instance))","intervalFactor":2,"legendFormat":"under-replicated","refId":"E","step":120}],"timeFrom":null,"timeShift":null,"title":"Ranges: $node","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"short","label":null,"logBase":1,"max":null,"min":0,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":0,"show":true}]},{"columns":[{"text":"Current","value":"current"},{"text":"Avg","value":"avg"},{"text":"Min","value":"min"},{"text":"Max","value":"max"}],"editable":true,"error":false,"fontSize":"90%","id":17,"isNew":true,"links":[],"pageSize":null,"scroll":true,"showHeader":true,"sort":{"col":0,"desc":false},"span":4,"styles":[{"colorMode":null,"colors":["rgba(245, 54, 54, 0.9)","rgba(237, 129, 40, 0.89)","rgba(50, 172, 45, 0.97)"],"decimals":2,"pattern":"/.*/","thresholds":[],"type":"number","unit":"short"}],"targets":[{"expr":"sum(ranges{cluster=\"$cluster\",instance=~\"$node\"}) by (instance)","interval":"","intervalFactor":2,"legendFormat":"{{instance}} - ranges","metric":"","refId":"A","step":240},{"expr":"sum(ranges_unavailable{cluster=\"$cluster\",instance=~\"$node\"}) by (instance)","interval":"","intervalFactor":2,"legendFormat":"{{instance}} - unavailable","metric":"","refId":"B","step":240}],"title":"Ranges: $node","transform":"timeseries_aggregations","type":"table"}],"title":"New row"},{"collapse":false,"editable":true,"height":"250px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"fill":1,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":28,"isNew":true,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"null as zero","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":8,"stack":false,"steppedLine":false,"targets":[{"expr":"sum(replicas{cluster=\"$cluster\",instance=~\"$node\"}) by (instance)","interval":"","intervalFactor":2,"legendFormat":"{{instance}}","metric":"","refId":"A","step":120}],"timeFrom":null,"timeShift":null,"title":"Replicas per node: $node","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"short","label":null,"logBase":1,"max":null,"min":0,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true}]},{"columns":[{"text":"Current","value":"current"},{"text":"Avg","value":"avg"},{"text":"Min","value":"min"},{"text":"Max","value":"max"}],"editable":true,"error":false,"fontSize":"90%","id":29,"isNew":true,"links":[],"pageSize":null,"scroll":true,"showHeader":true,"sort":{"col":0,"desc":false},"span":4,"styles":[{"colorMode":null,"colors":["rgba(245, 54, 54, 0.9)","rgba(237, 129, 40, 0.89)","rgba(50, 172, 45, 0.97)"],"decimals":2,"pattern":"/.*/","thresholds":[],"type":"number","unit":"short"}],"targets":[{"expr":"sum(replicas{cluster=\"$cluster\",instance=~\"$node\"}) by (instance)","interval":"","intervalFactor":2,"legendFormat":"{{instance}}","metric":"","refId":"A","step":240}],"title":"Replicas: $node","transform":"timeseries_aggregations","type":"table"},{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"fill":1,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":41,"isNew":true,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"null as zero","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":8,"stack":false,"steppedLine":false,"targets":[{"expr":"sum(replicas_leaseholders{cluster=\"$cluster\",instance=~\"$node\"}) by (instance)","interval":"","intervalFactor":2,"legendFormat":"{{instance}}","metric":"","refId":"A","step":120}],"timeFrom":null,"timeShift":null,"title":"Replica leaseholders per node: $node","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"short","label":null,"logBase":1,"max":null,"min":0,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true}]},{"columns":[{"text":"Current","value":"current"},{"text":"Avg","value":"avg"},{"text":"Min","value":"min"},{"text":"Max","value":"max"}],"editable":true,"error":false,"fontSize":"90%","id":40,"isNew":true,"links":[],"pageSize":null,"scroll":true,"showHeader":true,"sort":{"col":0,"desc":false},"span":4,"styles":[{"colorMode":null,"colors":["rgba(245, 54, 54, 0.9)","rgba(237, 129, 40, 0.89)","rgba(50, 172, 45, 0.97)"],"decimals":2,"pattern":"/.*/","thresholds":[],"type":"number","unit":"short"}],"targets":[{"expr":"sum(replicas_leaseholders{cluster=\"$cluster\",instance=~\"$node\"}) by (instance)","interval":"","intervalFactor":2,"legendFormat":"{{instance}}","metric":"","refId":"A","step":240}],"title":"Replica leaseholders: $node","transform":"timeseries_aggregations","type":"table"}],"title":"New row"},{"collapse":false,"editable":true,"height":"250px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"fill":1,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":30,"isNew":true,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"null as zero","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":8,"stack":false,"steppedLine":false,"targets":[{"expr":"sum(sum(replicas{cluster=\"$cluster\",instance=~\"$node\"}) by (instance))","interval":"","intervalFactor":2,"legendFormat":"Replicas","refId":"A","step":120},{"expr":"sum(sum(replicas_quiescent{cluster=\"$cluster\",instance=~\"$node\"}) by (instance))","interval":"","intervalFactor":2,"legendFormat":"Quiescent","refId":"B","step":120},{"expr":"sum(sum(replicas{cluster=\"$cluster\",instance=~\"$node\"}) by (instance) - sum(replicas_quiescent{cluster=\"$cluster\",instance=~\"$node\"}) by (instance))","intervalFactor":2,"legendFormat":"Active","refId":"C","step":120}],"timeFrom":null,"timeShift":null,"title":"Replicas: $node","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true}]},{"columns":[{"text":"Current","value":"current"},{"text":"Avg","value":"avg"},{"text":"Min","value":"min"},{"text":"Max","value":"max"}],"editable":true,"error":false,"fontSize":"90%","id":31,"isNew":true,"links":[],"pageSize":null,"scroll":true,"showHeader":true,"sort":{"col":0,"desc":true},"span":4,"styles":[{"dateFormat":"YYYY-MM-DD HH:mm:ss","pattern":"Time","type":"date"},{"colorMode":null,"colors":["rgba(245, 54, 54, 0.9)","rgba(237, 129, 40, 0.89)","rgba(50, 172, 45, 0.97)"],"decimals":2,"pattern":"/.*/","thresholds":[],"type":"number","unit":"short"}],"targets":[{"expr":"sum(replicas_quiescent{cluster=\"$cluster\",instance=~\"$node\"}) by (instance)","interval":"","intervalFactor":2,"legendFormat":"{{instance}}","refId":"A","step":240}],"title":"Quiescent Replicas: $node","transform":"timeseries_aggregations","type":"table"}],"title":"New row"},{"collapse":false,"editable":true,"height":"250px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"fill":1,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":9,"isNew":true,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"null as zero","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":8,"stack":false,"steppedLine":false,"targets":[{"expr":"sum(sum(rate(range_splits{cluster=\"$cluster\",instance=~\"$node\"}[$rate_interval])) by (instance))","interval":"","intervalFactor":2,"legendFormat":"splits","metric":"","refId":"A","step":120},{"expr":"sum(sum(rate(range_adds{cluster=\"$cluster\",instance=~\"$node\"}[$rate_interval])) by (instance))","interval":"","intervalFactor":2,"legendFormat":"adds","metric":"","refId":"B","step":120},{"expr":"sum(sum(rate(range_removes{cluster=\"$cluster\",instance=~\"$node\"}[$rate_interval])) by (instance))","interval":"","intervalFactor":2,"legendFormat":"removes","metric":"","refId":"C","step":120}],"timeFrom":null,"timeShift":null,"title":"Range Ops: $node","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"ops","label":null,"logBase":1,"max":null,"min":0,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true}]},{"columns":[{"text":"Current","value":"current"},{"text":"Avg","value":"avg"},{"text":"Min","value":"min"},{"text":"Max","value":"max"}],"editable":true,"error":false,"fontSize":"90%","id":14,"isNew":true,"links":[],"pageSize":null,"scroll":true,"showHeader":true,"sort":{"col":0,"desc":true},"span":4,"styles":[{"dateFormat":"YYYY-MM-DD HH:mm:ss","pattern":"Time","type":"date"},{"colorMode":null,"colors":["rgba(245, 54, 54, 0.9)","rgba(237, 129, 40, 0.89)","rgba(50, 172, 45, 0.97)"],"decimals":2,"pattern":"/.*/","thresholds":[],"type":"number","unit":"short"}],"targets":[{"expr":"sum(rate(range_splits{cluster=\"$cluster\",instance=~\"$node\"}[$rate_interval])) by (instance) + sum(rate(range_adds{cluster=\"$cluster\",instance=~\"$node\"}[$rate_interval])) by (instance) + sum(rate(range_removes{cluster=\"$cluster\",instance=~\"$node\"}[$rate_interval])) by (instance)","interval":"","intervalFactor":2,"legendFormat":"{{instance}}","metric":"","refId":"A","step":240}],"title":"Range Ops: $node","transform":"timeseries_aggregations","type":"table"}],"title":"New row"},{"collapse":false,"editable":true,"height":"250px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"fill":1,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":7,"isNew":true,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"null as zero","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[{"alias":"Reserved Capacity","yaxis":2}],"span":8,"stack":false,"steppedLine":false,"targets":[{"expr":"sum(sum(rate(range_snapshots_generated{cluster=\"$cluster\",instance=~\"$node\"}[$rate_interval])) by (instance))","interval":"","intervalFactor":2,"legendFormat":"Generated","metric":"","refId":"A","step":120},{"expr":"sum(sum(rate(range_snapshots_normal_applied{cluster=\"$cluster\",instance=~\"$node\"}[$rate_interval])) by (instance))","interval":"","intervalFactor":2,"legendFormat":"Normal-applied","metric":"","refId":"B","step":120},{"expr":"sum(sum(rate(range_snapshots_preemptive_applied{cluster=\"$cluster\",instance=~\"$node\"}[$rate_interval])) by (instance))","interval":"","intervalFactor":2,"legendFormat":"Preemptive-applied","refId":"C","step":120},{"expr":"sum(capacity_reserved{cluster=\"$cluster\",instance=~\"$node\"})","hide":false,"interval":"","intervalFactor":2,"legendFormat":"Reserved Capacity","refId":"D","step":120},{"expr":"sum(replicas_reserved{cluster=\"$cluster\",instance=~\"$node\"})","intervalFactor":2,"legendFormat":"Reservations","refId":"E","step":120}],"timeFrom":null,"timeShift":null,"title":"Snapshots: $node","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"ops","label":"Snapshots","logBase":1,"max":null,"min":0,"show":true},{"format":"KBs","label":"Reservations","logBase":1,"max":null,"min":0,"show":true}]},{"columns":[{"text":"Current","value":"current"},{"text":"Avg","value":"avg"},{"text":"Min","value":"min"},{"text":"Max","value":"max"}],"editable":true,"error":false,"fontSize":"90%","hideTimeOverride":false,"id":13,"isNew":true,"links":[],"pageSize":null,"scroll":true,"showHeader":true,"sort":{"col":0,"desc":true},"span":4,"styles":[{"colorMode":null,"colors":["rgba(245, 54, 54, 0.9)","rgba(237, 129, 40, 0.89)","rgba(50, 172, 45, 0.97)"],"dateFormat":"YYYY-MM-DD HH:mm:ss","decimals":2,"pattern":"/.*/","thresholds":[],"type":"number","unit":"short"}],"targets":[{"expr":"sum(rate(range_snapshots_generated{cluster=\"$cluster\",instance=~\"$node\"}[$rate_interval])) by (instance)","interval":"","intervalFactor":2,"legendFormat":"{{instance}} - generated","refId":"A","step":240},{"expr":"sum(rate(range_snapshots_normal_applied{cluster=\"$cluster\",instance=~\"$node\"}[$rate_interval])) by (instance) + sum(rate(range_snapshots_preemptive_applied{cluster=\"$cluster\",instance=~\"$node\"}[$rate_interval])) by (instance)","interval":"","intervalFactor":2,"legendFormat":"{{instance}} - applied","metric":"","refId":"B","step":240}],"timeFrom":null,"timeShift":null,"title":"Snapshots: $node","transform":"timeseries_aggregations","type":"table"}],"title":"New row"}],"time":{"from":"now-24h","to":"now"},"timepicker":{"refresh_intervals":["5s","10s","30s","1m","5m","15m","30m","1h","2h","1d"],"time_options":["5m","15m","1h","6h","12h","24h","2d","7d","30d"]},"templating":{"list":[{"current":{},"datasource":"${DS_PROMETHEUS}","hide":0,"includeAll":false,"label":"Cluster name","multi":false,"name":"cluster","options":[],"query":"sys_uptime","refresh":1,"regex":"/cluster=\"([^\"]+)\"/","type":"query"},{"allValue":".*","current":{},"datasource":"${DS_PROMETHEUS}","hide":0,"includeAll":true,"label":"Node","multi":false,"name":"node","options":[],"query":"label_values(sys_uptime{job=\"cockroach\",cluster=\"$cluster\"},instance)","refresh":1,"regex":"","type":"query"},{"current":{"selected":true,"text":"1m","value":"1m"},"datasource":null,"hide":0,"includeAll":false,"label":"Rate Interval","multi":false,"name":"rate_interval","options":[{"selected":false,"text":"30s","value":"30s"},{"selected":true,"text":"1m","value":"1m"},{"selected":false,"text":"5m","value":"5m"},{"selected":false,"text":"10m","value":"10m"},{"selected":false,"text":"30m","value":"30m"},{"selected":false,"text":"1h","value":"1h"},{"selected":false,"text":"6h","value":"6h"},{"selected":false,"text":"12h","value":"12h"},{"selected":false,"text":"1d","value":"1d"}],"query":"30s,1m,5m,10m,30m,1h,6h,12h,1d","refresh":0,"type":"interval"}]},"annotations":{"list":[]},"refresh":false,"schemaVersion":12,"version":7,"links":[{"asDropdown":true,"icon":"external link","includeVars":true,"keepTime":true,"tags":["cockroach"],"targetBlank":true,"title":"Dashboards","type":"dashboards"}],"gnetId":null},"overwrite":true,"inputs":[{"name":"DS_PROMETHEUS","type":"datasource","pluginId":"prometheus","value":"prometheus"}]} +{% endraw %} \ No newline at end of file diff --git a/roles/prometheus/templates/grafana-dashboards/cockroachdb/runtime.json b/roles/prometheus/templates/grafana-dashboards/cockroachdb/runtime.json new file mode 100644 index 0000000..74f728a --- /dev/null +++ b/roles/prometheus/templates/grafana-dashboards/cockroachdb/runtime.json @@ -0,0 +1,3 @@ +{% raw %} + {"dashboard":{"__inputs":[{"name":"DS_PROMETHEUS","label":"Prometheus","description":"","type":"datasource","pluginId":"prometheus","pluginName":"Prometheus"}],"__requires":[{"type":"panel","id":"graph","name":"Graph","version":""},{"type":"panel","id":"table","name":"Table","version":""},{"type":"grafana","id":"grafana","name":"Grafana","version":"3.1.1"},{"type":"datasource","id":"prometheus","name":"Prometheus","version":"1.0.0"}],"id":null,"title":"Cockroach Runtime","tags":["cockroach"],"style":"dark","timezone":"utc","editable":true,"hideControls":false,"sharedCrosshair":true,"rows":[{"collapse":false,"editable":true,"height":"250px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"fill":0,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":2,"isNew":true,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"null as zero","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[{"alias":"Live nodes","yaxis":1},{"alias":"All nodes","yaxis":1}],"span":8,"stack":false,"steppedLine":false,"targets":[{"expr":"count(up{job=\"cockroach\",cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"})","hide":false,"interval":"","intervalFactor":2,"legendFormat":"All nodes","metric":"","refId":"A","step":120},{"expr":"count(up{job=\"cockroach\",cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"} == 1)","interval":"","intervalFactor":2,"legendFormat":"Live nodes","metric":"","refId":"B","step":120}],"timeFrom":null,"timeShift":null,"title":"Nodes: $node","tooltip":{"msResolution":true,"shared":true,"sort":0,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"short","label":"","logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":"","logBase":1,"max":null,"min":null,"show":true}]},{"columns":[{"text":"Current","value":"current"}],"editable":true,"error":false,"fontSize":"90%","id":19,"isNew":true,"links":[],"pageSize":null,"scroll":true,"showHeader":true,"sort":{"col":0,"desc":true},"span":4,"styles":[],"targets":[{"expr":"build_timestamp{cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}","interval":"","intervalFactor":2,"legendFormat":"{{kubernetes_pod_node_name}} - {{tag}}","metric":"","refId":"A","step":240}],"title":"Built Timestamp","transform":"timeseries_aggregations","type":"table"}],"showTitle":false,"title":"New row"},{"collapse":false,"editable":true,"height":"250px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"fill":1,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":12,"isNew":true,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"null as zero","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":8,"stack":false,"steppedLine":false,"targets":[{"expr":"sys_uptime{job=\"cockroach\",cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}","interval":"","intervalFactor":2,"legendFormat":"{{kubernetes_pod_node_name}}","refId":"A","step":120}],"timeFrom":null,"timeShift":null,"title":"Uptime: $node","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"s","label":null,"logBase":10,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true}]},{"columns":[{"text":"Current","value":"current"}],"editable":true,"error":false,"fontSize":"90%","id":13,"isNew":true,"links":[],"pageSize":null,"scroll":true,"showHeader":true,"sort":{"col":1,"desc":false},"span":4,"styles":[{"dateFormat":"YYYY-MM-DD HH:mm:ss","pattern":"Current","type":"number","unit":"s"}],"targets":[{"expr":"sys_uptime{job=\"cockroach\",cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}","interval":"","intervalFactor":2,"legendFormat":"{{kubernetes_pod_node_name}}","metric":"","refId":"A","step":240}],"title":"Uptime: $node","transform":"timeseries_aggregations","type":"table"}],"title":"New row"},{"collapse":false,"editable":true,"height":"250px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"fill":1,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":9,"isNew":true,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"null as zero","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":8,"stack":false,"steppedLine":false,"targets":[{"expr":"sum(sys_rss{job=\"cockroach\",cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"})","interval":"","intervalFactor":2,"legendFormat":"RSS","metric":"sys_rss","refId":"A","step":120},{"expr":"sum(sys_go_allocbytes{job=\"cockroach\",cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"})","interval":"","intervalFactor":2,"legendFormat":"Go Allocated","metric":"sys_rss","refId":"C","step":120},{"expr":"sum(sys_go_totalbytes{job=\"cockroach\",cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"})","interval":"","intervalFactor":2,"legendFormat":"Go Total","metric":"sys_cgo","refId":"D","step":120},{"expr":"sum(sys_cgo_allocbytes{job=\"cockroach\",cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"})","interval":"","intervalFactor":2,"legendFormat":"CGo Allocated","metric":"sys_rss","refId":"B","step":120},{"expr":"sum(sys_cgo_totalbytes{job=\"cockroach\",cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"})","interval":"","intervalFactor":2,"legendFormat":"CGo Total","metric":"sys_rss","refId":"E","step":120}],"timeFrom":null,"timeShift":null,"title":"Memory: $node","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"bytes","label":null,"logBase":1,"max":null,"min":0,"show":true},{"format":"bytes","label":null,"logBase":1,"max":null,"min":null,"show":true}]},{"columns":[{"text":"Current","value":"current"},{"text":"Avg","value":"avg"},{"text":"Max","value":"max"}],"editable":true,"error":false,"fontSize":"90%","hideTimeOverride":false,"id":10,"isNew":true,"links":[],"pageSize":null,"scroll":true,"showHeader":true,"sort":{"col":null,"desc":false},"span":4,"styles":[{"colorMode":null,"colors":["rgba(245, 54, 54, 0.9)","rgba(237, 129, 40, 0.89)","rgba(50, 172, 45, 0.97)"],"decimals":2,"pattern":".*","thresholds":[],"type":"number","unit":"bytes"}],"targets":[{"expr":"sys_rss{job=\"cockroach\",cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}","interval":"","intervalFactor":2,"legendFormat":"{{kubernetes_pod_node_name}}","refId":"A","step":240}],"title":"RSS: $node","transform":"timeseries_aggregations","type":"table"}],"title":"New row"},{"collapse":false,"editable":true,"height":"250px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"fill":1,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":11,"isNew":true,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"null as zero","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":8,"stack":false,"steppedLine":false,"targets":[{"expr":"sum(sys_goroutines{job=\"cockroach\",cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"})","interval":"","intervalFactor":2,"legendFormat":"Goroutines","metric":"sys","refId":"A","step":120}],"timeFrom":null,"timeShift":null,"title":"Goroutines: $node","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"short","label":null,"logBase":1,"max":null,"min":0,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true}]},{"columns":[{"text":"Current","value":"current"},{"text":"Avg","value":"avg"},{"text":"Max","value":"max"}],"editable":true,"error":false,"fontSize":"90%","id":15,"isNew":true,"links":[],"pageSize":null,"scroll":true,"showHeader":true,"sort":{"col":0,"desc":false},"span":4,"styles":[{"colorMode":null,"colors":["rgba(245, 54, 54, 0.9)","rgba(237, 129, 40, 0.89)","rgba(50, 172, 45, 0.97)"],"dateFormat":"YYYY-MM-DD HH:mm:ss","decimals":0,"pattern":"/.*/","thresholds":[],"type":"number","unit":"none"}],"targets":[{"expr":"sys_goroutines{job=\"cockroach\",cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}","interval":"","intervalFactor":2,"legendFormat":"{{kubernetes_pod_node_name}}","refId":"A","step":240}],"title":"Goroutines: $node","transform":"timeseries_aggregations","type":"table"}],"title":"New row"},{"collapse":false,"editable":true,"height":"250px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"fill":1,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":14,"isNew":true,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"null as zero","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[{"alias":"GC Pauses","yaxis":2}],"span":8,"stack":false,"steppedLine":false,"targets":[{"expr":"sum(rate(sys_gc_count{job=\"cockroach\",cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}[$rate_interval]))","interval":"","intervalFactor":2,"legendFormat":"GC Runs","refId":"A","step":120},{"expr":"sum(rate(sys_gc_pause_ns{job=\"cockroach\",cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}[$rate_interval]))","interval":"","intervalFactor":2,"legendFormat":"GC Pauses","metric":"","refId":"B","step":120}],"timeFrom":null,"timeShift":null,"title":"GC: $node","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"short","label":null,"logBase":1,"max":null,"min":0,"show":true},{"format":"ns","label":null,"logBase":1,"max":null,"min":0,"show":true}]},{"columns":[{"text":"Current","value":"current"},{"text":"Avg","value":"avg"},{"text":"Max","value":"max"}],"editable":true,"error":false,"fontSize":"90%","id":16,"isNew":true,"links":[],"pageSize":null,"scroll":true,"showHeader":true,"sort":{"col":0,"desc":false},"span":4,"styles":[{"dateFormat":"YYYY-MM-DD HH:mm:ss","pattern":"Time","type":"date"},{"colorMode":null,"colors":["rgba(245, 54, 54, 0.9)","rgba(237, 129, 40, 0.89)","rgba(50, 172, 45, 0.97)"],"decimals":2,"pattern":"/.*/","thresholds":[],"type":"number","unit":"ns"}],"targets":[{"expr":"rate(sys_gc_pause_ns{job=\"cockroach\",cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}[$rate_interval])","interval":"","intervalFactor":2,"legendFormat":"{{kubernetes_pod_node_name}}","refId":"A","step":240}],"title":"GC Pauses: $node","transform":"timeseries_aggregations","type":"table"}],"title":"New row"},{"collapse":false,"editable":true,"height":"250px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"fill":1,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":17,"isNew":true,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"null as zero","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":8,"stack":false,"steppedLine":false,"targets":[{"expr":"sum(rate(sys_cpu_user_ns{job=\"cockroach\",cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}[$rate_interval]))","interval":"","intervalFactor":2,"legendFormat":"User","refId":"A","step":120},{"expr":"sum(rate(sys_cpu_sys_ns{job=\"cockroach\",cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}[$rate_interval]))","intervalFactor":2,"legendFormat":"System","metric":"","refId":"B","step":120}],"timeFrom":null,"timeShift":null,"title":"CPU Time: $node","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"ns","label":null,"logBase":1,"max":null,"min":0,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true}]},{"columns":[{"text":"Current","value":"current"},{"text":"Avg","value":"avg"},{"text":"Min","value":"min"},{"text":"Max","value":"max"}],"editable":true,"error":false,"fontSize":"90%","id":18,"isNew":true,"links":[],"pageSize":null,"scroll":true,"showHeader":true,"sort":{"col":0,"desc":true},"span":4,"styles":[{"colorMode":null,"colors":["rgba(245, 54, 54, 0.9)","rgba(237, 129, 40, 0.89)","rgba(50, 172, 45, 0.97)"],"decimals":2,"pattern":"/.*/","thresholds":[],"type":"number","unit":"ns"}],"targets":[{"expr":"rate(sys_cpu_user_ns{job=\"cockroach\",cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}[$rate_interval]) + rate(sys_cpu_sys_ns{job=\"cockroach\",cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}[$rate_interval])","interval":"","intervalFactor":2,"legendFormat":"{{kubernetes_pod_node_name}}","metric":"","refId":"A","step":240}],"title":"CPU Time: $node","transform":"timeseries_aggregations","type":"table"}],"title":"New row"},{"collapse":false,"editable":true,"height":"250px","panels":[],"title":"New row"}],"time":{"from":"now-24h","to":"now"},"timepicker":{"refresh_intervals":["5s","10s","30s","1m","5m","15m","30m","1h","2h","1d"],"time_options":["5m","15m","1h","6h","12h","24h","2d","7d","30d"]},"templating":{"list":[{"current":{},"datasource":"${DS_PROMETHEUS}","hide":0,"includeAll":false,"label":"Cluster name","multi":false,"name":"cluster","options":[],"query":"sys_uptime","refresh":1,"regex":"/cluster=\"([^\"]+)\"/","type":"query"},{"allValue":".*","current":{},"datasource":"${DS_PROMETHEUS}","hide":0,"includeAll":true,"label":"Node","multi":false,"name":"node","options":[],"query":"label_values(sys_uptime{job=\"cockroach\",cluster=\"$cluster\"},kubernetes_pod_node_name)","refresh":1,"regex":"","type":"query"},{"auto":false,"current":{"selected":true,"text":"1m","value":"1m"},"datasource":null,"hide":0,"includeAll":false,"label":"Rate Interval","multi":false,"name":"rate_interval","options":[{"selected":false,"text":"30s","value":"30s"},{"selected":true,"text":"1m","value":"1m"},{"selected":false,"text":"5m","value":"5m"},{"selected":false,"text":"10m","value":"10m"},{"selected":false,"text":"30m","value":"30m"},{"selected":false,"text":"1h","value":"1h"},{"selected":false,"text":"6h","value":"6h"},{"selected":false,"text":"12h","value":"12h"},{"selected":false,"text":"1d","value":"1d"}],"query":"30s,1m,5m,10m,30m,1h,6h,12h,1d","refresh":0,"regex":"","type":"interval"}]},"annotations":{"list":[]},"refresh":"1m","schemaVersion":12,"version":62,"links":[{"asDropdown":true,"icon":"external link","includeVars":true,"keepTime":true,"tags":["cockroach"],"targetBlank":true,"title":"Dashboards","type":"dashboards"}],"gnetId":null},"overwrite":true,"inputs":[{"name":"DS_PROMETHEUS","type":"datasource","pluginId":"prometheus","value":"prometheus"}]} +{% endraw %} \ No newline at end of file diff --git a/roles/prometheus/templates/grafana-dashboards/cockroachdb/sql.json b/roles/prometheus/templates/grafana-dashboards/cockroachdb/sql.json new file mode 100644 index 0000000..6189fbb --- /dev/null +++ b/roles/prometheus/templates/grafana-dashboards/cockroachdb/sql.json @@ -0,0 +1,3 @@ +{% raw %} + {"dashboard":{"__inputs":[{"name":"DS_PROMETHEUS","label":"Prometheus","description":"","type":"datasource","pluginId":"prometheus","pluginName":"Prometheus"}],"__requires":[{"type":"panel","id":"graph","name":"Graph","version":""},{"type":"panel","id":"table","name":"Table","version":""},{"type":"grafana","id":"grafana","name":"Grafana","version":"3.1.1"},{"type":"datasource","id":"prometheus","name":"Prometheus","version":"1.0.0"}],"id":null,"title":"Cockroach SQL","tags":["cockroach"],"style":"dark","timezone":"utc","editable":true,"hideControls":false,"sharedCrosshair":true,"rows":[{"collapse":false,"editable":true,"height":"250px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"fill":1,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":6,"isNew":true,"legend":{"alignAsTable":false,"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"null as zero","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":8,"stack":false,"steppedLine":false,"targets":[{"expr":"sum(sql_conns{cluster=\"$cluster\",job=\"cockroach\",kubernetes_pod_node_name=~\"$node\"})","hide":false,"interval":"","intervalFactor":2,"legendFormat":"Connections","metric":"","refId":"A","step":120}],"timeFrom":null,"timeShift":null,"title":"SQL Connections: $node","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"short","label":null,"logBase":1,"max":null,"min":0,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true}]},{"columns":[{"text":"Current","value":"current"},{"text":"Avg","value":"avg"},{"text":"Min","value":"min"},{"text":"Max","value":"max"}],"editable":true,"error":false,"fontSize":"90%","id":12,"isNew":true,"links":[],"pageSize":null,"scroll":true,"showHeader":true,"sort":{"col":0,"desc":false},"span":4,"styles":[{"dateFormat":"YYYY-MM-DD HH:mm:ss","pattern":"Time","type":"date"},{"colorMode":null,"colors":["rgba(245, 54, 54, 0.9)","rgba(237, 129, 40, 0.89)","rgba(50, 172, 45, 0.97)"],"decimals":0,"pattern":"/.*/","thresholds":[],"type":"number","unit":"short"}],"targets":[{"expr":"sql_conns{job=\"cockroach\",cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}","interval":"","intervalFactor":2,"legendFormat":"{{kubernetes_pod_node_name}}","refId":"A","step":240}],"title":"Connections: $node","transform":"timeseries_aggregations","type":"table"}],"title":"New row"},{"collapse":false,"editable":true,"height":"250px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"fill":1,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":7,"isNew":true,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"null as zero","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":8,"stack":false,"steppedLine":false,"targets":[{"expr":"sum(rate(sql_bytesin{job=\"cockroach\", cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}[$rate_interval]))","interval":"","intervalFactor":2,"legendFormat":"In","metric":"","refId":"A","step":120},{"expr":"sum(rate(sql_bytesout{job=\"cockroach\", cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}[$rate_interval]))","interval":"","intervalFactor":2,"legendFormat":"Out","metric":"","refId":"B","step":120}],"timeFrom":null,"timeShift":null,"title":"Bytes: $node","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"Bps","label":null,"logBase":1,"max":null,"min":0,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true}]},{"columns":[{"text":"Current","value":"current"},{"text":"Avg","value":"avg"},{"text":"Min","value":"min"},{"text":"Max","value":"max"}],"editable":true,"error":false,"fontSize":"90%","hideTimeOverride":false,"id":13,"isNew":true,"links":[],"pageSize":null,"scroll":true,"showHeader":true,"sort":{"col":0,"desc":true},"span":4,"styles":[{"colorMode":null,"colors":["rgba(245, 54, 54, 0.9)","rgba(237, 129, 40, 0.89)","rgba(50, 172, 45, 0.97)"],"dateFormat":"YYYY-MM-DD HH:mm:ss","decimals":2,"pattern":"/.*/","thresholds":[],"type":"number","unit":"Bps"}],"targets":[{"expr":"rate(sql_bytesin{job=\"cockroach\", cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}[$rate_interval])","interval":"","intervalFactor":2,"legendFormat":"{{kubernetes_pod_node_name}} - in","refId":"A","step":240},{"expr":"rate(sql_bytesout{job=\"cockroach\", cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}[$rate_interval])","interval":"","intervalFactor":2,"legendFormat":"{{kubernetes_pod_node_name}} - out","metric":"","refId":"B","step":240}],"timeFrom":null,"timeShift":null,"title":"Bytes in/out: $node","transform":"timeseries_aggregations","type":"table"}],"title":"New row"},{"collapse":false,"editable":true,"height":"250px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"fill":1,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":9,"isNew":true,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"null as zero","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":8,"stack":false,"steppedLine":false,"targets":[{"expr":"sum(rate(sql_select_count{job=\"cockroach\",cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}[$rate_interval]))","interval":"","intervalFactor":2,"legendFormat":"select","metric":"","refId":"A","step":120},{"expr":"sum(rate(sql_insert_count{job=\"cockroach\",cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}[$rate_interval]))","interval":"","intervalFactor":2,"legendFormat":"insert","metric":"","refId":"B","step":120},{"expr":"sum(rate(sql_update_count{job=\"cockroach\",cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}[$rate_interval]))","interval":"","intervalFactor":2,"legendFormat":"update","metric":"","refId":"C","step":120},{"expr":"sum(rate(sql_delete_count{job=\"cockroach\",cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}[$rate_interval]))","interval":"","intervalFactor":2,"legendFormat":"delete","metric":"","refId":"D","step":120}],"timeFrom":null,"timeShift":null,"title":"Queries: $node","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"short","label":null,"logBase":1,"max":null,"min":0,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true}]},{"columns":[{"text":"Current","value":"current"},{"text":"Avg","value":"avg"},{"text":"Min","value":"min"},{"text":"Max","value":"max"}],"editable":true,"error":false,"fontSize":"90%","id":14,"isNew":true,"links":[],"pageSize":null,"scroll":true,"showHeader":true,"sort":{"col":0,"desc":true},"span":4,"styles":[{"dateFormat":"YYYY-MM-DD HH:mm:ss","pattern":"Time","type":"date"},{"colorMode":null,"colors":["rgba(245, 54, 54, 0.9)","rgba(237, 129, 40, 0.89)","rgba(50, 172, 45, 0.97)"],"decimals":2,"pattern":"/.*/","thresholds":[],"type":"number","unit":"short"}],"targets":[{"expr":"rate(sql_select_count{job=\"cockroach\",cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}[$rate_interval]) + rate(sql_insert_count{job=\"cockroach\",cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}[$rate_interval]) + rate(sql_update_count{job=\"cockroach\",cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}[$rate_interval]) + rate(sql_delete_count{job=\"cockroach\",cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}[$rate_interval])","interval":"","intervalFactor":2,"legendFormat":"{{kubernetes_pod_node_name}}","metric":"","refId":"A","step":240}],"title":"Queries: $node","transform":"timeseries_aggregations","type":"table"}],"title":"New row"},{"collapse":false,"editable":true,"height":"250px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"fill":1,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":10,"isNew":true,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"null as zero","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":8,"stack":false,"steppedLine":false,"targets":[{"expr":"sum(rate(sql_txn_begin_count{job=\"cockroach\",cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}[$rate_interval]))","interval":"","intervalFactor":2,"legendFormat":"begin","metric":"","refId":"A","step":120},{"expr":"sum(rate(sql_txn_commit_count{job=\"cockroach\",cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}[$rate_interval]))","interval":"","intervalFactor":2,"legendFormat":"commit","refId":"B","step":120},{"expr":"sum(rate(sql_txn_abort_count{job=\"cockroach\",cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}[$rate_interval]))","interval":"","intervalFactor":2,"legendFormat":"abort","refId":"C","step":120},{"expr":"sum(rate(sql_txn_rollback_count{job=\"cockroach\",cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}[$rate_interval]))","interval":"","intervalFactor":2,"legendFormat":"rollback","refId":"D","step":120}],"timeFrom":null,"timeShift":null,"title":"Transactions: $node","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"short","label":null,"logBase":1,"max":null,"min":0,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true}]},{"columns":[{"text":"Current","value":"current"},{"text":"Avg","value":"avg"},{"text":"Min","value":"min"},{"text":"Max","value":"max"}],"editable":true,"error":false,"fontSize":"90%","id":15,"isNew":true,"links":[],"pageSize":null,"scroll":true,"showHeader":true,"sort":{"col":0,"desc":true},"span":4,"styles":[{"dateFormat":"YYYY-MM-DD HH:mm:ss","pattern":"Time","type":"date"},{"colorMode":null,"colors":["rgba(245, 54, 54, 0.9)","rgba(237, 129, 40, 0.89)","rgba(50, 172, 45, 0.97)"],"decimals":2,"pattern":"/.*/","thresholds":[],"type":"number","unit":"short"}],"targets":[{"expr":"rate(sql_txn_begin_count{job=\"cockroach\",cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}[$rate_interval]) + rate(sql_txn_commit_count{job=\"cockroach\",cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}[$rate_interval]) + rate(sql_txn_abort_count{job=\"cockroach\",cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}[$rate_interval]) + rate(sql_txn_rollback_count{job=\"cockroach\",cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}[$rate_interval])","interval":"","intervalFactor":2,"legendFormat":"{{kubernetes_pod_node_name}}","refId":"A","step":240}],"title":"Transactions: $node","transform":"timeseries_aggregations","type":"table"}],"title":"New row"},{"collapse":false,"editable":true,"height":"250px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"fill":1,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":11,"isNew":true,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"null as zero","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":8,"stack":false,"steppedLine":false,"targets":[{"expr":"sum(rate(sql_ddl_count{job=\"cockroach\",cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}[$rate_interval]))","interval":"","intervalFactor":2,"legendFormat":"DDL","metric":"","refId":"A","step":120}],"timeFrom":null,"timeShift":null,"title":"Schema changes: $node","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"short","label":null,"logBase":1,"max":null,"min":0,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true}]},{"columns":[{"text":"Current","value":"current"},{"text":"Avg","value":"avg"},{"text":"Min","value":"min"},{"text":"Max","value":"max"}],"editable":true,"error":false,"fontSize":"90%","id":16,"isNew":true,"links":[],"pageSize":null,"scroll":true,"showHeader":true,"sort":{"col":0,"desc":true},"span":4,"styles":[{"dateFormat":"YYYY-MM-DD HH:mm:ss","pattern":"Time","type":"date"},{"colorMode":null,"colors":["rgba(245, 54, 54, 0.9)","rgba(237, 129, 40, 0.89)","rgba(50, 172, 45, 0.97)"],"decimals":2,"pattern":"/.*/","thresholds":[],"type":"number","unit":"short"}],"targets":[{"expr":"rate(sql_ddl_count{job=\"cockroach\",cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}[$rate_interval])","interval":"","intervalFactor":2,"legendFormat":"{{kubernetes_pod_node_name}}","metric":"","refId":"A","step":240}],"title":"Schema Changes: $node","transform":"timeseries_aggregations","type":"table"}],"title":"New row"}],"time":{"from":"now-24h","to":"now"},"timepicker":{"refresh_intervals":["5s","10s","30s","1m","5m","15m","30m","1h","2h","1d"],"time_options":["5m","15m","1h","6h","12h","24h","2d","7d","30d"]},"templating":{"list":[{"current":{},"datasource":"${DS_PROMETHEUS}","hide":0,"includeAll":false,"label":"Cluster name","multi":false,"name":"cluster","options":[],"query":"sys_uptime","refresh":1,"regex":"/cluster=\"([^\"]+)\"/","type":"query"},{"allValue":".*","current":{},"datasource":"${DS_PROMETHEUS}","hide":0,"includeAll":true,"label":"Node","multi":false,"name":"node","options":[],"query":"label_values(sys_uptime{job=\"cockroach\",cluster=\"$cluster\"},kubernetes_pod_node_name)","refresh":1,"regex":"","type":"query"},{"current":{"selected":true,"text":"1m","value":"1m"},"datasource":null,"hide":0,"includeAll":false,"label":"Rate Interval","multi":false,"name":"rate_interval","options":[{"selected":false,"text":"30s","value":"30s"},{"selected":true,"text":"1m","value":"1m"},{"selected":false,"text":"5m","value":"5m"},{"selected":false,"text":"10m","value":"10m"},{"selected":false,"text":"30m","value":"30m"},{"selected":false,"text":"1h","value":"1h"},{"selected":false,"text":"6h","value":"6h"},{"selected":false,"text":"12h","value":"12h"},{"selected":false,"text":"1d","value":"1d"}],"query":"30s,1m,5m,10m,30m,1h,6h,12h,1d","refresh":0,"type":"interval"}]},"annotations":{"list":[]},"refresh":false,"schemaVersion":12,"version":33,"links":[{"asDropdown":true,"icon":"external link","includeVars":true,"keepTime":true,"tags":["cockroach"],"targetBlank":true,"title":"Dashboards","type":"dashboards"}],"gnetId":null},"overwrite":true,"inputs":[{"name":"DS_PROMETHEUS","type":"datasource","pluginId":"prometheus","value":"prometheus"}]} +{% endraw %} \ No newline at end of file diff --git a/roles/prometheus/templates/grafana-dashboards/cockroachdb/storage.json b/roles/prometheus/templates/grafana-dashboards/cockroachdb/storage.json new file mode 100644 index 0000000..f2a1e82 --- /dev/null +++ b/roles/prometheus/templates/grafana-dashboards/cockroachdb/storage.json @@ -0,0 +1,3 @@ +{% raw %} + {"dashboard":{"__inputs":[{"name":"DS_PROMETHEUS","label":"Prometheus","description":"","type":"datasource","pluginId":"prometheus","pluginName":"Prometheus"}],"__requires":[{"type":"panel","id":"graph","name":"Graph","version":""},{"type":"panel","id":"table","name":"Table","version":""},{"type":"grafana","id":"grafana","name":"Grafana","version":"3.1.1"},{"type":"datasource","id":"prometheus","name":"Prometheus","version":"1.0.0"}],"id":null,"title":"Cockroach Storage","tags":["cockroach"],"style":"dark","timezone":"utc","editable":true,"hideControls":false,"sharedCrosshair":true,"rows":[{"collapse":false,"editable":true,"height":"250px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"fill":1,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":16,"isNew":true,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":8,"stack":false,"steppedLine":false,"targets":[{"expr":"sum(sum(capacity{cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}) by (kubernetes_pod_node_name)) - sum(sum(capacity_available{cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}) by (kubernetes_pod_node_name))","interval":"","intervalFactor":2,"legendFormat":"Used","refId":"A","step":120},{"expr":"sum(sum(capacity{cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}) by (kubernetes_pod_node_name))","interval":"","intervalFactor":2,"legendFormat":"Capacity","metric":"","refId":"B","step":120}],"timeFrom":null,"timeShift":null,"title":"Capacity: $node","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"bytes","label":"","logBase":1,"max":null,"min":0,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true}]},{"columns":[{"text":"Current","value":"current"},{"text":"Avg","value":"avg"},{"text":"Min","value":"min"},{"text":"Max","value":"max"}],"editable":true,"error":false,"fontSize":"90%","id":17,"isNew":true,"links":[],"pageSize":null,"scroll":true,"showHeader":true,"sort":{"col":0,"desc":false},"span":4,"styles":[{"colorMode":null,"colors":["rgba(245, 54, 54, 0.9)","rgba(237, 129, 40, 0.89)","rgba(50, 172, 45, 0.97)"],"decimals":2,"pattern":"/.*/","thresholds":[],"type":"number","unit":"percentunit"}],"targets":[{"expr":"(sum(capacity{cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}) by (kubernetes_pod_node_name) - sum(capacity_available{cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}) by (kubernetes_pod_node_name)) / sum(capacity{cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}) by (kubernetes_pod_node_name)","interval":"","intervalFactor":2,"legendFormat":"{{kubernetes_pod_node_name}}","refId":"A","step":240}],"title":"Capacity Used: $node","transform":"timeseries_aggregations","type":"table"},{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"fill":1,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":22,"isNew":true,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":8,"stack":false,"steppedLine":false,"targets":[{"expr":"1 - sum(capacity_available{cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}) by (kubernetes_pod_node_name) / sum(capacity{cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}) by (kubernetes_pod_node_name) ","interval":"","intervalFactor":2,"legendFormat":"{{kubernetes_pod_node_name}}","refId":"A","step":120}],"timeFrom":null,"timeShift":null,"title":"Percentage of capacity used per node: All","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"percentunit","label":"","logBase":1,"max":null,"min":0,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true}]},{"columns":[{"text":"Current","value":"current"},{"text":"Avg","value":"avg"},{"text":"Min","value":"min"},{"text":"Max","value":"max"}],"editable":true,"error":false,"fontSize":"90%","id":23,"isNew":true,"links":[],"pageSize":null,"scroll":true,"showHeader":true,"sort":{"col":0,"desc":false},"span":4,"styles":[{"colorMode":null,"colors":["rgba(245, 54, 54, 0.9)","rgba(237, 129, 40, 0.89)","rgba(50, 172, 45, 0.97)"],"decimals":2,"pattern":"/.*/","thresholds":[],"type":"number","unit":"percentunit"}],"targets":[{"expr":"1 - sum(capacity_available{cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}) by (kubernetes_pod_node_name) / sum(capacity{cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}) by (kubernetes_pod_node_name) ","interval":"","intervalFactor":2,"legendFormat":"{{kubernetes_pod_node_name}}","refId":"A","step":240}],"title":"Percentage of capacity used per node: All","transform":"timeseries_aggregations","type":"table"}],"title":"New row"},{"collapse":false,"editable":true,"height":"250px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"fill":1,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":6,"isNew":true,"legend":{"alignAsTable":false,"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":8,"stack":false,"steppedLine":false,"targets":[{"expr":"sum(sum(livebytes{cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}) by (kubernetes_pod_node_name))","hide":false,"interval":"","intervalFactor":2,"legendFormat":"Live","metric":"","refId":"A","step":120},{"expr":"sum(sum(sysbytes{cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}) by (kubernetes_pod_node_name))","interval":"","intervalFactor":2,"legendFormat":"System","metric":"","refId":"C","step":120}],"timeFrom":null,"timeShift":null,"title":"Bytes: $node","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"bytes","label":null,"logBase":1,"max":null,"min":0,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true}]},{"columns":[{"text":"Current","value":"current"},{"text":"Avg","value":"avg"},{"text":"Min","value":"min"},{"text":"Max","value":"max"}],"editable":true,"error":false,"fontSize":"90%","id":12,"isNew":true,"links":[],"pageSize":null,"scroll":true,"showHeader":true,"sort":{"col":0,"desc":false},"span":4,"styles":[{"colorMode":null,"colors":["rgba(245, 54, 54, 0.9)","rgba(237, 129, 40, 0.89)","rgba(50, 172, 45, 0.97)"],"decimals":2,"pattern":"/.*/","thresholds":[],"type":"number","unit":"bytes"}],"targets":[{"expr":"sum(livebytes{cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}) by (kubernetes_pod_node_name)","interval":"","intervalFactor":2,"legendFormat":"{{kubernetes_pod_node_name}}","metric":"","refId":"A","step":240}],"title":"Live Bytes: $node","transform":"timeseries_aggregations","type":"table"}],"title":"New row"},{"collapse":false,"editable":true,"height":"250px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"fill":1,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":9,"isNew":true,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":8,"stack":false,"steppedLine":false,"targets":[{"expr":"avg(avg(rocksdb_read_amplification{cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}) by (kubernetes_pod_node_name))","interval":"","intervalFactor":2,"legendFormat":"Read Amplification","metric":"","refId":"A","step":120}],"timeFrom":null,"timeShift":null,"title":"Read Amplification: $node","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"short","label":null,"logBase":1,"max":null,"min":0,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true}]},{"columns":[{"text":"Current","value":"current"},{"text":"Avg","value":"avg"},{"text":"Min","value":"min"},{"text":"Max","value":"max"}],"editable":true,"error":false,"fontSize":"90%","id":14,"isNew":true,"links":[],"pageSize":null,"scroll":true,"showHeader":true,"sort":{"col":0,"desc":true},"span":4,"styles":[{"dateFormat":"YYYY-MM-DD HH:mm:ss","pattern":"Time","type":"date"},{"colorMode":null,"colors":["rgba(245, 54, 54, 0.9)","rgba(237, 129, 40, 0.89)","rgba(50, 172, 45, 0.97)"],"decimals":2,"pattern":"/.*/","thresholds":[],"type":"number","unit":"short"}],"targets":[{"expr":"avg(rocksdb_read_amplification{cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}) by (kubernetes_pod_node_name)","interval":"","intervalFactor":2,"legendFormat":"{{kubernetes_pod_node_name}}","metric":"","refId":"A","step":240}],"title":"Read Amplification: $node","transform":"timeseries_aggregations","type":"table"}],"title":"New row"},{"collapse":false,"editable":true,"height":"250px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"fill":1,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":18,"isNew":true,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":8,"stack":false,"steppedLine":false,"targets":[{"expr":"sum(sum(rocksdb_num_sstables{cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}) by (kubernetes_pod_node_name))","interval":"","intervalFactor":2,"legendFormat":"SSTables","refId":"A","step":120}],"timeFrom":null,"timeShift":null,"title":"RocksDB SSTables: $node","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true}]},{"columns":[{"text":"Current","value":"current"},{"text":"Avg","value":"avg"},{"text":"Max","value":"max"}],"editable":true,"error":false,"fontSize":"90%","id":19,"isNew":true,"links":[],"pageSize":null,"scroll":true,"showHeader":true,"sort":{"col":0,"desc":true},"span":4,"styles":[{"dateFormat":"YYYY-MM-DD HH:mm:ss","pattern":"Time","type":"date"},{"colorMode":null,"colors":["rgba(245, 54, 54, 0.9)","rgba(237, 129, 40, 0.89)","rgba(50, 172, 45, 0.97)"],"decimals":2,"pattern":"/.*/","thresholds":[],"type":"number","unit":"short"}],"targets":[{"expr":"sum(rocksdb_num_sstables{cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}) by (kubernetes_pod_node_name)","interval":"","intervalFactor":2,"legendFormat":"{{kubernetes_pod_node_name}}","metric":"","refId":"A","step":240}],"title":"RocksDB SSTables: $node","transform":"timeseries_aggregations","type":"table"}],"title":"New row"},{"collapse":false,"editable":true,"height":"250px","panels":[{"aliasColors":{},"bars":false,"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"fill":1,"grid":{"threshold1":null,"threshold1Color":"rgba(216, 200, 27, 0.27)","threshold2":null,"threshold2Color":"rgba(234, 112, 112, 0.22)"},"id":20,"isNew":true,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"null as zero","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"span":8,"stack":false,"steppedLine":false,"targets":[{"expr":"sum(sum(sys_fd_open{cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}) by (kubernetes_pod_node_name))","interval":"","intervalFactor":2,"legendFormat":"Open FDs","refId":"A","step":120},{"expr":"sum(sum(sys_fd_softlimit{cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}) by (kubernetes_pod_node_name))","intervalFactor":2,"legendFormat":"Limit","refId":"B","step":120}],"timeFrom":null,"timeShift":null,"title":"File Descriptors: $node","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"cumulative"},"type":"graph","xaxis":{"show":true},"yaxes":[{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true}]},{"columns":[{"text":"Current","value":"current"},{"text":"Avg","value":"avg"},{"text":"Min","value":"min"},{"text":"Max","value":"max"}],"editable":true,"error":false,"fontSize":"90%","id":21,"isNew":true,"links":[],"pageSize":null,"scroll":true,"showHeader":true,"sort":{"col":0,"desc":true},"span":4,"styles":[{"colorMode":null,"colors":["rgba(245, 54, 54, 0.9)","rgba(237, 129, 40, 0.89)","rgba(50, 172, 45, 0.97)"],"decimals":2,"pattern":"/.*/","thresholds":[],"type":"number","unit":"percentunit"}],"targets":[{"expr":"sum(sys_fd_open{cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}) by (kubernetes_pod_node_name) / sum(sys_fd_softlimit{cluster=\"$cluster\",kubernetes_pod_node_name=~\"$node\"}) by (kubernetes_pod_node_name)","interval":"","intervalFactor":2,"legendFormat":"{{kubernetes_pod_node_name}}","refId":"A","step":240}],"title":"File descriptor usage: $node","transform":"timeseries_aggregations","type":"table"}],"title":"New row"}],"time":{"from":"now-24h","to":"now"},"timepicker":{"refresh_intervals":["5s","10s","30s","1m","5m","15m","30m","1h","2h","1d"],"time_options":["5m","15m","1h","6h","12h","24h","2d","7d","30d"]},"templating":{"list":[{"current":{},"datasource":"${DS_PROMETHEUS}","hide":0,"includeAll":false,"label":"Cluster name","multi":false,"name":"cluster","options":[],"query":"sys_uptime","refresh":1,"regex":"/cluster=\"([^\"]+)\"/","type":"query"},{"allValue":".*","current":{},"datasource":"${DS_PROMETHEUS}","hide":0,"includeAll":true,"label":"Node","multi":false,"name":"node","options":[],"query":"label_values(sys_uptime{job=\"cockroach\",cluster=\"$cluster\"},kubernetes_pod_node_name)","refresh":1,"regex":"","type":"query"},{"current":{"selected":true,"text":"1m","value":"1m"},"datasource":null,"hide":0,"includeAll":false,"label":"Rate Interval","multi":false,"name":"rate_interval","options":[{"selected":false,"text":"30s","value":"30s"},{"selected":true,"text":"1m","value":"1m"},{"selected":false,"text":"5m","value":"5m"},{"selected":false,"text":"10m","value":"10m"},{"selected":false,"text":"30m","value":"30m"},{"selected":false,"text":"1h","value":"1h"},{"selected":false,"text":"6h","value":"6h"},{"selected":false,"text":"12h","value":"12h"},{"selected":false,"text":"1d","value":"1d"}],"query":"30s,1m,5m,10m,30m,1h,6h,12h,1d","refresh":0,"type":"interval"}]},"annotations":{"list":[]},"refresh":false,"schemaVersion":12,"version":15,"links":[{"asDropdown":true,"icon":"external link","includeVars":true,"keepTime":true,"tags":["cockroach"],"targetBlank":true,"title":"Dashboards","type":"dashboards"}],"gnetId":null},"overwrite":true,"inputs":[{"name":"DS_PROMETHEUS","type":"datasource","pluginId":"prometheus","value":"prometheus"}]} +{% endraw %} \ No newline at end of file diff --git a/roles/prometheus/templates/grafana-dashboards/http-service.json b/roles/prometheus/templates/grafana-dashboards/http-service.json new file mode 100644 index 0000000..d2fe2fc --- /dev/null +++ b/roles/prometheus/templates/grafana-dashboards/http-service.json @@ -0,0 +1,249 @@ +{% raw %} + +{ + "dashboard": { + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "4.4.1" + }, + { + "type": "panel", + "id": "graph", + "name": "Graph", + "version": "" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + } + ], + "annotations": { + "list": [] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 1, + "hideControls": false, + "id": null, + "links": [], + "rows": [ + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "id": 1, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "(sum(rate(http_requests_total{job=\"kubernetes-service-endpoints\",status=~\"2..\",component=\"$component\"}[1m]))", + "interval": "1m", + "intervalFactor": 1, + "legendFormat": "2xx", + "metric": "container_memory_usage_bytes", + "refId": "A", + "step": 15 + }, + { + "expr": "(sum(rate(http_requests_total{job=\"kubernetes-service-endpoints\",status=~\"5..\",component=\"$component\"}[1m])) / sum(rate(http_requests_total{job=\"kubernetes-service-endpoints\",component=\"$component\"}[1m]))) * 100", + "interval": "1m", + "intervalFactor": 2, + "legendFormat": "5xx", + "metric": "kube_pod_container_resource_requests_memory_bytes", + "refId": "B", + "step": 20 + }, + { + "expr": "sum(rate(http_requests_total{job=\"kubernetes-service-endpoints\",component=\"$component\"}[1m]))", + "interval": "1m", + "intervalFactor": 2, + "legendFormat": "total", + "metric": "kube_pod_container_resource_requests_memory_bytes", + "refId": "B", + "step": 20 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Request / error rate ", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Row", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "allValue": ".*", + "current": {}, + "datasource": "${DS_PROMETHEUS}", + "hide": 0, + "includeAll": true, + "label": "Namespace", + "multi": false, + "name": "namespace", + "options": [], + "query": "label_values(http_request_duration_seconds_sum, kubernetes_namespace)", + "refresh": 1, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": {}, + "datasource": "${DS_PROMETHEUS}", + "hide": 0, + "includeAll": false, + "label": "HTTP service", + "multi": false, + "name": "component", + "options": [], + "query": "label_values(http_request_duration_seconds_sum{kubernetes_namespace=~\"$namespace\"}, component)", + "refresh": 1, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "HTTP service", + "version": 1 + }, + "overwrite": true, + "inputs": [ + { + "name": "DS_PROMETHEUS", + "type": "datasource", + "pluginId": "prometheus", + "value": "prometheus" + } + ] +} +{% endraw %} diff --git a/roles/prometheus/templates/grafana-dashboards/nodes.json b/roles/prometheus/templates/grafana-dashboards/nodes.json index 8baef30..6a24def 100644 --- a/roles/prometheus/templates/grafana-dashboards/nodes.json +++ b/roles/prometheus/templates/grafana-dashboards/nodes.json @@ -1,3 +1,3 @@ {% raw %} - {"dashboard":{"__inputs":[{"name":"DS_PROMETHEUS","label":"prometheus","description":"","type":"datasource","pluginId":"prometheus","pluginName":"Prometheus"}],"__requires":[{"type":"grafana","id":"grafana","name":"Grafana","version":"4.4.1"},{"type":"panel","id":"graph","name":"Graph","version":""},{"type":"datasource","id":"prometheus","name":"Prometheus","version":"1.0.0"},{"type":"panel","id":"singlestat","name":"Singlestat","version":""}],"annotations":{"list":[]},"description":"Dashboard to get an overview of one server","editable":true,"gnetId":22,"graphTooltip":0,"hideControls":false,"id":null,"links":[],"refresh":false,"rows":[{"collapse":false,"height":"250px","panels":[{"alerting":{},"aliasColors":{},"bars":false,"dashLength":10,"dashes":false,"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"fill":1,"grid":{},"id":3,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"spaceLength":10,"span":6,"stack":false,"steppedLine":false,"targets":[{"expr":"100 - (avg by (cpu) (irate(node_cpu{mode=\"idle\", instance=\"$server\"}[5m])) * 100)","hide":false,"intervalFactor":10,"legendFormat":"{{cpu}}","refId":"A","step":50}],"thresholds":[],"timeFrom":null,"timeShift":null,"title":"Idle cpu","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"cumulative"},"type":"graph","xaxis":{"buckets":null,"mode":"time","name":null,"show":true,"values":[]},"yaxes":[{"format":"percent","label":"cpu usage","logBase":1,"max":100,"min":0,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true}]},{"alerting":{},"aliasColors":{},"bars":false,"dashLength":10,"dashes":false,"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"fill":1,"grid":{},"id":9,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"spaceLength":10,"span":6,"stack":false,"steppedLine":false,"targets":[{"expr":"node_load1{instance=\"$server\"}","intervalFactor":4,"legendFormat":"load 1m","refId":"A","step":20,"target":""},{"expr":"node_load5{instance=\"$server\"}","intervalFactor":4,"legendFormat":"load 5m","refId":"B","step":20,"target":""},{"expr":"node_load15{instance=\"$server\"}","intervalFactor":4,"legendFormat":"load 15m","refId":"C","step":20,"target":""}],"thresholds":[],"timeFrom":null,"timeShift":null,"title":"System load","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"cumulative"},"type":"graph","xaxis":{"buckets":null,"mode":"time","name":null,"show":true,"values":[]},"yaxes":[{"format":"percentunit","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true}]}],"repeat":null,"repeatIteration":null,"repeatRowId":null,"showTitle":false,"title":"New row","titleSize":"h6"},{"collapse":false,"height":"250px","panels":[{"alerting":{},"aliasColors":{},"bars":false,"dashLength":10,"dashes":false,"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"fill":1,"grid":{},"id":4,"legend":{"alignAsTable":false,"avg":false,"current":false,"hideEmpty":false,"hideZero":false,"max":false,"min":false,"rightSide":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[{"alias":"node_memory_SwapFree{instance=\"172.17.0.1:9100\",job=\"prometheus\"}","yaxis":2}],"spaceLength":10,"span":9,"stack":true,"steppedLine":false,"targets":[{"expr":"node_memory_MemTotal{instance=\"$server\"} - node_memory_MemFree{instance=\"$server\"} - node_memory_Buffers{instance=\"$server\"} - node_memory_Cached{instance=\"$server\"}","hide":false,"interval":"","intervalFactor":2,"legendFormat":"memory used","metric":"","refId":"C","step":10},{"expr":"node_memory_Buffers{instance=\"$server\"}","interval":"","intervalFactor":2,"legendFormat":"memory buffers","metric":"","refId":"E","step":10},{"expr":"node_memory_Cached{instance=\"$server\"}","intervalFactor":2,"legendFormat":"memory cached","metric":"","refId":"F","step":10},{"expr":"node_memory_MemFree{instance=\"$server\"}","intervalFactor":2,"legendFormat":"memory free","metric":"","refId":"D","step":10}],"thresholds":[],"timeFrom":null,"timeShift":null,"title":"Memory usage","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"individual"},"type":"graph","xaxis":{"buckets":null,"mode":"time","name":null,"show":true,"values":[]},"yaxes":[{"format":"bytes","label":null,"logBase":1,"max":null,"min":"0","show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true}]},{"cacheTimeout":null,"colorBackground":false,"colorValue":false,"colors":["rgba(50, 172, 45, 0.97)","rgba(237, 129, 40, 0.89)","rgba(245, 54, 54, 0.9)"],"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"format":"percent","gauge":{"maxValue":100,"minValue":0,"show":true,"thresholdLabels":false,"thresholdMarkers":true},"id":5,"interval":null,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":3,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"tableColumn":"","targets":[{"expr":"((node_memory_MemTotal{instance=\"$server\"} - node_memory_MemFree{instance=\"$server\"} - node_memory_Buffers{instance=\"$server\"} - node_memory_Cached{instance=\"$server\"}) / node_memory_MemTotal{instance=\"$server\"}) * 100","intervalFactor":2,"refId":"A","step":60,"target":""}],"thresholds":"80, 90","title":"Memory usage","type":"singlestat","valueFontSize":"80%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"avg"}],"repeat":null,"repeatIteration":null,"repeatRowId":null,"showTitle":false,"title":"New row","titleSize":"h6"},{"collapse":false,"height":"250px","panels":[{"alerting":{},"aliasColors":{},"bars":false,"dashLength":10,"dashes":false,"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"fill":1,"grid":{},"id":6,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[{"alias":"read","yaxis":1},{"alias":"{instance=\"172.17.0.1:9100\"}","yaxis":2},{"alias":"io time","yaxis":2}],"spaceLength":10,"span":9,"stack":false,"steppedLine":false,"targets":[{"expr":"sum by (instance) (rate(node_disk_bytes_read{instance=\"$server\"}[2m]))","hide":false,"intervalFactor":4,"legendFormat":"read","refId":"A","step":20,"target":""},{"expr":"sum by (instance) (rate(node_disk_bytes_written{instance=\"$server\"}[2m]))","intervalFactor":4,"legendFormat":"written","refId":"B","step":20},{"expr":"sum by (instance) (rate(node_disk_io_time_ms{instance=\"$server\"}[2m]))","intervalFactor":4,"legendFormat":"io time","refId":"C","step":20}],"thresholds":[],"timeFrom":null,"timeShift":null,"title":"Disk I/O","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"cumulative"},"type":"graph","xaxis":{"buckets":null,"mode":"time","name":null,"show":true,"values":[]},"yaxes":[{"format":"bytes","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"ms","label":null,"logBase":1,"max":null,"min":null,"show":true}]},{"cacheTimeout":null,"colorBackground":false,"colorValue":false,"colors":["rgba(50, 172, 45, 0.97)","rgba(237, 129, 40, 0.89)","rgba(245, 54, 54, 0.9)"],"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"format":"percentunit","gauge":{"maxValue":1,"minValue":0,"show":true,"thresholdLabels":false,"thresholdMarkers":true},"id":7,"interval":null,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":3,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"tableColumn":"","targets":[{"expr":"(sum(node_filesystem_size{device!=\"rootfs\",instance=\"$server\"}) - sum(node_filesystem_free{device!=\"rootfs\",instance=\"$server\"})) / sum(node_filesystem_size{device!=\"rootfs\",instance=\"$server\"})","intervalFactor":2,"refId":"A","step":60,"target":""}],"thresholds":"0.75, 0.9","title":"Disk space usage","type":"singlestat","valueFontSize":"80%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"current"}],"repeat":null,"repeatIteration":null,"repeatRowId":null,"showTitle":false,"title":"New row","titleSize":"h6"},{"collapse":false,"height":"250px","panels":[{"alerting":{},"aliasColors":{},"bars":false,"dashLength":10,"dashes":false,"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"fill":1,"grid":{},"id":8,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[{"alias":"transmitted ","yaxis":2}],"spaceLength":10,"span":6,"stack":false,"steppedLine":false,"targets":[{"expr":"rate(node_network_receive_bytes{instance=\"$server\",device!~\"lo\"}[5m])","hide":false,"intervalFactor":2,"legendFormat":"{{device}}","refId":"A","step":10,"target":""}],"thresholds":[],"timeFrom":null,"timeShift":null,"title":"Network received","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"cumulative"},"type":"graph","xaxis":{"buckets":null,"mode":"time","name":null,"show":true,"values":[]},"yaxes":[{"format":"bytes","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"bytes","label":null,"logBase":1,"max":null,"min":null,"show":true}]},{"alerting":{},"aliasColors":{},"bars":false,"dashLength":10,"dashes":false,"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"fill":1,"grid":{},"id":10,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[{"alias":"transmitted ","yaxis":2}],"spaceLength":10,"span":6,"stack":false,"steppedLine":false,"targets":[{"expr":"rate(node_network_transmit_bytes{instance=\"$server\",device!~\"lo\"}[5m])","hide":false,"intervalFactor":2,"legendFormat":"{{device}}","refId":"B","step":10,"target":""}],"thresholds":[],"timeFrom":null,"timeShift":null,"title":"Network transmitted","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"cumulative"},"type":"graph","xaxis":{"buckets":null,"mode":"time","name":null,"show":true,"values":[]},"yaxes":[{"format":"bytes","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"bytes","label":null,"logBase":1,"max":null,"min":null,"show":true}]}],"repeat":null,"repeatIteration":null,"repeatRowId":null,"showTitle":false,"title":"New row","titleSize":"h6"}],"schemaVersion":14,"style":"dark","tags":[],"templating":{"list":[{"allValue":null,"current":{},"datasource":"${DS_PROMETHEUS}","hide":0,"includeAll":false,"label":null,"multi":false,"name":"server","options":[],"query":"label_values(node_boot_time, instance)","refresh":1,"regex":"","sort":0,"tagValuesQuery":"","tags":[],"tagsQuery":"","type":"query","useTags":false}]},"time":{"from":"now-1h","to":"now"},"timepicker":{"refresh_intervals":["5s","10s","30s","1m","5m","15m","30m","1h","2h","1d"],"time_options":["5m","15m","1h","6h","12h","24h","2d","7d","30d"]},"timezone":"browser","title":"Nodes","version":2},"overwrite":true,"inputs":[{"name":"DS_PROMETHEUS","type":"datasource","pluginId":"prometheus","value":"prometheus"}]} + {"dashboard":{"__inputs":[{"name":"DS_PROMETHEUS","label":"prometheus","description":"","type":"datasource","pluginId":"prometheus","pluginName":"Prometheus"}],"__requires":[{"type":"grafana","id":"grafana","name":"Grafana","version":"4.4.1"},{"type":"panel","id":"graph","name":"Graph","version":""},{"type":"datasource","id":"prometheus","name":"Prometheus","version":"1.0.0"},{"type":"panel","id":"singlestat","name":"Singlestat","version":""}],"annotations":{"list":[]},"description":"Dashboard to get an overview of one server","editable":true,"gnetId":22,"graphTooltip":0,"hideControls":false,"id":null,"links":[],"refresh":false,"rows":[{"collapse":false,"height":"250px","panels":[{"alerting":{},"aliasColors":{},"bars":false,"dashLength":10,"dashes":false,"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"fill":1,"grid":{},"id":3,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"spaceLength":10,"span":6,"stack":false,"steppedLine":false,"targets":[{"expr":"100 - (avg by (cpu) (irate(node_cpu{mode=\"idle\", kubernetes_pod_node_name=\"$server\"}[5m])) * 100)","hide":false,"intervalFactor":10,"legendFormat":"{{cpu}}","refId":"A","step":50}],"thresholds":[],"timeFrom":null,"timeShift":null,"title":"Idle cpu","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"cumulative"},"type":"graph","xaxis":{"buckets":null,"mode":"time","name":null,"show":true,"values":[]},"yaxes":[{"format":"percent","label":"cpu usage","logBase":1,"max":100,"min":0,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true}]},{"alerting":{},"aliasColors":{},"bars":false,"dashLength":10,"dashes":false,"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"fill":1,"grid":{},"id":9,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[],"spaceLength":10,"span":6,"stack":false,"steppedLine":false,"targets":[{"expr":"node_load1{kubernetes_pod_node_name=\"$server\"}","intervalFactor":4,"legendFormat":"load 1m","refId":"A","step":20,"target":""},{"expr":"node_load5{kubernetes_pod_node_name=\"$server\"}","intervalFactor":4,"legendFormat":"load 5m","refId":"B","step":20,"target":""},{"expr":"node_load15{kubernetes_pod_node_name=\"$server\"}","intervalFactor":4,"legendFormat":"load 15m","refId":"C","step":20,"target":""}],"thresholds":[],"timeFrom":null,"timeShift":null,"title":"System load","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"cumulative"},"type":"graph","xaxis":{"buckets":null,"mode":"time","name":null,"show":true,"values":[]},"yaxes":[{"format":"percentunit","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true}]}],"repeat":null,"repeatIteration":null,"repeatRowId":null,"showTitle":false,"title":"New row","titleSize":"h6"},{"collapse":false,"height":"250px","panels":[{"alerting":{},"aliasColors":{},"bars":false,"dashLength":10,"dashes":false,"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"fill":1,"grid":{},"id":4,"legend":{"alignAsTable":false,"avg":false,"current":false,"hideEmpty":false,"hideZero":false,"max":false,"min":false,"rightSide":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[{"alias":"node_memory_SwapFree{instance=\"172.17.0.1:9100\",job=\"prometheus\"}","yaxis":2}],"spaceLength":10,"span":9,"stack":true,"steppedLine":false,"targets":[{"expr":"node_memory_MemTotal{kubernetes_pod_node_name=\"$server\"} - node_memory_MemFree{kubernetes_pod_node_name=\"$server\"} - node_memory_Buffers{kubernetes_pod_node_name=\"$server\"} - node_memory_Cached{kubernetes_pod_node_name=\"$server\"}","hide":false,"interval":"","intervalFactor":2,"legendFormat":"memory used","metric":"","refId":"C","step":10},{"expr":"node_memory_Buffers{kubernetes_pod_node_name=\"$server\"}","interval":"","intervalFactor":2,"legendFormat":"memory buffers","metric":"","refId":"E","step":10},{"expr":"node_memory_Cached{kubernetes_pod_node_name=\"$server\"}","intervalFactor":2,"legendFormat":"memory cached","metric":"","refId":"F","step":10},{"expr":"node_memory_MemFree{kubernetes_pod_node_name=\"$server\"}","intervalFactor":2,"legendFormat":"memory free","metric":"","refId":"D","step":10}],"thresholds":[],"timeFrom":null,"timeShift":null,"title":"Memory usage","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"individual"},"type":"graph","xaxis":{"buckets":null,"mode":"time","name":null,"show":true,"values":[]},"yaxes":[{"format":"bytes","label":null,"logBase":1,"max":null,"min":"0","show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":true}]},{"cacheTimeout":null,"colorBackground":false,"colorValue":false,"colors":["rgba(50, 172, 45, 0.97)","rgba(237, 129, 40, 0.89)","rgba(245, 54, 54, 0.9)"],"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"format":"percent","gauge":{"maxValue":100,"minValue":0,"show":true,"thresholdLabels":false,"thresholdMarkers":true},"id":5,"interval":null,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":3,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"tableColumn":"","targets":[{"expr":"((node_memory_MemTotal{kubernetes_pod_node_name=\"$server\"} - node_memory_MemFree{kubernetes_pod_node_name=\"$server\"} - node_memory_Buffers{kubernetes_pod_node_name=\"$server\"} - node_memory_Cached{kubernetes_pod_node_name=\"$server\"}) / node_memory_MemTotal{kubernetes_pod_node_name=\"$server\"}) * 100","intervalFactor":2,"refId":"A","step":60,"target":""}],"thresholds":"80, 90","title":"Memory usage","type":"singlestat","valueFontSize":"80%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"avg"}],"repeat":null,"repeatIteration":null,"repeatRowId":null,"showTitle":false,"title":"New row","titleSize":"h6"},{"collapse":false,"height":"250px","panels":[{"alerting":{},"aliasColors":{},"bars":false,"dashLength":10,"dashes":false,"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"fill":1,"grid":{},"id":6,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[{"alias":"read","yaxis":1},{"alias":"{instance=\"172.17.0.1:9100\"}","yaxis":2},{"alias":"io time","yaxis":2}],"spaceLength":10,"span":9,"stack":false,"steppedLine":false,"targets":[{"expr":"sum by (instance) (rate(node_disk_bytes_read{kubernetes_pod_node_name=\"$server\"}[2m]))","hide":false,"intervalFactor":4,"legendFormat":"read","refId":"A","step":20,"target":""},{"expr":"sum by (instance) (rate(node_disk_bytes_written{kubernetes_pod_node_name=\"$server\"}[2m]))","intervalFactor":4,"legendFormat":"written","refId":"B","step":20},{"expr":"sum by (instance) (rate(node_disk_io_time_ms{kubernetes_pod_node_name=\"$server\"}[2m]))","intervalFactor":4,"legendFormat":"io time","refId":"C","step":20}],"thresholds":[],"timeFrom":null,"timeShift":null,"title":"Disk I/O","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"cumulative"},"type":"graph","xaxis":{"buckets":null,"mode":"time","name":null,"show":true,"values":[]},"yaxes":[{"format":"bytes","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"ms","label":null,"logBase":1,"max":null,"min":null,"show":true}]},{"cacheTimeout":null,"colorBackground":false,"colorValue":false,"colors":["rgba(50, 172, 45, 0.97)","rgba(237, 129, 40, 0.89)","rgba(245, 54, 54, 0.9)"],"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"format":"percentunit","gauge":{"maxValue":1,"minValue":0,"show":true,"thresholdLabels":false,"thresholdMarkers":true},"id":7,"interval":null,"links":[],"mappingType":1,"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"maxDataPoints":100,"nullPointMode":"connected","nullText":null,"postfix":"","postfixFontSize":"50%","prefix":"","prefixFontSize":"50%","rangeMaps":[{"from":"null","text":"N/A","to":"null"}],"span":3,"sparkline":{"fillColor":"rgba(31, 118, 189, 0.18)","full":false,"lineColor":"rgb(31, 120, 193)","show":false},"tableColumn":"","targets":[{"expr":"(sum(node_filesystem_size{device!=\"rootfs\",kubernetes_pod_node_name=\"$server\"}) - sum(node_filesystem_free{device!=\"rootfs\",kubernetes_pod_node_name=\"$server\"})) / sum(node_filesystem_size{device!=\"rootfs\",kubernetes_pod_node_name=\"$server\"})","intervalFactor":2,"refId":"A","step":60,"target":""}],"thresholds":"0.75, 0.9","title":"Disk space usage","type":"singlestat","valueFontSize":"80%","valueMaps":[{"op":"=","text":"N/A","value":"null"}],"valueName":"current"}],"repeat":null,"repeatIteration":null,"repeatRowId":null,"showTitle":false,"title":"New row","titleSize":"h6"},{"collapse":false,"height":"250px","panels":[{"alerting":{},"aliasColors":{},"bars":false,"dashLength":10,"dashes":false,"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"fill":1,"grid":{},"id":8,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[{"alias":"transmitted ","yaxis":2}],"spaceLength":10,"span":6,"stack":false,"steppedLine":false,"targets":[{"expr":"rate(node_network_receive_bytes{kubernetes_pod_node_name=\"$server\",device!~\"lo\"}[5m])","hide":false,"intervalFactor":2,"legendFormat":"{{device}}","refId":"A","step":10,"target":""}],"thresholds":[],"timeFrom":null,"timeShift":null,"title":"Network received","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"cumulative"},"type":"graph","xaxis":{"buckets":null,"mode":"time","name":null,"show":true,"values":[]},"yaxes":[{"format":"bytes","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"bytes","label":null,"logBase":1,"max":null,"min":null,"show":true}]},{"alerting":{},"aliasColors":{},"bars":false,"dashLength":10,"dashes":false,"datasource":"${DS_PROMETHEUS}","editable":true,"error":false,"fill":1,"grid":{},"id":10,"legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":false,"values":false},"lines":true,"linewidth":2,"links":[],"nullPointMode":"connected","percentage":false,"pointradius":5,"points":false,"renderer":"flot","seriesOverrides":[{"alias":"transmitted ","yaxis":2}],"spaceLength":10,"span":6,"stack":false,"steppedLine":false,"targets":[{"expr":"rate(node_network_transmit_bytes{kubernetes_pod_node_name=\"$server\",device!~\"lo\"}[5m])","hide":false,"intervalFactor":2,"legendFormat":"{{device}}","refId":"B","step":10,"target":""}],"thresholds":[],"timeFrom":null,"timeShift":null,"title":"Network transmitted","tooltip":{"msResolution":false,"shared":true,"sort":0,"value_type":"cumulative"},"type":"graph","xaxis":{"buckets":null,"mode":"time","name":null,"show":true,"values":[]},"yaxes":[{"format":"bytes","label":null,"logBase":1,"max":null,"min":null,"show":true},{"format":"bytes","label":null,"logBase":1,"max":null,"min":null,"show":true}]}],"repeat":null,"repeatIteration":null,"repeatRowId":null,"showTitle":false,"title":"New row","titleSize":"h6"}],"schemaVersion":14,"style":"dark","tags":[],"templating":{"list":[{"allValue":null,"current":{},"datasource":"${DS_PROMETHEUS}","hide":0,"includeAll":false,"label":null,"multi":false,"name":"server","options":[],"query":"label_values(node_boot_time, kubernetes_pod_node_name)","refresh":1,"regex":"","sort":0,"tagValuesQuery":"","tags":[],"tagsQuery":"","type":"query","useTags":false}]},"time":{"from":"now-1h","to":"now"},"timepicker":{"refresh_intervals":["5s","10s","30s","1m","5m","15m","30m","1h","2h","1d"],"time_options":["5m","15m","1h","6h","12h","24h","2d","7d","30d"]},"timezone":"browser","title":"Nodes","version":2},"overwrite":true,"inputs":[{"name":"DS_PROMETHEUS","type":"datasource","pluginId":"prometheus","value":"prometheus"}]} {% endraw %} diff --git a/roles/prometheus/templates/grafana-dashboards/template.json b/roles/prometheus/templates/grafana-dashboards/template.json new file mode 100644 index 0000000..822c008 --- /dev/null +++ b/roles/prometheus/templates/grafana-dashboards/template.json @@ -0,0 +1,15 @@ +{% raw %} +{ + "dashboard": { + ... ... + }, + "overwrite": true, + "inputs": [ + { + "name": "DS_PROMETHEUS", + "type": "datasource", + "pluginId": "prometheus", + "value": "prometheus" + } + ] +}{% endraw %} \ No newline at end of file diff --git a/roles/prometheus/templates/grafana.yaml b/roles/prometheus/templates/grafana.yaml index 32e4281..eb315ab 100644 --- a/roles/prometheus/templates/grafana.yaml +++ b/roles/prometheus/templates/grafana.yaml @@ -9,32 +9,7 @@ metadata: namespace: {{ k8s_prometheus_namespace }} name: prometheus-grafana data: - capacity-planning-dashboard.json: |- -{% include 'grafana-dashboards/capacity-planning.json' %} - - cluster-health-dashboard.json: |- -{% include 'grafana-dashboards/cluster-health.json' %} - - cluster-monitoring-dashboard.json: |- -{% include 'grafana-dashboards/cluster-monitoring.json' %} - - cluster-status-dashboard.json: |- -{% include 'grafana-dashboards/cluster-status.json' %} - - control-plane-status-dashboard.json: |- -{% include 'grafana-dashboards/control-plane-status.json' %} - - deployment-dashboard.json: |- -{% include 'grafana-dashboards/deployment.json' %} - - nodes-dashboard.json: |- -{% include 'grafana-dashboards/nodes.json' %} - - pods-dashboard.json: |- -{% include 'grafana-dashboards/pods.json' %} - - resource-requests-dashboard.json: |- -{% include 'grafana-dashboards/resource-requests.json' %} +{% include 'grafana-dashboards/all.yml' %} prometheus-datasource.json: |- { @@ -133,7 +108,7 @@ spec: configMap: name: prometheus-grafana -{% if k8s_prometheus_grafana_name is defined and k8s_prometheus_grafana_name != '' %} +{% if k8s_prometheus_grafana_name != '' %} --- apiVersion: extensions/v1beta1 diff --git a/roles/prometheus/templates/scrape_configs/all.yml b/roles/prometheus/templates/scrape_configs/all.yml new file mode 100644 index 0000000..a397e59 --- /dev/null +++ b/roles/prometheus/templates/scrape_configs/all.yml @@ -0,0 +1,11 @@ +{% include 'scrape_configs/prometheus.yml' %} + +{% include 'scrape_configs/kubernetes.yml' %} + +{% if k8s_prometheus_scrape_istio_metrics %} +{% include 'scrape_configs/istio.yml' %} +{% endif %} + +{% if k8s_prometheus_scrape_cockroachdb_metrics %} +{% include 'scrape_configs/cockroachdb.yml' %} +{% endif %} diff --git a/roles/prometheus/templates/scrape_configs/cockroachdb.yml b/roles/prometheus/templates/scrape_configs/cockroachdb.yml new file mode 100644 index 0000000..c7727a8 --- /dev/null +++ b/roles/prometheus/templates/scrape_configs/cockroachdb.yml @@ -0,0 +1,44 @@ + - job_name: 'cockroach' + + kubernetes_sd_configs: + - role: endpoints + + tls_config: + insecure_skip_verify: true + + relabel_configs: + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape] + action: keep + regex: true + - source_labels: [__meta_kubernetes_pod_name] + action: keep + regex: cockroachdb-(\d+) + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme] + action: replace + target_label: __scheme__ + regex: (https?) + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path] + action: replace + target_label: __metrics_path__ + regex: (.+) + - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port] + action: replace + target_label: __address__ + regex: (.+)(?::\d+);(\d+) + replacement: $1:$2 + - source_labels: [__meta_kubernetes_pod_node_name] + action: replace + target_label: kubernetes_pod_node_name + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: kubernetes_namespace + - source_labels: [__meta_kubernetes_pod_name] + action: replace + target_label: kubernetes_pod_name + - source_labels: [__address__] + action: replace + target_label: cluster + regex: (.+) + replacement: cockroachdb-cluster \ No newline at end of file diff --git a/roles/prometheus/templates/scrape_configs/kubernetes.yml b/roles/prometheus/templates/scrape_configs/kubernetes.yml index 46b30e1..08a5e8b 100644 --- a/roles/prometheus/templates/scrape_configs/kubernetes.yml +++ b/roles/prometheus/templates/scrape_configs/kubernetes.yml @@ -111,6 +111,9 @@ regex: (.+) target_label: __metrics_path__ replacement: /api/v1/nodes/${1}/proxy/metrics + - source_labels: [__meta_kubernetes_node_name] + action: replace + target_label: kubernetes_node_name # Scrape config for service endpoints. # @@ -128,10 +131,19 @@ kubernetes_sd_configs: - role: endpoints + tls_config: + insecure_skip_verify: true + relabel_configs: - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape] action: keep regex: true +{% if k8s_prometheus_scrape_cockroachdb_metrics %} + + - source_labels: [__meta_kubernetes_pod_name] + action: drop + regex: cockroachdb-(\d+) +{% endif %} - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme] action: replace target_label: __scheme__ @@ -145,17 +157,17 @@ target_label: __address__ regex: (.+)(?::\d+);(\d+) replacement: $1:$2 + - source_labels: [__meta_kubernetes_pod_node_name] + action: replace + target_label: kubernetes_pod_node_name - action: labelmap regex: __meta_kubernetes_service_label_(.+) - source_labels: [__meta_kubernetes_namespace] action: replace target_label: kubernetes_namespace - - source_labels: [__meta_kubernetes_service_name] - action: replace - target_label: kubernetes_name - - source_labels: [__meta_kubernetes_node_name] + - source_labels: [__meta_kubernetes_pod_name] action: replace - target_label: kubernetes_node_name + target_label: kubernetes_pod_name # Scrape config for pods # @@ -178,10 +190,8 @@ action: replace target_label: __metrics_path__ regex: (.+) - - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port] + - source_labels: [__meta_kubernetes_pod_node_name] action: replace - regex: (.+):(?:\d+);(\d+) - replacement: ${1}:${2} target_label: __address__ - action: labelmap regex: __meta_kubernetes_pod_label_(.+) @@ -199,6 +209,8 @@ # # * `prometheus.io/probe`: Only probe services that have a value of `true` - job_name: 'kubernetes-services' + scrape_interval: 60s + scrape_timeout: 50s metrics_path: /probe params: module: [http_2xx] @@ -219,7 +231,7 @@ - source_labels: [__meta_kubernetes_namespace] target_label: kubernetes_namespace - source_labels: [__meta_kubernetes_service_name] - target_label: kubernetes_name + target_label: kubernetes_service_name # Scrape config for probing ingresses via the Blackbox Exporter. # @@ -228,7 +240,8 @@ # # * `prometheus.io/probe`: Only probe ingresses that have a value of `true` - job_name: 'kubernetes-ingresses' - + scrape_interval: 60s + scrape_timeout: 50s metrics_path: /probe params: module: [https_2xx] @@ -253,4 +266,4 @@ - source_labels: [__meta_kubernetes_namespace] target_label: kubernetes_namespace - source_labels: [__meta_kubernetes_ingress_name] - target_label: kubernetes_name + target_label: kubernetes_ingress_name diff --git a/roles/prometheus/templates/scrape_configs/prometheus.yml b/roles/prometheus/templates/scrape_configs/prometheus.yml index 2ce66f3..802e3f9 100644 --- a/roles/prometheus/templates/scrape_configs/prometheus.yml +++ b/roles/prometheus/templates/scrape_configs/prometheus.yml @@ -1,4 +1,4 @@ - # A scrape configuration for monitoring itself + # A scrape configuration for short-lived jobs (ephemeral and batch jobs) # - job_name: prometheus static_configs: diff --git a/roles/prometheus/templates/server.yaml b/roles/prometheus/templates/server.yaml index cec390f..87ea667 100644 --- a/roles/prometheus/templates/server.yaml +++ b/roles/prometheus/templates/server.yaml @@ -10,19 +10,7 @@ metadata: name: prometheus-server data: alerts: |- -{% include 'alerts/common.conf' %} - groups: -{% include 'alerts/app.conf' %} - -{% include 'alerts/deployment.conf' %} - -{% include 'alerts/pods.conf' %} - -{% include 'alerts/node.conf' %} - -{% include 'alerts/ingress.conf' %} - -{% include 'alerts/service.conf' %} +{% include 'alerts/all.yml' %} prometheus.yml: |- alerting: @@ -37,11 +25,7 @@ data: # A list of scrape configurations. scrape_configs: -{% include 'scrape_configs/prometheus.yml' %} - -{% include 'scrape_configs/kubernetes.yml' %} - -{% include 'scrape_configs/istio.yml' %} +{% include 'scrape_configs/all.yml' %} --- @@ -172,7 +156,7 @@ spec: args: - --config.file=/etc/config/prometheus.yml - --storage.tsdb.path=/data - - --storage.tsdb.retention=7d + - --storage.tsdb.retention={{ k8s_prometheus_retention_days }}d - --web.console.libraries=/etc/prometheus/console_libraries - --web.console.templates=/etc/prometheus/consoles ports: @@ -204,7 +188,7 @@ spec: - name: storage-volume emptyDir: {} -{% if k8s_prometheus_name is defined and k8s_prometheus_name != '' %} +{% if k8s_prometheus_name != '' %} --- apiVersion: extensions/v1beta1