From aa2d4e45ac0bc261733b2e2f033662605b89f4b7 Mon Sep 17 00:00:00 2001 From: mphanias <121247041+mphanias@users.noreply.github.com> Date: Mon, 11 Dec 2023 14:15:36 +0530 Subject: [PATCH] OM153 - Node exporter dashboard (#93) * OM153 - node exporter dashboard * OM153 - added node exporter alerts to docker-compose files added alert for secret-agent too --- .../dashboards/nodeexporter/hostview.json | 7291 +++++++++++++++++ config/prometheus/aerospike_rules.yml | 1 - .../aerospike_secret_agent_rules.yml | 12 + config/prometheus/node_exporter_alerts.yml | 274 + config/prometheus/prometheus.yml | 12 +- .../node_exporter_alerts_rules.template | 274 + .../templates/node_exporter_config_data.json | 44 + examples/docker/docker-compose.yml | 6 + examples/docker/easy-prometheus.yml | 2 + 9 files changed, 7914 insertions(+), 2 deletions(-) create mode 100644 config/grafana/dashboards/nodeexporter/hostview.json create mode 100644 config/prometheus/aerospike_secret_agent_rules.yml create mode 100644 config/prometheus/node_exporter_alerts.yml create mode 100644 config/prometheus/templates/node_exporter_alerts_rules.template create mode 100644 config/prometheus/templates/node_exporter_config_data.json diff --git a/config/grafana/dashboards/nodeexporter/hostview.json b/config/grafana/dashboards/nodeexporter/hostview.json new file mode 100644 index 0000000..3479896 --- /dev/null +++ b/config/grafana/dashboards/nodeexporter/hostview.json @@ -0,0 +1,7291 @@ +{ + "__requires": [ + { + "type": "panel", + "id": "bargauge", + "name": "Bar gauge", + "version": "" + }, + { + "type": "panel", + "id": "gauge", + "name": "Gauge", + "version": "" + }, + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "9.3.2" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "stat", + "name": "Stat", + "version": "" + }, + { + "type": "panel", + "id": "timeseries", + "name": "Time series", + "version": "" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "description": "This dashboard helps visualise the details like CPU, memory, Network etc of all the nodes.", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": null, + "links": [], + "liveNow": false, + "panels": [ + { + "collapsed": false, + "datasource": { + "type": "prometheus", + "uid": "000000004" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 261, + "panels": [], + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "000000004" + }, + "refId": "A" + } + ], + "title": "Overview ", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "description": "Total number of CPU cores", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 4, + "x": 0, + "y": 1 + }, + "id": 14, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.3.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "editorMode": "code", + "expr": " sum(group by (instance) (node_exporter_build_info{job=\"$job\",cluster_name=\"$cluster_name\"}))", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Node Exporter UP (total)", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "description": "Busy state of all CPU cores together", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 1, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(50, 172, 45, 0.97)", + "value": null + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 85 + }, + { + "color": "rgba(245, 54, 54, 0.9)", + "value": 95 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 14, + "x": 4, + "y": 1 + }, + "id": 360, + "links": [], + "options": { + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "9.3.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": false, + "expr": " min(sum by( instance) (irate(node_cpu_seconds_total{job=\"$job\", cluster_name=\"$cluster_name\",instance=~\"$node\", mode!=\"idle\"}[$__rate_interval])) / scalar(sum ( irate(node_cpu_seconds_total{job=\"$job\", cluster_name=\"$cluster_name\",instance=~\"$node\", }[$__rate_interval]) ))*100 ) ", + "hide": false, + "instant": true, + "intervalFactor": 1, + "legendFormat": "Min ", + "range": false, + "refId": "A", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": false, + "expr": " avg(sum by( instance) (irate(node_cpu_seconds_total{job=\"$job\", cluster_name=\"$cluster_name\",instance=~\"$node\", mode!=\"idle\"}[$__rate_interval])) / scalar(sum ( irate(node_cpu_seconds_total{job=\"$job\", cluster_name=\"$cluster_name\",instance=~\"$node\", }[$__rate_interval]) ))*100 ) ", + "hide": false, + "instant": true, + "intervalFactor": 1, + "legendFormat": "Avg", + "range": false, + "refId": "B", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": false, + "expr": " max(sum by( instance) (irate(node_cpu_seconds_total{job=\"$job\", cluster_name=\"$cluster_name\",instance=~\"$node\", mode!=\"idle\"}[$__rate_interval])) / scalar(sum ( irate(node_cpu_seconds_total{job=\"$job\", cluster_name=\"$cluster_name\",instance=~\"$node\", }[$__rate_interval]) ))*100 ) ", + "hide": false, + "instant": true, + "intervalFactor": 1, + "legendFormat": "Max", + "range": false, + "refId": "C", + "step": 240 + } + ], + "title": "% CPU Utilization (rate)", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 18, + "y": 1 + }, + "id": 358, + "links": [], + "options": { + "displayMode": "basic", + "minVizHeight": 10, + "minVizWidth": 0, + "orientation": "vertical", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showUnfilled": true, + "valueMode": "color" + }, + "pluginVersion": "9.3.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": false, + "expr": " min(node_filefd_allocated{instance=~\"$node\",job=\"$job\",cluster_name=\"$cluster_name\"})", + "format": "time_series", + "instant": false, + "intervalFactor": 4, + "legendFormat": "Min", + "range": true, + "refId": "A", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": false, + "expr": " avg(node_filefd_allocated{instance=~\"$node\",job=\"$job\",cluster_name=\"$cluster_name\"})", + "format": "time_series", + "hide": false, + "instant": false, + "intervalFactor": 4, + "legendFormat": "Avg", + "range": true, + "refId": "B", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": false, + "expr": " max(node_filefd_allocated{instance=~\"$node\",job=\"$job\",cluster_name=\"$cluster_name\"})", + "format": "time_series", + "hide": false, + "instant": false, + "intervalFactor": 4, + "legendFormat": "Max", + "range": true, + "refId": "C", + "step": 240 + } + ], + "title": "File Descriptor (allocated)", + "type": "bargauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 0, + "y": 7 + }, + "id": 345, + "links": [], + "options": { + "colorMode": "background", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.3.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "sum(node_memory_MemTotal_bytes{job=\"$job\",cluster_name=\"$cluster_name\"})", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Total", + "range": true, + "refId": "B", + "step": 240 + } + ], + "title": "Memory (total ) (bytes) ", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 6, + "y": 7 + }, + "id": 305, + "links": [], + "options": { + "colorMode": "background", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.3.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "sum(node_memory_MemFree_bytes{job=\"$job\",cluster_name=\"$cluster_name\"})", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Unused", + "range": true, + "refId": "A", + "step": 240 + } + ], + "title": "Memory (Unused ) (bytes) ", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 12, + "y": 7 + }, + "id": 320, + "links": [], + "options": { + "displayMode": "basic", + "minVizHeight": 10, + "minVizWidth": 0, + "orientation": "vertical", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showUnfilled": true, + "valueMode": "color" + }, + "pluginVersion": "9.3.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "min (node_memory_MemFree_bytes{job=\"$job\",cluster_name=\"$cluster_name\"})", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Min ", + "range": true, + "refId": "min_free", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "avg (node_memory_MemFree_bytes{job=\"$job\",cluster_name=\"$cluster_name\"})", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Avg", + "range": true, + "refId": "avg_free", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "max (node_memory_MemFree_bytes{job=\"$job\",cluster_name=\"$cluster_name\"})", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Max", + "range": true, + "refId": "A", + "step": 240 + } + ], + "title": "Memory (Unused ) (bytes) ", + "type": "bargauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "pps" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 18, + "y": 7 + }, + "id": 355, + "links": [], + "options": { + "colorMode": "background", + "graphMode": "area", + "justifyMode": "auto", + "legend": { + "calcs": [ + "last", + "mean", + "max", + "min" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "width": 300 + }, + "orientation": "vertical", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "textMode": "auto", + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.3.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "sum (irate(node_netstat_Tcp_RetransSegs{instance=~\"$node\",job=\"$job\",cluster_name=\"$cluster_name\"}[$__rate_interval] ))", + "format": "stat", + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}", + "range": true, + "refId": "A", + "step": 240 + } + ], + "title": "Network Traffic - retransmit (packets) (total) (rate)", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "binBps" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "receive_packets_eth0" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#7EB26D", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "receive_packets_lo" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E24D42", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 5, + "w": 8, + "x": 0, + "y": 12 + }, + "id": 60, + "links": [], + "options": { + "colorMode": "background", + "graphMode": "area", + "justifyMode": "auto", + "legend": { + "calcs": [ + "last", + "mean", + "max", + "min" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "width": 300 + }, + "orientation": "vertical", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "textMode": "auto", + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.3.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "sum by (device) (irate(node_network_receive_bytes_total{job=\"$job\",cluster_name=\"$cluster_name\", instance=~\"$node\",}[$__rate_interval]))", + "format": "stat", + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{device}}", + "range": true, + "refId": "A", + "step": 240 + } + ], + "title": "Network Traffic - Received (bytes) (rate)", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "binBps" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "transmit_packets_eth0" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#7EB26D", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "transmit_packets_lo" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E24D42", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 5, + "w": 8, + "x": 8, + "y": 12 + }, + "id": 308, + "links": [], + "options": { + "colorMode": "background", + "graphMode": "area", + "justifyMode": "auto", + "legend": { + "calcs": [ + "last", + "mean", + "max", + "min" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "width": 300 + }, + "orientation": "vertical", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "textMode": "auto", + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.3.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "sum by (device) (irate(node_network_transmit_bytes_total{instance=~\"$node\",job=\"$job\",cluster_name=\"$cluster_name\"}[$__rate_interval]))", + "format": "stat", + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{device}}", + "range": true, + "refId": "B", + "step": 240 + } + ], + "title": "Network Traffic -Transmit (bytes) (rate)", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "pps" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "transmit_packets_eth0" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#7EB26D", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "transmit_packets_lo" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E24D42", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 5, + "w": 8, + "x": 16, + "y": 12 + }, + "id": 354, + "links": [], + "options": { + "colorMode": "background", + "graphMode": "area", + "justifyMode": "auto", + "legend": { + "calcs": [ + "last", + "mean", + "max", + "min" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "width": 300 + }, + "orientation": "vertical", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "textMode": "auto", + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.3.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "sum by (device) (irate(node_network_transmit_drop_total{instance=~\"$node\",job=\"$job\",cluster_name=\"$cluster_name\"}[$__rate_interval] ))", + "format": "stat", + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{device}}", + "range": true, + "refId": "A", + "step": 240 + } + ], + "title": "Network Traffic - drop (packets) (rate)", + "type": "stat" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 17 + }, + "id": 348, + "panels": [], + "title": "Configs comparision", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "description": "Total number of CPU cores", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 5, + "x": 0, + "y": 18 + }, + "id": 321, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.3.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "sum( count by (instance) (count by (cpu, instance) (node_cpu_seconds_total{job=\"$job\",cluster_name=\"$cluster_name\"}) ))", + "legendFormat": "Total CPU cores", + "range": true, + "refId": "A" + } + ], + "title": "CPU Cores (total)", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "description": "Total number of CPU cores", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 5, + "x": 5, + "y": 18 + }, + "id": 356, + "links": [], + "maxDataPoints": 100, + "options": { + "displayMode": "lcd", + "minVizHeight": 10, + "minVizWidth": 0, + "orientation": "vertical", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showUnfilled": true, + "valueMode": "text" + }, + "pluginVersion": "9.3.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "editorMode": "code", + "expr": " min(count by (instance) (count by (cpu, instance) (node_cpu_seconds_total{}) ))", + "legendFormat": "Min", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "editorMode": "code", + "expr": " avg(count by (instance) (count by (cpu, instance) (node_cpu_seconds_total{}) ))", + "hide": false, + "legendFormat": "Avg", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "editorMode": "code", + "expr": " max(count by (instance) (count by (cpu, instance) (node_cpu_seconds_total{}) ))", + "hide": false, + "legendFormat": "Max", + "range": true, + "refId": "C" + } + ], + "title": "CPU Cores ", + "type": "bargauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 10, + "y": 18 + }, + "id": 346, + "links": [], + "options": { + "displayMode": "basic", + "minVizHeight": 10, + "minVizWidth": 0, + "orientation": "vertical", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showUnfilled": true, + "valueMode": "color" + }, + "pluginVersion": "9.3.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "min (node_memory_MemTotal_bytes{job=\"$job\",cluster_name=\"$cluster_name\"})", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Min ", + "range": true, + "refId": "min_free", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "avg(node_memory_MemTotal_bytes{job=\"$job\",cluster_name=\"$cluster_name\"})", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Avg", + "range": true, + "refId": "avg_free", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "max(node_memory_MemTotal_bytes{job=\"$job\",cluster_name=\"$cluster_name\"})", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Max", + "range": true, + "refId": "A", + "step": 240 + } + ], + "title": "Total Memory (bytes) ", + "type": "bargauge" + }, + { + "collapsed": true, + "datasource": { + "type": "prometheus", + "uid": "000000004" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 24 + }, + "id": 309, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "description": "Total number of CPU cores", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 5, + "x": 0, + "y": 25 + }, + "id": 350, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.3.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "count(count by (cpu) (node_cpu_seconds_total{job=\"$job\", instance=~\"$node\",cluster_name=\"$cluster_name\"}) )", + "hide": false, + "legendFormat": "CPU cores {{instance}}", + "range": true, + "refId": "B" + } + ], + "title": "CPU Cores (total)", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 5, + "x": 5, + "y": 25 + }, + "id": 352, + "links": [], + "options": { + "colorMode": "background", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.3.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "(node_memory_MemFree_bytes{job=\"$job\",instance=~\"$node\",cluster_name=\"$cluster_name\"})", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Unused {{instance}}", + "range": true, + "refId": "A", + "step": 240 + } + ], + "title": "Memory (Unused ) (bytes) ", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 5, + "x": 10, + "y": 25 + }, + "id": 353, + "links": [], + "options": { + "colorMode": "background", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.3.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "(node_memory_MemTotal_bytes{job=\"$job\",cluster_name=\"$cluster_name\", instance=~\"$node\"})", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Total {{instance}}", + "range": true, + "refId": "A", + "step": 240 + } + ], + "title": "Memory (total ) (bytes) ", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 7, + "x": 15, + "y": 25 + }, + "id": 349, + "links": [], + "options": { + "colorMode": "background", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "vertical", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.3.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "node_filefd_maximum{instance=~\"$node\",job=\"$job\",cluster_name=\"$cluster_name\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 4, + "legendFormat": "Maximum ", + "range": true, + "refId": "A", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "node_filefd_allocated{instance=~\"$node\",job=\"$job\",cluster_name=\"$cluster_name\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 4, + "legendFormat": "Allocated", + "range": true, + "refId": "B", + "step": 240 + } + ], + "title": "File Descriptor ", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "description": "Busy state of all CPU cores together", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 1, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(50, 172, 45, 0.97)", + "value": null + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 85 + }, + { + "color": "rgba(245, 54, 54, 0.9)", + "value": 95 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 24, + "x": 0, + "y": 30 + }, + "id": 319, + "links": [], + "options": { + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "9.3.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "topk($topk_limit, (sum by(cpu, instance) (irate(node_cpu_seconds_total{instance=~\"$node\",job=\"$job\", mode!=\"idle\",cluster_name=\"$cluster_name\"}[$__rate_interval])) / on(instance) group_left sum by (instance)((irate(node_cpu_seconds_total{instance=~\"$node\",job=\"$job\",cluster_name=\"$cluster_name\"}[$__rate_interval])))) * 100)", + "hide": false, + "instant": true, + "intervalFactor": 1, + "legendFormat": "Core {{cpu}}", + "range": false, + "refId": "A", + "step": 240 + } + ], + "title": "% CPU Utilization (topk) (rate)", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "description": "Basic CPU info", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 40, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "percent" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Busy Iowait" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#890F02", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Idle" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#052B51", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Busy Iowait" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#890F02", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Idle" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#7EB26D", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Busy System" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EAB839", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Busy User" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#0A437C", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Busy Other" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#6D1F62", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 35 + }, + "id": 77, + "links": [], + "options": { + "legend": { + "calcs": [ + "last", + "mean", + "max", + "min" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "width": 200 + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "sum by(instance) (irate(node_cpu_seconds_total{instance=\"$node\",job=\"$job\", mode=\"system\", cluster_name=\"$cluster_name\"}[$__rate_interval])) / on(instance) group_left sum by (instance)((irate(node_cpu_seconds_total{instance=\"$node\",job=\"$job\" ,cluster_name=\"$cluster_name\"}[$__rate_interval])))", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Busy System", + "range": true, + "refId": "A", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "sum by(instance) (irate(node_cpu_seconds_total{instance=\"$node\",job=\"$job\", mode=\"user\",cluster_name=\"$cluster_name\"}[$__rate_interval])) / on(instance) group_left sum by (instance)((irate(node_cpu_seconds_total{instance=\"$node\",job=\"$job\",cluster_name=\"$cluster_name\"}[$__rate_interval])))", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Busy User", + "range": true, + "refId": "B", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "sum by(instance) (irate(node_cpu_seconds_total{instance=~\"$node\",job=\"$job\", mode=\"iowait\",cluster_name=\"$cluster_name\"}[$__rate_interval])) / on(instance) group_left sum by (instance)((irate(node_cpu_seconds_total{instance=~\"$node\",job=\"$job\",cluster_name=\"$cluster_name\"}[$__rate_interval])))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Busy Iowait", + "range": true, + "refId": "C", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "sum by(instance) (irate(node_cpu_seconds_total{instance=~\"$node\",job=\"$job\", mode=~\".*irq\",cluster_name=\"$cluster_name\"}[$__rate_interval])) / on(instance) group_left sum by (instance)((irate(node_cpu_seconds_total{instance=~\"$node\",job=\"$job\",cluster_name=\"$cluster_name\"}[$__rate_interval])))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Busy IRQs", + "range": true, + "refId": "D", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "sum by(instance) (irate(node_cpu_seconds_total{instance=~\"$node\",job=\"$job\", mode!='idle',mode!='user',mode!='system',mode!='iowait',mode!='irq',mode!='softirq',cluster_name=\"$cluster_name\"}[$__rate_interval])) / on(instance) group_left sum by (instance)((irate(node_cpu_seconds_total{instance=~\"$node\",job=\"$job\",cluster_name=\"$cluster_name\"}[$__rate_interval])))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Busy Other", + "range": true, + "refId": "E", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "sum by(instance) (irate(node_cpu_seconds_total{instance=~\"$node\",job=\"$job\", mode=\"idle\",cluster_name=\"$cluster_name\"}[$__rate_interval])) / on(instance) group_left sum by (instance)((irate(node_cpu_seconds_total{instance=~\"$node\",job=\"$job\",cluster_name=\"$cluster_name\"}[$__rate_interval])))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Idle", + "range": true, + "refId": "F", + "step": 240 + } + ], + "title": "CPU Info (%) (rate)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "bytes", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 40, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Apps" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#629E51", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Free" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#0A437C", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "RAM_Free" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E0F9D7", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Unused" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EAB839", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Unused - Free memory unassigned" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#052B51", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 35 + }, + "id": 311, + "links": [], + "options": { + "legend": { + "calcs": [ + "last", + "mean", + "max", + "min" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "width": 200 + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "expr": "(node_memory_MemTotal_bytes{instance=~\"$node\",job=\"$job\",cluster_name=\"$cluster_name\"})", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Total", + "refId": "A", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "expr": "(node_memory_MemFree_bytes{instance=~\"$node\",job=\"$job\",cluster_name=\"$cluster_name\"})", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Unused", + "refId": "B", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "expr": "(node_memory_MemTotal_bytes - node_memory_MemFree_bytes{instance=~\"$node\",job=\"$job\",cluster_name=\"$cluster_name\"})", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Used", + "refId": "C", + "step": 240 + } + ], + "title": "Memory Details (Bytes)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "faults", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Apps" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#629E51", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Buffers" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#614D93", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Cache" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#6D1F62", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Cached" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#511749", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Committed" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#508642", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Free" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#0A437C", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Hardware Corrupted - Amount of RAM that the kernel identified as corrupted / not working" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#CFFAFF", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Inactive" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#584477", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "PageTables" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#0A50A1", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Page_Tables" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#0A50A1", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "RAM_Free" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E0F9D7", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Slab" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#806EB7", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Slab_Cache" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E0752D", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Swap" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#BF1B00", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Swap_Cache" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#C15C17", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Swap_Free" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#2F575E", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Unused" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EAB839", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Pgfault - Page major and minor fault operations" + }, + "properties": [ + { + "id": "custom.fillOpacity", + "value": 0 + }, + { + "id": "custom.stacking", + "value": { + "group": false, + "mode": "normal" + } + } + ] + } + ] + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 41 + }, + "id": 175, + "links": [], + "options": { + "legend": { + "calcs": [ + "last", + "mean", + "max", + "min" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "width": 350 + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "irate(node_vmstat_pgfault{instance=~\"$node\",job=\"$job\",cluster_name=\"$cluster_name\"}[$__rate_interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Pgfault", + "range": true, + "refId": "A", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "expr": "irate(node_vmstat_pgmajfault{instance=~\"$node\",job=\"$job\",cluster_name=\"$cluster_name\"}[$__rate_interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "PgMajFault", + "refId": "B", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "expr": "irate(node_vmstat_pgfault{instance=~\"$node\",job=\"$job\",cluster_name=\"$cluster_name\"}[$__rate_interval]) - irate(node_vmstat_pgmajfault{instance=~\"$node\",job=\"$job\",cluster_name=\"$cluster_name\"}[$__rate_interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "PgMinFault", + "refId": "C", + "step": 240 + } + ], + "title": "Memory Page Faults (rate)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "description": "Apps- Memory used by user-space applications; PageTables- Memory used to map between virtual and physical memory addresses,SwapCache- Memory that keeps track of pages that have been fetched from swap but not yet been modified; Slab- Memory used by the kernel to cache data structures for its own use (caches like inode, dentry, etc; Cache- Parked file data (file content) cache; Buffers- Block device (e.g. harddisk) cache; Unused- Free memory unassigned; Swap- Swap space used; Hardware Corrupted - Amount of RAM that the kernel identified as corrupted / not working", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "bytes", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 40, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Apps" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#629E51", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Buffers" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#614D93", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Cache" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#6D1F62", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Cached" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#511749", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Committed" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#508642", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Free" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#0A437C", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Hardware Corrupted - Amount of RAM that the kernel identified as corrupted / not working" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#CFFAFF", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Inactive" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#584477", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "PageTables" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#0A50A1", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Page_Tables" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#0A50A1", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "RAM_Free" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E0F9D7", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Slab" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#806EB7", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Slab_Cache" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E0752D", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Swap" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#BF1B00", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Swap - Swap memory usage" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#BF1B00", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Swap_Cache" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#C15C17", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Swap_Free" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#2F575E", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Unused" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EAB839", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Unused - Free memory unassigned" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#052B51", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*Hardware Corrupted - *./" + }, + "properties": [ + { + "id": "custom.stacking", + "value": { + "group": false, + "mode": "normal" + } + } + ] + } + ] + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 41 + }, + "id": 314, + "links": [], + "options": { + "legend": { + "calcs": [ + "last", + "mean", + "max", + "min" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "width": 350 + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "expr": "node_memory_MemTotal_bytes{instance=~\"$node\",job=\"$job\",cluster_name=\"$cluster_name\"} - node_memory_MemFree_bytes{instance=~\"$node\",job=\"$job\",cluster_name=\"$cluster_name\"} - node_memory_Buffers_bytes{instance=~\"$node\",job=\"$job\",cluster_name=\"$cluster_name\"} - node_memory_Cached_bytes{instance=~\"$node\",job=\"$job\",cluster_name=\"$cluster_name\"} - node_memory_Slab_bytes{instance=~\"$node\",job=\"$job\",cluster_name=\"$cluster_name\"} - node_memory_PageTables_bytes{instance=~\"$node\",job=\"$job\",cluster_name=\"$cluster_name\"} - node_memory_SwapCached_bytes{instance=~\"$node\",job=\"$job\",cluster_name=\"$cluster_name\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Apps", + "refId": "A", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "expr": "node_memory_PageTables_bytes{instance=~\"$node\",job=\"$job\",cluster_name=\"$cluster_name\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "PageTables", + "refId": "B", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "expr": "node_memory_SwapCached_bytes{instance=~\"$node\",job=\"$job\",cluster_name=\"$cluster_name\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "SwapCache", + "refId": "C", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "expr": "node_memory_Slab_bytes{instance=~\"$node\",job=\"$job\",cluster_name=\"$cluster_name\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Slab", + "refId": "D", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "expr": "node_memory_Cached_bytes{instance=~\"$node\",job=\"$job\",cluster_name=\"$cluster_name\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Cache", + "refId": "E", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "expr": "node_memory_Buffers_bytes{instance=~\"$node\",job=\"$job\",cluster_name=\"$cluster_name\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Buffers", + "refId": "F", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "expr": "node_memory_MemFree_bytes{instance=~\"$node\",job=\"$job\",cluster_name=\"$cluster_name\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Unused", + "refId": "G", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "expr": "(node_memory_SwapTotal_bytes{instance=~\"$node\",job=\"$job\",cluster_name=\"$cluster_name\"} - node_memory_SwapFree_bytes{instance=~\"$node\",job=\"$job\",cluster_name=\"$cluster_name\"})", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Swap", + "refId": "H", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "expr": "node_memory_HardwareCorrupted_bytes{instance=~\"$node\",job=\"$job\",cluster_name=\"$cluster_name\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Hardware Corrupted", + "refId": "I", + "step": 240 + } + ], + "title": "Memory Stack (Bytes)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "pages out (-) / in (+)", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*out/" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + } + ] + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 47 + }, + "id": 176, + "links": [], + "options": { + "legend": { + "calcs": [ + "last", + "mean", + "max", + "min" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "expr": "irate(node_vmstat_pgpgin{instance=~\"$node\",job=\"$job\",cluster_name=\"$cluster_name\"}[$__rate_interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "PagesIn", + "refId": "A", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "expr": "irate(node_vmstat_pgpgout{instance=~\"$node\",job=\"$job\",cluster_name=\"$cluster_name\"}[$__rate_interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "PagesOut", + "refId": "B", + "step": 240 + } + ], + "title": "Memory Pages In / Out (rate)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "pages out (-) / in (+)", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*out/" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + } + ] + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 47 + }, + "id": 22, + "links": [], + "options": { + "legend": { + "calcs": [ + "last", + "mean", + "max", + "min" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "expr": "irate(node_vmstat_pswpin{instance=~\"$node\",job=\"$job\",cluster_name=\"$cluster_name\"}[$__rate_interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "PgSwapIn", + "refId": "A", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "expr": "irate(node_vmstat_pswpout{instance=~\"$node\",job=\"$job\",cluster_name=\"$cluster_name\"}[$__rate_interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "PgSwapOut", + "refId": "B", + "step": 240 + } + ], + "title": "Memory Pages Swap In / Out (rate)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "description": "The number (after merges) of I/O requests completed per second for the device", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "IO read (-) / write (+)", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "iops" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*Read.*/" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sda_.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#7EB26D", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdb_.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EAB839", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdc_.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#6ED0E0", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdd_.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EF843C", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sde_.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E24D42", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sda1.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#584477", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sda2_.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#BA43A9", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sda3_.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#F4D598", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdb1.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#0A50A1", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdb2.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#BF1B00", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdb3.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E0752D", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdc1.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#962D82", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdc2.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#614D93", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdc3.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#9AC48A", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdd1.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#65C5DB", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdd2.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#F9934E", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdd3.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EA6460", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sde1.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E0F9D7", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdd2.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#FCEACA", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sde3.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#F9E2D2", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 53 + }, + "id": 9, + "links": [], + "options": { + "legend": { + "calcs": [ + "last", + "mean", + "max", + "min" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "width": 260 + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "expr": "irate(node_disk_reads_completed_total{instance=~\"$node\",job=\"$job\",cluster_name=\"$cluster_name\"}[$__rate_interval])", + "intervalFactor": 4, + "legendFormat": "{{device}} - Reads", + "refId": "A", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "expr": "irate(node_disk_writes_completed_total{instance=~\"$node\",job=\"$job\",cluster_name=\"$cluster_name\"}[$__rate_interval])", + "intervalFactor": 1, + "legendFormat": "{{device}} - Writes", + "refId": "B", + "step": 240 + } + ], + "title": "Disk IOps Completed (rate)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "description": "The number of bytes read from or written to the device per second", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "bytes read (-) / write (+)", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "Bps" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*Read.*/" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sda_.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#7EB26D", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdb_.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EAB839", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdc_.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#6ED0E0", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdd_.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EF843C", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sde_.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E24D42", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sda1.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#584477", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sda2_.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#BA43A9", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sda3_.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#F4D598", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdb1.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#0A50A1", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdb2.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#BF1B00", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdb3.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E0752D", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdc1.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#962D82", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdc2.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#614D93", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdc3.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#9AC48A", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdd1.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#65C5DB", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdd2.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#F9934E", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdd3.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EA6460", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sde1.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E0F9D7", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdd2.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#FCEACA", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sde3.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#F9E2D2", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 53 + }, + "id": 33, + "links": [], + "options": { + "legend": { + "calcs": [ + "last", + "mean", + "max", + "min" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "width": 260 + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "expr": "irate(node_disk_read_bytes_total{instance=~\"$node\",job=\"$job\",cluster_name=\"$cluster_name\"}[$__rate_interval])", + "format": "time_series", + "intervalFactor": 4, + "legendFormat": "{{device}} - Read", + "refId": "A", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "expr": "irate(node_disk_written_bytes_total{instance=~\"$node\",job=\"$job\",cluster_name=\"$cluster_name\"}[$__rate_interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{device}} - Written", + "refId": "B", + "step": 240 + } + ], + "title": "Disk R/W Data (Bytes) (rate)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "description": "The average time for requests issued to the device to be served. This includes the time spent by the requests in queue and the time spent servicing them.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "time. read (-) / write (+)", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 30, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*Read.*/" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sda_.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#7EB26D", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdb_.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EAB839", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdc_.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#6ED0E0", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdd_.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EF843C", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sde_.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E24D42", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sda1.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#584477", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sda2_.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#BA43A9", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sda3_.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#F4D598", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdb1.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#0A50A1", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdb2.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#BF1B00", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdb3.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E0752D", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdc1.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#962D82", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdc2.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#614D93", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdc3.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#9AC48A", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdd1.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#65C5DB", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdd2.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#F9934E", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdd3.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EA6460", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sde1.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E0F9D7", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdd2.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#FCEACA", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sde3.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#F9E2D2", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 59 + }, + "id": 313, + "links": [], + "options": { + "legend": { + "calcs": [ + "last", + "mean", + "max", + "min" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "width": 260 + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "expr": "irate(node_disk_read_time_seconds_total{instance=~\"$node\",job=\"$job\",cluster_name=\"$cluster_name\"}[$__rate_interval]) / irate(node_disk_reads_completed_total{instance=~\"$node\",job=\"$job\",cluster_name=\"$cluster_name\"}[$__rate_interval])", + "hide": false, + "interval": "", + "intervalFactor": 4, + "legendFormat": "{{device}} -Read", + "refId": "A", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "expr": "irate(node_disk_write_time_seconds_total{instance=~\"$node\",job=\"$job\",cluster_name=\"$cluster_name\"}[$__rate_interval]) / irate(node_disk_writes_completed_total{instance=~\"$node\",job=\"$job\",cluster_name=\"$cluster_name\"}[$__rate_interval])", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{device}} -Write", + "refId": "B", + "step": 240 + } + ], + "title": "Disk Average Wait Time (seconds) (rate)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "description": "The average queue length of the requests that were issued to the device", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "aqu-sz", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*sda_.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#7EB26D", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdb_.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EAB839", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdc_.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#6ED0E0", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdd_.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EF843C", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sde_.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E24D42", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sda1.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#584477", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sda2_.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#BA43A9", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sda3_.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#F4D598", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdb1.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#0A50A1", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdb2.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#BF1B00", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdb3.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E0752D", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdc1.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#962D82", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdc2.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#614D93", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdc3.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#9AC48A", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdd1.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#65C5DB", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdd2.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#F9934E", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdd3.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EA6460", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sde1.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E0F9D7", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdd2.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#FCEACA", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sde3.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#F9E2D2", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 59 + }, + "id": 35, + "links": [], + "options": { + "legend": { + "calcs": [ + "last", + "mean", + "max", + "min" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "width": 260 + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "expr": "irate(node_disk_io_time_weighted_seconds_total{instance=~\"$node\",job=\"$job\",cluster_name=\"$cluster_name\"}[$__rate_interval])", + "interval": "", + "intervalFactor": 4, + "legendFormat": "{{device}}", + "refId": "A", + "step": 240 + } + ], + "title": "Disk - Average Queue Size (rate)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "%util", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 40, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "io time" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#890F02", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byType", + "options": "time" + }, + "properties": [ + { + "id": "custom.axisPlacement", + "value": "hidden" + } + ] + } + ] + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 65 + }, + "id": 312, + "links": [], + "options": { + "legend": { + "calcs": [ + "last", + "mean", + "max", + "min" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "width": 260 + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "expr": "irate(node_disk_io_time_seconds_total{instance=~\"$node\",job=\"$job\",device=~\"$diskdevices\",cluster_name=\"$cluster_name\"} [$__rate_interval])", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{device}}", + "refId": "A", + "step": 240 + } + ], + "title": "Disk - I/O Utilization (%) (rate)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "description": "Percentage of elapsed time during which I/O requests were issued to the device (bandwidth utilization for the device). Device saturation occurs when this value is close to 100% for devices serving requests serially. But for devices serving requests in parallel, such as RAID arrays and modern SSDs, this number does not reflect their performance limits.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "%util", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 30, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*sda_.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#7EB26D", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdb_.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EAB839", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdc_.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#6ED0E0", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdd_.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EF843C", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sde_.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E24D42", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sda1.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#584477", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sda2_.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#BA43A9", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sda3_.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#F4D598", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdb1.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#0A50A1", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdb2.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#BF1B00", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdb3.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E0752D", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdc1.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#962D82", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdc2.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#614D93", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdc3.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#9AC48A", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdd1.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#65C5DB", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdd2.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#F9934E", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdd3.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EA6460", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sde1.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E0F9D7", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdd2.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#FCEACA", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sde3.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#F9E2D2", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 65 + }, + "id": 36, + "links": [], + "options": { + "legend": { + "calcs": [ + "last", + "mean", + "max", + "min" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "width": 260 + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "expr": "irate(node_disk_io_time_seconds_total{instance=~\"$node\",job=\"$job\",cluster_name=\"$cluster_name\"}[$__rate_interval])", + "interval": "", + "intervalFactor": 4, + "legendFormat": "{{device}} - IO", + "refId": "A", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "expr": "irate(node_disk_discard_time_seconds_total{instance=~\"$node\",job=\"$job\",cluster_name=\"$cluster_name\"}[$__rate_interval])", + "interval": "", + "intervalFactor": 4, + "legendFormat": "{{device}} - discard", + "refId": "B", + "step": 240 + } + ], + "title": "Disk - Time Spent Doing I/Os (%) (rate)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "bytes", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 71 + }, + "id": 43, + "links": [], + "options": { + "legend": { + "calcs": [ + "last", + "mean", + "max", + "min" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "expr": "node_filesystem_avail_bytes{instance=~\"$node\",job=\"$job\",cluster_name=\"$cluster_name\",device!~'rootfs'}", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "{{mountpoint}}", + "metric": "", + "refId": "A", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "expr": "node_filesystem_free_bytes{instance=~\"$node\",job=\"$job\",cluster_name=\"$cluster_name\",device!~'rootfs'}", + "format": "time_series", + "hide": true, + "intervalFactor": 1, + "legendFormat": "{{mountpoint}} - Free", + "refId": "B", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "expr": "node_filesystem_size_bytes{instance=~\"$node\",job=\"$job\",cluster_name=\"$cluster_name\",device!~'rootfs'}", + "format": "time_series", + "hide": true, + "intervalFactor": 1, + "legendFormat": "{{mountpoint}} - Size", + "refId": "C", + "step": 240 + } + ], + "title": "Filesystem space available (Bytes)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "files", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 71 + }, + "id": 315, + "links": [], + "options": { + "legend": { + "calcs": [ + "last", + "mean", + "max", + "min" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "node_filefd_maximum{instance=~\"$node\",job=\"$job\",cluster_name=\"$cluster_name\"}", + "format": "time_series", + "intervalFactor": 4, + "legendFormat": "Max ", + "range": true, + "refId": "A", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "expr": "node_filefd_allocated{instance=~\"$node\",job=\"$job\",cluster_name=\"$cluster_name\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Open files", + "refId": "B", + "step": 240 + } + ], + "title": "File Descriptor", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "description": "InSegs - Segments received, including those received in error. This count includes segments received on currently established connections; OutSegs - Segments sent, including those on current connections but excluding those containing only retransmitted octets", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "datagrams out (-) / in (+)", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*Out.*/" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*Snd.*/" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + } + ] + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 77 + }, + "id": 318, + "links": [], + "options": { + "legend": { + "calcs": [ + "last", + "mean", + "max", + "min" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "expr": "irate(node_netstat_Tcp_InSegs{instance=~\"$node\",job=\"$job\",cluster_name=\"$cluster_name\"}[$__rate_interval])", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "InSegs", + "refId": "A", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "expr": "irate(node_netstat_Tcp_OutSegs{instance=~\"$node\",job=\"$job\",cluster_name=\"$cluster_name\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "OutSegs", + "refId": "B", + "step": 240 + } + ], + "title": "TCP In / Out (rate)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "description": "CurrEstab - TCP connections for which the current state is either ESTABLISHED or CLOSE- WAIT", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "connections", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*MaxConn *./" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#890F02", + "mode": "fixed" + } + }, + { + "id": "custom.fillOpacity", + "value": 0 + } + ] + } + ] + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 77 + }, + "id": 85, + "links": [], + "options": { + "legend": { + "calcs": [ + "last", + "mean", + "max", + "min" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "expr": "node_netstat_Tcp_CurrEstab{instance=~\"$node\",job=\"$job\",cluster_name=\"$cluster_name\"}", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "TCP connections", + "refId": "A", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "expr": "node_netstat_Tcp_MaxConn{instance=~\"$node\",job=\"$job\",cluster_name=\"$cluster_name\"}", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "MaxConn - Limit on the total number of TCP connections the entity can support (Dynamic is \"-1\")", + "refId": "B", + "step": 240 + } + ], + "title": "TCP Connections", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "packets out (-) / in (+)", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "pps" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "receive_packets_eth0" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#7EB26D", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "receive_packets_lo" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E24D42", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "transmit_packets_eth0" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#7EB26D", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "transmit_packets_lo" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E24D42", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*Trans.*/" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + } + ] + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 83 + }, + "id": 316, + "links": [], + "options": { + "legend": { + "calcs": [ + "last", + "mean", + "max", + "min" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Last", + "sortDesc": true, + "width": 300 + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "expr": "irate(node_network_receive_packets_total{instance=~\"$node\",job=\"$job\",cluster_name=\"$cluster_name\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{device}} - Receive", + "refId": "A", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "expr": "irate(node_network_transmit_packets_total{instance=~\"$node\",job=\"$job\",cluster_name=\"$cluster_name\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{device}} - Transmit", + "refId": "B", + "step": 240 + } + ], + "title": "Network Traffic (packets) (rate)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "packets out (-) / in (+)", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "KBs" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "receive_packets_eth0" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#7EB26D", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "receive_packets_lo" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E24D42", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "transmit_packets_eth0" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#7EB26D", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "transmit_packets_lo" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E24D42", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*Trans.*/" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + } + ] + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 83 + }, + "id": 317, + "links": [], + "options": { + "legend": { + "calcs": [ + "last", + "mean", + "max", + "min" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Last", + "sortDesc": true, + "width": 300 + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "expr": "irate(node_network_receive_bytes_total{instance=~\"$node\",job=\"$job\",cluster_name=\"$cluster_name\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{device}} - Receive", + "refId": "A", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "expr": "irate(node_network_transmit_bytes_total{instance=~\"$node\",job=\"$job\",cluster_name=\"$cluster_name\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{device}} - Transmit", + "refId": "B", + "step": 240 + } + ], + "title": "Network Traffic by Bytes (rate)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "packets out (-) / in (+)", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "pps" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*Trans.*/" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + } + ] + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 89 + }, + "id": 146, + "links": [], + "options": { + "legend": { + "calcs": [ + "last", + "mean", + "max", + "min" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "width": 300 + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "expr": "irate(node_network_receive_multicast_total{instance=~\"$node\",job=\"$job\",cluster_name=\"$cluster_name\"}[$__rate_interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{device}} - Receive", + "refId": "A", + "step": 240 + } + ], + "title": "Network Traffic Multicast (packets) (rate)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "packets out (-) / in (+)", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "pps" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*Trans.*/" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + } + ] + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 89 + }, + "id": 142, + "links": [], + "options": { + "legend": { + "calcs": [ + "last", + "mean", + "max", + "min" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "width": 300 + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "expr": "irate(node_network_receive_errs_total{instance=~\"$node\",job=\"$job\",cluster_name=\"$cluster_name\"}[$__rate_interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{device}} - Receive", + "refId": "A", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "expr": "irate(node_network_transmit_errs_total{instance=~\"$node\",job=\"$job\",cluster_name=\"$cluster_name\"}[$__rate_interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{device}} - Transmit", + "refId": "B", + "step": 240 + } + ], + "title": "Network Traffic Errors (packets) (rate)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "packets out (-) / in (+)", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "pps" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*Trans.*/" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + } + ] + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 95 + }, + "id": 143, + "links": [], + "options": { + "legend": { + "calcs": [ + "last", + "mean", + "max", + "min" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "width": 300 + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "expr": "irate(node_network_receive_drop_total{instance=~\"$node\",job=\"$job\",cluster_name=\"$cluster_name\"}[$__rate_interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{device}} - Receive", + "refId": "A", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "expr": "irate(node_network_transmit_drop_total{instance=~\"$node\",job=\"$job\",cluster_name=\"$cluster_name\"}[$__rate_interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{device}} - Transmit", + "refId": "B", + "step": 240 + } + ], + "title": "Network Traffic Drop (packets) (rate)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "packets out (-) / in (+)", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "pps" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*Trans.*/" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + } + ] + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 95 + }, + "id": 361, + "links": [], + "options": { + "legend": { + "calcs": [ + "last", + "mean", + "max", + "min" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "width": 300 + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "editorMode": "code", + "expr": " (irate(node_netstat_Tcp_RetransSegs{instance=~\"$node\",job=\"$job\",cluster_name=\"$cluster_name\"}[$__rate_interval] ))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{instance}}", + "range": true, + "refId": "A", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{device}} - Transmit", + "range": true, + "refId": "B", + "step": 240 + } + ], + "title": "Network Traffic Retransmit (packets) (rate)", + "type": "timeseries" + } + ], + "repeat": "node", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "000000004" + }, + "refId": "A" + } + ], + "title": "$node", + "type": "row" + } + ], + "refresh": "", + "revision": 1, + "schemaVersion": 37, + "style": "dark", + "tags": [ + "linux", + "Monitoring" + ], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "Aerospike Prometheus", + "value": "Aerospike Prometheus" + }, + "hide": 0, + "includeAll": false, + "label": "datasource", + "multi": false, + "name": "DS_AEROSPIKE_PROMETHEUS", + "options": [], + "query": "prometheus", + "queryValue": "", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + }, + { + "current": {}, + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "definition": "", + "hide": 0, + "includeAll": false, + "label": "Job", + "multi": false, + "name": "job", + "options": [], + "query": { + "query": "label_values(node_uname_info, job)", + "refId": "Prometheus-job-Variable-Query" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "tagValuesQuery": "", + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "current": {}, + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "definition": "label_values(node_uname_info{job=~\"$job\"}, cluster_name)", + "hide": 0, + "includeAll": false, + "label": "cluster_name", + "multi": false, + "name": "cluster_name", + "options": [], + "query": { + "query": "label_values(node_uname_info{job=~\"$job\"}, cluster_name)", + "refId": "Prometheus-cluster-Variable-Query" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "tagValuesQuery": "", + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "current": {}, + "datasource": { + "type": "prometheus", + "uid": "${DS_AEROSPIKE_PROMETHEUS}" + }, + "definition": "label_values(node_uname_info{job=\"$job\", cluster_name=\"$cluster_name\"}, instance)", + "hide": 0, + "includeAll": true, + "label": "Nodes", + "multi": true, + "name": "node", + "options": [], + "query": { + "query": "label_values(node_uname_info{job=\"$job\", cluster_name=\"$cluster_name\"}, instance)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "tagValuesQuery": "", + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "current": { + "selected": false, + "text": "10", + "value": "10" + }, + "hide": 0, + "label": "Top-K", + "name": "topk_limit", + "options": [ + { + "selected": true, + "text": "2", + "value": "2" + } + ], + "query": "10", + "skipUrlSync": false, + "type": "textbox" + }, + { + "current": { + "selected": false, + "text": "[a-z]+|nvme[0-9]+n[0-9]+|mmcblk[0-9]+", + "value": "[a-z]+|nvme[0-9]+n[0-9]+|mmcblk[0-9]+" + }, + "hide": 2, + "includeAll": false, + "multi": false, + "name": "diskdevices", + "options": [ + { + "selected": true, + "text": "[a-z]+|nvme[0-9]+n[0-9]+|mmcblk[0-9]+", + "value": "[a-z]+|nvme[0-9]+n[0-9]+|mmcblk[0-9]+" + } + ], + "query": "[a-z]+|nvme[0-9]+n[0-9]+|mmcblk[0-9]+", + "skipUrlSync": false, + "type": "custom" + } + ] + }, + "time": { + "from": "now-5m", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "Aerospike Host View", + "uid": "U5PWhoNIZ", + "version": 4, + "weekStart": "" +} \ No newline at end of file diff --git a/config/prometheus/aerospike_rules.yml b/config/prometheus/aerospike_rules.yml index 4da8a53..0f813d7 100644 --- a/config/prometheus/aerospike_rules.yml +++ b/config/prometheus/aerospike_rules.yml @@ -19,7 +19,6 @@ groups: summary: "Node {{ $labels.instance }} down" description: "{{ $labels.instance }} node is down." - - name: aerospike_aerospike.rules > NAMESPACE rules: - alert: NamespaceStopWrites diff --git a/config/prometheus/aerospike_secret_agent_rules.yml b/config/prometheus/aerospike_secret_agent_rules.yml new file mode 100644 index 0000000..0dbcb09 --- /dev/null +++ b/config/prometheus/aerospike_secret_agent_rules.yml @@ -0,0 +1,12 @@ +groups: +- name: secret_agent.rules + rules: + - alert: AerospikeSecretAgentDown + expr: absent(aerospike_sa_connections_active{job="aerospike_secret_agent"}) == 1 + for: 1m + labels: + severity: critical + annotations: + summary: "Aerospike Secret Agent instance {{ $labels.instance }} in cluster {{$labels.cluster_name}} has been down for more than 1m." + description: "Aerospike Secret Agent instance {{ $labels.instance }} in cluster {{$labels.cluster_name}} has been down for more than 1m." + diff --git a/config/prometheus/node_exporter_alerts.yml b/config/prometheus/node_exporter_alerts.yml new file mode 100644 index 0000000..f6373c2 --- /dev/null +++ b/config/prometheus/node_exporter_alerts.yml @@ -0,0 +1,274 @@ +groups: +- name: node_exporter_alerts + rules: + - alert: HostNodeExporterDownCritical + expr: up{job="node-exporter"} == 0 + for: 1m + labels: + severity: critical + annotations: + summary: "Host ({{ $labels.instance }}) is down in cluster {{ $labels.cluster_name }} " + description: "Failed to scrape {{ $labels.job }} on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} for more than 1m minutes. node-exporter seems down." + + - alert: HostMemoryFillingUpWarn + expr: 100 - (node_memory_MemAvailable_bytes{job="node-exporter"} / node_memory_MemTotal_bytes{job="node-exporter"} * 100 ) > 70 + for: 1m + labels: + severity: warn + annotations: + summary: "Host memory filling up on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "Memory is filling up (> 70%) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }} " + + - alert: HostMemoryFillingUpCritical + expr: 100 - (node_memory_MemAvailable_bytes{job="node-exporter"} / node_memory_MemTotal_bytes{job="node-exporter"} * 100 ) > 90 + for: 1m + labels: + severity: critical + annotations: + summary: "Host memory filling up on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "Node memory is filling up (> 90%) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }} " + + - alert: HostDiskSpaceFillingUpWarn + expr: 100 - (node_filesystem_avail_bytes{job="node-exporter"} * 100) / node_filesystem_size_bytes{job="node-exporter"} > 70 and ON (instance, device, mountpoint) node_filesystem_readonly{job="node-exporter"} == 0 + for: 1m + labels: + severity: warn + annotations: + summary: "Host disk space is filling up on ({{ $labels.instance }})of cluster {{ $labels.cluster_name }}" + description: "Disk is crossing (> 70% ) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }} " + + - alert: HostDiskSpaceFillingUpCritical + expr: 100 - (node_filesystem_avail_bytes{job="node-exporter"} * 100) / node_filesystem_size_bytes{job="node-exporter"} > 90 and ON (instance, device, mountpoint) node_filesystem_readonly{job="node-exporter"} == 0 + for: 1m + labels: + severity: critical + annotations: + summary: "Host disk space is filling up on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "Disk is crossing (> 90% ) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }} " + + - alert: HostInodesFillingUpWarn + expr: 100 - node_filesystem_files_free{job="node-exporter"} / node_filesystem_files{job="node-exporter"} * 100 > 70 and ON (instance, device, mountpoint) node_filesystem_readonly{job="node-exporter"} == 0 + for: 1m + labels: + severity: warn + annotations: + summary: "Host inodes filling Up on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "Disk is running out of available inodes (> 70%) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }} " + + - alert: HostInodesFillingUpCritical + expr: 100 - node_filesystem_files_free{job="node-exporter"} / node_filesystem_files{job="node-exporter"} * 100 > 90 and ON (instance, device, mountpoint) node_filesystem_readonly{job="node-exporter"} == 0 + for: 1m + labels: + severity: critical + annotations: + summary: "Host inodes filling Up on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "Disk is running out of available inodes (> 90%) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }} " + + - alert: HostUnusualDiskReadLatencyWarn + expr: (node_disk_read_time_seconds_total{job="node-exporter"}) / (node_disk_reads_completed_total{job="node-exporter"}) > 0.1 and (node_disk_reads_completed_total{job="node-exporter"}) > 0 + for: 1m + labels: + severity: warn + annotations: + summary: "Host unusual disk read latency on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "Disk read latency is increasing (read operations > 0.1s) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }} " + + - alert: HostUnusualDiskReadLatencyCritical + expr: (node_disk_read_time_seconds_total{job="node-exporter"}) / (node_disk_reads_completed_total{job="node-exporter"}) > 0.5 and (node_disk_reads_completed_total{job="node-exporter"}) > 0 + for: 1m + labels: + severity: critical + annotations: + summary: "Host unusual disk read latency on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "Disk read latency is increasing (read operations > 0.5s) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }} " + + - alert: HostUnusualDiskWriteLatencyWarn + expr: (node_disk_write_time_seconds_total{job="node-exporter"}) / (node_disk_writes_completed_total{job="node-exporter"}) > 0.1 and (node_disk_writes_completed_total{job="node-exporter"}) > 0 + for: 1m + labels: + severity: warn + annotations: + summary: "Host unusual disk write latency on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "Disk write latency is increasing (write operations > 0.1s) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }}" + + - alert: HostUnusualDiskWriteLatencyCritical + expr: (node_disk_write_time_seconds_total{job="node-exporter"}) / (node_disk_writes_completed_total{job="node-exporter"}) > 0.5 and (node_disk_writes_completed_total{job="node-exporter"}) > 0 + for: 1m + labels: + severity: critical + annotations: + summary: "Host unusual disk write latency ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "Disk write latency is increasing (write operations > 0.5s) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }}" + + - alert: HostHighCpuUtilizationWarn(Host) + expr: sum by (instance) (node_cpu_seconds_total{mode!="idle", job="node-exporter"}) / on(instance) group_left sum by (instance)((node_cpu_seconds_total{job="node-exporter"})) * 100 > 70 + for: 30s + labels: + severity: warn + annotations: + summary: "Host high CPU load on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "CPU utilization is crossing (> 70%) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }}" + + - alert: HostHighCpuUtilizationCritical(Host) + expr: sum by (instance) (node_cpu_seconds_total{mode!="idle", job="node-exporter"}) / on(instance) group_left sum by (instance)((node_cpu_seconds_total{job="node-exporter"})) * 100 > 90 + for: 30s + labels: + severity: critical + annotations: + summary: "Host high CPU load on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "CPU utilization is crossing (> 90%) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }}" + + - alert: HostHighCpuUtilizationWarn(Core) + expr: sum by (instance, cpu) (node_cpu_seconds_total{mode!="idle", job="node-exporter"}) / on(instance) group_left sum by (instance)((node_cpu_seconds_total{job="node-exporter"})) * 100 > 70 + for: 30s + labels: + severity: warn + annotations: + summary: "Host high CPU load on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "CPU utilization is crossing (> 70%) for ({{ $labels.cpu }}) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }}" + + - alert: HostHighCpuUtilizationCritical(Core) + expr: sum by (instance, cpu) (node_cpu_seconds_total{mode!="idle", job="node-exporter"}) / on(instance) group_left sum by (instance)((node_cpu_seconds_total{job="node-exporter"})) * 100 > 90 + for: 30s + labels: + severity: critical + annotations: + summary: "Host high CPU load on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "CPU utilization is crossing (> 90%) for ({{ $labels.cpu }}) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }}" + + - alert: HostCpuStealWarn(Host) + expr: sum by (instance)(node_cpu_seconds_total{mode="steal", job="node-exporter"}) / on(instance) group_left sum by (instance)((node_cpu_seconds_total{job="node-exporter"})) * 100 > 3 + for: 30s + labels: + severity: warn + annotations: + summary: "Host CPU steal on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "CPU steal is > 3% on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }}. A noisy neighbor is killing VM performances or a spot instance may be out of credit. " + + - alert: HostCpuStealCritical(Host) + expr: sum by (instance)(node_cpu_seconds_total{mode="steal", job="node-exporter"}) / on(instance) group_left sum by (instance)((node_cpu_seconds_total{job="node-exporter"})) * 100 > 5 + for: 30s + labels: + severity: critical + annotations: + summary: "Host CPU steal on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "CPU steal is (> 5%) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }}. A noisy neighbor is killing VM performances or a spot instance may be out of credit. " + + - alert: HostCpuStealWarn(Core) + expr: sum by (instance, cpu)(node_cpu_seconds_total{mode="steal", job="node-exporter"}) / on(instance) group_left sum by (instance)((node_cpu_seconds_total{job="node-exporter"})) * 100 > 3 + for: 30s + labels: + severity: warn + annotations: + summary: "Host CPU steal on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "CPU steal is (> 3%) for ({{ $labels.cpu }}) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }}. A noisy neighbor is killing VM performances or a spot instance may be out of credit. " + + - alert: HostCpuStealCritical(Core) + expr: sum by (instance ,cpu)(node_cpu_seconds_total{mode="steal", job="node-exporter"}) / on(instance) group_left sum by (instance)((node_cpu_seconds_total{job="node-exporter"})) * 100 > 5 + for: 30s + labels: + severity: critical + annotations: + summary: "Host CPU steal on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "CPU steal is (> 5%) for ({{ $labels.cpu }}) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }}. A noisy neighbor is killing VM performances or a spot instance may be out of credit. " + + - alert: HostNetworkReceiveErrorsWarn + expr: ((node_network_receive_errs_total{job="node-exporter"}) / (node_network_receive_packets_total{job="node-exporter"})) * 100 > 3 + for: 1m + labels: + severity: warn + annotations: + summary: "Host Network Receive Errors on ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}" + description: "Instance interface has encountered {{ $value }} receive errors for {{ $labels.device }} on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }} " + + - alert: HostNetworkReceiveErrorsCritical + expr: ((node_network_receive_errs_total{job="node-exporter"}) / (node_network_receive_packets_total{job="node-exporter"})) * 100 > 5 + for: 1m + labels: + severity: critical + annotations: + summary: "Host Network Receive Errors on ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}" + description: "Instance interface has encountered {{ $value }} receive errors for {{ $labels.device }} on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }} " + + - alert: HostNetworkTransmitErrorsWarn + expr: ((node_network_transmit_errs_total{job="node-exporter"}) / (node_network_transmit_packets_total{job="node-exporter"})) * 100 > 3 + for: 1m + labels: + severity: warn + annotations: + summary: "Host Network Transmit Errors on ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}" + description: "Instance has encountered {{ $value }} transmit errors for {{ $labels.device }} on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }}" + + - alert: HostNetworkTransmitErrorsCritical + expr: ((node_network_transmit_errs_total{job="node-exporter"}) / (node_network_transmit_packets_total{job="node-exporter"})) * 100 > 5 + for: 1m + labels: + severity: critical + annotations: + summary: "Host Network Transmit Errors on ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}" + description: "Instance has encountered {{ $value }} transmit errors for {{ $labels.device }} on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }}" + + - alert: HostNetworkInterfaceSaturatedWarn + expr: ((node_network_receive_bytes_total{job="node-exporter"}) + (node_network_transmit_bytes_total{job="node-exporter"})) / (node_network_speed_bytes{job="node-exporter"}) > 0.8 + for: 1m + labels: + severity: warn + annotations: + summary: "Host Network Interface Saturated ({{ $labels.instance }}:{{ $labels.interface }}) of cluster {{ $labels.cluster_name }}" + description: "The network interface is getting overloaded (> 0.8) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} {{ $value }}. VALUE = {{ $value }}" + + - alert: HostNetworkInterfaceSaturatedCritical + expr: ((node_network_receive_bytes_total{job="node-exporter"}) + (node_network_transmit_bytes_total{job="node-exporter"})) / (node_network_speed_bytes{job="node-exporter"}) > 0.9 + for: 1m + labels: + severity: critical + annotations: + summary: "Host Network Interface Saturated on ({{ $labels.instance }}:{{ $labels.interface }}) of cluster {{ $labels.cluster_name }}" + description: "The network interface is getting overloaded (> 0.9) {{ $value }} on host {{ $labels.instance }}:{{ $labels.interface }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }}" + + - alert: HostClockNotSynchronisingWarn + expr: min_over_time(node_timex_sync_status{job="node-exporter"}[2m]) == 0 and node_timex_maxerror_seconds{job="node-exporter"} >= 16 + for: 2m + labels: + severity: warn + annotations: + summary: "Host clock not synchronising on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "Clock not synchronising on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }}" + + - alert: HostSwapInWarn + expr: (node_vmstat_pswpin{job="node-exporter"}) > 5 + for: 1m + labels: + severity: warn + annotations: + summary: "Host PageSwapIn value is too high on {{ $labels.instance }} of cluster {{ $labels.cluster_name }}" + description: "PageSwapIn(data from swap space on disk back into the physical memory (RAM)) value exceeds 5 on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. Current value is {{ $value }}." + + - alert: HostSwapInCritical + expr: (node_vmstat_pswpin{job="node-exporter"}) > 10 + for: 1m + labels: + severity: critical + annotations: + summary: "Host PageSwapIn value is too high on {{ $labels.instance }} of cluster {{ $labels.cluster_name }}" + description: "PageSwapIn(data from swap space on disk back into the physical memory (RAM)) value exceeds 10 on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. Current value is {{ $value }}." + + - alert: HostSwapOutWarn + expr: (node_vmstat_pswpout{job="node-exporter"}) > 5 + for: 1m + labels: + severity: warn + annotations: + summary: "Host PageSwapOut value is too high on {{ $labels.instance }} of cluster {{ $labels.cluster_name }}" + description: "PageSwapOut(move data from RAM to swap space on disk to free up space in memory) value exceeds 5 on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. Current value is {{ $value }}." + + - alert: HostSwapOutCritical + expr: (node_vmstat_pswpout{job="node-exporter"}) > 10 + for: 1m + labels: + severity: critical + annotations: + summary: "Host PageSwapOut value is too high on {{ $labels.instance }} of cluster {{ $labels.cluster_name }}" + description: "PageSwapOut(move data from RAM to swap space on disk to free up space in physical memory) value exceeds 10 on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. Current value is {{ $value }}." + + diff --git a/config/prometheus/prometheus.yml b/config/prometheus/prometheus.yml index a5f3f17..076c486 100644 --- a/config/prometheus/prometheus.yml +++ b/config/prometheus/prometheus.yml @@ -21,9 +21,11 @@ rule_files: # new severities are info, warn, error, critical in the increasing order of severity # these priority are updates in aerospike_rules.yml # - - "/etc/prometheus/deprecated_aerospike_rules.yml" + # - "/etc/prometheus/deprecated_aerospike_rules.yml" - "/etc/prometheus/aerospike_rules.yml" - "/etc/prometheus/aerospike_connector_rules.yml" + - "/etc/prometheus/node_exporter_alerts.yml" + - "/etc/prometheus/aerospike_secret_agent_rules.yml" # A scrape configuration containing exactly one endpoint to scrape: # Here it's Prometheus itself. @@ -36,3 +38,11 @@ scrape_configs: static_configs: - targets: ['172.17.0.2:9145', '172.17.0.3:9145', '172.17.0.4:9145'] + + - job_name: 'node-exporter' + + # metrics_path defaults to '/metrics' + # scheme defaults to 'http'. + + static_configs: + - targets: ['172.17.0.2:9100', '172.17.0.3:9100', '172.17.0.4:9100'] diff --git a/config/prometheus/templates/node_exporter_alerts_rules.template b/config/prometheus/templates/node_exporter_alerts_rules.template new file mode 100644 index 0000000..a3907d8 --- /dev/null +++ b/config/prometheus/templates/node_exporter_alerts_rules.template @@ -0,0 +1,274 @@ +groups: +- name: node_exporter_alerts + rules: + - alert: HostNodeExporterDownCritical + expr: up{job="node-exporter"} == 0 + for: {{ node_down_critical_duration }} + labels: + severity: critical + annotations: + summary: "{% raw %}Host ({{ $labels.instance }}) is down in cluster {{ $labels.cluster_name }} {% endraw %}" + description: "Failed to scrape{% raw %} ({{ $labels.job }}) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} {% endraw %} for more than {{ node_down_critical_duration }} and node-exporter seems down." + + - alert: HostMemoryFillingUpWarn + expr: 100 - (node_memory_MemAvailable_bytes{job="node-exporter"} / node_memory_MemTotal_bytes{job="node-exporter"} * 100 ) > {{ node_warn_memory_pct }} + for: {{ node_warn_outofmemory_duration }} + labels: + severity: warn + annotations: + summary: "{% raw %}Host memory filling up on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}{% endraw %}" + description: "Memory is filling up (> {{ node_warn_memory_pct }}%) {% raw %}on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}{% endraw %} " + + - alert: HostMemoryFillingUpCritical + expr: 100 - (node_memory_MemAvailable_bytes{job="node-exporter"} / node_memory_MemTotal_bytes{job="node-exporter"} * 100 ) > {{ node_critical_memory_pct }} + for: {{ node_critical_outofmemory_duration }} + labels: + severity: critical + annotations: + summary: "{% raw %}Host memory filling up on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}{% endraw %}" + description: "Node memory is filling up (> {{ node_critical_memory_pct }}%) {% raw %}on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }}{% endraw %} " + + - alert: HostDiskSpaceFillingUpWarn + expr: 100 - (node_filesystem_avail_bytes{job="node-exporter"} * 100) / node_filesystem_size_bytes{job="node-exporter"} > {{ node_warn_OutOfdisk_pct }} and ON (instance, device, mountpoint) node_filesystem_readonly{job="node-exporter"} == 0 + for: {{ node_warn_outOfDisk_duration }} + labels: + severity: warn + annotations: + summary: "{% raw %}Host disk space is filling up on ({{ $labels.instance }})of cluster {{ $labels.cluster_name }}{% endraw %}" + description: "Disk is crossing (> {{ node_warn_OutOfdisk_pct }}% ) {% raw %}on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}{% endraw %} " + + - alert: HostDiskSpaceFillingUpCritical + expr: 100 - (node_filesystem_avail_bytes{job="node-exporter"} * 100) / node_filesystem_size_bytes{job="node-exporter"} > {{ node_critical_OutOfdisk_pct }} and ON (instance, device, mountpoint) node_filesystem_readonly{job="node-exporter"} == 0 + for: {{ node_critical_outOfDisk_duration }} + labels: + severity: critical + annotations: + summary: "{% raw %}Host disk space is filling up on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}{% endraw %}" + description: "Disk is crossing (> {{ node_critical_OutOfdisk_pct }}% ) {% raw %}on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }}{% endraw %} " + + - alert: HostInodesFillingUpWarn + expr: 100 - node_filesystem_files_free{job="node-exporter"} / node_filesystem_files{job="node-exporter"} * 100 > {{ node_warn_OutOfInodes_pct }} and ON (instance, device, mountpoint) node_filesystem_readonly{job="node-exporter"} == 0 + for: {{ node_warn_outOfInodes_duration }} + labels: + severity: warn + annotations: + summary: "{% raw %}Host inodes filling Up on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}{% endraw %}" + description: "Disk is running out of available inodes (> {{ node_warn_OutOfInodes_pct }}%) {% raw %}on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }} {% endraw %}" + + - alert: HostInodesFillingUpCritical + expr: 100 - node_filesystem_files_free{job="node-exporter"} / node_filesystem_files{job="node-exporter"} * 100 > {{ node_critical_OutOfInodes_pct }} and ON (instance, device, mountpoint) node_filesystem_readonly{job="node-exporter"} == 0 + for: {{ node_critical_outOfInodes_duration }} + labels: + severity: critical + annotations: + summary: "{% raw %}Host inodes filling Up on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}{% endraw %}" + description: "Disk is running out of available inodes (> {{ node_critical_OutOfInodes_pct }}%) {% raw %}on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }} {% endraw %}" + + - alert: HostUnusualDiskReadLatencyWarn + expr: (node_disk_read_time_seconds_total{job="node-exporter"}) / (node_disk_reads_completed_total{job="node-exporter"}) > {{ node_warn_unusual_disklatency_time }} and (node_disk_reads_completed_total{job="node-exporter"}) > 0 + for: {{ node_warn_unusual_diskRead_latency_duration }} + labels: + severity: warn + annotations: + summary: "{% raw %}Host unusual disk read latency on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}{% endraw %}" + description: "Disk read latency is increasing (read operations > {{ node_warn_unusual_disklatency_time }}s) {% raw %}on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}{% endraw %} " + + - alert: HostUnusualDiskReadLatencyCritical + expr: (node_disk_read_time_seconds_total{job="node-exporter"}) / (node_disk_reads_completed_total{job="node-exporter"}) > {{ node_critical_unusual_disklatency_time }} and (node_disk_reads_completed_total{job="node-exporter"}) > 0 + for: {{ node_critical_unusual_diskRead_latency_duration }} + labels: + severity: critical + annotations: + summary: "{% raw %}Host unusual disk read latency on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}{% endraw %}" + description: "Disk read latency is increasing (read operations > {{ node_critical_unusual_disklatency_time }}s) {% raw %}on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }}{% endraw %} " + + - alert: HostUnusualDiskWriteLatencyWarn + expr: (node_disk_write_time_seconds_total{job="node-exporter"}) / (node_disk_writes_completed_total{job="node-exporter"}) > {{ node_warn_unusual_disklatency_time }} and (node_disk_writes_completed_total{job="node-exporter"}) > 0 + for: {{ node_warn_unusual_diskWrite_latency_duration }} + labels: + severity: warn + annotations: + summary: "{% raw %}Host unusual disk write latency on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}{% endraw %}" + description: "Disk write latency is increasing (write operations > {{ node_warn_unusual_disklatency_time }}s) {% raw %}on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }}{% endraw %}" + + - alert: HostUnusualDiskWriteLatencyCritical + expr: (node_disk_write_time_seconds_total{job="node-exporter"}) / (node_disk_writes_completed_total{job="node-exporter"}) > {{ node_critical_unusual_disklatency_time }} and (node_disk_writes_completed_total{job="node-exporter"}) > 0 + for: {{ node_critical_unusual_diskWrite_latency_duration }} + labels: + severity: critical + annotations: + summary: "{% raw %}Host unusual disk write latency ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}{% endraw %}" + description: "Disk write latency is increasing (write operations > {{ node_critical_unusual_disklatency_time }}s) {% raw %}on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }}{% endraw %}" + + - alert: HostHighCpuUtilizationWarn(Host) + expr: sum by (instance) (node_cpu_seconds_total{mode!="idle", job="node-exporter"}) / on(instance) group_left sum by (instance)((node_cpu_seconds_total{job="node-exporter"})) * 100 > {{ node_warn_highCPU_pct }} + for: {{ node_high_cpuload_threshold_duration }} + labels: + severity: warn + annotations: + summary: "{% raw %}Host high CPU load on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}{% endraw %}" + description: "CPU utilization is crossing (> {{ node_warn_highCPU_pct }}%) {% raw %}on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }}{% endraw %}" + + - alert: HostHighCpuUtilizationCritical(Host) + expr: sum by (instance) (node_cpu_seconds_total{mode!="idle", job="node-exporter"}) / on(instance) group_left sum by (instance)((node_cpu_seconds_total{job="node-exporter"})) * 100 > {{ node_critical_highCPU_pct }} + for: {{ node_high_cpuload_threshold_duration }} + labels: + severity: critical + annotations: + summary: "{% raw %}Host high CPU load on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}{% endraw %}" + description: "CPU utilization is crossing (> {{ node_critical_highCPU_pct }}%) {% raw %}on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }}{% endraw %}" + + - alert: HostHighCpuUtilizationWarn(Core) + expr: sum by (instance, cpu) (node_cpu_seconds_total{mode!="idle", job="node-exporter"}) / on(instance) group_left sum by (instance)((node_cpu_seconds_total{job="node-exporter"})) * 100 > {{ node_warn_highCPU_pct }} + for: {{ node_high_cpuload_threshold_duration }} + labels: + severity: warn + annotations: + summary: "{% raw %}Host high CPU load on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}{% endraw %}" + description: "CPU utilization is crossing (> {{ node_warn_highCPU_pct }}%) {% raw %}for ({{ $labels.cpu }}) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }}{% endraw %}" + + - alert: HostHighCpuUtilizationCritical(Core) + expr: sum by (instance, cpu) (node_cpu_seconds_total{mode!="idle", job="node-exporter"}) / on(instance) group_left sum by (instance)((node_cpu_seconds_total{job="node-exporter"})) * 100 > {{ node_critical_highCPU_pct }} + for: {{ node_high_cpuload_threshold_duration }} + labels: + severity: critical + annotations: + summary: "{% raw %}Host high CPU load on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}{% endraw %}" + description: "CPU utilization is crossing (> {{ node_critical_highCPU_pct }}%) {% raw %}for ({{ $labels.cpu }}) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }}{% endraw %}" + + - alert: HostCpuStealWarn(Host) + expr: sum by (instance)(node_cpu_seconds_total{mode="steal", job="node-exporter"}) / on(instance) group_left sum by (instance)((node_cpu_seconds_total{job="node-exporter"})) * 100 > {{ node_warn_cpu_steal_pct }} + for: {{ node_cpu_steal_threshold_duration }} + labels: + severity: warn + annotations: + summary: "{% raw %}Host CPU steal on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}{% endraw %}" + description: "CPU steal is > {{ node_warn_cpu_steal_pct }}% {% raw %}on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }}{% endraw %}. A noisy neighbor is killing VM performances or a spot instance may be out of credit. " + + - alert: HostCpuStealCritical(Host) + expr: sum by (instance)(node_cpu_seconds_total{mode="steal", job="node-exporter"}) / on(instance) group_left sum by (instance)((node_cpu_seconds_total{job="node-exporter"})) * 100 > {{ node_critical_cpu_steal_pct }} + for: {{ node_cpu_steal_threshold_duration }} + labels: + severity: critical + annotations: + summary: "{% raw %}Host CPU steal on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}{% endraw %}" + description: "CPU steal is (> {{ node_critical_cpu_steal_pct }}%) {% raw %}on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }}{% endraw %}. A noisy neighbor is killing VM performances or a spot instance may be out of credit. " + + - alert: HostCpuStealWarn(Core) + expr: sum by (instance, cpu)(node_cpu_seconds_total{mode="steal", job="node-exporter"}) / on(instance) group_left sum by (instance)((node_cpu_seconds_total{job="node-exporter"})) * 100 > {{ node_warn_cpu_steal_pct }} + for: {{ node_cpu_steal_threshold_duration }} + labels: + severity: warn + annotations: + summary: "{% raw %}Host CPU steal on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}{% endraw %}" + description: "CPU steal is (> {{ node_warn_cpu_steal_pct }}%) {% raw %}for ({{ $labels.cpu }}) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }}{% endraw %}. A noisy neighbor is killing VM performances or a spot instance may be out of credit. " + + - alert: HostCpuStealCritical(Core) + expr: sum by (instance ,cpu)(node_cpu_seconds_total{mode="steal", job="node-exporter"}) / on(instance) group_left sum by (instance)((node_cpu_seconds_total{job="node-exporter"})) * 100 > {{ node_critical_cpu_steal_pct }} + for: {{ node_cpu_steal_threshold_duration }} + labels: + severity: critical + annotations: + summary: "{% raw %}Host CPU steal on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}{% endraw %}" + description: "CPU steal is (> {{ node_critical_cpu_steal_pct }}%) {% raw %}for ({{ $labels.cpu }}) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }}{% endraw %}. A noisy neighbor is killing VM performances or a spot instance may be out of credit. " + + - alert: HostNetworkReceiveErrorsWarn + expr: ((node_network_receive_errs_total{job="node-exporter"}) / (node_network_receive_packets_total{job="node-exporter"})) * 100 > {{ node_warn_network_err }} + for: {{ node_network_receiveError_duration }} + labels: + severity: warn + annotations: + summary: "{% raw %}Host Network Receive Errors on ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}{% endraw %}" + description: "{% raw %}Instance interface has encountered {{ $value }} receive errors for {{ $labels.device }} on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }} {% endraw %}" + + - alert: HostNetworkReceiveErrorsCritical + expr: ((node_network_receive_errs_total{job="node-exporter"}) / (node_network_receive_packets_total{job="node-exporter"})) * 100 > {{ node_critical_network_err }} + for: {{ node_network_receiveError_duration }} + labels: + severity: critical + annotations: + summary: "{% raw %}Host Network Receive Errors on ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}{% endraw %}" + description: "{% raw %}Instance interface has encountered {{ $value }} receive errors for {{ $labels.device }} on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }} {% endraw %}" + + - alert: HostNetworkTransmitErrorsWarn + expr: ((node_network_transmit_errs_total{job="node-exporter"}) / (node_network_transmit_packets_total{job="node-exporter"})) * 100 > {{ node_warn_network_err }} + for: {{ node_network_transmitError_duration }} + labels: + severity: warn + annotations: + summary: "{% raw %}Host Network Transmit Errors on ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}{% endraw %}" + description: "{% raw %}Instance has encountered {{ $value }} transmit errors for {{ $labels.device }} on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }}{% endraw %}" + + - alert: HostNetworkTransmitErrorsCritical + expr: ((node_network_transmit_errs_total{job="node-exporter"}) / (node_network_transmit_packets_total{job="node-exporter"})) * 100 > {{ node_critical_network_err }} + for: {{ node_network_transmitError_duration }} + labels: + severity: critical + annotations: + summary: "{% raw %}Host Network Transmit Errors on ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}{% endraw %}" + description: "{% raw %}Instance has encountered {{ $value }} transmit errors for {{ $labels.device }} on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }}{% endraw %}" + + - alert: HostNetworkInterfaceSaturatedWarn + expr: ((node_network_receive_bytes_total{job="node-exporter"}) + (node_network_transmit_bytes_total{job="node-exporter"})) / (node_network_speed_bytes{job="node-exporter"}) > {{ node_warn_network_interface_saturation }} + for: {{ node_network_interface_saturated_duration }} + labels: + severity: warn + annotations: + summary: "{% raw %}Host Network Interface Saturated ({{ $labels.instance }}:{{ $labels.interface }}) of cluster {{ $labels.cluster_name }}{% endraw %}" + description: "The network interface is getting overloaded (> {{ node_warn_network_interface_saturation }}) on host {% raw %} {{ $labels.instance }} of cluster {{ $labels.cluster_name }} {{ $value }}. VALUE = {{ $value }}{% endraw %}" + + - alert: HostNetworkInterfaceSaturatedCritical + expr: ((node_network_receive_bytes_total{job="node-exporter"}) + (node_network_transmit_bytes_total{job="node-exporter"})) / (node_network_speed_bytes{job="node-exporter"}) > {{ node_critical_network_interface_saturation }} + for: {{ node_network_interface_saturated_duration }} + labels: + severity: critical + annotations: + summary: "{% raw %}Host Network Interface Saturated on ({{ $labels.instance }}:{{ $labels.interface }}) of cluster {{ $labels.cluster_name }}{% endraw %}" + description: "The network interface is getting overloaded (> {{ node_critical_network_interface_saturation }}) {% raw %}{{ $value }} on host {{ $labels.instance }}:{{ $labels.interface }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }}{% endraw %}" + + - alert: HostClockNotSynchronisingWarn + expr: min_over_time(node_timex_sync_status{job="node-exporter"}[{{ node_clock_notsync_duration }}]) == 0 and node_timex_maxerror_seconds{job="node-exporter"} >= {{ node_warn_clock_duration }} + for: {{ node_clock_notsync_duration }} + labels: + severity: warn + annotations: + summary: "{% raw %}Host clock not synchronising on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}{% endraw %}" + description: "{% raw %}Clock not synchronising on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }}{% endraw %}" + + - alert: HostSwapInWarn + expr: (node_vmstat_pswpin{job="node-exporter"}) > {{ node_warn_swapPages_count }} + for: {{ node_warn_swap_in_duration }} + labels: + severity: warn + annotations: + summary: "{% raw %}Host PageSwapIn value is too high on {{ $labels.instance }} of cluster {{ $labels.cluster_name }}{% endraw %}" + description: "PageSwapIn(data from swap space on disk back into the physical memory (RAM)) value exceeds {{ node_warn_swapPages_count }} on host {% raw %}{{ $labels.instance }} of cluster {{ $labels.cluster_name }}. Current value is {{ $value }}.{% endraw %}" + + - alert: HostSwapInCritical + expr: (node_vmstat_pswpin{job="node-exporter"}) > {{ node_critical_swapPages_count }} + for: {{ node_critical_swap_in_duration }} + labels: + severity: critical + annotations: + summary: "{% raw %}Host PageSwapIn value is too high on {{ $labels.instance }} of cluster {{ $labels.cluster_name }}{% endraw %}" + description: "PageSwapIn(data from swap space on disk back into the physical memory (RAM)) value exceeds {{ node_critical_swapPages_count }} on host {% raw %}{{ $labels.instance }} of cluster {{ $labels.cluster_name }}. Current value is {{ $value }}.{% endraw %}" + + - alert: HostSwapOutWarn + expr: (node_vmstat_pswpout{job="node-exporter"}) > {{ node_warn_swapPages_count }} + for: {{ node_warn_swap_out_duration }} + labels: + severity: warn + annotations: + summary: "{% raw %}Host PageSwapOut value is too high on {{ $labels.instance }} of cluster {{ $labels.cluster_name }}{% endraw %}" + description: "PageSwapOut(move data from RAM to swap space on disk to free up space in memory) value exceeds {{ node_warn_swapPages_count }} on host {% raw %}{{ $labels.instance }} of cluster {{ $labels.cluster_name }}. Current value is {{ $value }}.{% endraw %}" + + - alert: HostSwapOutCritical + expr: (node_vmstat_pswpout{job="node-exporter"}) > {{ node_critical_swapPages_count }} + for: {{ node_critical_swap_out_duration }} + labels: + severity: critical + annotations: + summary: "{% raw %}Host PageSwapOut value is too high on {{ $labels.instance }} of cluster {{ $labels.cluster_name }}{% endraw %}" + description: "PageSwapOut(move data from RAM to swap space on disk to free up space in physical memory) value exceeds {{ node_critical_swapPages_count }} on host {% raw %}{{ $labels.instance }} of cluster {{ $labels.cluster_name }}. Current value is {{ $value }}.{% endraw %}" + + diff --git a/config/prometheus/templates/node_exporter_config_data.json b/config/prometheus/templates/node_exporter_config_data.json new file mode 100644 index 0000000..29c61ad --- /dev/null +++ b/config/prometheus/templates/node_exporter_config_data.json @@ -0,0 +1,44 @@ +{ + "node_down_critical_duration": "1m", + "node_warn_memory_pct": "70", + "node_warn_outofmemory_duration": "1m", + "node_warn_outOfDisk_duration": "1m", + "node_warn_OutOfdisk_pct": "70", + "node_warn_OutOfInodes_pct": "70", + "node_warn_outOfInodes_duration": "1m", + "node_warn_unusual_disklatency_time": "0.1", + "node_warn_unusual_diskWrite_latency_duration": "1m", + "node_warn_unusual_diskRead_latency_duration": "1m", + "node_high_cpuload_threshold_duration": "30s", + "node_warn_highCPU_pct": "70", + "node_cpu_steal_threshold_duration": "30s", + "node_warn_cpu_steal_pct": "3", + "node_network_receiveError_duration": "1m", + "node_warn_network_err": "3", + "node_network_transmitError_duration": "1m", + "node_warn_network_interface_saturation": "0.8", + "node_network_interface_saturated_duration": "1m", + "node_clock_notsync_duration": "2m", + "node_warn_clock_duration": "16", + "node_warn_swapPages_count": "5", + "node_warn_swap_in_duration": "1m", + "node_warn_swap_out_duration": "1m", + + "node_critical_memory_pct": "90", + "node_critical_outofmemory_duration": "1m", + "node_critical_outOfDisk_duration": "1m", + "node_critical_OutOfdisk_pct": "90", + "node_critical_OutOfInodes_pct": "90", + "node_critical_outOfInodes_duration": "1m", + "node_critical_unusual_disklatency_time": "0.5", + "node_critical_unusual_diskWrite_latency_duration": "1m", + "node_critical_unusual_diskRead_latency_duration": "1m", + "node_critical_highCPU_pct": "90", + "node_critical_cpu_steal_pct": "5", + "node_critical_network_err": "5", + "node_critical_network_interface_saturation": "0.9", + "node_critical_swapPages_count": "10", + "node_critical_swap_in_duration": "1m", + "node_critical_swap_out_duration": "1m" + +} \ No newline at end of file diff --git a/examples/docker/docker-compose.yml b/examples/docker/docker-compose.yml index d6c7d82..bcbcec8 100644 --- a/examples/docker/docker-compose.yml +++ b/examples/docker/docker-compose.yml @@ -53,6 +53,12 @@ services: - source: ./config/prometheus/aerospike_connector_rules.yml target: /etc/prometheus/aerospike_connector_rules.yml type: bind + - source: ./config/prometheus/node_exporter_alerts.yml + target: /etc/prometheus/node_exporter_alerts.yml + type: bind + - source: ./config/prometheus/aerospike_secret_agent_rules.yml + target: /etc/prometheus/aerospike_secret_agent_rules.yml + type: bind depends_on: - exporter1 command: diff --git a/examples/docker/easy-prometheus.yml b/examples/docker/easy-prometheus.yml index 304237d..7a4dce6 100755 --- a/examples/docker/easy-prometheus.yml +++ b/examples/docker/easy-prometheus.yml @@ -18,6 +18,8 @@ rule_files: - "/etc/prometheus/aerospike_rules.yml" - "/etc/prometheus/deprecated_aerospike_rules.yml" - "/etc/prometheus/aerospike_connector_rules.yml" + - "/etc/prometheus/node_exporter_alerts.yml" + - "/etc/prometheus/aerospike_secret_agent_rules.yml" # A scrape configuration containing exactly one endpoint to scrape: # Here it's Prometheus itself.