Skip to content

Commit

Permalink
OM153 - Node exporter dashboard (#93)
Browse files Browse the repository at this point in the history
* OM153 - node exporter dashboard
* OM153 - added node exporter alerts to docker-compose files

added alert for secret-agent too
  • Loading branch information
mphanias authored Dec 11, 2023
1 parent 3a14f6b commit aa2d4e4
Show file tree
Hide file tree
Showing 9 changed files with 7,914 additions and 2 deletions.
7,291 changes: 7,291 additions & 0 deletions config/grafana/dashboards/nodeexporter/hostview.json

Large diffs are not rendered by default.

1 change: 0 additions & 1 deletion config/prometheus/aerospike_rules.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ groups:
summary: "Node {{ $labels.instance }} down"
description: "{{ $labels.instance }} node is down."


- name: aerospike_aerospike.rules > NAMESPACE
rules:
- alert: NamespaceStopWrites
Expand Down
12 changes: 12 additions & 0 deletions config/prometheus/aerospike_secret_agent_rules.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
groups:
- name: secret_agent.rules
rules:
- alert: AerospikeSecretAgentDown
expr: absent(aerospike_sa_connections_active{job="aerospike_secret_agent"}) == 1
for: 1m
labels:
severity: critical
annotations:
summary: "Aerospike Secret Agent instance {{ $labels.instance }} in cluster {{$labels.cluster_name}} has been down for more than 1m."
description: "Aerospike Secret Agent instance {{ $labels.instance }} in cluster {{$labels.cluster_name}} has been down for more than 1m."

274 changes: 274 additions & 0 deletions config/prometheus/node_exporter_alerts.yml

Large diffs are not rendered by default.

12 changes: 11 additions & 1 deletion config/prometheus/prometheus.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,11 @@ rule_files:
# new severities are info, warn, error, critical in the increasing order of severity
# these priority are updates in aerospike_rules.yml
#
- "/etc/prometheus/deprecated_aerospike_rules.yml"
# - "/etc/prometheus/deprecated_aerospike_rules.yml"
- "/etc/prometheus/aerospike_rules.yml"
- "/etc/prometheus/aerospike_connector_rules.yml"
- "/etc/prometheus/node_exporter_alerts.yml"
- "/etc/prometheus/aerospike_secret_agent_rules.yml"

# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
Expand All @@ -36,3 +38,11 @@ scrape_configs:

static_configs:
- targets: ['172.17.0.2:9145', '172.17.0.3:9145', '172.17.0.4:9145']

- job_name: 'node-exporter'

# metrics_path defaults to '/metrics'
# scheme defaults to 'http'.

static_configs:
- targets: ['172.17.0.2:9100', '172.17.0.3:9100', '172.17.0.4:9100']
274 changes: 274 additions & 0 deletions config/prometheus/templates/node_exporter_alerts_rules.template

Large diffs are not rendered by default.

44 changes: 44 additions & 0 deletions config/prometheus/templates/node_exporter_config_data.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
{
"node_down_critical_duration": "1m",
"node_warn_memory_pct": "70",
"node_warn_outofmemory_duration": "1m",
"node_warn_outOfDisk_duration": "1m",
"node_warn_OutOfdisk_pct": "70",
"node_warn_OutOfInodes_pct": "70",
"node_warn_outOfInodes_duration": "1m",
"node_warn_unusual_disklatency_time": "0.1",
"node_warn_unusual_diskWrite_latency_duration": "1m",
"node_warn_unusual_diskRead_latency_duration": "1m",
"node_high_cpuload_threshold_duration": "30s",
"node_warn_highCPU_pct": "70",
"node_cpu_steal_threshold_duration": "30s",
"node_warn_cpu_steal_pct": "3",
"node_network_receiveError_duration": "1m",
"node_warn_network_err": "3",
"node_network_transmitError_duration": "1m",
"node_warn_network_interface_saturation": "0.8",
"node_network_interface_saturated_duration": "1m",
"node_clock_notsync_duration": "2m",
"node_warn_clock_duration": "16",
"node_warn_swapPages_count": "5",
"node_warn_swap_in_duration": "1m",
"node_warn_swap_out_duration": "1m",

"node_critical_memory_pct": "90",
"node_critical_outofmemory_duration": "1m",
"node_critical_outOfDisk_duration": "1m",
"node_critical_OutOfdisk_pct": "90",
"node_critical_OutOfInodes_pct": "90",
"node_critical_outOfInodes_duration": "1m",
"node_critical_unusual_disklatency_time": "0.5",
"node_critical_unusual_diskWrite_latency_duration": "1m",
"node_critical_unusual_diskRead_latency_duration": "1m",
"node_critical_highCPU_pct": "90",
"node_critical_cpu_steal_pct": "5",
"node_critical_network_err": "5",
"node_critical_network_interface_saturation": "0.9",
"node_critical_swapPages_count": "10",
"node_critical_swap_in_duration": "1m",
"node_critical_swap_out_duration": "1m"

}
6 changes: 6 additions & 0 deletions examples/docker/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,12 @@ services:
- source: ./config/prometheus/aerospike_connector_rules.yml
target: /etc/prometheus/aerospike_connector_rules.yml
type: bind
- source: ./config/prometheus/node_exporter_alerts.yml
target: /etc/prometheus/node_exporter_alerts.yml
type: bind
- source: ./config/prometheus/aerospike_secret_agent_rules.yml
target: /etc/prometheus/aerospike_secret_agent_rules.yml
type: bind
depends_on:
- exporter1
command:
Expand Down
2 changes: 2 additions & 0 deletions examples/docker/easy-prometheus.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ rule_files:
- "/etc/prometheus/aerospike_rules.yml"
- "/etc/prometheus/deprecated_aerospike_rules.yml"
- "/etc/prometheus/aerospike_connector_rules.yml"
- "/etc/prometheus/node_exporter_alerts.yml"
- "/etc/prometheus/aerospike_secret_agent_rules.yml"

# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
Expand Down

0 comments on commit aa2d4e4

Please sign in to comment.