diff --git a/dist/rules/host-and-hardware/node-exporter.yml b/dist/rules/host-and-hardware/node-exporter.yml index 6e108d050..791d89379 100644 --- a/dist/rules/host-and-hardware/node-exporter.yml +++ b/dist/rules/host-and-hardware/node-exporter.yml @@ -94,6 +94,15 @@ groups: summary: Host out of inodes (instance {{ $labels.instance }}) description: "Disk is almost running out of available inodes (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + - alert: HostFilesystemDeviceError + expr: 'node_filesystem_device_error == 1' + for: 0m + labels: + severity: critical + annotations: + summary: Host filesystem device error (instance {{ $labels.instance }}) + description: "{{ $labels.instance }}: Device error with the {{ $labels.mountpoint }} filesystem\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + - alert: HostInodesWillFillIn24Hours expr: '(node_filesystem_files_free / node_filesystem_files * 100 < 10 and predict_linear(node_filesystem_files_free[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' for: 2m diff --git a/dist/rules/loki/embedded-exporter.yml b/dist/rules/loki/embedded-exporter.yml index fe6a97fd3..077036a49 100644 --- a/dist/rules/loki/embedded-exporter.yml +++ b/dist/rules/loki/embedded-exporter.yml @@ -5,7 +5,7 @@ groups: rules: - alert: LokiProcessTooManyRestarts - expr: 'changes(process_start_time_seconds{job=~"loki"}[15m]) > 2' + expr: 'changes(process_start_time_seconds{job=~".*loki.*"}[15m]) > 2' for: 0m labels: severity: warning diff --git a/dist/rules/patroni/embedded-exporter-patroni.yml b/dist/rules/patroni/embedded-exporter-patroni.yml new file mode 100644 index 000000000..561f12f12 --- /dev/null +++ b/dist/rules/patroni/embedded-exporter-patroni.yml @@ -0,0 +1,14 @@ +groups: + +- name: EmbeddedExporterPatroni + + rules: + + - alert: PatroniHasNoLeader + expr: '(max by (scope) (patroni_master) < 1) and (max by (scope) (patroni_standby_leader) < 1)' + for: 0m + labels: + severity: critical + annotations: + summary: Patroni has no Leader (instance {{ $labels.instance }}) + description: "A leader node (neither primary nor standby) cannot be found inside the cluster {{ $labels.scope }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" diff --git a/dist/rules/postgresql/postgres-exporter.yml b/dist/rules/postgresql/postgres-exporter.yml index e9dc0b5aa..2c4a793d1 100644 --- a/dist/rules/postgresql/postgres-exporter.yml +++ b/dist/rules/postgresql/postgres-exporter.yml @@ -50,7 +50,7 @@ groups: description: "Table {{ $labels.relname }} has not been auto analyzed for 10 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PostgresqlTooManyConnections - expr: 'sum by (instance, job, server) (pg_stat_activity_count) > min by (instance, job, server) (pg_settings_max_connections * 0.8)' + expr: '' for: 2m labels: severity: warning diff --git a/dist/rules/prometheus-self-monitoring/embedded-exporter.yml b/dist/rules/prometheus-self-monitoring/embedded-exporter.yml index 9ec212019..65bfd8278 100644 --- a/dist/rules/prometheus-self-monitoring/embedded-exporter.yml +++ b/dist/rules/prometheus-self-monitoring/embedded-exporter.yml @@ -247,11 +247,11 @@ groups: summary: Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }}) description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: PrometheusTimeserieCardinality + - alert: PrometheusTimeseriesCardinality expr: 'label_replace(count by(__name__) ({__name__=~".+"}), "name", "$1", "__name__", "(.+)") > 10000' for: 0m labels: severity: warning annotations: - summary: Prometheus timeserie cardinality (instance {{ $labels.instance }}) - description: "The \"{{ $labels.name }}\" timeserie cardinality is getting very high: {{ $value }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + summary: Prometheus timeseries cardinality (instance {{ $labels.instance }}) + description: "The \"{{ $labels.name }}\" timeseries cardinality is getting very high: {{ $value }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" diff --git a/dist/rules/redis/oliver006-redis-exporter.yml b/dist/rules/redis/oliver006-redis-exporter.yml index 6afd17bb7..08cdf23a3 100644 --- a/dist/rules/redis/oliver006-redis-exporter.yml +++ b/dist/rules/redis/oliver006-redis-exporter.yml @@ -32,7 +32,7 @@ groups: description: "Redis cluster has too many nodes marked as master.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: RedisDisconnectedSlaves - expr: 'count without (instance, job) (redis_connected_slaves) - sum without (instance, job) (redis_connected_slaves) - 1 > 1' + expr: 'count without (instance, job) (redis_connected_slaves) - sum without (instance, job) (redis_connected_slaves) - 1 > 0' for: 0m labels: severity: critical