From 4aa89260a2dd59fa3810934b17add8123b451ee6 Mon Sep 17 00:00:00 2001 From: Amnon Heiman Date: Sun, 21 Apr 2024 09:13:24 +0300 Subject: [PATCH 1/2] prometheus.yml.template: take the repair progress from the manager --- prometheus/prom_rules/prometheus.latency.rules.yml | 5 ----- prometheus/prometheus.yml.template | 4 ++-- 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/prometheus/prom_rules/prometheus.latency.rules.yml b/prometheus/prom_rules/prometheus.latency.rules.yml index 33f437614e..ba80829925 100644 --- a/prometheus/prom_rules/prometheus.latency.rules.yml +++ b/prometheus/prom_rules/prometheus.latency.rules.yml @@ -45,11 +45,6 @@ groups: expr: timestamp(sum(changes(scylla_manager_scheduler_run_total{status="ERROR",type="backup"}[2m])) by (cluster) > 0) or on(cluster) manager:backup_fail_ts - record: manager:repair_progress expr: (max(scylla_manager_scheduler_run_indicator{type="repair"}) by (cluster) >bool 0)*((max(scylla_manager_repair_token_ranges_total) by(cluster)<= 0)*0 or on(cluster) (sum(scylla_manager_repair_token_ranges_success>=0) by (cluster) + sum(scylla_manager_repair_token_ranges_error>=0) by (cluster))/sum(scylla_manager_repair_token_ranges_total>=0) by (cluster)) - - record: scylla_manager_repair_progress - expr: sum(manager:repair_progress) by (cluster) - labels: - dd: "1" - by: "cluster" - record: manager:backup_progress expr: (max(scylla_manager_scheduler_run_indicator{type="backup"}) by (cluster) >bool 0)*((max(scylla_manager_backup_files_size_bytes) by(cluster)<= 0)*0 or on(cluster) (sum(scylla_manager_backup_files_uploaded_bytes) by (cluster) + sum(scylla_manager_backup_files_skipped_bytes) by (cluster) + sum(scylla_manager_backup_files_failed_bytes)by(cluster))/sum(scylla_manager_backup_files_size_bytes>=0) by (cluster)) - record: scylla_manager_backup_progress diff --git a/prometheus/prometheus.yml.template b/prometheus/prometheus.yml.template index df4aac15ac..d360b7f6ee 100644 --- a/prometheus/prometheus.yml.template +++ b/prometheus/prometheus.yml.template @@ -196,11 +196,11 @@ scrape_configs: - source_labels: [host] target_label: instance - source_labels: [__name__] - regex: '(scylla_manager_scheduler_last_duration)' + regex: '(scylla_manager_scheduler_last_duration|scylla_manager_repair_progress)' target_label: dd replacement: '1' - source_labels: [__name__] - regex: '(scylla_manager_scheduler_last_duration)' + regex: '(scylla_manager_scheduler_last_duration|scylla_manager_repair_progress)' target_label: by replacement: 'cluster' From e9005b28acb46bb5f1fc920632ef9435b654a712 Mon Sep 17 00:00:00 2001 From: Amnon Heiman Date: Sun, 21 Apr 2024 09:23:08 +0300 Subject: [PATCH 2/2] prometheus.latency.rules.yml: Mark cluster wide manager metrics with labels --- docs/source/procedures/datadog/datadog.rules.yml | 12 ++++++++++++ prometheus/prom_rules/prometheus.latency.rules.yml | 12 ++++++++++++ 2 files changed, 24 insertions(+) diff --git a/docs/source/procedures/datadog/datadog.rules.yml b/docs/source/procedures/datadog/datadog.rules.yml index 7bdc8b2c59..73651eed6f 100644 --- a/docs/source/procedures/datadog/datadog.rules.yml +++ b/docs/source/procedures/datadog/datadog.rules.yml @@ -481,3 +481,15 @@ groups: labels: by: "cluster" dd: "1" + - record: scylla_manager_healthcheck_cql_status_ag + expr: sum(scylla_manager_healthcheck_cql_status) by(cluster) + labels: + by: "cluster" + status: "1" + dd: "1" + - record: scylla_manager_healthcheck_cql_status_ag + expr: count(scylla_manager_healthcheck_cql_status==0) by(cluster) + labels: + by: "cluster" + status: "0" + dd: "1" diff --git a/prometheus/prom_rules/prometheus.latency.rules.yml b/prometheus/prom_rules/prometheus.latency.rules.yml index ba80829925..972b360c1a 100644 --- a/prometheus/prom_rules/prometheus.latency.rules.yml +++ b/prometheus/prom_rules/prometheus.latency.rules.yml @@ -37,12 +37,24 @@ groups: expr: errors:local_failed + errors:operation_unavailable - record: manager:repair_done_ts expr: timestamp(sum(changes(scylla_manager_scheduler_run_total{status="DONE",type="repair"}[2m])) by (cluster) > 0) or on(cluster) manager:repair_done_ts + labels: + dd: "1" + by: "cluster" - record: manager:backup_done_ts expr: timestamp(sum(changes(scylla_manager_scheduler_run_total{status="DONE",type="backup"}[2m])) by (cluster) > 0) or on(cluster) manager:backup_done_ts + labels: + dd: "1" + by: "cluster" - record: manager:repair_fail_ts expr: timestamp(sum(changes(scylla_manager_scheduler_run_total{status="ERROR",type="repair"}[2m])) by (cluster) > 0) or on(cluster) manager:repair_fail_ts + labels: + dd: "1" + by: "cluster" - record: manager:backup_fail_ts expr: timestamp(sum(changes(scylla_manager_scheduler_run_total{status="ERROR",type="backup"}[2m])) by (cluster) > 0) or on(cluster) manager:backup_fail_ts + labels: + dd: "1" + by: "cluster" - record: manager:repair_progress expr: (max(scylla_manager_scheduler_run_indicator{type="repair"}) by (cluster) >bool 0)*((max(scylla_manager_repair_token_ranges_total) by(cluster)<= 0)*0 or on(cluster) (sum(scylla_manager_repair_token_ranges_success>=0) by (cluster) + sum(scylla_manager_repair_token_ranges_error>=0) by (cluster))/sum(scylla_manager_repair_token_ranges_total>=0) by (cluster)) - record: manager:backup_progress