diff --git a/docs/source/procedures/datadog/datadog.rules.yml b/docs/source/procedures/datadog/datadog.rules.yml index 7bdc8b2c5..73651eed6 100644 --- a/docs/source/procedures/datadog/datadog.rules.yml +++ b/docs/source/procedures/datadog/datadog.rules.yml @@ -481,3 +481,15 @@ groups: labels: by: "cluster" dd: "1" + - record: scylla_manager_healthcheck_cql_status_ag + expr: sum(scylla_manager_healthcheck_cql_status) by(cluster) + labels: + by: "cluster" + status: "1" + dd: "1" + - record: scylla_manager_healthcheck_cql_status_ag + expr: count(scylla_manager_healthcheck_cql_status==0) by(cluster) + labels: + by: "cluster" + status: "0" + dd: "1" diff --git a/prometheus/prom_rules/prometheus.latency.rules.yml b/prometheus/prom_rules/prometheus.latency.rules.yml index 33f437614..972b360c1 100644 --- a/prometheus/prom_rules/prometheus.latency.rules.yml +++ b/prometheus/prom_rules/prometheus.latency.rules.yml @@ -37,19 +37,26 @@ groups: expr: errors:local_failed + errors:operation_unavailable - record: manager:repair_done_ts expr: timestamp(sum(changes(scylla_manager_scheduler_run_total{status="DONE",type="repair"}[2m])) by (cluster) > 0) or on(cluster) manager:repair_done_ts + labels: + dd: "1" + by: "cluster" - record: manager:backup_done_ts expr: timestamp(sum(changes(scylla_manager_scheduler_run_total{status="DONE",type="backup"}[2m])) by (cluster) > 0) or on(cluster) manager:backup_done_ts + labels: + dd: "1" + by: "cluster" - record: manager:repair_fail_ts expr: timestamp(sum(changes(scylla_manager_scheduler_run_total{status="ERROR",type="repair"}[2m])) by (cluster) > 0) or on(cluster) manager:repair_fail_ts + labels: + dd: "1" + by: "cluster" - record: manager:backup_fail_ts expr: timestamp(sum(changes(scylla_manager_scheduler_run_total{status="ERROR",type="backup"}[2m])) by (cluster) > 0) or on(cluster) manager:backup_fail_ts - - record: manager:repair_progress - expr: (max(scylla_manager_scheduler_run_indicator{type="repair"}) by (cluster) >bool 0)*((max(scylla_manager_repair_token_ranges_total) by(cluster)<= 0)*0 or on(cluster) (sum(scylla_manager_repair_token_ranges_success>=0) by (cluster) + sum(scylla_manager_repair_token_ranges_error>=0) by (cluster))/sum(scylla_manager_repair_token_ranges_total>=0) by (cluster)) - - record: scylla_manager_repair_progress - expr: sum(manager:repair_progress) by (cluster) labels: dd: "1" by: "cluster" + - record: manager:repair_progress + expr: (max(scylla_manager_scheduler_run_indicator{type="repair"}) by (cluster) >bool 0)*((max(scylla_manager_repair_token_ranges_total) by(cluster)<= 0)*0 or on(cluster) (sum(scylla_manager_repair_token_ranges_success>=0) by (cluster) + sum(scylla_manager_repair_token_ranges_error>=0) by (cluster))/sum(scylla_manager_repair_token_ranges_total>=0) by (cluster)) - record: manager:backup_progress expr: (max(scylla_manager_scheduler_run_indicator{type="backup"}) by (cluster) >bool 0)*((max(scylla_manager_backup_files_size_bytes) by(cluster)<= 0)*0 or on(cluster) (sum(scylla_manager_backup_files_uploaded_bytes) by (cluster) + sum(scylla_manager_backup_files_skipped_bytes) by (cluster) + sum(scylla_manager_backup_files_failed_bytes)by(cluster))/sum(scylla_manager_backup_files_size_bytes>=0) by (cluster)) - record: scylla_manager_backup_progress diff --git a/prometheus/prometheus.yml.template b/prometheus/prometheus.yml.template index df4aac15a..d360b7f6e 100644 --- a/prometheus/prometheus.yml.template +++ b/prometheus/prometheus.yml.template @@ -196,11 +196,11 @@ scrape_configs: - source_labels: [host] target_label: instance - source_labels: [__name__] - regex: '(scylla_manager_scheduler_last_duration)' + regex: '(scylla_manager_scheduler_last_duration|scylla_manager_repair_progress)' target_label: dd replacement: '1' - source_labels: [__name__] - regex: '(scylla_manager_scheduler_last_duration)' + regex: '(scylla_manager_scheduler_last_duration|scylla_manager_repair_progress)' target_label: by replacement: 'cluster'