From 79cb589235f09e282cd097461d46a0a52811164a Mon Sep 17 00:00:00 2001 From: Zach Leslie Date: Mon, 7 Nov 2022 17:32:43 +0000 Subject: [PATCH] Delete TempoRequestErrors alert from mixin (#1810) * Delete TempoRequestErrors alert from mixin * Update changelog --- CHANGELOG.md | 2 ++ operations/tempo-mixin-compiled/alerts.yaml | 13 ------------- operations/tempo-mixin/alerts.libsonnet | 19 ------------------- operations/tempo-mixin/runbook.md | 5 ++--- 4 files changed, 4 insertions(+), 35 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 806f2146338..ed20c252b67 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -45,6 +45,8 @@ overrides: Old config will still work but will be removed in a future release. [#1735](https://github.com/grafana/tempo/pull/1735) (@mapno) * [CHANGE] Add GOMEMLIMIT variable to compactor jsonnet and set the value to equal compactor memory limit. [#1758](https://github.com/grafana/tempo/pull/1758/files) (@ie-pham) * [CHANGE] Update alpine image version to 3.16. [#1784](https://github.com/grafana/tempo/pull/1784) (@zalegrala) +* [CHANGE] Delete TempoRequestErrors alert from mixin [#1810](https://github.com/grafana/tempo/pull/1810) (@zalegrala) + - **BREAKING CHANGE** Any jsonnet users relying on this alert should copy this into their own environment. * [FEATURE] Add capability to configure the used S3 Storage Class [#1697](https://github.com/grafana/tempo/pull/1714) (@amitsetty) * [ENHANCEMENT] cache: expose username and sentinel_username redis configuration options for ACL-based Redis Auth support [#1708](https://github.com/grafana/tempo/pull/1708) (@jsievenpiper) * [ENHANCEMENT] metrics-generator: expose span size as a metric [#1662](https://github.com/grafana/tempo/pull/1662) (@ie-pham) diff --git a/operations/tempo-mixin-compiled/alerts.yaml b/operations/tempo-mixin-compiled/alerts.yaml index 871b6492f35..707495dd43c 100644 --- a/operations/tempo-mixin-compiled/alerts.yaml +++ b/operations/tempo-mixin-compiled/alerts.yaml @@ -1,19 +1,6 @@ "groups": - "name": "tempo_alerts" "rules": - - "alert": "TempoRequestErrors" - "annotations": - "message": | - {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors. - "runbook_url": "https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoRequestErrors" - "expr": | - 100 * sum(rate(tempo_request_duration_seconds_count{status_code=~"5.."}[1m])) by (cluster, namespace, job, route) - / - sum(rate(tempo_request_duration_seconds_count[1m])) by (cluster, namespace, job, route) - > 10 - "for": "15m" - "labels": - "severity": "critical" - "alert": "TempoRequestLatency" "annotations": "message": | diff --git a/operations/tempo-mixin/alerts.libsonnet b/operations/tempo-mixin/alerts.libsonnet index 24acd6e5480..124e6bc4d53 100644 --- a/operations/tempo-mixin/alerts.libsonnet +++ b/operations/tempo-mixin/alerts.libsonnet @@ -4,25 +4,6 @@ { name: 'tempo_alerts', rules: [ - { - alert: 'TempoRequestErrors', - expr: ||| - 100 * sum(rate(tempo_request_duration_seconds_count{status_code=~"5.."}[1m])) by (%(group_by_job)s, route) - / - sum(rate(tempo_request_duration_seconds_count[1m])) by (%(group_by_job)s, route) - > 10 - ||| % $._config, - 'for': '15m', - labels: { - severity: 'critical', - }, - annotations: { - message: ||| - {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors. - |||, - runbook_url: 'https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoRequestErrors', - }, - }, { alert: 'TempoRequestLatency', expr: ||| diff --git a/operations/tempo-mixin/runbook.md b/operations/tempo-mixin/runbook.md index 2e33f1e9309..17908eab6aa 100644 --- a/operations/tempo-mixin/runbook.md +++ b/operations/tempo-mixin/runbook.md @@ -1,8 +1,7 @@ # Runbook -This document should help with remediating operational issues in Tempo. +This document should help with remediation of operational issues in Tempo. -## TempoRequestErrors ## TempoRequestLatency Aside from obvious errors in the logs the only real lever you can pull here is scaling. Use the Reads or Writes dashboard @@ -281,4 +280,4 @@ The error "Unexpected error reloading meta for local block. Ignoring and continu meta.json. Repair the meta.json and then restart the ingester to successfully recover the block. Or if it is not able to be repaired then the block files can be simply deleted as the ingester has already started without it. As long as the replication factor is 2 or higher, then there will be no data loss as the -same data was also written to another ingester. \ No newline at end of file +same data was also written to another ingester.