Skip to content

Commit

Permalink
feat: alert metrics posthog
Browse files Browse the repository at this point in the history
  • Loading branch information
anirudhpillai committed Nov 25, 2024
1 parent d1f1d04 commit 13e139c
Showing 1 changed file with 67 additions and 3 deletions.
70 changes: 67 additions & 3 deletions posthog/tasks/alerts/checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
alert_calculation_interval_to_relativedelta,
)
from posthog.tasks.alerts.trends import check_trends_alert
import posthoganalytics


logger = structlog.get_logger(__name__)
Expand Down Expand Up @@ -100,6 +101,15 @@ def alerts_backlog_task() -> None:

HOURLY_ALERTS_BACKLOG_GAUGE.set(hourly_alerts_breaching_sla)

posthoganalytics.capture(
"[email protected]",
"alert check backlog",
properties={
"alert_check_frequency": AlertCalculationInterval.HOURLY,
"backlog": hourly_alerts_breaching_sla,
},
)

now = datetime.now(UTC)

daily_alerts_breaching_sla = AlertConfiguration.objects.filter(
Expand All @@ -112,6 +122,15 @@ def alerts_backlog_task() -> None:

DAILY_ALERTS_BACKLOG_GAUGE.set(daily_alerts_breaching_sla)

posthoganalytics.capture(
"[email protected]",
"alert check backlog",
properties={
"alert_check_frequency": AlertCalculationInterval.DAILY,
"backlog": daily_alerts_breaching_sla,
},
)

# sleeping 30s for prometheus to pick up the metrics sent during task
time.sleep(30)

Expand Down Expand Up @@ -244,6 +263,16 @@ def check_alert(alert_id: str) -> None:
except Exception as err:
ALERT_CHECK_ERROR_COUNTER.inc()

posthoganalytics.capture(
alert.created_by.email,
"alert check failed",
properties={
"alert_id": alert.id,
"error": f"AlertCheckError: {err}",
"traceback": traceback.format_exc(),
},
)

logger.exception(AlertCheckException(err))
capture_exception(
AlertCheckException(err),
Expand Down Expand Up @@ -276,6 +305,16 @@ def check_alert_and_notify_atomically(alert: AlertConfiguration) -> None:
set_tag("alert_config_id", alert.id)

ALERT_COMPUTED_COUNTER.inc()

# Event to count alert checks
posthoganalytics.capture(
alert.created_by.email,
"alert check",
properties={
"alert_id": alert.id,
},
)

value = breaches = error = None

# 1. Evaluate insight and get alert value
Expand All @@ -288,8 +327,21 @@ def check_alert_and_notify_atomically(alert: AlertConfiguration) -> None:
# as celery task can be retried according to config
raise
except Exception as err:
logger.exception(f"Alert id = {alert.id}, failed to evaluate", exc_info=err)
set_tag("evaluation_error_message", traceback.format_exc())
error_message = f"Alert id = {alert.id}, failed to evaluate"
evaluation_error_message = traceback.format_exc()

posthoganalytics.capture(
alert.created_by.email,
"alert check failed",
properties={
"alert_id": alert.id,
"error": error_message,
"traceback": evaluation_error_message,
},
)

logger.exception(error_message, exc_info=err)
set_tag("evaluation_error_message", evaluation_error_message)
capture_exception(AlertCheckException(err))

# error can be on user side (incorrectly configured insight/alert)
Expand All @@ -316,9 +368,21 @@ def check_alert_and_notify_atomically(alert: AlertConfiguration) -> None:
send_notifications_for_breaches(alert, breaches)
except Exception as err:
error_message = f"AlertCheckError: error sending notifications for alert_id = {alert.id}"
evaluation_error_message = traceback.format_exc()

posthoganalytics.capture(
alert.created_by.email,
"alert check failed",
properties={
"alert_id": alert.id,
"error": error_message,
"traceback": evaluation_error_message,
},
)

logger.exception(error_message, exc_info=err)

set_tag("evaluation_error_message", traceback.format_exc())
set_tag("evaluation_error_message", evaluation_error_message)
capture_exception(Exception(error_message))

# don't want alert state to be updated (so that it's retried as next_check_at won't be updated)
Expand Down

0 comments on commit 13e139c

Please sign in to comment.