Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(alerts): check alerts earlier #26843

Merged
merged 5 commits into from
Dec 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 2 additions & 37 deletions posthog/tasks/alerts/checks.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import time
import traceback

from datetime import datetime, timedelta, UTC
Expand All @@ -25,16 +24,15 @@
AlertState,
)
from posthog.utils import get_from_dict_or_attr
from prometheus_client import Counter, Gauge
from django.db.models import Q, F
from collections import defaultdict
from posthog.tasks.alerts.utils import (
AlertEvaluationResult,
calculation_interval_to_order,
next_check_time,
send_notifications_for_breaches,
send_notifications_for_errors,
WRAPPER_NODE_KINDS,
alert_calculation_interval_to_relativedelta,
)
from posthog.tasks.alerts.trends import check_trends_alert
from posthog.ph_client import ph_us_client
Expand All @@ -54,26 +52,6 @@ def __init__(self, err: Exception):
self.__traceback__ = err.__traceback__


HOURLY_ALERTS_BACKLOG_GAUGE = Gauge(
"hourly_alerts_backlog",
"Number of hourly alerts that are not being checked in the last hour.",
)

DAILY_ALERTS_BACKLOG_GAUGE = Gauge(
"daily_alerts_backlog",
"Number of daily alerts that are not being checked in the last 24 hours.",
)

ALERT_CHECK_ERROR_COUNTER = Counter(
"alerts_check_failures",
"Number of alert check errors that don't notify the user",
)

ALERT_COMPUTED_COUNTER = Counter(
"alerts_computed",
"Number of alerts we calculated",
)

ANIRUDH_DISTINCT_ID = "wcPbDRs08GtNzrNIXfzHvYAkwUaekW7UrAo4y3coznT"


Expand Down Expand Up @@ -102,8 +80,6 @@ def alerts_backlog_task() -> None:
)
).count()

HOURLY_ALERTS_BACKLOG_GAUGE.set(hourly_alerts_breaching_sla)

now = datetime.now(UTC)

daily_alerts_breaching_sla = AlertConfiguration.objects.filter(
Expand All @@ -114,8 +90,6 @@ def alerts_backlog_task() -> None:
)
).count()

DAILY_ALERTS_BACKLOG_GAUGE.set(daily_alerts_breaching_sla)

with ph_us_client() as capture_ph_event:
capture_ph_event(
ANIRUDH_DISTINCT_ID,
Expand All @@ -135,9 +109,6 @@ def alerts_backlog_task() -> None:
},
)

# sleeping 30s for prometheus to pick up the metrics sent during task
time.sleep(30)


@shared_task(
ignore_result=True,
Expand Down Expand Up @@ -266,7 +237,6 @@ def check_alert(alert_id: str, capture_ph_event: Callable = lambda *args, **kwar
try:
check_alert_and_notify_atomically(alert, capture_ph_event)
except Exception as err:
ALERT_CHECK_ERROR_COUNTER.inc()
user = cast(User, alert.created_by)

capture_ph_event(
Expand Down Expand Up @@ -309,9 +279,6 @@ def check_alert_and_notify_atomically(alert: AlertConfiguration, capture_ph_even
so we can retry notification without re-computing insight.
"""
set_tag("alert_config_id", alert.id)

ALERT_COMPUTED_COUNTER.inc()

user = cast(User, alert.created_by)

# Event to count alert checks
Expand Down Expand Up @@ -426,9 +393,7 @@ def add_alert_check(

# IMPORTANT: update next_check_at according to interval
# ensure we don't recheck alert until the next interval is due
alert.next_check_at = (alert.next_check_at or now) + alert_calculation_interval_to_relativedelta(
cast(AlertCalculationInterval, alert.calculation_interval)
)
alert.next_check_at = next_check_time(alert)

if notify:
alert.last_notified_at = now
Expand Down
130 changes: 111 additions & 19 deletions posthog/tasks/alerts/test/test_trends_absolute_alerts.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
from typing import Optional, Any
from unittest.mock import ANY, MagicMock, patch
import dateutil

from freezegun import freeze_time

import dateutil
import pytz
import datetime

from posthog.models.alert import AlertCheck
from posthog.models.instance_setting import set_instance_setting
from posthog.tasks.alerts.checks import check_alert
Expand Down Expand Up @@ -41,7 +43,12 @@ def setUp(self) -> None:
self.dashboard_api = DashboardAPI(self.client, self.team, self.assertEqual)

def create_alert(
self, insight: dict, series_index: int, lower: Optional[int] = None, upper: Optional[int] = None
self,
insight: dict,
series_index: int,
lower: Optional[int] = None,
upper: Optional[int] = None,
calculation_interval: AlertCalculationInterval = AlertCalculationInterval.DAILY,
) -> dict:
alert = self.client.post(
f"/api/projects/{self.team.id}/alerts",
Expand All @@ -54,7 +61,7 @@ def create_alert(
"series_index": series_index,
},
"condition": {"type": "absolute_value"},
"calculation_interval": AlertCalculationInterval.DAILY,
"calculation_interval": calculation_interval,
"threshold": {"configuration": {"type": "absolute", "bounds": {"lower": lower, "upper": upper}}},
},
).json()
Expand Down Expand Up @@ -134,7 +141,11 @@ def test_alert_lower_threshold_breached(self, mock_send_breaches: MagicMock, moc
assert updated_alert.state == AlertState.FIRING
assert updated_alert.last_checked_at == FROZEN_TIME
assert updated_alert.last_notified_at == FROZEN_TIME
assert updated_alert.next_check_at == FROZEN_TIME + dateutil.relativedelta.relativedelta(days=1)

next_check = (FROZEN_TIME + dateutil.relativedelta.relativedelta(days=1)).replace(hour=1, tzinfo=pytz.UTC)
assert updated_alert.next_check_at is not None
assert updated_alert.next_check_at.hour == next_check.hour
assert updated_alert.next_check_at.date() == next_check.date()

alert_check = AlertCheck.objects.filter(alert_configuration=alert["id"]).latest("created_at")
assert alert_check.calculated_value == 0
Expand Down Expand Up @@ -168,7 +179,11 @@ def test_trend_high_threshold_breached(self, mock_send_breaches: MagicMock, mock

updated_alert = AlertConfiguration.objects.get(pk=alert["id"])
assert updated_alert.state == AlertState.FIRING
assert updated_alert.next_check_at == FROZEN_TIME + dateutil.relativedelta.relativedelta(days=1)

next_check = (FROZEN_TIME + dateutil.relativedelta.relativedelta(days=1)).replace(hour=1, tzinfo=pytz.UTC)
assert updated_alert.next_check_at is not None
assert updated_alert.next_check_at.hour == next_check.hour
assert updated_alert.next_check_at.date() == next_check.date()

alert_check = AlertCheck.objects.filter(alert_configuration=alert["id"]).latest("created_at")
assert alert_check.calculated_value == 2
Expand All @@ -181,9 +196,11 @@ def test_trend_high_threshold_breached(self, mock_send_breaches: MagicMock, mock

def test_trend_no_threshold_breached(self, mock_send_breaches: MagicMock, mock_send_errors: MagicMock) -> None:
insight = self.create_time_series_trend_insight()
alert = self.create_alert(insight, series_index=0, lower=0, upper=2)
alert = self.create_alert(
insight, series_index=0, lower=0, upper=2, calculation_interval=AlertCalculationInterval.MONTHLY
)

with freeze_time(FROZEN_TIME - dateutil.relativedelta.relativedelta(days=1)):
with freeze_time(FROZEN_TIME):
_create_event(
team=self.team,
event="signed_up",
Expand All @@ -196,10 +213,49 @@ def test_trend_no_threshold_breached(self, mock_send_breaches: MagicMock, mock_s

updated_alert = AlertConfiguration.objects.get(pk=alert["id"])
assert updated_alert.state == AlertState.NOT_FIRING
assert updated_alert.next_check_at == FROZEN_TIME + dateutil.relativedelta.relativedelta(days=1)

next_check = datetime.datetime(2024, 7, 1, 4, 0, tzinfo=pytz.UTC)
# first day of next month at around 4 AM
assert updated_alert.next_check_at is not None
assert updated_alert.next_check_at.hour == next_check.hour
assert updated_alert.next_check_at.date() == next_check.date()

alert_check = AlertCheck.objects.filter(alert_configuration=alert["id"]).latest("created_at")
assert alert_check.calculated_value == 1
assert alert_check.calculated_value == 0
assert alert_check.state == AlertState.NOT_FIRING
assert alert_check.error is None

def test_trend_no_threshold_breached_weekly(
self, mock_send_breaches: MagicMock, mock_send_errors: MagicMock
) -> None:
insight = self.create_time_series_trend_insight()
alert = self.create_alert(
insight, series_index=0, lower=0, upper=2, calculation_interval=AlertCalculationInterval.WEEKLY
)

with freeze_time(FROZEN_TIME):
_create_event(
team=self.team,
event="signed_up",
distinct_id="1",
properties={"$browser": "Chrome"},
)
flush_persons_and_events()

check_alert(alert["id"])

updated_alert = AlertConfiguration.objects.get(pk=alert["id"])
assert updated_alert.state == AlertState.NOT_FIRING

next_check = (
FROZEN_TIME + dateutil.relativedelta.relativedelta(days=1, weekday=dateutil.relativedelta.MO(1))
).replace(hour=3, tzinfo=pytz.UTC)
assert updated_alert.next_check_at is not None
assert updated_alert.next_check_at.hour == next_check.hour
assert updated_alert.next_check_at.date() == next_check.date()

alert_check = AlertCheck.objects.filter(alert_configuration=alert["id"]).latest("created_at")
assert alert_check.calculated_value == 0
assert alert_check.state == AlertState.NOT_FIRING
assert alert_check.error is None

Expand Down Expand Up @@ -234,7 +290,11 @@ def test_trend_breakdown_high_threshold_breached(

updated_alert = AlertConfiguration.objects.get(pk=alert["id"])
assert updated_alert.state == AlertState.FIRING
assert updated_alert.next_check_at == FROZEN_TIME + dateutil.relativedelta.relativedelta(days=1)

next_check = (FROZEN_TIME + dateutil.relativedelta.relativedelta(days=1)).replace(hour=1, tzinfo=pytz.UTC)
assert updated_alert.next_check_at is not None
assert updated_alert.next_check_at.hour == next_check.hour
assert updated_alert.next_check_at.date() == next_check.date()

alert_check = AlertCheck.objects.filter(alert_configuration=alert["id"]).latest("created_at")
assert alert_check.calculated_value == 2
Expand Down Expand Up @@ -276,7 +336,11 @@ def test_trend_breakdown_low_threshold_breached(

updated_alert = AlertConfiguration.objects.get(pk=alert["id"])
assert updated_alert.state == AlertState.FIRING
assert updated_alert.next_check_at == FROZEN_TIME + dateutil.relativedelta.relativedelta(days=1)

next_check = (FROZEN_TIME + dateutil.relativedelta.relativedelta(days=1)).replace(hour=1, tzinfo=pytz.UTC)
assert updated_alert.next_check_at is not None
assert updated_alert.next_check_at.hour == next_check.hour
assert updated_alert.next_check_at.date() == next_check.date()

alert_check = AlertCheck.objects.filter(alert_configuration=alert["id"]).latest("created_at")
assert alert_check.calculated_value == 1
Expand Down Expand Up @@ -318,7 +382,11 @@ def test_trend_breakdown_no_threshold_breached(

updated_alert = AlertConfiguration.objects.get(pk=alert["id"])
assert updated_alert.state == AlertState.NOT_FIRING
assert updated_alert.next_check_at == FROZEN_TIME + dateutil.relativedelta.relativedelta(days=1)

next_check = (FROZEN_TIME + dateutil.relativedelta.relativedelta(days=1)).replace(hour=1, tzinfo=pytz.UTC)
assert updated_alert.next_check_at is not None
assert updated_alert.next_check_at.hour == next_check.hour
assert updated_alert.next_check_at.date() == next_check.date()

alert_check = AlertCheck.objects.filter(alert_configuration=alert["id"]).latest("created_at")
assert alert_check.calculated_value is None
Expand Down Expand Up @@ -358,7 +426,11 @@ def test_aggregate_trend_high_threshold_breached(

updated_alert = AlertConfiguration.objects.get(pk=alert["id"])
assert updated_alert.state == AlertState.FIRING
assert updated_alert.next_check_at == FROZEN_TIME + dateutil.relativedelta.relativedelta(days=1)

next_check = (FROZEN_TIME + dateutil.relativedelta.relativedelta(days=1)).replace(hour=1, tzinfo=pytz.UTC)
assert updated_alert.next_check_at is not None
assert updated_alert.next_check_at.hour == next_check.hour
assert updated_alert.next_check_at.date() == next_check.date()

alert_check = AlertCheck.objects.filter(alert_configuration=alert["id"]).latest("created_at")
assert alert_check.calculated_value == 3
Expand Down Expand Up @@ -400,7 +472,11 @@ def test_aggregate_trend_with_breakdown_high_threshold_breached(

updated_alert = AlertConfiguration.objects.get(pk=alert["id"])
assert updated_alert.state == AlertState.FIRING
assert updated_alert.next_check_at == FROZEN_TIME + dateutil.relativedelta.relativedelta(days=1)

next_check = (FROZEN_TIME + dateutil.relativedelta.relativedelta(days=1)).replace(hour=1, tzinfo=pytz.UTC)
assert updated_alert.next_check_at is not None
assert updated_alert.next_check_at.hour == next_check.hour
assert updated_alert.next_check_at.date() == next_check.date()

alert_check = AlertCheck.objects.filter(alert_configuration=alert["id"]).latest("created_at")
assert alert_check.calculated_value == 2
Expand Down Expand Up @@ -437,7 +513,11 @@ def test_trend_current_interval_high_threshold_breached(

updated_alert = AlertConfiguration.objects.get(pk=alert["id"])
assert updated_alert.state == AlertState.FIRING
assert updated_alert.next_check_at == FROZEN_TIME + dateutil.relativedelta.relativedelta(days=1)

next_check = (FROZEN_TIME + dateutil.relativedelta.relativedelta(days=1)).replace(hour=1, tzinfo=pytz.UTC)
assert updated_alert.next_check_at is not None
assert updated_alert.next_check_at.hour == next_check.hour
assert updated_alert.next_check_at.date() == next_check.date()

alert_check = AlertCheck.objects.filter(alert_configuration=alert["id"]).latest("created_at")
assert alert_check.calculated_value == 2
Expand Down Expand Up @@ -484,7 +564,11 @@ def test_trend_current_interval_fallback_to_previous_high_threshold_breached(

updated_alert = AlertConfiguration.objects.get(pk=alert["id"])
assert updated_alert.state == AlertState.FIRING
assert updated_alert.next_check_at == FROZEN_TIME + dateutil.relativedelta.relativedelta(days=1)

next_check = (FROZEN_TIME + dateutil.relativedelta.relativedelta(days=1)).replace(hour=1, tzinfo=pytz.UTC)
assert updated_alert.next_check_at is not None
assert updated_alert.next_check_at.hour == next_check.hour
assert updated_alert.next_check_at.date() == next_check.date()

alert_check = AlertCheck.objects.filter(alert_configuration=alert["id"]).latest("created_at")
assert alert_check.calculated_value == 2
Expand Down Expand Up @@ -524,7 +608,11 @@ def test_trend_current_interval_no_threshold_breached(

updated_alert = AlertConfiguration.objects.get(pk=alert["id"])
assert updated_alert.state == AlertState.NOT_FIRING
assert updated_alert.next_check_at == FROZEN_TIME + dateutil.relativedelta.relativedelta(days=1)

next_check = (FROZEN_TIME + dateutil.relativedelta.relativedelta(days=1)).replace(hour=1, tzinfo=pytz.UTC)
assert updated_alert.next_check_at is not None
assert updated_alert.next_check_at.hour == next_check.hour
assert updated_alert.next_check_at.date() == next_check.date()

alert_check = AlertCheck.objects.filter(alert_configuration=alert["id"]).latest("created_at")
assert alert_check.calculated_value == 0
Expand All @@ -551,7 +639,11 @@ def test_trend_current_interval_low_threshold_breached(

updated_alert = AlertConfiguration.objects.get(pk=alert["id"])
assert updated_alert.state == AlertState.FIRING
assert updated_alert.next_check_at == FROZEN_TIME + dateutil.relativedelta.relativedelta(days=1)

next_check = (FROZEN_TIME + dateutil.relativedelta.relativedelta(days=1)).replace(hour=1, tzinfo=pytz.UTC)
assert updated_alert.next_check_at is not None
assert updated_alert.next_check_at.hour == next_check.hour
assert updated_alert.next_check_at.date() == next_check.date()

alert_check = AlertCheck.objects.filter(alert_configuration=alert["id"]).latest("created_at")
# will be 0 even thought for current day it's 1
Expand Down
Loading
Loading