Skip to content

Commit

Permalink
fix(alerts): alert monitoring + trendlines only for absolute alerts (#…
Browse files Browse the repository at this point in the history
  • Loading branch information
anirudhpillai authored Oct 18, 2024
1 parent 76c3f68 commit 14a8b5c
Show file tree
Hide file tree
Showing 7 changed files with 117 additions and 68 deletions.
3 changes: 2 additions & 1 deletion frontend/src/lib/components/Alerts/insightAlertsLogic.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ import { loaders } from 'kea-loaders'
import api from 'lib/api'
import { insightVizDataLogic } from 'scenes/insights/insightVizDataLogic'

import { GoalLine, InsightThresholdType } from '~/queries/schema'
import { AlertConditionType, GoalLine, InsightThresholdType } from '~/queries/schema'
import { getBreakdown, isInsightVizNode, isTrendsQuery } from '~/queries/utils'
import { InsightLogicProps } from '~/types'

Expand Down Expand Up @@ -67,6 +67,7 @@ export const insightAlertsLogic = kea<insightAlertsLogicType>([
alerts.flatMap((alert) => {
if (
alert.threshold.configuration.type !== InsightThresholdType.ABSOLUTE ||
alert.condition.type !== AlertConditionType.ABSOLUTE_VALUE ||
!alert.threshold.configuration.bounds
) {
return []
Expand Down
4 changes: 3 additions & 1 deletion posthog/api/alert.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,9 @@ def update(self, instance, validated_data):
else:
# always store snoozed_until as UTC time
# as we look at current UTC time to check when to run alerts
snoozed_until = relative_date_parse(snoozed_until_param, ZoneInfo("UTC"), increase=True)
snoozed_until = relative_date_parse(
snoozed_until_param, ZoneInfo("UTC"), increase=True, always_truncate=True
)
instance.state = AlertState.SNOOZED
instance.snoozed_until = snoozed_until

Expand Down
39 changes: 36 additions & 3 deletions posthog/tasks/alerts/checks.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,14 @@
import math
import time
import traceback

from datetime import datetime, timedelta, UTC
from typing import cast
from dateutil.relativedelta import relativedelta
import traceback

from celery import shared_task
from celery.canvas import chain
from django.conf import settings
from django.db import transaction
import structlog
from sentry_sdk import capture_exception
Expand Down Expand Up @@ -39,7 +43,15 @@
logger = structlog.get_logger(__name__)


class AlertCheckException(Exception): ...
class AlertCheckException(Exception):
"""
Required for custom exceptions to pass stack trace to sentry.
Subclassing through other ways doesn't transfer the traceback.
https://stackoverflow.com/a/69963663/5540417
"""

def __init__(self, err: Exception):
self.__traceback__ = err.__traceback__


HOURLY_ALERTS_BACKLOG_GAUGE = Gauge(
Expand Down Expand Up @@ -102,6 +114,9 @@ def alerts_backlog_task() -> None:

DAILY_ALERTS_BACKLOG_GAUGE.set(daily_alerts_breaching_sla)

# sleeping 30s for prometheus to pick up the metrics sent during task
time.sleep(30)


@shared_task(
ignore_result=True,
Expand Down Expand Up @@ -158,6 +173,8 @@ def check_alert_task(alert_id: str) -> None:


def check_alert(alert_id: str) -> None:
task_start_time = time.time()

try:
alert = AlertConfiguration.objects.get(id=alert_id, enabled=True)
except AlertConfiguration.DoesNotExist:
Expand Down Expand Up @@ -199,8 +216,15 @@ def check_alert(alert_id: str) -> None:
check_alert_and_notify_atomically(alert)
except Exception as err:
ALERT_CHECK_ERROR_COUNTER.inc()

logger.exception(AlertCheckException(err))
capture_exception(AlertCheckException(err))
capture_exception(
AlertCheckException(err),
tags={
"alert_configuration_id": alert_id,
},
)

# raise again so alert check is retried depending on error type
raise
finally:
Expand All @@ -209,6 +233,15 @@ def check_alert(alert_id: str) -> None:
alert.is_calculating = False
alert.save()

# only in PROD
if not settings.DEBUG and not settings.TEST:
task_duration = time.time() - task_start_time

# Ensure task runs at least 40s
# for prometheus to pick up the metrics sent during task
time_left_to_run = 40 - math.floor(task_duration)
time.sleep(time_left_to_run)


@transaction.atomic
def check_alert_and_notify_atomically(alert: AlertConfiguration) -> None:
Expand Down
6 changes: 3 additions & 3 deletions posthog/tasks/alerts/test/test_alert_checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ def test_alert_is_triggered_for_values_above_higher_threshold(

anomalies_descriptions = self.get_breach_description(mock_send_notifications_for_breaches, call_index=0)
assert len(anomalies_descriptions) == 1
assert "The trend value (1) is above the upper threshold (0.0)" in anomalies_descriptions[0]
assert "The insight value for previous day is (1) more than upper threshold (0.0)" in anomalies_descriptions[0]

def test_alert_is_not_triggered_for_events_beyond_interval(
self, mock_send_notifications_for_breaches: MagicMock, mock_send_errors: MagicMock
Expand Down Expand Up @@ -127,7 +127,7 @@ def test_alert_is_triggered_for_value_below_lower_threshold(

assert mock_send_notifications_for_breaches.call_count == 1
anomalies = self.get_breach_description(mock_send_notifications_for_breaches, call_index=0)
assert "The trend value (0) is below the lower threshold (1.0)" in anomalies
assert "The insight value for previous day is (0) less than lower threshold (1.0)" in anomalies

def test_alert_triggers_but_does_not_send_notification_during_firing(
self, mock_send_notifications_for_breaches: MagicMock, mock_send_errors: MagicMock
Expand Down Expand Up @@ -315,7 +315,7 @@ def test_alert_with_insight_with_filter(

assert mock_send_notifications_for_breaches.call_count == 1
anomalies = self.get_breach_description(mock_send_notifications_for_breaches, call_index=0)
assert "The trend value (0) is below the lower threshold (1.0)" in anomalies
assert "The insight value for previous day is (0) less than lower threshold (1.0)" in anomalies

@patch("posthog.tasks.alerts.utils.EmailMessage")
def test_send_emails(
Expand Down
61 changes: 13 additions & 48 deletions posthog/tasks/alerts/test/test_trends_absolute_alerts.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
from typing import Optional, Any
from unittest.mock import MagicMock, patch
from unittest.mock import ANY, MagicMock, patch
import dateutil


from freezegun import freeze_time

from posthog.models.alert import AlertCheck
Expand All @@ -29,7 +28,7 @@
FROZEN_TIME = dateutil.parser.parse("2024-06-02T08:55:00.000Z")


@freeze_time("2024-06-02T08:55:00.000Z")
@freeze_time(FROZEN_TIME)
@patch("posthog.tasks.alerts.checks.send_notifications_for_errors")
@patch("posthog.tasks.alerts.checks.send_notifications_for_breaches")
class TestTimeSeriesTrendsAbsoluteAlerts(APIBaseTest, ClickhouseDestroyTablesMixin):
Expand Down Expand Up @@ -97,7 +96,7 @@ def create_time_series_trend_insight(self, breakdown: Optional[BreakdownFilter]

return insight

def test_alert_properties(self, mock_send_breaches: MagicMock, mock_send_errors: MagicMock) -> None:
def test_alert_lower_threshold_breached(self, mock_send_breaches: MagicMock, mock_send_errors: MagicMock) -> None:
insight = self.create_time_series_trend_insight()
alert = self.create_alert(insight, series_index=0, lower=1)

Expand All @@ -119,11 +118,15 @@ def test_alert_properties(self, mock_send_breaches: MagicMock, mock_send_errors:
assert alert_check.state == AlertState.FIRING
assert alert_check.error is None

mock_send_breaches.assert_called_once_with(
ANY, ["The insight value for previous week is (0) less than lower threshold (1.0)"]
)

def test_trend_high_threshold_breached(self, mock_send_breaches: MagicMock, mock_send_errors: MagicMock) -> None:
insight = self.create_time_series_trend_insight()
alert = self.create_alert(insight, series_index=0, upper=1)

with freeze_time("2024-06-02T07:55:00.000Z"):
with freeze_time(FROZEN_TIME - dateutil.relativedelta.relativedelta(days=1)):
_create_event(
team=self.team,
event="signed_up",
Expand All @@ -149,11 +152,15 @@ def test_trend_high_threshold_breached(self, mock_send_breaches: MagicMock, mock
assert alert_check.state == AlertState.FIRING
assert alert_check.error is None

mock_send_breaches.assert_called_once_with(
ANY, ["The insight value for previous week is (2) more than upper threshold (1.0)"]
)

def test_trend_no_threshold_breached(self, mock_send_breaches: MagicMock, mock_send_errors: MagicMock) -> None:
insight = self.create_time_series_trend_insight()
alert = self.create_alert(insight, series_index=0, lower=0, upper=2)

with freeze_time("2024-06-02T07:55:00.000Z"):
with freeze_time(FROZEN_TIME - dateutil.relativedelta.relativedelta(days=1)):
_create_event(
team=self.team,
event="signed_up",
Expand All @@ -172,45 +179,3 @@ def test_trend_no_threshold_breached(self, mock_send_breaches: MagicMock, mock_s
assert alert_check.calculated_value == 1
assert alert_check.state == AlertState.NOT_FIRING
assert alert_check.error is None

# TODO: support breakdowns
def test_trend_with_single_breakdown_threshold_breached(
self, mock_send_breaches: MagicMock, mock_send_errors: MagicMock
) -> None:
insight = self.create_time_series_trend_insight(
breakdown=BreakdownFilter(breakdown_type="event", breakdown="$browser")
)
alert = self.create_alert(insight, series_index=0, lower=0, upper=1)

with freeze_time("2024-06-02T07:55:00.000Z"):
_create_event(
team=self.team,
event="signed_up",
distinct_id="1",
properties={"$browser": "Chrome"},
)
_create_event(
team=self.team,
event="signed_up",
distinct_id="2",
properties={"$browser": "Chrome"},
)
_create_event(
team=self.team,
event="signed_up",
distinct_id="1",
properties={"$browser": "Firefox"},
)
flush_persons_and_events()

check_alert(alert["id"])

updated_alert = AlertConfiguration.objects.get(pk=alert["id"])
assert updated_alert.state == AlertState.FIRING
assert updated_alert.next_check_at == FROZEN_TIME + dateutil.relativedelta.relativedelta(days=1)

alert_check = AlertCheck.objects.filter(alert_configuration=alert["id"]).latest("created_at")
# calculated value should only be from browser = Chrome
assert alert_check.calculated_value == 2
assert alert_check.state == AlertState.FIRING
assert alert_check.error is None
30 changes: 29 additions & 1 deletion posthog/tasks/alerts/test/test_trends_relative_alerts.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from typing import Optional, Any
from unittest.mock import MagicMock, patch
from unittest.mock import ANY, MagicMock, patch
import dateutil


Expand Down Expand Up @@ -184,6 +184,10 @@ def test_relative_increase_absolute_upper_threshold_breached(
assert alert_check.state == AlertState.FIRING
assert alert_check.error is None

mock_send_breaches.assert_called_once_with(
ANY, ["The insight value for previous week increased (2) more than upper threshold (1.0)"]
)

def test_relative_increase_upper_threshold_breached(
self, mock_send_breaches: MagicMock, mock_send_errors: MagicMock
) -> None:
Expand Down Expand Up @@ -341,6 +345,10 @@ def test_relative_increase_lower_threshold_breached_1(
assert alert_check.state == AlertState.FIRING
assert alert_check.error is None

mock_send_breaches.assert_called_once_with(
ANY, ["The insight value for previous week increased (-1) less than lower threshold (2.0)"]
)

# check percentage alert
check_alert(percentage_alert["id"])

Expand All @@ -354,6 +362,10 @@ def test_relative_increase_lower_threshold_breached_1(
assert alert_check.state == AlertState.FIRING
assert alert_check.error is None

mock_send_breaches.assert_called_with(
ANY, ["The insight value for previous week increased (-50.00%) less than lower threshold (50.00%)"]
)

def test_relative_increase_lower_threshold_breached_2(
self, mock_send_breaches: MagicMock, mock_send_errors: MagicMock
) -> None:
Expand Down Expand Up @@ -511,6 +523,10 @@ def test_relative_decrease_upper_threshold_breached(
assert alert_check.state == AlertState.FIRING
assert alert_check.error is None

mock_send_breaches.assert_called_once_with(
ANY, ["The insight value for previous week decreased (2) more than upper threshold (1.0)"]
)

check_alert(percentage_alert["id"])

updated_alert = AlertConfiguration.objects.get(pk=percentage_alert["id"])
Expand All @@ -523,6 +539,10 @@ def test_relative_decrease_upper_threshold_breached(
assert alert_check.state == AlertState.FIRING
assert alert_check.error is None

mock_send_breaches.assert_called_with(
ANY, ["The insight value for previous week decreased (66.67%) more than upper threshold (20.00%)"]
)

def test_relative_decrease_lower_threshold_breached(
self, mock_send_breaches: MagicMock, mock_send_errors: MagicMock
) -> None:
Expand Down Expand Up @@ -592,6 +612,10 @@ def test_relative_decrease_lower_threshold_breached(
assert alert_check.state == AlertState.FIRING
assert alert_check.error is None

mock_send_breaches.assert_called_once_with(
ANY, ["The insight value for previous week decreased (1) less than lower threshold (2.0)"]
)

check_alert(percentage_alert["id"])

updated_alert = AlertConfiguration.objects.get(pk=percentage_alert["id"])
Expand All @@ -604,6 +628,10 @@ def test_relative_decrease_lower_threshold_breached(
assert alert_check.state == AlertState.FIRING
assert alert_check.error is None

mock_send_breaches.assert_called_with(
ANY, ["The insight value for previous week decreased (50.00%) less than lower threshold (80.00%)"]
)

def test_relative_increase_no_threshold_breached(
self, mock_send_breaches: MagicMock, mock_send_errors: MagicMock
) -> None:
Expand Down
Loading

0 comments on commit 14a8b5c

Please sign in to comment.