diff --git a/posthog/tasks/scheduled.py b/posthog/tasks/scheduled.py index 2a43d1dd834a8..a3b7e441f018c 100644 --- a/posthog/tasks/scheduled.py +++ b/posthog/tasks/scheduled.py @@ -47,6 +47,7 @@ update_quota_limiting, verify_persons_data_in_sync, update_survey_iteration, + invalid_web_replays, ) from posthog.utils import get_crontab @@ -199,6 +200,7 @@ def setup_periodic_tasks(sender: Celery, **kwargs: Any) -> None: pg_plugin_server_query_timing.s(), name="PG plugin server query timing", ) + add_periodic_task_with_expiry( sender, 60, @@ -311,3 +313,10 @@ def setup_periodic_tasks(sender: Celery, **kwargs: Any) -> None: check_data_import_row_limits.s(), name="check external data rows synced", ) + + add_periodic_task_with_expiry( + sender, + 3600, + invalid_web_replays.s(), + name="Invalid web replays count", + ) diff --git a/posthog/tasks/tasks.py b/posthog/tasks/tasks.py index a9cf873fdbd9b..0964e00171708 100644 --- a/posthog/tasks/tasks.py +++ b/posthog/tasks/tasks.py @@ -228,6 +228,42 @@ def ingestion_lag() -> None: pass +@shared_task(ignore_result=True) +def invalid_web_replays() -> None: + from posthog.client import sync_execute + + # ultimately I want to observe values by team id, but at the moment that would be lots of series, let's reduce the value first + query = """ + select + --team_id, + count() + from ( + select any(team_id) as team_id, argMinMerge(first_url) as first_url, argMinMerge(snapshot_source) as snapshot_source + from session_replay_events + where min_first_timestamp >= now() - interval 1 hour + and min_first_timestamp <= now() + group by session_id + having first_url is null and snapshot_source = 'web' + ) + --group by team_id + """ + + try: + results = sync_execute( + query, + ) + with pushed_metrics_registry("celery_replay_tracking") as registry: + gauge = Gauge( + "replay_tracking_web_replay_with_missing_first_url", + "Acts as a proxy for replay sessions which haven't received a full snapshot", + registry=registry, + ) + count = results[0][0] + gauge.set(count) + except: + pass + + KNOWN_CELERY_TASK_IDENTIFIERS = { "pluginJob", "runEveryHour",