From 78c9184231d494b46fce35c76022301f0d3165de Mon Sep 17 00:00:00 2001 From: ted kaemming <65315+tkaemming@users.noreply.github.com> Date: Mon, 18 Mar 2024 13:26:43 -0700 Subject: [PATCH] feat(persons-on-events): Support backfilling multiple teams, and store backfill state (#20886) --- .../backfill_distinct_id_overrides.py | 49 ++++++++++++++++--- .../test_backfill_distinct_id_overrides.py | 4 +- 2 files changed, 45 insertions(+), 8 deletions(-) diff --git a/posthog/management/commands/backfill_distinct_id_overrides.py b/posthog/management/commands/backfill_distinct_id_overrides.py index 35133ef4addbf..507e744a93d0e 100644 --- a/posthog/management/commands/backfill_distinct_id_overrides.py +++ b/posthog/management/commands/backfill_distinct_id_overrides.py @@ -2,6 +2,7 @@ import logging from dataclasses import dataclass +from typing import Sequence import structlog from django.core.management.base import BaseCommand, CommandError @@ -14,10 +15,12 @@ @dataclass -class BackfillQuery: +class Backfill: team_id: int def execute(self, dry_run: bool = False) -> None: + logger.info("Starting %r...", self) + query = """ SELECT team_id, @@ -51,20 +54,54 @@ def execute(self, dry_run: bool = False) -> None: parameters, ) + # XXX: The RETURNING set isn't really useful here, but this QuerySet + # needs to be iterated over to force execution, so we might as well + # return something... + updated_teams = list( + Team.objects.raw( + """ + UPDATE posthog_team + SET extra_settings = COALESCE(extra_settings, '{}'::jsonb) || '{"distinct_id_overrides_backfilled": true}'::jsonb + WHERE id = %s + RETURNING * + """, + [self.team_id], + ) + ) + assert not len(updated_teams) > 1 + + logger.info("Completed %r!", self) + class Command(BaseCommand): help = "Backfill person_distinct_id_overrides records." def add_arguments(self, parser): - parser.add_argument("--team-id", required=True, type=int, help="team to backfill for") + parser.add_argument( + "--team-id", + required=False, + type=int, + dest="team_id_list", + action="append", + help="team(s) to backfill (defaults to all un-backfilled teams)", + ) parser.add_argument( "--live-run", action="store_true", help="actually execute INSERT queries (default is dry-run)" ) - def handle(self, *, live_run: bool, team_id: int, **options): + def handle(self, *, live_run: bool, team_id_list: Sequence[int] | None, **options): logger.setLevel(logging.INFO) - if not Team.objects.filter(id=team_id).exists(): - raise CommandError(f"Team with id={team_id!r} does not exist") + if team_id_list is not None: + team_ids = set(team_id_list) + existing_team_ids = set(Team.objects.filter(id__in=team_ids).values_list("id", flat=True)) + if existing_team_ids != team_ids: + raise CommandError(f"Teams with ids {team_ids - existing_team_ids!r} do not exist") + else: + team_ids = set( + Team.objects.exclude(extra_settings__distinct_id_overrides_backfilled=True).values_list("id", flat=True) + ) - BackfillQuery(team_id).execute(dry_run=not live_run) + logger.info("Starting backfill for %s teams...", len(team_ids)) + for team_id in team_ids: + Backfill(team_id).execute(dry_run=not live_run) diff --git a/posthog/management/commands/test/test_backfill_distinct_id_overrides.py b/posthog/management/commands/test/test_backfill_distinct_id_overrides.py index 6a161d6205d18..55f7c76116936 100644 --- a/posthog/management/commands/test/test_backfill_distinct_id_overrides.py +++ b/posthog/management/commands/test/test_backfill_distinct_id_overrides.py @@ -2,7 +2,7 @@ import uuid from posthog.clickhouse.client.execute import sync_execute -from posthog.management.commands.backfill_distinct_id_overrides import BackfillQuery +from posthog.management.commands.backfill_distinct_id_overrides import Backfill from posthog.test.base import BaseTest, ClickhouseTestMixin @@ -32,7 +32,7 @@ def __run_test_backfill(self, dry_run: bool) -> None: {"team_id": self.team.id}, ) == [(0,)] - BackfillQuery(self.team.id).execute(dry_run=dry_run) + Backfill(self.team.id).execute(dry_run=dry_run) read_columns = ["team_id", "distinct_id", "person_id", "version"] distinct_id_override_rows = sync_execute(