From 397a009da5cd9a2a06c6a9de6e0787ee137e578f Mon Sep 17 00:00:00 2001 From: Oliver Browne Date: Thu, 17 Oct 2024 17:11:24 +0300 Subject: [PATCH 1/5] add management command --- .../delete_persons_with_no_distinct_ids.py | 32 +++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 posthog/management/commands/delete_persons_with_no_distinct_ids.py diff --git a/posthog/management/commands/delete_persons_with_no_distinct_ids.py b/posthog/management/commands/delete_persons_with_no_distinct_ids.py new file mode 100644 index 0000000000000..d93362226840d --- /dev/null +++ b/posthog/management/commands/delete_persons_with_no_distinct_ids.py @@ -0,0 +1,32 @@ +from django.core.management.base import BaseCommand, CommandError + +from posthog.models import Team, Person, PersonDistinctId + + +class Command(BaseCommand): + help = "Delete person rows that have no associated persondistinctid rows, by team" + + def add_arguments(self, parser): + parser.add_argument("--team-id", default=None, type=int, help="Team ID to migrate from (on this instance)") + + def handle(self, **options): + team_id = options["team_id"] + + if not team_id: + raise CommandError("source Team ID is required") + + team = Team.objects.get(pk=team_id) + + print("Deleting persons with no distinct ids for team", team_id) # noqa: T201 + + # There's a relationship from persondistinctid to person, but not in the other + # direction, so we have to iterate over the entire person set to find the ones + # that have no distinct ids + people = Person.objects.filter(team=team) + + # Delete persons with no distinct ids + deleted = 0 + for p in people: + if not PersonDistinctId.objects.filter(person=p).exists(): + p.delete() + deleted += 1 From 4e5ab65f698b8f14f9cae762b7d2f3397bcdfe83 Mon Sep 17 00:00:00 2001 From: Oliver Browne Date: Thu, 17 Oct 2024 17:14:04 +0300 Subject: [PATCH 2/5] more print --- .../management/commands/delete_persons_with_no_distinct_ids.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/posthog/management/commands/delete_persons_with_no_distinct_ids.py b/posthog/management/commands/delete_persons_with_no_distinct_ids.py index d93362226840d..3deabc8d7ea0a 100644 --- a/posthog/management/commands/delete_persons_with_no_distinct_ids.py +++ b/posthog/management/commands/delete_persons_with_no_distinct_ids.py @@ -28,5 +28,8 @@ def handle(self, **options): deleted = 0 for p in people: if not PersonDistinctId.objects.filter(person=p).exists(): + print(f"Deleting person {p} with no distinct ids") # noqa: T201 p.delete() deleted += 1 + + print(f"Deleted {deleted} persons with no distinct ids") # noqa: T201 From b7a12537d97131269d2bb9d462b3cb26e3330493 Mon Sep 17 00:00:00 2001 From: Oliver Browne Date: Thu, 17 Oct 2024 17:14:49 +0300 Subject: [PATCH 3/5] dry run --- .../commands/delete_persons_with_no_distinct_ids.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/posthog/management/commands/delete_persons_with_no_distinct_ids.py b/posthog/management/commands/delete_persons_with_no_distinct_ids.py index 3deabc8d7ea0a..b7faffd8637b6 100644 --- a/posthog/management/commands/delete_persons_with_no_distinct_ids.py +++ b/posthog/management/commands/delete_persons_with_no_distinct_ids.py @@ -8,9 +8,12 @@ class Command(BaseCommand): def add_arguments(self, parser): parser.add_argument("--team-id", default=None, type=int, help="Team ID to migrate from (on this instance)") + # Make it dry-runnable + parser.add_argument("--dry-run", action="store_true", help="Dry run") def handle(self, **options): team_id = options["team_id"] + dry_run = options["dry_run"] if not team_id: raise CommandError("source Team ID is required") @@ -29,7 +32,8 @@ def handle(self, **options): for p in people: if not PersonDistinctId.objects.filter(person=p).exists(): print(f"Deleting person {p} with no distinct ids") # noqa: T201 - p.delete() + if not dry_run: + p.delete() deleted += 1 print(f"Deleted {deleted} persons with no distinct ids") # noqa: T201 From 79015e6cb2257bc48de8eb859ef53962d481c98f Mon Sep 17 00:00:00 2001 From: Oliver Browne Date: Thu, 17 Oct 2024 17:16:18 +0300 Subject: [PATCH 4/5] dry run default on --- .../management/commands/delete_persons_with_no_distinct_ids.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/posthog/management/commands/delete_persons_with_no_distinct_ids.py b/posthog/management/commands/delete_persons_with_no_distinct_ids.py index b7faffd8637b6..9b7f4f7907fb8 100644 --- a/posthog/management/commands/delete_persons_with_no_distinct_ids.py +++ b/posthog/management/commands/delete_persons_with_no_distinct_ids.py @@ -8,8 +8,7 @@ class Command(BaseCommand): def add_arguments(self, parser): parser.add_argument("--team-id", default=None, type=int, help="Team ID to migrate from (on this instance)") - # Make it dry-runnable - parser.add_argument("--dry-run", action="store_true", help="Dry run") + parser.add_argument("--dry-run", action="store_false", help="Dry run (default: true)") def handle(self, **options): team_id = options["team_id"] From 0553dfd1ff6745dc53fd6ab2602100124bf53741 Mon Sep 17 00:00:00 2001 From: Oliver Browne Date: Thu, 17 Oct 2024 18:36:18 +0300 Subject: [PATCH 5/5] just use ssql --- .../delete_persons_with_no_distinct_ids.py | 70 ++++++++++++++----- 1 file changed, 51 insertions(+), 19 deletions(-) diff --git a/posthog/management/commands/delete_persons_with_no_distinct_ids.py b/posthog/management/commands/delete_persons_with_no_distinct_ids.py index 9b7f4f7907fb8..80e52f6e1bde6 100644 --- a/posthog/management/commands/delete_persons_with_no_distinct_ids.py +++ b/posthog/management/commands/delete_persons_with_no_distinct_ids.py @@ -1,6 +1,5 @@ from django.core.management.base import BaseCommand, CommandError - -from posthog.models import Team, Person, PersonDistinctId +from django.db import connection class Command(BaseCommand): @@ -17,22 +16,55 @@ def handle(self, **options): if not team_id: raise CommandError("source Team ID is required") - team = Team.objects.get(pk=team_id) - print("Deleting persons with no distinct ids for team", team_id) # noqa: T201 - # There's a relationship from persondistinctid to person, but not in the other - # direction, so we have to iterate over the entire person set to find the ones - # that have no distinct ids - people = Person.objects.filter(team=team) - - # Delete persons with no distinct ids - deleted = 0 - for p in people: - if not PersonDistinctId.objects.filter(person=p).exists(): - print(f"Deleting person {p} with no distinct ids") # noqa: T201 - if not dry_run: - p.delete() - deleted += 1 - - print(f"Deleted {deleted} persons with no distinct ids") # noqa: T201 + if dry_run: + delete_persons_without_distinct_ids_raw_sql_dry_run(team_id) + else: + delete_persons_without_distinct_ids_raw_sql(team_id) + + +def delete_persons_without_distinct_ids_raw_sql(team_id): + with connection.cursor() as cursor: + cursor.execute( + """ + WITH persons_to_delete AS ( + SELECT p.id + FROM posthog_person p + LEFT JOIN posthog_persondistinctid pd ON p.id = pd.person_id AND p.team_id = pd.team_id + WHERE p.team_id = %s AND pd.id IS NULL + ) + DELETE FROM posthog_person + WHERE id IN (SELECT id FROM persons_to_delete) + RETURNING id; + """, + [team_id], + ) + + deleted_ids = cursor.fetchall() + deleted_count = len(deleted_ids) + + print(f"Deleted {deleted_count} Person objects with no PersonDistinctIds for team {team_id}.") # noqa: T201 + return deleted_count + + +def delete_persons_without_distinct_ids_raw_sql_dry_run(team_id): + with connection.cursor() as cursor: + cursor.execute( + """ + WITH persons_to_delete AS ( + SELECT p.id + FROM posthog_person p + LEFT JOIN posthog_persondistinctid pd ON p.id = pd.person_id AND p.team_id = pd.team_id + WHERE p.team_id = %s AND pd.id IS NULL + ) + SELECT COUNT(*) FROM persons_to_delete; + """, + [team_id], + ) + + deleted_count = cursor.fetchone() + deleted_count = deleted_count[0] if deleted_count else 0 + + print(f"Would have deleted {deleted_count} Person objects with no PersonDistinctIds for team {team_id}.") # noqa: T201 + return deleted_count