Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(persons): limit to maximum of 2500 distinct_ids for cross-db join #18414

Merged
merged 10 commits into from
Nov 8, 2023
5 changes: 2 additions & 3 deletions posthog/hogql_queries/events_query_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from posthog.hogql_queries.query_runner import QueryRunner
from posthog.models import Action, Person, Team
from posthog.models.element import chain_to_elements
from posthog.models.person.person import get_distinct_ids_for_subquery
from posthog.models.person.util import get_persons_by_distinct_ids
from posthog.schema import EventsQuery, EventsQueryResponse
from posthog.utils import relative_date_parse
Expand Down Expand Up @@ -118,12 +119,10 @@ def to_query(self) -> ast.SelectQuery:
person: Optional[Person] = get_pk_or_uuid(
Person.objects.filter(team=self.team), self.query.personId
).first()
distinct_ids = person.distinct_ids if person is not None else []
ids_list = list(map(str, distinct_ids))
where_exprs.append(
parse_expr(
"distinct_id in {list}",
{"list": ast.Constant(value=ids_list)},
{"list": ast.Constant(value=get_distinct_ids_for_subquery(person, self.team))},
timings=self.timings,
)
)
Expand Down
4 changes: 2 additions & 2 deletions posthog/models/event/query_event_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
SELECT_EVENT_BY_TEAM_AND_CONDITIONS_FILTERS_SQL,
SELECT_EVENT_BY_TEAM_AND_CONDITIONS_SQL,
)
from posthog.models.person.person import get_distinct_ids_for_subquery
from posthog.models.property.util import parse_prop_grouped_clauses
from posthog.queries.insight import insight_query_with_columns
from posthog.utils import relative_date_parse
Expand Down Expand Up @@ -45,8 +46,7 @@ def determine_event_conditions(
elif k == "person_id":
result += """AND distinct_id IN (%(distinct_ids)s) """
person = get_pk_or_uuid(Person.objects.filter(team=team), v).first()
distinct_ids = person.distinct_ids if person is not None else []
params.update({"distinct_ids": list(map(str, distinct_ids))})
params.update({"distinct_ids": get_distinct_ids_for_subquery(person, team)})
elif k == "distinct_id":
result += "AND distinct_id = %(distinct_id)s "
params.update({"distinct_id": v})
Expand Down
41 changes: 41 additions & 0 deletions posthog/models/person/person.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,10 @@

from posthog.models.utils import UUIDT

from ..team import Team

MAX_LIMIT_DISTINCT_IDS = 2500


class PersonManager(models.Manager):
def create(self, *args: Any, **kwargs: Any):
Expand Down Expand Up @@ -173,3 +177,40 @@ class Meta:

oldest_event: models.DateTimeField = models.DateTimeField()
version: models.BigIntegerField = models.BigIntegerField(null=True, blank=True)


def get_distinct_ids_for_subquery(person: Person | None, team: Team) -> List[str]:
"""_summary_
Fetching distinct_ids for a person from CH is slow, so we
fetch them from PG for certain queries. Therfore we need
to inline the ids in a `distinct_ids IN (...)` clause.

This can cause the query to explode for persons with many
ids. Thus we need to limit the amount of distinct_ids we
pass through.

The first distinct_ids should contain the real distinct_ids
for a person and later ones should be associated with current
events. Therefore we union from both sides.

Many ids are usually a sign of instrumentation issues
on the customer side.
"""
first_ids_limit = 100
last_ids_limit = MAX_LIMIT_DISTINCT_IDS - first_ids_limit

if person is not None:
first_ids = (
PersonDistinctId.objects.filter(person=person, team=team)
thmsobrmlr marked this conversation as resolved.
Show resolved Hide resolved
.order_by("id")
.values_list("distinct_id", flat=True)[:first_ids_limit]
)
last_ids = (
PersonDistinctId.objects.filter(person=person, team=team)
.order_by("-id")
.values_list("distinct_id", flat=True)[:last_ids_limit]
)
distinct_ids = first_ids.union(last_ids)
else:
distinct_ids = [] # type: ignore
return list(map(str, distinct_ids))
Loading