diff --git a/posthog/hogql_queries/events_query_runner.py b/posthog/hogql_queries/events_query_runner.py index 6b168a3f7dd98..a1b6973995668 100644 --- a/posthog/hogql_queries/events_query_runner.py +++ b/posthog/hogql_queries/events_query_runner.py @@ -17,6 +17,7 @@ from posthog.hogql_queries.query_runner import QueryRunner from posthog.models import Action, Person, Team from posthog.models.element import chain_to_elements +from posthog.models.person.person import get_distinct_ids_for_subquery from posthog.models.person.util import get_persons_by_distinct_ids from posthog.schema import EventsQuery, EventsQueryResponse from posthog.utils import relative_date_parse @@ -118,12 +119,10 @@ def to_query(self) -> ast.SelectQuery: person: Optional[Person] = get_pk_or_uuid( Person.objects.filter(team=self.team), self.query.personId ).first() - distinct_ids = person.distinct_ids if person is not None else [] - ids_list = list(map(str, distinct_ids)) where_exprs.append( parse_expr( "distinct_id in {list}", - {"list": ast.Constant(value=ids_list)}, + {"list": ast.Constant(value=get_distinct_ids_for_subquery(person, self.team))}, timings=self.timings, ) ) diff --git a/posthog/models/event/query_event_list.py b/posthog/models/event/query_event_list.py index 66fc02c8ba55a..9608fdd8e2330 100644 --- a/posthog/models/event/query_event_list.py +++ b/posthog/models/event/query_event_list.py @@ -15,6 +15,7 @@ SELECT_EVENT_BY_TEAM_AND_CONDITIONS_FILTERS_SQL, SELECT_EVENT_BY_TEAM_AND_CONDITIONS_SQL, ) +from posthog.models.person.person import get_distinct_ids_for_subquery from posthog.models.property.util import parse_prop_grouped_clauses from posthog.queries.insight import insight_query_with_columns from posthog.utils import relative_date_parse @@ -45,8 +46,7 @@ def determine_event_conditions( elif k == "person_id": result += """AND distinct_id IN (%(distinct_ids)s) """ person = get_pk_or_uuid(Person.objects.filter(team=team), v).first() - distinct_ids = person.distinct_ids if person is not None else [] - params.update({"distinct_ids": list(map(str, distinct_ids))}) + params.update({"distinct_ids": get_distinct_ids_for_subquery(person, team)}) elif k == "distinct_id": result += "AND distinct_id = %(distinct_id)s " params.update({"distinct_id": v}) diff --git a/posthog/models/person/person.py b/posthog/models/person/person.py index cae5e450fa766..92b6103f2c8b6 100644 --- a/posthog/models/person/person.py +++ b/posthog/models/person/person.py @@ -5,6 +5,10 @@ from posthog.models.utils import UUIDT +from ..team import Team + +MAX_LIMIT_DISTINCT_IDS = 2500 + class PersonManager(models.Manager): def create(self, *args: Any, **kwargs: Any): @@ -173,3 +177,40 @@ class Meta: oldest_event: models.DateTimeField = models.DateTimeField() version: models.BigIntegerField = models.BigIntegerField(null=True, blank=True) + + +def get_distinct_ids_for_subquery(person: Person | None, team: Team) -> List[str]: + """_summary_ + Fetching distinct_ids for a person from CH is slow, so we + fetch them from PG for certain queries. Therfore we need + to inline the ids in a `distinct_ids IN (...)` clause. + + This can cause the query to explode for persons with many + ids. Thus we need to limit the amount of distinct_ids we + pass through. + + The first distinct_ids should contain the real distinct_ids + for a person and later ones should be associated with current + events. Therefore we union from both sides. + + Many ids are usually a sign of instrumentation issues + on the customer side. + """ + first_ids_limit = 100 + last_ids_limit = MAX_LIMIT_DISTINCT_IDS - first_ids_limit + + if person is not None: + first_ids = ( + PersonDistinctId.objects.filter(person=person, team=team) + .order_by("id") + .values_list("distinct_id", flat=True)[:first_ids_limit] + ) + last_ids = ( + PersonDistinctId.objects.filter(person=person, team=team) + .order_by("-id") + .values_list("distinct_id", flat=True)[:last_ids_limit] + ) + distinct_ids = first_ids.union(last_ids) + else: + distinct_ids = [] # type: ignore + return list(map(str, distinct_ids))