Skip to content

Commit

Permalink
fix: persons api performance (#24351)
Browse files Browse the repository at this point in the history
Co-authored-by: github-actions <41898282+github-actions[bot]@users.noreply.github.com>
  • Loading branch information
aspicer and github-actions[bot] authored Aug 14, 2024
1 parent cea6ac6 commit 57793ef
Show file tree
Hide file tree
Showing 6 changed files with 74 additions and 32 deletions.
4 changes: 2 additions & 2 deletions ee/clickhouse/queries/related_actors_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
SerializedGroup,
SerializedPerson,
get_groups,
get_people,
get_serialized_people,
)
from posthog.queries.person_distinct_id_query import get_team_distinct_ids_query

Expand Down Expand Up @@ -69,7 +69,7 @@ def _query_related_people(self) -> list[SerializedPerson]:
)
)

_, serialized_people = get_people(self.team, person_ids)
serialized_people = get_serialized_people(self.team, person_ids)
return serialized_people

def _query_related_groups(self, group_type_index: GroupTypeIndex) -> list[SerializedGroup]:
Expand Down
19 changes: 9 additions & 10 deletions mypy-baseline.txt
Original file line number Diff line number Diff line change
Expand Up @@ -231,8 +231,6 @@ posthog/hogql/printer.py:0: error: "FieldOrTable" has no attribute "name" [attr
posthog/hogql/printer.py:0: error: "FieldOrTable" has no attribute "name" [attr-defined]
posthog/hogql/printer.py:0: error: Argument 2 to "_get_materialized_column" of "_Printer" has incompatible type "str | int"; expected "str" [arg-type]
posthog/hogql/printer.py:0: error: Argument 1 to "_print_identifier" of "_Printer" has incompatible type "str | None"; expected "str" [arg-type]
posthog/api/plugin.py:0: error: Item "None" of "IO[Any] | None" has no attribute "read" [union-attr]
posthog/api/plugin.py:0: error: Item "None" of "IO[Any] | None" has no attribute "read" [union-attr]
posthog/api/action.py:0: error: Argument 1 to <tuple> has incompatible type "*tuple[str, ...]"; expected "type[BaseRenderer]" [arg-type]
posthog/test/base.py:0: error: Module has no attribute "configure" [attr-defined]
posthog/test/base.py:0: error: Incompatible types in assignment (expression has type "None", variable has type "Organization") [assignment]
Expand Down Expand Up @@ -285,7 +283,6 @@ posthog/queries/paths/paths_actors.py:0: error: Incompatible types in assignment
ee/clickhouse/queries/funnels/funnel_correlation.py:0: error: Statement is unreachable [unreachable]
posthog/api/insight.py:0: error: Argument 1 to <tuple> has incompatible type "*tuple[str, ...]"; expected "type[BaseRenderer]" [arg-type]
posthog/api/dashboards/dashboard.py:0: error: Argument 1 to "dashboard_queryset" of "DashboardTile" has incompatible type "DashboardTile_RelatedManager"; expected "_QuerySet[Any, Any]" [arg-type]
posthog/api/person.py:0: error: Incompatible return value type (got "int", expected "str") [return-value]
posthog/api/person.py:0: error: Argument 1 to <tuple> has incompatible type "*tuple[str, ...]"; expected "type[BaseRenderer]" [arg-type]
posthog/api/person.py:0: error: Argument 1 to "loads" has incompatible type "str | None"; expected "str | bytes | bytearray" [arg-type]
posthog/api/person.py:0: error: Argument "user" to "log_activity" has incompatible type "User | AnonymousUser"; expected "User | None" [arg-type]
Expand Down Expand Up @@ -349,9 +346,6 @@ posthog/tasks/exports/test/test_export_utils.py:0: error: Function is missing a
posthog/tasks/exports/test/test_csv_exporter_url_sanitising.py:0: error: Function is missing a type annotation for one or more arguments [no-untyped-def]
posthog/tasks/exports/test/test_csv_exporter_url_sanitising.py:0: error: Function is missing a type annotation for one or more arguments [no-untyped-def]
posthog/tasks/exports/test/test_csv_exporter_renders.py:0: error: Function is missing a type annotation [no-untyped-def]
posthog/queries/app_metrics/test/test_app_metrics.py:0: error: Argument 3 to "AppMetricsErrorDetailsQuery" has incompatible type "AppMetricsRequestSerializer"; expected "AppMetricsErrorsRequestSerializer" [arg-type]
posthog/queries/app_metrics/test/test_app_metrics.py:0: error: Argument 3 to "AppMetricsErrorDetailsQuery" has incompatible type "AppMetricsRequestSerializer"; expected "AppMetricsErrorsRequestSerializer" [arg-type]
posthog/queries/app_metrics/test/test_app_metrics.py:0: error: Argument 3 to "AppMetricsErrorDetailsQuery" has incompatible type "AppMetricsRequestSerializer"; expected "AppMetricsErrorsRequestSerializer" [arg-type]
posthog/hogql_queries/test/test_query_runner.py:0: error: Variable "TestQueryRunner" is not valid as a type [valid-type]
posthog/hogql_queries/test/test_query_runner.py:0: note: See https://mypy.readthedocs.io/en/stable/common_issues.html#variables-vs-type-aliases
posthog/hogql_queries/test/test_query_runner.py:0: error: Invalid base class "TestQueryRunner" [misc]
Expand Down Expand Up @@ -568,12 +562,13 @@ posthog/temporal/data_imports/pipelines/stripe/__init__.py:0: error: Unused "typ
posthog/temporal/data_imports/pipelines/stripe/__init__.py:0: error: Unused "type: ignore" comment [unused-ignore]
posthog/temporal/data_imports/pipelines/stripe/__init__.py:0: error: Unused "type: ignore" comment [unused-ignore]
posthog/session_recordings/session_recording_api.py:0: error: Argument "team_id" to "get_realtime_snapshots" has incompatible type "int"; expected "str" [arg-type]
posthog/queries/app_metrics/test/test_app_metrics.py:0: error: Argument 3 to "AppMetricsErrorDetailsQuery" has incompatible type "AppMetricsRequestSerializer"; expected "AppMetricsErrorsRequestSerializer" [arg-type]
posthog/queries/app_metrics/test/test_app_metrics.py:0: error: Argument 3 to "AppMetricsErrorDetailsQuery" has incompatible type "AppMetricsRequestSerializer"; expected "AppMetricsErrorsRequestSerializer" [arg-type]
posthog/queries/app_metrics/test/test_app_metrics.py:0: error: Argument 3 to "AppMetricsErrorDetailsQuery" has incompatible type "AppMetricsRequestSerializer"; expected "AppMetricsErrorsRequestSerializer" [arg-type]
posthog/api/test/test_decide.py:0: error: Item "None" of "Any | None" has no attribute "toolbar_mode" [union-attr]
posthog/api/test/test_decide.py:0: error: Item "None" of "Any | None" has no attribute "save" [union-attr]
posthog/api/plugin_log_entry.py:0: error: Name "timezone.datetime" is not defined [name-defined]
posthog/api/plugin_log_entry.py:0: error: Module "django.utils.timezone" does not explicitly export attribute "datetime" [attr-defined]
posthog/api/plugin_log_entry.py:0: error: Name "timezone.datetime" is not defined [name-defined]
posthog/api/plugin_log_entry.py:0: error: Module "django.utils.timezone" does not explicitly export attribute "datetime" [attr-defined]
posthog/api/plugin.py:0: error: Item "None" of "IO[Any] | None" has no attribute "read" [union-attr]
posthog/api/plugin.py:0: error: Item "None" of "IO[Any] | None" has no attribute "read" [union-attr]
posthog/warehouse/api/external_data_source.py:0: error: Incompatible return value type (got "tuple[ExternalDataSource, dict[str, list[tuple[str, str]]]]", expected "tuple[ExternalDataSource, list[Any]]") [return-value]
posthog/warehouse/api/external_data_source.py:0: error: Incompatible return value type (got "tuple[ExternalDataSource, dict[str, list[tuple[str, str]]]]", expected "tuple[ExternalDataSource, list[Any]]") [return-value]
posthog/warehouse/api/table.py:0: error: Unused "type: ignore" comment [unused-ignore]
Expand All @@ -585,6 +580,10 @@ posthog/temporal/tests/batch_exports/test_s3_batch_export_workflow.py:0: error:
posthog/temporal/tests/batch_exports/test_s3_batch_export_workflow.py:0: error: "tuple[Any, ...]" has no attribute "upload_state" [attr-defined]
posthog/migrations/0237_remove_timezone_from_teams.py:0: error: Argument 2 to "RunPython" has incompatible type "Callable[[Migration, Any], None]"; expected "_CodeCallable | None" [arg-type]
posthog/migrations/0228_fix_tile_layouts.py:0: error: Argument 2 to "RunPython" has incompatible type "Callable[[Migration, Any], None]"; expected "_CodeCallable | None" [arg-type]
posthog/api/plugin_log_entry.py:0: error: Name "timezone.datetime" is not defined [name-defined]
posthog/api/plugin_log_entry.py:0: error: Module "django.utils.timezone" does not explicitly export attribute "datetime" [attr-defined]
posthog/api/plugin_log_entry.py:0: error: Name "timezone.datetime" is not defined [name-defined]
posthog/api/plugin_log_entry.py:0: error: Module "django.utils.timezone" does not explicitly export attribute "datetime" [attr-defined]
posthog/warehouse/external_data_source/source.py:0: error: Incompatible types in assignment (expression has type "int", target has type "str") [assignment]
posthog/warehouse/external_data_source/source.py:0: error: Incompatible types in assignment (expression has type "int", target has type "str") [assignment]
posthog/warehouse/external_data_source/source.py:0: error: Incompatible types in assignment (expression has type "dict[str, Collection[str]]", variable has type "StripeSourcePayload") [assignment]
Expand Down
4 changes: 2 additions & 2 deletions posthog/api/cohort.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@
)
from posthog.queries.actor_base_query import (
ActorBaseQuery,
get_people,
get_serialized_people,
)
from posthog.queries.paths import PathsActors
from posthog.queries.person_query import PersonQuery
Expand Down Expand Up @@ -357,7 +357,7 @@ def persons(self, request: Request, **kwargs) -> Response:
workload=Workload.OFFLINE, # this endpoint is only used by external API requests
)
actor_ids = [row[0] for row in raw_result]
actors, serialized_actors = get_people(team, actor_ids, distinct_id_limit=10)
serialized_actors = get_serialized_people(team, actor_ids, distinct_id_limit=10)

_should_paginate = len(actor_ids) >= filter.limit

Expand Down
33 changes: 20 additions & 13 deletions posthog/api/person.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@
from posthog.models.person.util import delete_person
from posthog.queries.actor_base_query import (
ActorBaseQuery,
get_people,
get_serialized_people,
)
from posthog.queries.funnels import ClickhouseFunnelActors, ClickhouseFunnelTrendsActors
from posthog.queries.funnels.funnel_strict_persons import ClickhouseFunnelStrictActors
Expand Down Expand Up @@ -89,6 +89,7 @@
from prometheus_client import Counter
from posthog.metrics import LABEL_TEAM_ID
from loginas.utils import is_impersonated_session
import builtins

DEFAULT_PAGE_LIMIT = 100
# Sync with .../lib/constants.tsx and .../ingestion/hooks.ts
Expand Down Expand Up @@ -133,19 +134,23 @@ def get_paginated_response_schema(self, schema):


def get_person_name(team: Team, person: Person) -> str:
if display_name := get_person_display_name(person, team):
return display_name
if len(person.distinct_ids) > 0:
# Prefer non-UUID distinct IDs (presumably from user identification) over UUIDs
return sorted(person.distinct_ids, key=is_anonymous_id)[0]
return person.pk
return get_person_name_helper(person.pk, person.properties, person.distinct_ids, team)


def get_person_display_name(person: Person, team: Team) -> str | None:
def get_person_name_helper(
person_pk: int, person_properties: dict[str, str], distinct_ids: list[str], team: Team
) -> str:
display_name = None
for property in team.person_display_name_properties or PERSON_DEFAULT_DISPLAY_NAME_PROPERTIES:
if person.properties and person.properties.get(property):
return person.properties.get(property)
return None
if person_properties and person_properties.get(property):
display_name = person_properties.get(property)
break
if display_name:
return display_name
if len(distinct_ids) > 0:
# Prefer non-UUID distinct IDs (presumably from user identification) over UUIDs
return sorted(distinct_ids, key=is_anonymous_id)[0]
return str(person_pk)


class PersonsThrottle(ClickHouseSustainedRateThrottle):
Expand Down Expand Up @@ -303,7 +308,7 @@ def list(self, request: request.Request, *args: Any, **kwargs: Any) -> response.
workload=Workload.OFFLINE, # this endpoint is only used by external API requests
)
actor_ids = [row[0] for row in raw_paginated_result]
_, serialized_actors = get_people(team, actor_ids)
serialized_actors = get_serialized_people(team, actor_ids)
_should_paginate = len(actor_ids) >= filter.limit

# If the undocumented include_total param is set to true, we'll return the total count of people
Expand Down Expand Up @@ -643,7 +648,9 @@ def _set_properties(self, properties, user):
)

# PRAGMA: Methods for getting Persons via clickhouse queries
def _respond_with_cached_results(self, results_package: dict[str, tuple[List, Optional[str], Optional[str], int]]): # noqa: UP006
def _respond_with_cached_results(
self, results_package: dict[str, tuple[builtins.list, Optional[str], Optional[str], int]]
): # noqa: UP006
if not results_package:
return response.Response(data=[])

Expand Down
13 changes: 8 additions & 5 deletions posthog/hogql_queries/actor_strategies.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,15 +45,18 @@ class PersonStrategy(ActorStrategy):
origin_id = "id"

# This is hand written instead of using the ORM because the ORM was blowing up the memory on exports and taking forever
def get_actors(self, actor_ids) -> dict[str, dict]:
def get_actors(self, actor_ids, order_by: str = "") -> dict[str, dict]:
# If actor queries start quietly dying again, this might need batching at some point
# but currently works with 800,000 persondistinctid entries (May 24, 2024)
with connection.cursor() as cursor:
cursor.execute(
"""SELECT posthog_person.id, posthog_person.uuid, posthog_person.properties, posthog_person.is_identified, posthog_person.created_at
persons_query = """SELECT posthog_person.id, posthog_person.uuid, posthog_person.properties, posthog_person.is_identified, posthog_person.created_at
FROM posthog_person
WHERE posthog_person.uuid = ANY(%(uuids)s)
AND posthog_person.team_id = %(team_id)s""",
AND posthog_person.team_id = %(team_id)s"""
if order_by:
persons_query += f" ORDER BY {order_by}"
with connection.cursor() as cursor:
cursor.execute(
persons_query,
{"uuids": list(actor_ids), "team_id": self.team.pk},
)
people = cursor.fetchall()
Expand Down
33 changes: 33 additions & 0 deletions posthog/queries/actor_base_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,16 @@
from django.db.models.query import Prefetch, QuerySet

from posthog.constants import INSIGHT_FUNNELS, INSIGHT_PATHS, INSIGHT_TRENDS
from posthog.hogql_queries.actor_strategies import PersonStrategy
from posthog.hogql_queries.insights.paginators import HogQLHasMorePaginator
from posthog.models import Entity, Filter, PersonDistinctId, SessionRecording, Team
from posthog.models.filters.mixins.utils import cached_property
from posthog.models.filters.retention_filter import RetentionFilter
from posthog.models.filters.stickiness_filter import StickinessFilter
from posthog.models.group import Group
from posthog.models.person import Person
from posthog.queries.insight import insight_sync_execute
from posthog.schema import ActorsQuery


class EventInfoForRecording(TypedDict):
Expand Down Expand Up @@ -288,6 +291,36 @@ def get_people(
return persons, serialize_people(team, persons, value_per_actor_id)


# A faster get_people if you don't need the Person objects
def get_serialized_people(
team: Team, people_ids: list[Any], value_per_actor_id: Optional[dict[str, float]] = None, distinct_id_limit=1000
) -> list[SerializedPerson]:
persons_dict = PersonStrategy(team, ActorsQuery(), HogQLHasMorePaginator()).get_actors(
people_ids, order_by="created_at DESC, uuid"
)
from posthog.api.person import get_person_name_helper

return [
SerializedPerson(
type="person",
id=uuid,
uuid=uuid,
created_at=person_dict["created_at"],
properties=person_dict["properties"],
is_identified=person_dict["is_identified"],
name=get_person_name_helper(
person_dict["id"], person_dict["properties"], person_dict["distinct_ids"], team
),
distinct_ids=person_dict["distinct_ids"]
if distinct_id_limit is None
else person_dict["distinct_ids"][:distinct_id_limit],
matched_recordings=[],
value_at_data_point=value_per_actor_id[str(uuid)] if value_per_actor_id else None,
)
for uuid, person_dict in persons_dict.items()
]


def serialize_people(
team: Team,
data: Union[QuerySet[Person], list[Person]],
Expand Down

0 comments on commit 57793ef

Please sign in to comment.