Skip to content

Commit

Permalink
perf: Speed up selecting from persons table (#25824)
Browse files Browse the repository at this point in the history
Co-authored-by: github-actions <41898282+github-actions[bot]@users.noreply.github.com>
  • Loading branch information
timgl and github-actions[bot] authored Oct 25, 2024
1 parent 0ad1b5f commit dff2676
Show file tree
Hide file tree
Showing 5 changed files with 235 additions and 12 deletions.
2 changes: 1 addition & 1 deletion posthog/api/test/__snapshots__/test_api_docs.ambr
Original file line number Diff line number Diff line change
Expand Up @@ -97,8 +97,8 @@
'/home/runner/work/posthog/posthog/posthog/api/survey.py: Warning [SurveyViewSet > SurveySerializer]: unable to resolve type hint for function "get_conditions". Consider using a type hint or @extend_schema_field. Defaulting to string.',
'/home/runner/work/posthog/posthog/posthog/api/web_experiment.py: Warning [WebExperimentViewSet]: could not derive type of path parameter "project_id" because model "posthog.models.web_experiment.WebExperiment" contained no such field. Consider annotating parameter with @extend_schema. Defaulting to "string".',
'Warning: encountered multiple names for the same choice set (HrefMatchingEnum). This may be unwanted even though the generated schema is technically correct. Add an entry to ENUM_NAME_OVERRIDES to fix the naming.',
'Warning: enum naming encountered a non-optimally resolvable collision for fields named "kind". The same name has been used for multiple choice sets in multiple components. The collision was resolved with "Kind069Enum". add an entry to ENUM_NAME_OVERRIDES to fix the naming.',
'Warning: enum naming encountered a non-optimally resolvable collision for fields named "kind". The same name has been used for multiple choice sets in multiple components. The collision was resolved with "KindCfaEnum". add an entry to ENUM_NAME_OVERRIDES to fix the naming.',
'Warning: enum naming encountered a non-optimally resolvable collision for fields named "kind". The same name has been used for multiple choice sets in multiple components. The collision was resolved with "Kind069Enum". add an entry to ENUM_NAME_OVERRIDES to fix the naming.',
'Warning: enum naming encountered a non-optimally resolvable collision for fields named "type". The same name has been used for multiple choice sets in multiple components. The collision was resolved with "TypeF73Enum". add an entry to ENUM_NAME_OVERRIDES to fix the naming.',
'Warning: encountered multiple names for the same choice set (EffectivePrivilegeLevelEnum). This may be unwanted even though the generated schema is technically correct. Add an entry to ENUM_NAME_OVERRIDES to fix the naming.',
'Warning: encountered multiple names for the same choice set (MembershipLevelEnum). This may be unwanted even though the generated schema is technically correct. Add an entry to ENUM_NAME_OVERRIDES to fix the naming.',
Expand Down
44 changes: 41 additions & 3 deletions posthog/hogql/database/schema/persons.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
from typing import cast, Optional, Self
from posthog.hogql.parser import parse_select
import posthoganalytics

from posthog.hogql.ast import SelectQuery, And, CompareOperation, CompareOperationOp, Field, JoinExpr
from posthog.hogql.base import Expr
from posthog.hogql.constants import HogQLQuerySettings
from posthog.hogql.context import HogQLContext
from posthog.hogql import ast
from posthog.hogql.database.argmax import argmax_select
from posthog.hogql.database.models import (
BooleanDatabaseField,
Expand Down Expand Up @@ -56,10 +58,46 @@ def select_from_persons_table(
version = PersonsArgMaxVersion.V2
break

if version == PersonsArgMaxVersion.V2:
from posthog.hogql import ast
from posthog.hogql.parser import parse_select
and_conditions = []

if filter is not None:
and_conditions.append(filter)

# For now, only do this optimization for directly querying the persons table (without joins or as part of a subquery) to avoid knock-on effects to insight queries
if (
node.select_from
and node.select_from.type
and hasattr(node.select_from.type, "table")
and node.select_from.type.table
and isinstance(node.select_from.type.table, PersonsTable)
):
extractor = WhereClauseExtractor(context)
extractor.add_local_tables(join_or_table)
where = extractor.get_inner_where(node)
if where:
select = argmax_select(
table_name="raw_persons",
select_fields=join_or_table.fields_accessed,
group_fields=["id"],
argmax_field="version",
deleted_field="is_deleted",
timestamp_field_to_clamp="created_at",
)
inner_select = cast(
ast.SelectQuery,
parse_select(
"""
SELECT id FROM raw_persons as where_optimization
"""
),
)
inner_select.where = where
select.where = ast.CompareOperation(
left=ast.Field(chain=["id"]), right=inner_select, op=ast.CompareOperationOp.In
)
return select

if version == PersonsArgMaxVersion.V2:
select = cast(
ast.SelectQuery,
parse_select(
Expand Down
58 changes: 58 additions & 0 deletions posthog/hogql/database/schema/test/__snapshots__/test_persons.ambr
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
# serializer version: 1
# name: TestPersonOptimization.test_joins_are_left_alone_for_now
'''
SELECT events.uuid AS uuid
FROM events
INNER JOIN
(SELECT argMax(person_distinct_id2.person_id, person_distinct_id2.version) AS events__pdi___person_id,
argMax(person_distinct_id2.person_id, person_distinct_id2.version) AS person_id,
person_distinct_id2.distinct_id AS distinct_id
FROM person_distinct_id2
WHERE equals(person_distinct_id2.team_id, 2)
GROUP BY person_distinct_id2.distinct_id
HAVING ifNull(equals(argMax(person_distinct_id2.is_deleted, person_distinct_id2.version), 0), 0) SETTINGS optimize_aggregation_in_order=1) AS events__pdi ON equals(events.distinct_id, events__pdi.distinct_id)
INNER JOIN
(SELECT person.id AS id,
replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(person.properties, '$some_prop'), ''), 'null'), '^"|"$', '') AS `properties___$some_prop`
FROM person
WHERE and(equals(person.team_id, 2), ifNull(in(tuple(person.id, person.version),
(SELECT person.id AS id, max(person.version) AS version
FROM person
WHERE equals(person.team_id, 2)
GROUP BY person.id
HAVING and(ifNull(equals(argMax(person.is_deleted, person.version), 0), 0), ifNull(less(argMax(toTimeZone(person.created_at, 'UTC'), person.version), plus(now64(6, 'UTC'), toIntervalDay(1))), 0)))), 0)) SETTINGS optimize_aggregation_in_order=1) AS events__pdi__person ON equals(events__pdi.events__pdi___person_id, events__pdi__person.id)
WHERE and(equals(events.team_id, 2), ifNull(equals(events__pdi__person.`properties___$some_prop`, 'something'), 0))
LIMIT 100 SETTINGS readonly=2,
max_execution_time=60,
allow_experimental_object_type=1,
format_csv_allow_double_quotes=0,
max_ast_elements=4000000,
max_expanded_ast_elements=4000000,
max_bytes_before_external_group_by=0
'''
# ---
# name: TestPersonOptimization.test_simple_filter
'''
SELECT persons.id AS id,
persons.properties AS properties
FROM
(SELECT argMax(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(person.properties, '$some_prop'), ''), 'null'), '^"|"$', ''), person.version) AS `properties___$some_prop`,
argMax(person.properties, person.version) AS properties,
person.id AS id
FROM person
WHERE and(equals(person.team_id, 2), in(id,
(SELECT where_optimization.id AS id
FROM person AS where_optimization
WHERE and(equals(where_optimization.team_id, 2), ifNull(equals(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(where_optimization.properties, '$some_prop'), ''), 'null'), '^"|"$', ''), 'something'), 0)))))
GROUP BY person.id
HAVING and(ifNull(equals(argMax(person.is_deleted, person.version), 0), 0), ifNull(less(argMax(toTimeZone(person.created_at, 'UTC'), person.version), plus(now64(6, 'UTC'), toIntervalDay(1))), 0))) AS persons
WHERE ifNull(equals(persons.`properties___$some_prop`, 'something'), 0)
LIMIT 100 SETTINGS readonly=2,
max_execution_time=60,
allow_experimental_object_type=1,
format_csv_allow_double_quotes=0,
max_ast_elements=4000000,
max_expanded_ast_elements=4000000,
max_bytes_before_external_group_by=0
'''
# ---
128 changes: 128 additions & 0 deletions posthog/hogql/database/schema/test/test_persons.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
from posthog.hogql.parser import parse_select
from posthog.schema import (
PersonsOnEventsMode,
InsightActorsQuery,
TrendsQuery,
ActorsQuery,
EventsNode,
InsightDateRange,
)
from posthog.hogql_queries.actors_query_runner import ActorsQueryRunner
from posthog.hogql.modifiers import create_default_modifiers_for_team
from posthog.hogql.query import execute_hogql_query
from posthog.test.base import (
APIBaseTest,
ClickhouseTestMixin,
_create_person,
_create_event,
snapshot_clickhouse_queries,
)
from posthog.models.person.util import create_person
from datetime import datetime

from unittest.mock import patch, Mock


@patch("posthoganalytics.feature_enabled", new=Mock(return_value=True)) # for persons-inner-where-optimization
class TestPersonOptimization(ClickhouseTestMixin, APIBaseTest):
"""
Mostly tests for the optimization of pre-filtering before aggregating. See https://github.com/PostHog/posthog/pull/25604
"""

def setUp(self):
super().setUp()
self.first_person = _create_person(
team_id=self.team.pk,
distinct_ids=["1"],
properties={"$some_prop": "something", "$another_prop": "something1"},
created_at=datetime(2024, 1, 1, 12),
)
self.second_person = _create_person(
team_id=self.team.pk,
properties={"$some_prop": "ifwematcholdversionsthiswillmatch", "$another_prop": "something2"},
distinct_ids=["2"],
version=1,
created_at=datetime(2024, 1, 1, 13),
)
# update second_person with the correct prop
create_person(
team_id=self.team.pk,
uuid=str(self.second_person.uuid),
properties={"$some_prop": "something", "$another_prop": "something2"},
created_at=datetime(2024, 1, 1, 13),
version=2,
)
self.third_person = _create_person(
team_id=self.team.pk,
distinct_ids=["3"],
properties={"$some_prop": "not something", "$another_prop": "something3"},
created_at=datetime(2024, 1, 1, 14),
)
# deleted
self.deleted_person = _create_person(
team_id=self.team.pk,
properties={"$some_prop": "ifwematcholdversionsthiswillmatch", "$another_prop": "something2"},
distinct_ids=["deleted"],
created_at=datetime(2024, 1, 1, 13),
version=1,
)
create_person(team_id=self.team.pk, uuid=str(self.deleted_person.uuid), version=2, is_deleted=True)
_create_event(event="$pageview", distinct_id="1", team=self.team)
_create_event(event="$pageview", distinct_id="2", team=self.team)
_create_event(event="$pageview", distinct_id="3", team=self.team)
self.modifiers = create_default_modifiers_for_team(self.team)
self.modifiers.personsOnEventsMode = PersonsOnEventsMode.DISABLED
# self.modifiers.optimizeJoinedFilters = True
# self.modifiers.personsArgMaxVersion = PersonsArgMaxVersion.V1

@snapshot_clickhouse_queries
def test_simple_filter(self):
response = execute_hogql_query(
parse_select("select id, properties from persons where properties.$some_prop = 'something'"),
self.team,
modifiers=self.modifiers,
)
assert len(response.results) == 2
assert response.clickhouse
self.assertIn("where_optimization", response.clickhouse)
self.assertNotIn("in(tuple(person.id, person.version)", response.clickhouse)

@snapshot_clickhouse_queries
def test_joins_are_left_alone_for_now(self):
response = execute_hogql_query(
parse_select("select uuid from events where person.properties.$some_prop = 'something'"),
self.team,
modifiers=self.modifiers,
)
assert len(response.results) == 2
assert response.clickhouse
self.assertIn("in(tuple(person.id, person.version)", response.clickhouse)
self.assertNotIn("where_optimization", response.clickhouse)

def test_person_modal_not_optimized_yet(self):
source_query = TrendsQuery(
series=[EventsNode(event="$pageview")],
dateRange=InsightDateRange(date_from="2024-01-01", date_to="2024-01-07"),
# breakdownFilter=BreakdownFilter(breakdown="$", breakdown_type=BreakdownType.PERSON),
)
insight_actors_query = InsightActorsQuery(
source=source_query,
day="2024-01-01",
modifiers=self.modifiers,
)
actors_query = ActorsQuery(
source=insight_actors_query,
offset=0,
select=[
"actor",
"created_at",
"event_count",
# "matched_recordings",
],
orderBy=["event_count DESC"],
modifiers=self.modifiers,
)
query_runner = ActorsQueryRunner(query=actors_query, team=self.team)
response = execute_hogql_query(query_runner.to_query(), self.team, modifiers=self.modifiers)
assert response.clickhouse
self.assertNotIn("where_optimization", response.clickhouse)
15 changes: 7 additions & 8 deletions posthog/hogql/test/__snapshots__/test_query.ambr
Original file line number Diff line number Diff line change
Expand Up @@ -459,16 +459,15 @@

SELECT DISTINCT persons.properties___sneaky_mail AS sneaky_mail
FROM (
SELECT person.id AS id, replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(person.properties, %(hogql_val_0)s), ''), 'null'), '^"|"$', '') AS properties___sneaky_mail, replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(person.properties, %(hogql_val_1)s), ''), 'null'), '^"|"$', '') AS properties___random_uuid
SELECT argMax(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(person.properties, %(hogql_val_0)s), ''), 'null'), '^"|"$', ''), person.version) AS properties___sneaky_mail, argMax(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(person.properties, %(hogql_val_1)s), ''), 'null'), '^"|"$', ''), person.version) AS properties___random_uuid, person.id AS id
FROM person
WHERE and(equals(person.team_id, 420), ifNull(in(tuple(person.id, person.version), (
SELECT person.id AS id, max(person.version) AS version
FROM person
WHERE equals(person.team_id, 420)
WHERE and(equals(person.team_id, 420), in(id, (
SELECT where_optimization.id AS id
FROM person AS where_optimization
WHERE and(equals(where_optimization.team_id, 420), ifNull(equals(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(where_optimization.properties, %(hogql_val_2)s), ''), 'null'), '^"|"$', ''), %(hogql_val_3)s), 0)))))
GROUP BY person.id
HAVING and(ifNull(equals(argMax(person.is_deleted, person.version), 0), 0), ifNull(less(argMax(toTimeZone(person.created_at, %(hogql_val_2)s), person.version), plus(now64(6, %(hogql_val_3)s), toIntervalDay(1))), 0)))), 0))
SETTINGS optimize_aggregation_in_order=1) AS persons
WHERE ifNull(equals(persons.properties___random_uuid, %(hogql_val_4)s), 0)
HAVING and(ifNull(equals(argMax(person.is_deleted, person.version), 0), 0), ifNull(less(argMax(toTimeZone(person.created_at, %(hogql_val_4)s), person.version), plus(now64(6, %(hogql_val_5)s), toIntervalDay(1))), 0))) AS persons
WHERE ifNull(equals(persons.properties___random_uuid, %(hogql_val_6)s), 0)
LIMIT 100
SETTINGS readonly=2, max_execution_time=60, allow_experimental_object_type=1, format_csv_allow_double_quotes=0, max_ast_elements=4000000, max_expanded_ast_elements=4000000, max_bytes_before_external_group_by=0

Expand Down

0 comments on commit dff2676

Please sign in to comment.