Skip to content

Commit

Permalink
perf: Speed up filtering persons (#25604)
Browse files Browse the repository at this point in the history
Co-authored-by: github-actions <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Tomás Farías Santana <[email protected]>
Co-authored-by: Marius Andra <[email protected]>
Co-authored-by: Georgiy Tarasov <[email protected]>
Co-authored-by: David Newell <[email protected]>
Co-authored-by: PostHog Bot <[email protected]>
Co-authored-by: Richard Borcsik <[email protected]>
Co-authored-by: Michael Matloka <[email protected]>
Co-authored-by: Eric Duong <[email protected]>
Co-authored-by: Paul D'Ambra <[email protected]>
Co-authored-by: Juraj Majerik <[email protected]>
Co-authored-by: Phani Raj <[email protected]>
Co-authored-by: Tom Owers <[email protected]>
Co-authored-by: Dylan Martin <[email protected]>
Co-authored-by: Oliver Browne <[email protected]>
Co-authored-by: Marcus Hof <[email protected]>
Co-authored-by: Michael Matloka <[email protected]>
Co-authored-by: Anirudh Pillai <[email protected]>
Co-authored-by: Sandy Spicer <[email protected]>
Co-authored-by: Neil Kakkar <[email protected]>
  • Loading branch information
1 parent f31abb5 commit 3f75806
Show file tree
Hide file tree
Showing 45 changed files with 3,339 additions and 3,015 deletions.

Large diffs are not rendered by default.

Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
7 changes: 0 additions & 7 deletions frontend/src/queries/schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -6533,13 +6533,6 @@
"enum": ["auto", "legacy_null_as_string", "legacy_null_as_null", "disabled"],
"type": "string"
},
"optimizeJoinedFilters": {
"type": "boolean"
},
"personsArgMaxVersion": {
"enum": ["auto", "v1", "v2"],
"type": "string"
},
"personsJoinMode": {
"enum": ["inner", "left"],
"type": "string"
Expand Down
2 changes: 0 additions & 2 deletions frontend/src/queries/schema.ts
Original file line number Diff line number Diff line change
Expand Up @@ -227,10 +227,8 @@ export interface HogQLQueryModifiers {
| 'person_id_no_override_properties_on_events'
| 'person_id_override_properties_on_events'
| 'person_id_override_properties_joined'
personsArgMaxVersion?: 'auto' | 'v1' | 'v2'
inCohortVia?: 'auto' | 'leftjoin' | 'subquery' | 'leftjoin_conjoined'
materializationMode?: 'auto' | 'legacy_null_as_string' | 'legacy_null_as_null' | 'disabled'
optimizeJoinedFilters?: boolean
dataWarehouseEventsModifiers?: DataWarehouseEventsModifier[]
debug?: boolean
s3TableUseInvalidColumns?: boolean
Expand Down
33 changes: 1 addition & 32 deletions frontend/src/scenes/debug/Modifiers.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -47,22 +47,7 @@ export function Modifiers<Q extends { response?: Record<string, any>; modifiers?
value={query.modifiers?.personsOnEventsMode ?? response?.modifiers?.personsOnEventsMode}
/>
</LemonLabel>
<LemonLabel className={labelClassName}>
<div>Persons ArgMax:</div>
<LemonSelect
options={[
{ value: 'v1', label: 'V1' },
{ value: 'v2', label: 'V2' },
]}
onChange={(value) =>
setQuery({
...query,
modifiers: { ...query.modifiers, personsArgMaxVersion: value },
})
}
value={query.modifiers?.personsArgMaxVersion ?? response?.modifiers?.personsArgMaxVersion}
/>
</LemonLabel>

<LemonLabel className={labelClassName}>
<div>In Cohort Via:</div>
<LemonSelect
Expand Down Expand Up @@ -99,22 +84,6 @@ export function Modifiers<Q extends { response?: Record<string, any>; modifiers?
value={query.modifiers?.materializationMode ?? response?.modifiers?.materializationMode}
/>
</LemonLabel>
<LemonLabel className={labelClassName}>
<div>Optimize joined filters:</div>
<LemonSelect
options={[
{ value: true, label: 'true' },
{ value: false, label: 'false' },
]}
onChange={(value) =>
setQuery({
...query,
modifiers: { ...query.modifiers, optimizeJoinedFilters: value },
})
}
value={query.modifiers?.optimizeJoinedFilters ?? response?.modifiers?.optimizeJoinedFilters}
/>
</LemonLabel>
<LemonLabel className={labelClassName}>
<div>Property Groups:</div>
<LemonSelect
Expand Down
5 changes: 0 additions & 5 deletions mypy-baseline.txt
Original file line number Diff line number Diff line change
Expand Up @@ -527,11 +527,6 @@ posthog/hogql/test/test_modifiers.py:0: error: Unsupported right operand type fo
posthog/hogql/test/test_modifiers.py:0: error: Unsupported right operand type for in ("str | None") [operator]
posthog/hogql/test/test_modifiers.py:0: error: Unsupported right operand type for in ("str | None") [operator]
posthog/hogql/test/test_modifiers.py:0: error: Unsupported right operand type for in ("str | None") [operator]
posthog/hogql/test/test_modifiers.py:0: error: Unsupported right operand type for in ("str | None") [operator]
posthog/hogql/test/test_modifiers.py:0: error: Unsupported right operand type for in ("str | None") [operator]
posthog/hogql/test/test_modifiers.py:0: error: Unsupported right operand type for in ("str | None") [operator]
posthog/hogql/test/test_modifiers.py:0: error: Unsupported right operand type for in ("str | None") [operator]
posthog/hogql/test/test_modifiers.py:0: error: Unsupported right operand type for in ("str | None") [operator]
posthog/hogql/test/_test_parser.py:0: error: Invalid base class [misc]
posthog/hogql/test/_test_parser.py:0: error: Argument "table" to "JoinExpr" has incompatible type "Placeholder"; expected "SelectQuery | SelectUnionQuery | Field | None" [arg-type]
posthog/hogql/test/_test_parser.py:0: error: Item "None" of "JoinExpr | None" has no attribute "table" [union-attr]
Expand Down
34 changes: 20 additions & 14 deletions posthog/api/test/__snapshots__/test_query.ambr
Original file line number Diff line number Diff line change
Expand Up @@ -442,14 +442,17 @@
HAVING ifNull(equals(argMax(person_distinct_id_overrides.is_deleted, person_distinct_id_overrides.version), 0), 0) SETTINGS optimize_aggregation_in_order=1) AS events__override ON equals(events.distinct_id, events__override.distinct_id)
LEFT JOIN
(SELECT person.id AS id,
replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(person.properties, 'email'), ''), 'null'), '^"|"$', '') AS properties___email
argMax(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(person.properties, 'email'), ''), 'null'), '^"|"$', ''), person.version) AS properties___email
FROM person
WHERE and(equals(person.team_id, 2), ifNull(in(tuple(person.id, person.version),
(SELECT person.id AS id, max(person.version) AS version
FROM person
WHERE equals(person.team_id, 2)
GROUP BY person.id
HAVING and(ifNull(equals(argMax(person.is_deleted, person.version), 0), 0), ifNull(less(argMax(toTimeZone(person.created_at, 'UTC'), person.version), plus(now64(6, 'UTC'), toIntervalDay(1))), 0)))), 0)) SETTINGS optimize_aggregation_in_order=1) AS events__person ON equals(if(not(empty(events__override.distinct_id)), events__override.person_id, events.person_id), events__person.id)
WHERE and(equals(person.team_id, 2), in(person.id,
(SELECT person.id AS id
FROM person
WHERE and(equals(person.team_id, 2), notIn(person.id,
(SELECT person.id AS id
FROM person
WHERE and(equals(person.team_id, 2), equals(person.is_deleted, 1)))), ifNull(equals(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(person.properties, 'email'), ''), 'null'), '^"|"$', ''), '[email protected]'), 0)))), ifNull(equals(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(person.properties, 'email'), ''), 'null'), '^"|"$', ''), '[email protected]'), 0))
GROUP BY person.id
HAVING and(ifNull(equals(argMax(person.is_deleted, person.version), 0), 0), ifNull(less(argMax(toTimeZone(person.created_at, 'UTC'), person.version), plus(now64(6, 'UTC'), toIntervalDay(1))), 0)) SETTINGS optimize_aggregation_in_order=1) AS events__person ON equals(if(not(empty(events__override.distinct_id)), events__override.person_id, events.person_id), events__person.id)
WHERE and(equals(events.team_id, 2), ifNull(equals(events__person.properties___email, '[email protected]'), 0), less(toTimeZone(events.timestamp, 'UTC'), toDateTime64('2020-01-10 12:14:05.000000', 6, 'UTC')), greater(toTimeZone(events.timestamp, 'UTC'), toDateTime64('2020-01-09 12:14:00.000000', 6, 'UTC')))
ORDER BY events.event ASC
LIMIT 101
Expand Down Expand Up @@ -480,14 +483,17 @@
HAVING ifNull(equals(argMax(person_distinct_id_overrides.is_deleted, person_distinct_id_overrides.version), 0), 0) SETTINGS optimize_aggregation_in_order=1) AS events__override ON equals(events.distinct_id, events__override.distinct_id)
LEFT JOIN
(SELECT person.id AS id,
nullIf(nullIf(person.pmat_email, ''), 'null') AS properties___email
argMax(nullIf(nullIf(person.pmat_email, ''), 'null'), person.version) AS properties___email
FROM person
WHERE and(equals(person.team_id, 2), ifNull(in(tuple(person.id, person.version),
(SELECT person.id AS id, max(person.version) AS version
FROM person
WHERE equals(person.team_id, 2)
GROUP BY person.id
HAVING and(ifNull(equals(argMax(person.is_deleted, person.version), 0), 0), ifNull(less(argMax(toTimeZone(person.created_at, 'UTC'), person.version), plus(now64(6, 'UTC'), toIntervalDay(1))), 0)))), 0)) SETTINGS optimize_aggregation_in_order=1) AS events__person ON equals(if(not(empty(events__override.distinct_id)), events__override.person_id, events.person_id), events__person.id)
WHERE and(equals(person.team_id, 2), in(person.id,
(SELECT person.id AS id
FROM person
WHERE and(equals(person.team_id, 2), notIn(person.id,
(SELECT person.id AS id
FROM person
WHERE and(equals(person.team_id, 2), equals(person.is_deleted, 1)))), ifNull(equals(nullIf(nullIf(person.pmat_email, ''), 'null'), '[email protected]'), 0)))), ifNull(equals(nullIf(nullIf(person.pmat_email, ''), 'null'), '[email protected]'), 0))
GROUP BY person.id
HAVING and(ifNull(equals(argMax(person.is_deleted, person.version), 0), 0), ifNull(less(argMax(toTimeZone(person.created_at, 'UTC'), person.version), plus(now64(6, 'UTC'), toIntervalDay(1))), 0)) SETTINGS optimize_aggregation_in_order=1) AS events__person ON equals(if(not(empty(events__override.distinct_id)), events__override.person_id, events.person_id), events__person.id)
WHERE and(equals(events.team_id, 2), ifNull(equals(events__person.properties___email, '[email protected]'), 0), less(toTimeZone(events.timestamp, 'UTC'), toDateTime64('2020-01-10 12:14:05.000000', 6, 'UTC')), greater(toTimeZone(events.timestamp, 'UTC'), toDateTime64('2020-01-09 12:14:00.000000', 6, 'UTC')))
ORDER BY events.event ASC
LIMIT 101
Expand Down
12 changes: 12 additions & 0 deletions posthog/hogql/ast.py
Original file line number Diff line number Diff line change
Expand Up @@ -631,6 +631,18 @@ class CompareOperationOp(StrEnum):
NotIRegex = "!~*"


NEGATED_COMPARE_OPS: list[CompareOperationOp] = [
CompareOperationOp.NotEq,
CompareOperationOp.NotLike,
CompareOperationOp.NotILike,
CompareOperationOp.NotIn,
CompareOperationOp.GlobalNotIn,
CompareOperationOp.NotInCohort,
CompareOperationOp.NotRegex,
CompareOperationOp.NotIRegex,
]


@dataclass(kw_only=True)
class CompareOperation(Expr):
left: Expr
Expand Down
130 changes: 68 additions & 62 deletions posthog/hogql/database/schema/persons.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
from posthog.hogql.base import Expr
from posthog.hogql.constants import HogQLQuerySettings
from posthog.hogql.context import HogQLContext
from posthog.hogql.database.argmax import argmax_select
from posthog.hogql.database.models import (
BooleanDatabaseField,
DateTimeDatabaseField,
Expand All @@ -24,7 +23,6 @@
from posthog.hogql.errors import ResolutionError
from posthog.hogql.visitor import clone_expr
from posthog.models.organization import Organization
from posthog.schema import PersonsArgMaxVersion

PERSONS_FIELDS: dict[str, FieldOrTable] = {
"id": StringDatabaseField(name="id"),
Expand All @@ -46,72 +44,73 @@ def select_from_persons_table(
node: SelectQuery,
*,
filter: Optional[Expr] = None,
is_join: Optional[bool] = None,
):
version = context.modifiers.personsArgMaxVersion
if version == PersonsArgMaxVersion.AUTO:
version = PersonsArgMaxVersion.V1
# If selecting properties, use the faster v2 query. Otherwise, v1 is faster.
for field_chain in join_or_table.fields_accessed.values():
if field_chain[0] == "properties":
version = PersonsArgMaxVersion.V2
break

if version == PersonsArgMaxVersion.V2:
from posthog.hogql import ast
from posthog.hogql.parser import parse_select

select = cast(
from posthog.hogql import ast
from posthog.hogql.parser import parse_select

select = cast(
ast.SelectQuery,
parse_select(
"""
SELECT id
FROM raw_persons
GROUP BY id
HAVING equals(argMax(raw_persons.is_deleted, raw_persons.version), 0)
AND argMax(raw_persons.created_at, raw_persons.version) < now() + interval 1 day
"""
),
)
select.settings = HogQLQuerySettings(optimize_aggregation_in_order=True)

# This bit optimizes the query by first selecting all IDs for all persons (regardless of whether it's the latest version), and only then aggregating the results
# We only do this if there are where clauses, _and_ WhereClauseExtractor can extract them
if node.where:
inner_select = cast(
ast.SelectQuery,
parse_select(
"""
SELECT id FROM raw_persons WHERE (id, version) IN (
SELECT id, max(version) as version
FROM raw_persons
GROUP BY id
HAVING equals(argMax(raw_persons.is_deleted, raw_persons.version), 0)
AND argMax(raw_persons.created_at, raw_persons.version) < now() + interval 1 day
)
SELECT id
FROM raw_persons
WHERE
-- Much faster to pre-select out any deleted persons than doing it in aggregation
-- This is correct because there are no instances where we'd un-delete a person (ie there are no cases where one version has is_deleted=1 and a later version has is_deleted = 0)
id NOT IN (select id from raw_persons where is_deleted = 1)
"""
),
)
select.settings = HogQLQuerySettings(optimize_aggregation_in_order=True)
if filter is not None:
cast(ast.SelectQuery, cast(ast.CompareOperation, select.where).right).where = filter

for field_name, field_chain in join_or_table.fields_accessed.items():
# We need to always select the 'id' field for the join constraint. The field name here is likely to
# be "persons__id" if anything, but just in case, let's avoid duplicates.
if field_name != "id":
select.select.append(
ast.Alias(
alias=field_name,
expr=ast.Field(chain=field_chain),
)
)
else:
select = argmax_select(
table_name="raw_persons",
select_fields=join_or_table.fields_accessed,
group_fields=["id"],
argmax_field="version",
deleted_field="is_deleted",
timestamp_field_to_clamp="created_at",
)
select.settings = HogQLQuerySettings(optimize_aggregation_in_order=True)
if filter is not None:
if select.where:
select.where = And(exprs=[select.where, filter])
else:
select.where = filter

if context.modifiers.optimizeJoinedFilters:
extractor = WhereClauseExtractor(context)
extractor = WhereClauseExtractor(context, is_join=is_join)
extractor.add_local_tables(join_or_table)
where = extractor.get_inner_where(node)
if where and select.where:
select.where = And(exprs=[select.where, where])
elif where:
select.where = where

if where and inner_select.where:
inner_select.where = ast.And(exprs=[inner_select.where, where])
select.where = ast.And(
exprs=[
ast.CompareOperation(
left=ast.Field(chain=["id"]), right=inner_select, op=ast.CompareOperationOp.In
),
where, # Technically, adding the where clause here is duplicative, because the outer node filters this out _again_. However, if you're trying to debug the results stay consistent throughout the query (otherwise old versions might pop up again in this subquery)
]
)
if filter is not None:
if select.where:
cast(ast.SelectQuery, cast(ast.CompareOperation, select.where).right).where = ast.And(
exprs=[select.where, filter]
)
else:
select.where = filter

for field_name, field_chain in join_or_table.fields_accessed.items():
# We need to always select the 'id' field for the join constraint. The field name here is likely to
# be "persons__id" if anything, but just in case, let's avoid duplicates.
if field_name != "id":
select.select.append(
ast.Alias(
alias=field_name,
expr=ast.Call(name="argMax", args=[ast.Field(chain=field_chain), ast.Field(chain=["version"])]),
)
)

return select

Expand All @@ -125,7 +124,7 @@ def join_with_persons_table(

if not join_to_add.fields_accessed:
raise ResolutionError("No fields requested from persons table")
join_expr = ast.JoinExpr(table=select_from_persons_table(join_to_add, context, node))
join_expr = ast.JoinExpr(table=select_from_persons_table(join_to_add, context, node, is_join=True))

organization: Organization = context.team.organization if context.team else None
# TODO: @raquelmsmith: Remove flag check and use left join for all once deletes are caught up
Expand Down Expand Up @@ -224,9 +223,16 @@ def create_new_table_with_filter(self, join: JoinExpr) -> Self:
return self

def lazy_select(self, table_to_add: LazyTableToAdd, context, node):
# assume that if the select_from is not persons table we're doing a join
try:
is_join = not isinstance(node.select_from.type.table, PersonsTable)
except AttributeError:
is_join = False
if self.filter is not None:
return select_from_persons_table(table_to_add, context, node, filter=clone_expr(self.filter, True))
return select_from_persons_table(table_to_add, context, node)
return select_from_persons_table(
table_to_add, context, node, filter=clone_expr(self.filter, True), is_join=is_join
)
return select_from_persons_table(table_to_add, context, node, is_join=is_join)

def to_printed_clickhouse(self, context):
return "person"
Expand Down
Loading

0 comments on commit 3f75806

Please sign in to comment.