Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

revert: perf: Speed up filtering persons #25715

Merged
merged 1 commit into from
Oct 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view

Large diffs are not rendered by default.

Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
7 changes: 7 additions & 0 deletions frontend/src/queries/schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -6533,6 +6533,13 @@
"enum": ["auto", "legacy_null_as_string", "legacy_null_as_null", "disabled"],
"type": "string"
},
"optimizeJoinedFilters": {
"type": "boolean"
},
"personsArgMaxVersion": {
"enum": ["auto", "v1", "v2"],
"type": "string"
},
"personsJoinMode": {
"enum": ["inner", "left"],
"type": "string"
Expand Down
2 changes: 2 additions & 0 deletions frontend/src/queries/schema.ts
Original file line number Diff line number Diff line change
Expand Up @@ -227,8 +227,10 @@ export interface HogQLQueryModifiers {
| 'person_id_no_override_properties_on_events'
| 'person_id_override_properties_on_events'
| 'person_id_override_properties_joined'
personsArgMaxVersion?: 'auto' | 'v1' | 'v2'
inCohortVia?: 'auto' | 'leftjoin' | 'subquery' | 'leftjoin_conjoined'
materializationMode?: 'auto' | 'legacy_null_as_string' | 'legacy_null_as_null' | 'disabled'
optimizeJoinedFilters?: boolean
dataWarehouseEventsModifiers?: DataWarehouseEventsModifier[]
debug?: boolean
s3TableUseInvalidColumns?: boolean
Expand Down
33 changes: 32 additions & 1 deletion frontend/src/scenes/debug/Modifiers.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,22 @@ export function Modifiers<Q extends { response?: Record<string, any>; modifiers?
value={query.modifiers?.personsOnEventsMode ?? response?.modifiers?.personsOnEventsMode}
/>
</LemonLabel>

<LemonLabel className={labelClassName}>
<div>Persons ArgMax:</div>
<LemonSelect
options={[
{ value: 'v1', label: 'V1' },
{ value: 'v2', label: 'V2' },
]}
onChange={(value) =>
setQuery({
...query,
modifiers: { ...query.modifiers, personsArgMaxVersion: value },
})
}
value={query.modifiers?.personsArgMaxVersion ?? response?.modifiers?.personsArgMaxVersion}
/>
</LemonLabel>
<LemonLabel className={labelClassName}>
<div>In Cohort Via:</div>
<LemonSelect
Expand Down Expand Up @@ -84,6 +99,22 @@ export function Modifiers<Q extends { response?: Record<string, any>; modifiers?
value={query.modifiers?.materializationMode ?? response?.modifiers?.materializationMode}
/>
</LemonLabel>
<LemonLabel className={labelClassName}>
<div>Optimize joined filters:</div>
<LemonSelect
options={[
{ value: true, label: 'true' },
{ value: false, label: 'false' },
]}
onChange={(value) =>
setQuery({
...query,
modifiers: { ...query.modifiers, optimizeJoinedFilters: value },
})
}
value={query.modifiers?.optimizeJoinedFilters ?? response?.modifiers?.optimizeJoinedFilters}
/>
</LemonLabel>
<LemonLabel className={labelClassName}>
<div>Property Groups:</div>
<LemonSelect
Expand Down
5 changes: 5 additions & 0 deletions mypy-baseline.txt
Original file line number Diff line number Diff line change
Expand Up @@ -527,6 +527,11 @@ posthog/hogql/test/test_modifiers.py:0: error: Unsupported right operand type fo
posthog/hogql/test/test_modifiers.py:0: error: Unsupported right operand type for in ("str | None") [operator]
posthog/hogql/test/test_modifiers.py:0: error: Unsupported right operand type for in ("str | None") [operator]
posthog/hogql/test/test_modifiers.py:0: error: Unsupported right operand type for in ("str | None") [operator]
posthog/hogql/test/test_modifiers.py:0: error: Unsupported right operand type for in ("str | None") [operator]
posthog/hogql/test/test_modifiers.py:0: error: Unsupported right operand type for in ("str | None") [operator]
posthog/hogql/test/test_modifiers.py:0: error: Unsupported right operand type for in ("str | None") [operator]
posthog/hogql/test/test_modifiers.py:0: error: Unsupported right operand type for in ("str | None") [operator]
posthog/hogql/test/test_modifiers.py:0: error: Unsupported right operand type for in ("str | None") [operator]
posthog/hogql/test/_test_parser.py:0: error: Invalid base class [misc]
posthog/hogql/test/_test_parser.py:0: error: Argument "table" to "JoinExpr" has incompatible type "Placeholder"; expected "SelectQuery | SelectUnionQuery | Field | None" [arg-type]
posthog/hogql/test/_test_parser.py:0: error: Item "None" of "JoinExpr | None" has no attribute "table" [union-attr]
Expand Down
34 changes: 14 additions & 20 deletions posthog/api/test/__snapshots__/test_query.ambr
Original file line number Diff line number Diff line change
Expand Up @@ -442,17 +442,14 @@
HAVING ifNull(equals(argMax(person_distinct_id_overrides.is_deleted, person_distinct_id_overrides.version), 0), 0) SETTINGS optimize_aggregation_in_order=1) AS events__override ON equals(events.distinct_id, events__override.distinct_id)
LEFT JOIN
(SELECT person.id AS id,
argMax(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(person.properties, 'email'), ''), 'null'), '^"|"$', ''), person.version) AS properties___email
replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(person.properties, 'email'), ''), 'null'), '^"|"$', '') AS properties___email
FROM person
WHERE and(equals(person.team_id, 2), in(person.id,
(SELECT person.id AS id
FROM person
WHERE and(equals(person.team_id, 2), notIn(person.id,
(SELECT person.id AS id
FROM person
WHERE and(equals(person.team_id, 2), equals(person.is_deleted, 1)))), ifNull(equals(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(person.properties, 'email'), ''), 'null'), '^"|"$', ''), '[email protected]'), 0)))), ifNull(equals(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(person.properties, 'email'), ''), 'null'), '^"|"$', ''), '[email protected]'), 0))
GROUP BY person.id
HAVING and(ifNull(equals(argMax(person.is_deleted, person.version), 0), 0), ifNull(less(argMax(toTimeZone(person.created_at, 'UTC'), person.version), plus(now64(6, 'UTC'), toIntervalDay(1))), 0)) SETTINGS optimize_aggregation_in_order=1) AS events__person ON equals(if(not(empty(events__override.distinct_id)), events__override.person_id, events.person_id), events__person.id)
WHERE and(equals(person.team_id, 2), ifNull(in(tuple(person.id, person.version),
(SELECT person.id AS id, max(person.version) AS version
FROM person
WHERE equals(person.team_id, 2)
GROUP BY person.id
HAVING and(ifNull(equals(argMax(person.is_deleted, person.version), 0), 0), ifNull(less(argMax(toTimeZone(person.created_at, 'UTC'), person.version), plus(now64(6, 'UTC'), toIntervalDay(1))), 0)))), 0)) SETTINGS optimize_aggregation_in_order=1) AS events__person ON equals(if(not(empty(events__override.distinct_id)), events__override.person_id, events.person_id), events__person.id)
WHERE and(equals(events.team_id, 2), ifNull(equals(events__person.properties___email, '[email protected]'), 0), less(toTimeZone(events.timestamp, 'UTC'), toDateTime64('2020-01-10 12:14:05.000000', 6, 'UTC')), greater(toTimeZone(events.timestamp, 'UTC'), toDateTime64('2020-01-09 12:14:00.000000', 6, 'UTC')))
ORDER BY events.event ASC
LIMIT 101
Expand Down Expand Up @@ -483,17 +480,14 @@
HAVING ifNull(equals(argMax(person_distinct_id_overrides.is_deleted, person_distinct_id_overrides.version), 0), 0) SETTINGS optimize_aggregation_in_order=1) AS events__override ON equals(events.distinct_id, events__override.distinct_id)
LEFT JOIN
(SELECT person.id AS id,
argMax(nullIf(nullIf(person.pmat_email, ''), 'null'), person.version) AS properties___email
nullIf(nullIf(person.pmat_email, ''), 'null') AS properties___email
FROM person
WHERE and(equals(person.team_id, 2), in(person.id,
(SELECT person.id AS id
FROM person
WHERE and(equals(person.team_id, 2), notIn(person.id,
(SELECT person.id AS id
FROM person
WHERE and(equals(person.team_id, 2), equals(person.is_deleted, 1)))), ifNull(equals(nullIf(nullIf(person.pmat_email, ''), 'null'), '[email protected]'), 0)))), ifNull(equals(nullIf(nullIf(person.pmat_email, ''), 'null'), '[email protected]'), 0))
GROUP BY person.id
HAVING and(ifNull(equals(argMax(person.is_deleted, person.version), 0), 0), ifNull(less(argMax(toTimeZone(person.created_at, 'UTC'), person.version), plus(now64(6, 'UTC'), toIntervalDay(1))), 0)) SETTINGS optimize_aggregation_in_order=1) AS events__person ON equals(if(not(empty(events__override.distinct_id)), events__override.person_id, events.person_id), events__person.id)
WHERE and(equals(person.team_id, 2), ifNull(in(tuple(person.id, person.version),
(SELECT person.id AS id, max(person.version) AS version
FROM person
WHERE equals(person.team_id, 2)
GROUP BY person.id
HAVING and(ifNull(equals(argMax(person.is_deleted, person.version), 0), 0), ifNull(less(argMax(toTimeZone(person.created_at, 'UTC'), person.version), plus(now64(6, 'UTC'), toIntervalDay(1))), 0)))), 0)) SETTINGS optimize_aggregation_in_order=1) AS events__person ON equals(if(not(empty(events__override.distinct_id)), events__override.person_id, events.person_id), events__person.id)
WHERE and(equals(events.team_id, 2), ifNull(equals(events__person.properties___email, '[email protected]'), 0), less(toTimeZone(events.timestamp, 'UTC'), toDateTime64('2020-01-10 12:14:05.000000', 6, 'UTC')), greater(toTimeZone(events.timestamp, 'UTC'), toDateTime64('2020-01-09 12:14:00.000000', 6, 'UTC')))
ORDER BY events.event ASC
LIMIT 101
Expand Down
12 changes: 0 additions & 12 deletions posthog/hogql/ast.py
Original file line number Diff line number Diff line change
Expand Up @@ -631,18 +631,6 @@ class CompareOperationOp(StrEnum):
NotIRegex = "!~*"


NEGATED_COMPARE_OPS: list[CompareOperationOp] = [
CompareOperationOp.NotEq,
CompareOperationOp.NotLike,
CompareOperationOp.NotILike,
CompareOperationOp.NotIn,
CompareOperationOp.GlobalNotIn,
CompareOperationOp.NotInCohort,
CompareOperationOp.NotRegex,
CompareOperationOp.NotIRegex,
]


@dataclass(kw_only=True)
class CompareOperation(Expr):
left: Expr
Expand Down
130 changes: 62 additions & 68 deletions posthog/hogql/database/schema/persons.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from posthog.hogql.base import Expr
from posthog.hogql.constants import HogQLQuerySettings
from posthog.hogql.context import HogQLContext
from posthog.hogql.database.argmax import argmax_select
from posthog.hogql.database.models import (
BooleanDatabaseField,
DateTimeDatabaseField,
Expand All @@ -23,6 +24,7 @@
from posthog.hogql.errors import ResolutionError
from posthog.hogql.visitor import clone_expr
from posthog.models.organization import Organization
from posthog.schema import PersonsArgMaxVersion

PERSONS_FIELDS: dict[str, FieldOrTable] = {
"id": StringDatabaseField(name="id"),
Expand All @@ -44,73 +46,72 @@ def select_from_persons_table(
node: SelectQuery,
*,
filter: Optional[Expr] = None,
is_join: Optional[bool] = None,
):
from posthog.hogql import ast
from posthog.hogql.parser import parse_select

select = cast(
ast.SelectQuery,
parse_select(
"""
SELECT id
FROM raw_persons
GROUP BY id
HAVING equals(argMax(raw_persons.is_deleted, raw_persons.version), 0)
AND argMax(raw_persons.created_at, raw_persons.version) < now() + interval 1 day
"""
),
)
select.settings = HogQLQuerySettings(optimize_aggregation_in_order=True)

# This bit optimizes the query by first selecting all IDs for all persons (regardless of whether it's the latest version), and only then aggregating the results
# We only do this if there are where clauses, _and_ WhereClauseExtractor can extract them
if node.where:
inner_select = cast(
version = context.modifiers.personsArgMaxVersion
if version == PersonsArgMaxVersion.AUTO:
version = PersonsArgMaxVersion.V1
# If selecting properties, use the faster v2 query. Otherwise, v1 is faster.
for field_chain in join_or_table.fields_accessed.values():
if field_chain[0] == "properties":
version = PersonsArgMaxVersion.V2
break

if version == PersonsArgMaxVersion.V2:
from posthog.hogql import ast
from posthog.hogql.parser import parse_select

select = cast(
ast.SelectQuery,
parse_select(
"""
SELECT id
FROM raw_persons
WHERE
-- Much faster to pre-select out any deleted persons than doing it in aggregation
-- This is correct because there are no instances where we'd un-delete a person (ie there are no cases where one version has is_deleted=1 and a later version has is_deleted = 0)
id NOT IN (select id from raw_persons where is_deleted = 1)
SELECT id FROM raw_persons WHERE (id, version) IN (
SELECT id, max(version) as version
FROM raw_persons
GROUP BY id
HAVING equals(argMax(raw_persons.is_deleted, raw_persons.version), 0)
AND argMax(raw_persons.created_at, raw_persons.version) < now() + interval 1 day
)
"""
),
)
extractor = WhereClauseExtractor(context, is_join=is_join)
select.settings = HogQLQuerySettings(optimize_aggregation_in_order=True)
if filter is not None:
cast(ast.SelectQuery, cast(ast.CompareOperation, select.where).right).where = filter

for field_name, field_chain in join_or_table.fields_accessed.items():
# We need to always select the 'id' field for the join constraint. The field name here is likely to
# be "persons__id" if anything, but just in case, let's avoid duplicates.
if field_name != "id":
select.select.append(
ast.Alias(
alias=field_name,
expr=ast.Field(chain=field_chain),
)
)
else:
select = argmax_select(
table_name="raw_persons",
select_fields=join_or_table.fields_accessed,
group_fields=["id"],
argmax_field="version",
deleted_field="is_deleted",
timestamp_field_to_clamp="created_at",
)
select.settings = HogQLQuerySettings(optimize_aggregation_in_order=True)
if filter is not None:
if select.where:
select.where = And(exprs=[select.where, filter])
else:
select.where = filter

if context.modifiers.optimizeJoinedFilters:
extractor = WhereClauseExtractor(context)
extractor.add_local_tables(join_or_table)
where = extractor.get_inner_where(node)

if where and inner_select.where:
inner_select.where = ast.And(exprs=[inner_select.where, where])
select.where = ast.And(
exprs=[
ast.CompareOperation(
left=ast.Field(chain=["id"]), right=inner_select, op=ast.CompareOperationOp.In
),
where, # Technically, adding the where clause here is duplicative, because the outer node filters this out _again_. However, if you're trying to debug the results stay consistent throughout the query (otherwise old versions might pop up again in this subquery)
]
)
if filter is not None:
if select.where:
cast(ast.SelectQuery, cast(ast.CompareOperation, select.where).right).where = ast.And(
exprs=[select.where, filter]
)
else:
select.where = filter

for field_name, field_chain in join_or_table.fields_accessed.items():
# We need to always select the 'id' field for the join constraint. The field name here is likely to
# be "persons__id" if anything, but just in case, let's avoid duplicates.
if field_name != "id":
select.select.append(
ast.Alias(
alias=field_name,
expr=ast.Call(name="argMax", args=[ast.Field(chain=field_chain), ast.Field(chain=["version"])]),
)
)
if where and select.where:
select.where = And(exprs=[select.where, where])
elif where:
select.where = where

return select

Expand All @@ -124,7 +125,7 @@ def join_with_persons_table(

if not join_to_add.fields_accessed:
raise ResolutionError("No fields requested from persons table")
join_expr = ast.JoinExpr(table=select_from_persons_table(join_to_add, context, node, is_join=True))
join_expr = ast.JoinExpr(table=select_from_persons_table(join_to_add, context, node))

organization: Organization = context.team.organization if context.team else None
# TODO: @raquelmsmith: Remove flag check and use left join for all once deletes are caught up
Expand Down Expand Up @@ -223,16 +224,9 @@ def create_new_table_with_filter(self, join: JoinExpr) -> Self:
return self

def lazy_select(self, table_to_add: LazyTableToAdd, context, node):
# assume that if the select_from is not persons table we're doing a join
try:
is_join = not isinstance(node.select_from.type.table, PersonsTable)
except AttributeError:
is_join = False
if self.filter is not None:
return select_from_persons_table(
table_to_add, context, node, filter=clone_expr(self.filter, True), is_join=is_join
)
return select_from_persons_table(table_to_add, context, node, is_join=is_join)
return select_from_persons_table(table_to_add, context, node, filter=clone_expr(self.filter, True))
return select_from_persons_table(table_to_add, context, node)

def to_printed_clickhouse(self, context):
return "person"
Expand Down
Loading
Loading