Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(hogql): Add basic support for PoE v3 (distinct ID overrides) #21059

Merged
merged 12 commits into from
Mar 22, 2024
1 change: 1 addition & 0 deletions frontend/src/scenes/debug/HogQLDebug.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ export function HogQLDebug({ query, setQuery, queryKey }: HogQLDebugProps): JSX.
{ value: 'v1_enabled', label: 'V1 Enabled' },
{ value: 'v1_mixed', label: 'V1 Mixed' },
{ value: 'v2_enabled', label: 'V2 Enabled' },
{ value: 'v3_enabled', label: 'V3 Enabled (Join)' },
]}
onChange={(value) =>
setQuery({
Expand Down
25 changes: 25 additions & 0 deletions posthog/hogql/database/database.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,11 @@
from posthog.hogql.database.schema.events import EventsTable
from posthog.hogql.database.schema.groups import GroupsTable, RawGroupsTable
from posthog.hogql.database.schema.numbers import NumbersTable
from posthog.hogql.database.schema.person_distinct_id_overrides import (
PersonDistinctIdOverridesTable,
RawPersonDistinctIdOverridesTable,
join_with_person_distinct_id_overrides_table,
)
from posthog.hogql.database.schema.person_distinct_ids import (
PersonDistinctIdsTable,
RawPersonDistinctIdsTable,
Expand Down Expand Up @@ -66,6 +71,7 @@
groups: GroupsTable = GroupsTable()
persons: PersonsTable = PersonsTable()
person_distinct_ids: PersonDistinctIdsTable = PersonDistinctIdsTable()
person_distinct_id_overrides: PersonDistinctIdOverridesTable = PersonDistinctIdOverridesTable()
person_overrides: PersonOverridesTable = PersonOverridesTable()

session_replay_events: SessionReplayEventsTable = SessionReplayEventsTable()
Expand All @@ -81,6 +87,7 @@
raw_persons: RawPersonsTable = RawPersonsTable()
raw_groups: RawGroupsTable = RawGroupsTable()
raw_cohort_people: RawCohortPeople = RawCohortPeople()
raw_person_distinct_id_overrides: RawPersonDistinctIdOverridesTable = RawPersonDistinctIdOverridesTable()
raw_person_overrides: RawPersonOverridesTable = RawPersonOverridesTable()
raw_sessions: RawSessionsTable = RawSessionsTable()

Expand Down Expand Up @@ -186,6 +193,24 @@
database.events.fields["poe"].fields["id"] = database.events.fields["person_id"]
database.events.fields["person"] = FieldTraverser(chain=["poe"])

elif modifiers.personsOnEventsMode == PersonsOnEventsMode.v3_enabled:
database.events.fields["event_person_id"] = StringDatabaseField(name="person_id")
database.events.fields["override"] = LazyJoin(
from_field=["distinct_id"], # ???
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is defined on 182 above as well, but it wasn't obvious to me at a glance where this gets used (I didn't do a ton of digging either though to be fair.) I figured this would be used for the LHS of the join comparator, but it seems like that's defined on the JoinExpr itself (i.e. the return value of join_with_person_distinct_id_overrides_table)?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's essentially for this:

# Make sure we also add fields we will use for the join's "ON" condition into the list of fields accessed.
# Without this "pdi.person.id" won't work if you did not ALSO select "pdi.person_id" explicitly for the join.

join_table=PersonDistinctIdOverridesTable(),
join_function=join_with_person_distinct_id_overrides_table,
)
database.events.fields["person_id"] = ExpressionField(
name="person_id",
expr=parse_expr(
# NOTE: assumes `join_use_nulls = 0` (the default), as ``override.distinct_id`` is not Nullable
"if(not(empty(override.distinct_id)), override.person_id, event_person_id)",
start=None,
),
)
database.events.fields["poe"].fields["id"] = database.events.fields["person_id"]

Check failure on line 211 in posthog/hogql/database/database.py

View workflow job for this annotation

GitHub Actions / Python code quality checks

"FieldOrTable" has no attribute "fields"

Check failure on line 211 in posthog/hogql/database/database.py

View workflow job for this annotation

GitHub Actions / Python code quality checks

"FieldOrTable" has no attribute "fields"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Some mypy errors here

database.events.fields["person"] = FieldTraverser(chain=["poe"])

database.persons.fields["$virt_initial_referring_domain_type"] = create_initial_domain_type(
"$virt_initial_referring_domain_type"
)
Expand Down
92 changes: 92 additions & 0 deletions posthog/hogql/database/schema/person_distinct_id_overrides.py
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is largely copy/pasted from the non-_overrides suffixed version, because the table itself is basically the same, just with fewer rows.

Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
from typing import Dict, List
from posthog.hogql.ast import SelectQuery
from posthog.hogql.context import HogQLContext

from posthog.hogql.database.argmax import argmax_select
from posthog.hogql.database.models import (
Table,
IntegerDatabaseField,
StringDatabaseField,
BooleanDatabaseField,
LazyJoin,
LazyTable,
FieldOrTable,
)
from posthog.hogql.database.schema.persons import join_with_persons_table
from posthog.hogql.errors import HogQLException
from posthog.schema import HogQLQueryModifiers

PERSON_DISTINCT_ID_OVERRIDES_FIELDS = {
"team_id": IntegerDatabaseField(name="team_id"),
"distinct_id": StringDatabaseField(name="distinct_id"),
"person_id": StringDatabaseField(name="person_id"),
"person": LazyJoin(
from_field=["person_id"],
join_table="persons",
join_function=join_with_persons_table,
),
}


def select_from_person_distinct_id_overrides_table(requested_fields: Dict[str, List[str | int]]):
# Always include "person_id", as it's the key we use to make further joins, and it'd be great if it's available
if "person_id" not in requested_fields:
requested_fields = {**requested_fields, "person_id": ["person_id"]}
return argmax_select(
table_name="raw_person_distinct_id_overrides",
select_fields=requested_fields,
group_fields=["distinct_id"],
argmax_field="version",
deleted_field="is_deleted",
)


def join_with_person_distinct_id_overrides_table(
from_table: str,
to_table: str,
requested_fields: Dict[str, List[str]],
context: HogQLContext,
node: SelectQuery,
):
from posthog.hogql import ast

if not requested_fields:
raise HogQLException("No fields requested from person_distinct_id_overrides")
join_expr = ast.JoinExpr(table=select_from_person_distinct_id_overrides_table(requested_fields))

Check failure on line 55 in posthog/hogql/database/schema/person_distinct_id_overrides.py

View workflow job for this annotation

GitHub Actions / Python code quality checks

Argument 1 to "select_from_person_distinct_id_overrides_table" has incompatible type "dict[str, list[str]]"; expected "dict[str, list[str | int]]"

Check failure on line 55 in posthog/hogql/database/schema/person_distinct_id_overrides.py

View workflow job for this annotation

GitHub Actions / Python code quality checks

Argument 1 to "select_from_person_distinct_id_overrides_table" has incompatible type "dict[str, list[str]]"; expected "dict[str, list[str | int]]"
join_expr.join_type = "LEFT OUTER JOIN"
Comment on lines +55 to +56
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Assuming we are able to stay reasonably on top of merges, something that could be interesting to check out here would be using an ANY merge in place of the argMax to see if that has any performance impact. I suspect it wouldn't be worth it since it'd increase cardinality of the RHS table and has the potential to create nondeterminism headaches if there are lots of rows with duplicate keys, but hard to say that for sure without trying.

join_expr.alias = to_table
join_expr.constraint = ast.JoinConstraint(
expr=ast.CompareOperation(
op=ast.CompareOperationOp.Eq,
left=ast.Field(chain=[from_table, "distinct_id"]),
right=ast.Field(chain=[to_table, "distinct_id"]),
)
)
return join_expr


class RawPersonDistinctIdOverridesTable(Table):
fields: Dict[str, FieldOrTable] = {
**PERSON_DISTINCT_ID_OVERRIDES_FIELDS,
"is_deleted": BooleanDatabaseField(name="is_deleted"),
"version": IntegerDatabaseField(name="version"),
}

def to_printed_clickhouse(self, context):
return "person_distinct_id_overrides"

def to_printed_hogql(self):
return "raw_person_distinct_id_overrides"


class PersonDistinctIdOverridesTable(LazyTable):
fields: Dict[str, FieldOrTable] = PERSON_DISTINCT_ID_OVERRIDES_FIELDS

def lazy_select(self, requested_fields: Dict[str, List[str | int]], modifiers: HogQLQueryModifiers):
return select_from_person_distinct_id_overrides_table(requested_fields)

def to_printed_clickhouse(self, context):
return "person_distinct_id_overrides"

def to_printed_hogql(self):
return "person_distinct_id_overrides"
7 changes: 7 additions & 0 deletions posthog/hogql/test/test_modifiers.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,13 @@ def test_modifiers_persons_on_events_mode_mapping(self):
"events.person_properties AS properties",
"toTimeZone(events.person_created_at, %(hogql_val_1)s) AS created_at",
),
(
PersonsOnEventsMode.v3_enabled,
"events.event AS event",
"if(not(empty(events__override.distinct_id)), events__override.person_id, events.person_id) AS id",
"events.person_properties AS properties",
"toTimeZone(events.person_created_at, %(hogql_val_0)s) AS created_at",
),
]

for mode, *expected in test_cases:
Expand Down
1 change: 1 addition & 0 deletions posthog/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -418,6 +418,7 @@ class PersonsOnEventsMode(str, Enum):
v1_enabled = "v1_enabled"
v1_mixed = "v1_mixed"
v2_enabled = "v2_enabled"
v3_enabled = "v3_enabled"


class HogQLQueryModifiers(BaseModel):
Expand Down
Loading