Skip to content

Commit

Permalink
feat(hogql): intelligent argmax on persons table (#17975)
Browse files Browse the repository at this point in the history
  • Loading branch information
mariusandra authored Oct 13, 2023
1 parent 51a2657 commit 5813380
Show file tree
Hide file tree
Showing 8 changed files with 90 additions and 28 deletions.
2 changes: 1 addition & 1 deletion frontend/src/queries/schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -1236,7 +1236,7 @@
"description": "HogQL Query Options are automatically set per team. However, they can be overriden in the query.",
"properties": {
"personsArgMaxVersion": {
"enum": ["v1", "v2"],
"enum": ["auto", "v1", "v2"],
"type": "string"
},
"personsOnEventsMode": {
Expand Down
2 changes: 1 addition & 1 deletion frontend/src/queries/schema.ts
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ export interface DataNode extends Node {
/** HogQL Query Options are automatically set per team. However, they can be overriden in the query. */
export interface HogQLQueryModifiers {
personsOnEventsMode?: 'disabled' | 'v1_enabled' | 'v2_enabled'
personsArgMaxVersion?: 'v1' | 'v2'
personsArgMaxVersion?: 'auto' | 'v1' | 'v2'
}

export interface HogQLQueryResponse {
Expand Down
26 changes: 16 additions & 10 deletions posthog/api/test/__snapshots__/test_query.ambr
Original file line number Diff line number Diff line change
Expand Up @@ -354,12 +354,15 @@
GROUP BY person_distinct_id2.distinct_id
HAVING ifNull(equals(argMax(person_distinct_id2.is_deleted, person_distinct_id2.version), 0), 0)) AS events__pdi ON equals(events.distinct_id, events__pdi.distinct_id)
INNER JOIN
(SELECT argMax(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(person.properties, 'email'), ''), 'null'), '^"|"$', ''), person.version) AS properties___email,
person.id AS id
(SELECT person.id,
replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(person.properties, 'email'), ''), 'null'), '^"|"$', '') AS properties___email
FROM person
WHERE equals(person.team_id, 2)
GROUP BY person.id
HAVING ifNull(equals(argMax(person.is_deleted, person.version), 0), 0) SETTINGS optimize_aggregation_in_order=1) AS events__pdi__person ON equals(events__pdi.person_id, events__pdi__person.id)
WHERE and(equals(person.team_id, 2), ifNull(in(tuple(person.id, person.version),
(SELECT person.id, max(person.version) AS version
FROM person
WHERE equals(person.team_id, 2)
GROUP BY person.id
HAVING ifNull(equals(argMax(person.is_deleted, person.version), 0), 0))), 0)) SETTINGS optimize_aggregation_in_order=1) AS events__pdi__person ON equals(events__pdi.person_id, events__pdi__person.id)
WHERE and(equals(events.team_id, 2), ifNull(equals(events__pdi__person.properties___email, '[email protected]'), 0), less(toTimeZone(events.timestamp, 'UTC'), toDateTime64('2020-01-10 12:14:05.000000', 6, 'UTC')), greater(toTimeZone(events.timestamp, 'UTC'), toDateTime64('2020-01-09 12:14:00.000000', 6, 'UTC')))
ORDER BY events.event ASC
LIMIT 101
Expand All @@ -385,12 +388,15 @@
GROUP BY person_distinct_id2.distinct_id
HAVING ifNull(equals(argMax(person_distinct_id2.is_deleted, person_distinct_id2.version), 0), 0)) AS events__pdi ON equals(events.distinct_id, events__pdi.distinct_id)
INNER JOIN
(SELECT argMax(nullIf(nullIf(person.pmat_email, ''), 'null'), person.version) AS properties___email,
person.id AS id
(SELECT person.id,
nullIf(nullIf(person.pmat_email, ''), 'null') AS properties___email
FROM person
WHERE equals(person.team_id, 2)
GROUP BY person.id
HAVING ifNull(equals(argMax(person.is_deleted, person.version), 0), 0) SETTINGS optimize_aggregation_in_order=1) AS events__pdi__person ON equals(events__pdi.person_id, events__pdi__person.id)
WHERE and(equals(person.team_id, 2), ifNull(in(tuple(person.id, person.version),
(SELECT person.id, max(person.version) AS version
FROM person
WHERE equals(person.team_id, 2)
GROUP BY person.id
HAVING ifNull(equals(argMax(person.is_deleted, person.version), 0), 0))), 0)) SETTINGS optimize_aggregation_in_order=1) AS events__pdi__person ON equals(events__pdi.person_id, events__pdi__person.id)
WHERE and(equals(events.team_id, 2), ifNull(equals(events__pdi__person.properties___email, '[email protected]'), 0), less(toTimeZone(events.timestamp, 'UTC'), toDateTime64('2020-01-10 12:14:05.000000', 6, 'UTC')), greater(toTimeZone(events.timestamp, 'UTC'), toDateTime64('2020-01-09 12:14:00.000000', 6, 'UTC')))
ORDER BY events.event ASC
LIMIT 101
Expand Down
11 changes: 10 additions & 1 deletion posthog/hogql/database/schema/persons.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,16 @@


def select_from_persons_table(requested_fields: Dict[str, List[str]], modifiers: HogQLQueryModifiers):
if modifiers.personsArgMaxVersion == PersonsArgMaxVersion.v2:
version = modifiers.personsArgMaxVersion
if version == PersonsArgMaxVersion.auto:
version = PersonsArgMaxVersion.v1
# If selecting properties, use the faster v2 query. Otherwise v1 is faster.
for field_chain in requested_fields.values():
if field_chain[0] == "properties":
version = PersonsArgMaxVersion.v2
break

if version == PersonsArgMaxVersion.v2:
from posthog.hogql.parser import parse_select
from posthog.hogql import ast

Expand Down
2 changes: 1 addition & 1 deletion posthog/hogql/modifiers.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,6 @@ def create_default_modifiers_for_team(
modifiers.personsOnEventsMode = team.person_on_events_mode or PersonOnEventsMode.DISABLED

if modifiers.personsArgMaxVersion is None:
modifiers.personsArgMaxVersion = "v1"
modifiers.personsArgMaxVersion = "auto"

return modifiers
49 changes: 35 additions & 14 deletions posthog/hogql/test/__snapshots__/test_query.ambr
Original file line number Diff line number Diff line change
Expand Up @@ -54,11 +54,14 @@

SELECT DISTINCT persons.properties___sneaky_mail
FROM (
SELECT argMax(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(person.properties, %(hogql_val_0)s), ''), 'null'), '^"|"$', ''), person.version) AS properties___sneaky_mail, argMax(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(person.properties, %(hogql_val_1)s), ''), 'null'), '^"|"$', ''), person.version) AS properties___random_uuid, person.id AS id
SELECT person.id, replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(person.properties, %(hogql_val_0)s), ''), 'null'), '^"|"$', '') AS properties___sneaky_mail, replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(person.properties, %(hogql_val_1)s), ''), 'null'), '^"|"$', '') AS properties___random_uuid
FROM person
WHERE and(equals(person.team_id, 420), ifNull(in(tuple(person.id, person.version), (
SELECT person.id, max(person.version) AS version
FROM person
WHERE equals(person.team_id, 420)
GROUP BY person.id
HAVING ifNull(equals(argMax(person.is_deleted, person.version), 0), 0)
HAVING ifNull(equals(argMax(person.is_deleted, person.version), 0), 0))), 0))
SETTINGS optimize_aggregation_in_order=1) AS persons
WHERE ifNull(equals(persons.properties___random_uuid, %(hogql_val_2)s), 0)
LIMIT 100
Expand Down Expand Up @@ -105,11 +108,14 @@
WHERE equals(person_distinct_id2.team_id, 420)
GROUP BY person_distinct_id2.distinct_id
HAVING ifNull(equals(argMax(person_distinct_id2.is_deleted, person_distinct_id2.version), 0), 0)) AS e__pdi ON equals(e.distinct_id, e__pdi.distinct_id) INNER JOIN (
SELECT argMax(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(person.properties, %(hogql_val_0)s), ''), 'null'), '^"|"$', ''), person.version) AS properties___sneaky_mail, person.id AS id
SELECT person.id, replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(person.properties, %(hogql_val_0)s), ''), 'null'), '^"|"$', '') AS properties___sneaky_mail
FROM person
WHERE and(equals(person.team_id, 420), ifNull(in(tuple(person.id, person.version), (
SELECT person.id, max(person.version) AS version
FROM person
WHERE equals(person.team_id, 420)
GROUP BY person.id
HAVING ifNull(equals(argMax(person.is_deleted, person.version), 0), 0)
HAVING ifNull(equals(argMax(person.is_deleted, person.version), 0), 0))), 0))
SETTINGS optimize_aggregation_in_order=1) AS e__pdi__person ON equals(e__pdi.person_id, e__pdi__person.id)
WHERE equals(e.team_id, 420)
LIMIT 10
Expand Down Expand Up @@ -147,11 +153,14 @@
WHERE equals(person_distinct_id2.team_id, 420)
GROUP BY person_distinct_id2.distinct_id
HAVING ifNull(equals(argMax(person_distinct_id2.is_deleted, person_distinct_id2.version), 0), 0)) AS events__pdi ON equals(events.distinct_id, events__pdi.distinct_id) INNER JOIN (
SELECT argMax(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(person.properties, %(hogql_val_0)s), ''), 'null'), '^"|"$', ''), person.version) AS properties___sneaky_mail, person.id AS id
SELECT person.id, replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(person.properties, %(hogql_val_0)s), ''), 'null'), '^"|"$', '') AS properties___sneaky_mail
FROM person
WHERE and(equals(person.team_id, 420), ifNull(in(tuple(person.id, person.version), (
SELECT person.id, max(person.version) AS version
FROM person
WHERE equals(person.team_id, 420)
GROUP BY person.id
HAVING ifNull(equals(argMax(person.is_deleted, person.version), 0), 0)
HAVING ifNull(equals(argMax(person.is_deleted, person.version), 0), 0))), 0))
SETTINGS optimize_aggregation_in_order=1) AS events__pdi__person ON equals(events__pdi.person_id, events__pdi__person.id)
WHERE equals(events.team_id, 420)
LIMIT 10
Expand All @@ -168,11 +177,14 @@
WHERE equals(person_distinct_id2.team_id, 420)
GROUP BY person_distinct_id2.distinct_id
HAVING ifNull(equals(argMax(person_distinct_id2.is_deleted, person_distinct_id2.version), 0), 0)) AS e__pdi ON equals(e.distinct_id, e__pdi.distinct_id) INNER JOIN (
SELECT argMax(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(person.properties, %(hogql_val_0)s), ''), 'null'), '^"|"$', ''), person.version) AS properties___sneaky_mail, person.id AS id
SELECT person.id, replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(person.properties, %(hogql_val_0)s), ''), 'null'), '^"|"$', '') AS properties___sneaky_mail
FROM person
WHERE and(equals(person.team_id, 420), ifNull(in(tuple(person.id, person.version), (
SELECT person.id, max(person.version) AS version
FROM person
WHERE equals(person.team_id, 420)
GROUP BY person.id
HAVING ifNull(equals(argMax(person.is_deleted, person.version), 0), 0)
HAVING ifNull(equals(argMax(person.is_deleted, person.version), 0), 0))), 0))
SETTINGS optimize_aggregation_in_order=1) AS e__pdi__person ON equals(e__pdi.person_id, e__pdi__person.id)
WHERE equals(e.team_id, 420)
LIMIT 10
Expand All @@ -189,11 +201,14 @@
WHERE equals(person_distinct_id2.team_id, 420)
GROUP BY person_distinct_id2.distinct_id
HAVING ifNull(equals(argMax(person_distinct_id2.is_deleted, person_distinct_id2.version), 0), 0)) AS s__pdi ON equals(s.distinct_id, s__pdi.distinct_id) INNER JOIN (
SELECT argMax(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(person.properties, %(hogql_val_0)s), ''), 'null'), '^"|"$', ''), person.version) AS properties___sneaky_mail, person.id AS id
SELECT person.id, replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(person.properties, %(hogql_val_0)s), ''), 'null'), '^"|"$', '') AS properties___sneaky_mail
FROM person
WHERE and(equals(person.team_id, 420), ifNull(in(tuple(person.id, person.version), (
SELECT person.id, max(person.version) AS version
FROM person
WHERE equals(person.team_id, 420)
GROUP BY person.id
HAVING ifNull(equals(argMax(person.is_deleted, person.version), 0), 0)
HAVING ifNull(equals(argMax(person.is_deleted, person.version), 0), 0))), 0))
SETTINGS optimize_aggregation_in_order=1) AS s__pdi__person ON equals(s__pdi.person_id, s__pdi__person.id)
WHERE equals(s.team_id, 420)
GROUP BY s__pdi__person.properties___sneaky_mail
Expand Down Expand Up @@ -221,11 +236,14 @@

SELECT pdi.distinct_id, pdi__person.properties___sneaky_mail
FROM person_distinct_id2 AS pdi INNER JOIN (
SELECT argMax(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(person.properties, %(hogql_val_0)s), ''), 'null'), '^"|"$', ''), person.version) AS properties___sneaky_mail, person.id AS id
SELECT person.id, replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(person.properties, %(hogql_val_0)s), ''), 'null'), '^"|"$', '') AS properties___sneaky_mail
FROM person
WHERE and(equals(person.team_id, 420), ifNull(in(tuple(person.id, person.version), (
SELECT person.id, max(person.version) AS version
FROM person
WHERE equals(person.team_id, 420)
GROUP BY person.id
HAVING ifNull(equals(argMax(person.is_deleted, person.version), 0), 0)
HAVING ifNull(equals(argMax(person.is_deleted, person.version), 0), 0))), 0))
SETTINGS optimize_aggregation_in_order=1) AS pdi__person ON equals(pdi.person_id, pdi__person.id)
WHERE equals(pdi.team_id, 420)
LIMIT 10
Expand Down Expand Up @@ -282,11 +300,14 @@
WHERE equals(person_distinct_id2.team_id, 420)
GROUP BY person_distinct_id2.distinct_id
HAVING ifNull(equals(argMax(person_distinct_id2.is_deleted, person_distinct_id2.version), 0), 0)) AS events__pdi ON equals(events.distinct_id, events__pdi.distinct_id) INNER JOIN (
SELECT argMax(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(person.properties, %(hogql_val_0)s), ''), 'null'), '^"|"$', ''), person.version) AS properties___sneaky_mail, person.id AS id
SELECT person.id, replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(person.properties, %(hogql_val_0)s), ''), 'null'), '^"|"$', '') AS properties___sneaky_mail
FROM person
WHERE and(equals(person.team_id, 420), ifNull(in(tuple(person.id, person.version), (
SELECT person.id, max(person.version) AS version
FROM person
WHERE equals(person.team_id, 420)
GROUP BY person.id
HAVING ifNull(equals(argMax(person.is_deleted, person.version), 0), 0)
HAVING ifNull(equals(argMax(person.is_deleted, person.version), 0), 0))), 0))
SETTINGS optimize_aggregation_in_order=1) AS events__pdi__person ON equals(events__pdi.person_id, events__pdi__person.id)
WHERE equals(events.team_id, 420)
LIMIT 10
Expand Down
25 changes: 25 additions & 0 deletions posthog/hogql/test/test_modifiers.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,3 +46,28 @@ def test_modifiers_persons_argmax_version_v2(self):
# Test (v2)
response = execute_hogql_query(query, team=self.team, modifiers=HogQLQueryModifiers(personsArgMaxVersion="v2"))
assert "in(tuple(person.id, person.version)" in response.clickhouse

def test_modifiers_persons_argmax_version_auto(self):
# Use the v2 query when selecting properties.x
response = execute_hogql_query(
"SELECT id, properties.$browser, is_identified FROM persons",
team=self.team,
modifiers=HogQLQueryModifiers(personsArgMaxVersion="auto"),
)
assert "in(tuple(person.id, person.version)" in response.clickhouse

# Use the v2 query when selecting properties
response = execute_hogql_query(
"SELECT id, properties FROM persons",
team=self.team,
modifiers=HogQLQueryModifiers(personsArgMaxVersion="auto"),
)
assert "in(tuple(person.id, person.version)" in response.clickhouse

# Use the v1 query when not selecting any properties
response = execute_hogql_query(
"SELECT id, is_identified FROM persons",
team=self.team,
modifiers=HogQLQueryModifiers(personsArgMaxVersion="auto"),
)
assert "in(tuple(person.id, person.version)" not in response.clickhouse
1 change: 1 addition & 0 deletions posthog/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,6 +229,7 @@ class HogQLNotice(BaseModel):


class PersonsArgMaxVersion(str, Enum):
auto = "auto"
v1 = "v1"
v2 = "v2"

Expand Down

0 comments on commit 5813380

Please sign in to comment.