Skip to content

Commit

Permalink
feat(hogql): Added some ordering dependency to lazy tables (#24685)
Browse files Browse the repository at this point in the history
Co-authored-by: github-actions <41898282+github-actions[bot]@users.noreply.github.com>
  • Loading branch information
Gilbert09 and github-actions[bot] authored Aug 30, 2024
1 parent 57c22c3 commit 279f276
Show file tree
Hide file tree
Showing 9 changed files with 185 additions and 100 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -465,6 +465,16 @@ def test_session_breakdown(self):
multiIf(and(ifNull(greaterOrEquals(e__session.`$session_duration`, 2.0), 0), ifNull(less(e__session.`$session_duration`, 4.5), 0)), %(hogql_val_10)s, and(ifNull(greaterOrEquals(e__session.`$session_duration`, 4.5), 0), ifNull(less(e__session.`$session_duration`, 27.0), 0)), %(hogql_val_11)s, and(ifNull(greaterOrEquals(e__session.`$session_duration`, 27.0), 0), ifNull(less(e__session.`$session_duration`, 44.0), 0)), %(hogql_val_12)s, and(ifNull(greaterOrEquals(e__session.`$session_duration`, 44.0), 0), ifNull(less(e__session.`$session_duration`, 48.0), 0)), %(hogql_val_13)s, and(ifNull(greaterOrEquals(e__session.`$session_duration`, 48.0), 0), ifNull(less(e__session.`$session_duration`, 57.5), 0)), %(hogql_val_14)s, and(ifNull(greaterOrEquals(e__session.`$session_duration`, 57.5), 0), ifNull(less(e__session.`$session_duration`, 61.0), 0)), %(hogql_val_15)s, and(ifNull(greaterOrEquals(e__session.`$session_duration`, 61.0), 0), ifNull(less(e__session.`$session_duration`, 74.0), 0)), %(hogql_val_16)s, and(ifNull(greaterOrEquals(e__session.`$session_duration`, 74.0), 0), ifNull(less(e__session.`$session_duration`, 90.0), 0)), %(hogql_val_17)s, and(ifNull(greaterOrEquals(e__session.`$session_duration`, 90.0), 0), ifNull(less(e__session.`$session_duration`, 98.5), 0)), %(hogql_val_18)s, and(ifNull(greaterOrEquals(e__session.`$session_duration`, 98.5), 0), ifNull(less(e__session.`$session_duration`, 167.01), 0)), %(hogql_val_19)s, %(hogql_val_20)s) AS breakdown_value
FROM
events AS e SAMPLE 1
LEFT JOIN (SELECT
dateDiff(%(hogql_val_0)s, min(toTimeZone(sessions.min_timestamp, %(hogql_val_1)s)), max(toTimeZone(sessions.max_timestamp, %(hogql_val_2)s))) AS `$session_duration`,
sessions.session_id AS session_id
FROM
sessions
WHERE
and(equals(sessions.team_id, {self.team.id}), ifNull(greaterOrEquals(plus(toTimeZone(sessions.min_timestamp, %(hogql_val_3)s), toIntervalDay(3)), toStartOfDay(assumeNotNull(parseDateTime64BestEffortOrNull(%(hogql_val_4)s, 6, %(hogql_val_5)s)))), 0), ifNull(lessOrEquals(minus(toTimeZone(sessions.min_timestamp, %(hogql_val_6)s), toIntervalDay(3)), assumeNotNull(parseDateTime64BestEffortOrNull(%(hogql_val_7)s, 6, %(hogql_val_8)s))), 0))
GROUP BY
sessions.session_id,
sessions.session_id) AS e__session ON equals(e.`$session_id`, e__session.session_id)
INNER JOIN (SELECT
argMax(person_distinct_id2.person_id, person_distinct_id2.version) AS person_id,
person_distinct_id2.distinct_id AS distinct_id
Expand All @@ -477,16 +487,6 @@ def test_session_breakdown(self):
HAVING
ifNull(equals(argMax(person_distinct_id2.is_deleted, person_distinct_id2.version), 0), 0)
SETTINGS optimize_aggregation_in_order=1) AS e__pdi ON equals(e.distinct_id, e__pdi.distinct_id)
LEFT JOIN (SELECT
dateDiff(%(hogql_val_0)s, min(toTimeZone(sessions.min_timestamp, %(hogql_val_1)s)), max(toTimeZone(sessions.max_timestamp, %(hogql_val_2)s))) AS `$session_duration`,
sessions.session_id AS session_id
FROM
sessions
WHERE
and(equals(sessions.team_id, {self.team.id}), ifNull(greaterOrEquals(plus(toTimeZone(sessions.min_timestamp, %(hogql_val_3)s), toIntervalDay(3)), toStartOfDay(assumeNotNull(parseDateTime64BestEffortOrNull(%(hogql_val_4)s, 6, %(hogql_val_5)s)))), 0), ifNull(lessOrEquals(minus(toTimeZone(sessions.min_timestamp, %(hogql_val_6)s), toIntervalDay(3)), assumeNotNull(parseDateTime64BestEffortOrNull(%(hogql_val_7)s, 6, %(hogql_val_8)s))), 0))
GROUP BY
sessions.session_id,
sessions.session_id) AS e__session ON equals(e.`$session_id`, e__session.session_id)
WHERE
and(equals(e.team_id, {self.team.id}), and(greaterOrEquals(toTimeZone(e.timestamp, %(hogql_val_21)s), toStartOfDay(assumeNotNull(parseDateTime64BestEffortOrNull(%(hogql_val_22)s, 6, %(hogql_val_23)s)))), lessOrEquals(toTimeZone(e.timestamp, %(hogql_val_24)s), assumeNotNull(parseDateTime64BestEffortOrNull(%(hogql_val_25)s, 6, %(hogql_val_26)s))), equals(e.event, %(hogql_val_27)s), ifNull(in(e__pdi.person_id, (SELECT
cohortpeople.person_id AS person_id
Expand Down
35 changes: 33 additions & 2 deletions posthog/hogql/transforms/lazy_tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,17 @@ def visit_field(self, node: ast.Field):
node.chain = [constraint.table_name, constraint.alias]


class FieldFinder(TraversingVisitor):
field_chains: list[list[str | int]] = []

def __init__(self) -> None:
super().__init__()
self.field_chains = []

def visit_field(self, node: ast.Field):
self.field_chains.append(node.chain)


class LazyFinder(TraversingVisitor):
found_lazy: bool = False
max_type_visits: int = 1
Expand Down Expand Up @@ -337,6 +348,9 @@ def create_override(table_name: str, field_chain: list[str | int]) -> None:
break
join_ptr = join_ptr.next_join

# Store all the constraint fields we've seen for each join to decide where in the order of joins the next join should be added
field_chain_finder = FieldFinder()

# For all the collected joins, create the join subqueries, and add them to the table.
for to_table, join_scope in joins_to_add.items():
join_to_add: ast.JoinExpr = join_scope.lazy_join.join_function(
Expand Down Expand Up @@ -366,14 +380,31 @@ def create_override(table_name: str, field_chain: list[str | int]) -> None:
if join_to_add.type is not None:
select_type.tables[to_table] = join_to_add.type

field_chain_finder.visit(join_to_add.constraint)

select_from_alias: str | int | None = None
if node.select_from and node.select_from.alias:
select_from_alias = node.select_from.alias
else:
if node.select_from and node.select_from.table and isinstance(node.select_from.table, ast.Field):
select_from_alias = node.select_from.table.chain[0]

constraint_tables = [x[0] for x in field_chain_finder.field_chains if x[0] != select_from_alias]

join_ptr = node.select_from
added = False
while join_ptr:
if join_scope.from_table == join_ptr.alias or (
isinstance(join_ptr.table, ast.Field) and join_scope.from_table == join_ptr.table.chain[0]
):
join_to_add.next_join = join_ptr.next_join
join_ptr.next_join = join_to_add
# If the `join_to_add` is reliant on the existing `next_join`, then just append after instead of before
if join_ptr.next_join and join_ptr.next_join.alias in constraint_tables:
if join_ptr.next_join.next_join:
join_to_add.next_join = join_ptr.next_join.next_join
join_ptr.next_join.next_join = join_to_add
else:
join_to_add.next_join = join_ptr.next_join
join_ptr.next_join = join_to_add
added = True
break
if join_ptr.next_join:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -93,19 +93,19 @@
'''

SELECT if(not(empty(events__override.distinct_id)), events__override.person_id, events.person_id) AS person_id, events__person.properties AS properties
FROM events LEFT JOIN (
SELECT argMax(person.properties, person.version) AS properties, person.id AS id
FROM person
WHERE equals(person.team_id, 420)
GROUP BY person.id
HAVING and(ifNull(equals(argMax(person.is_deleted, person.version), 0), 0), ifNull(less(argMax(toTimeZone(person.created_at, %(hogql_val_0)s), person.version), plus(now64(6, %(hogql_val_1)s), toIntervalDay(1))), 0))
SETTINGS optimize_aggregation_in_order=1) AS events__person ON equals(if(not(empty(events__override.distinct_id)), events__override.person_id, events.person_id), events__person.id) LEFT OUTER JOIN (
FROM events LEFT OUTER JOIN (
SELECT argMax(person_distinct_id_overrides.person_id, person_distinct_id_overrides.version) AS person_id, person_distinct_id_overrides.distinct_id AS distinct_id
FROM person_distinct_id_overrides
WHERE equals(person_distinct_id_overrides.team_id, 420)
GROUP BY person_distinct_id_overrides.distinct_id
HAVING ifNull(equals(argMax(person_distinct_id_overrides.is_deleted, person_distinct_id_overrides.version), 0), 0)
SETTINGS optimize_aggregation_in_order=1) AS events__override ON equals(events.distinct_id, events__override.distinct_id)
SETTINGS optimize_aggregation_in_order=1) AS events__override ON equals(events.distinct_id, events__override.distinct_id) LEFT JOIN (
SELECT argMax(person.properties, person.version) AS properties, person.id AS id
FROM person
WHERE equals(person.team_id, 420)
GROUP BY person.id
HAVING and(ifNull(equals(argMax(person.is_deleted, person.version), 0), 0), ifNull(less(argMax(toTimeZone(person.created_at, %(hogql_val_0)s), person.version), plus(now64(6, %(hogql_val_1)s), toIntervalDay(1))), 0))
SETTINGS optimize_aggregation_in_order=1) AS events__person ON equals(if(not(empty(events__override.distinct_id)), events__override.person_id, events.person_id), events__person.id)
WHERE equals(events.team_id, 420)
LIMIT 50000
'''
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1687,13 +1687,6 @@
prop_basic AS prop,
argMinIf(prop, timestamp, isNotNull(prop)) OVER (PARTITION BY aggregation_target) AS prop_vals
FROM events AS e
INNER JOIN
(SELECT argMax(person_distinct_id2.person_id, person_distinct_id2.version) AS person_id,
person_distinct_id2.distinct_id AS distinct_id
FROM person_distinct_id2
WHERE equals(person_distinct_id2.team_id, 2)
GROUP BY person_distinct_id2.distinct_id
HAVING ifNull(equals(argMax(person_distinct_id2.is_deleted, person_distinct_id2.version), 0), 0) SETTINGS optimize_aggregation_in_order=1) AS e__pdi ON equals(e.distinct_id, e__pdi.distinct_id)
LEFT JOIN
(SELECT argMax(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(groups.group_properties, 'industry'), ''), 'null'), '^"|"$', ''), toTimeZone(groups._timestamp, 'UTC')) AS properties___industry,
groups.group_type_index AS index,
Expand All @@ -1702,6 +1695,13 @@
WHERE and(equals(groups.team_id, 2), ifNull(equals(index, 0), 0))
GROUP BY groups.group_type_index,
groups.group_key) AS e__group_0 ON equals(e.`$group_0`, e__group_0.key)
INNER JOIN
(SELECT argMax(person_distinct_id2.person_id, person_distinct_id2.version) AS person_id,
person_distinct_id2.distinct_id AS distinct_id
FROM person_distinct_id2
WHERE equals(person_distinct_id2.team_id, 2)
GROUP BY person_distinct_id2.distinct_id
HAVING ifNull(equals(argMax(person_distinct_id2.is_deleted, person_distinct_id2.version), 0), 0) SETTINGS optimize_aggregation_in_order=1) AS e__pdi ON equals(e.distinct_id, e__pdi.distinct_id)
WHERE and(equals(e.team_id, 2), and(and(greaterOrEquals(toTimeZone(e.timestamp, 'UTC'), toDateTime64('2020-01-01 00:00:00.000000', 6, 'UTC')), lessOrEquals(toTimeZone(e.timestamp, 'UTC'), toDateTime64('2020-01-08 23:59:59.999999', 6, 'UTC'))), in(e.event, tuple('buy', 'play movie', 'sign up'))), or(ifNull(equals(step_0, 1), 0), ifNull(equals(step_1, 1), 0), ifNull(equals(step_2, 1), 0))))))))
WHERE ifNull(equals(step_0, 1), 0)))
GROUP BY aggregation_target,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -612,13 +612,6 @@
prop_basic AS prop,
argMinIf(prop, timestamp, isNotNull(prop)) OVER (PARTITION BY aggregation_target) AS prop_vals
FROM events AS e
INNER JOIN
(SELECT argMax(person_distinct_id2.person_id, person_distinct_id2.version) AS person_id,
person_distinct_id2.distinct_id AS distinct_id
FROM person_distinct_id2
WHERE equals(person_distinct_id2.team_id, 2)
GROUP BY person_distinct_id2.distinct_id
HAVING ifNull(equals(argMax(person_distinct_id2.is_deleted, person_distinct_id2.version), 0), 0) SETTINGS optimize_aggregation_in_order=1) AS e__pdi ON equals(e.distinct_id, e__pdi.distinct_id)
LEFT JOIN
(SELECT argMax(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(groups.group_properties, 'industry'), ''), 'null'), '^"|"$', ''), toTimeZone(groups._timestamp, 'UTC')) AS properties___industry,
groups.group_type_index AS index,
Expand All @@ -627,6 +620,13 @@
WHERE and(equals(groups.team_id, 2), ifNull(equals(index, 0), 0))
GROUP BY groups.group_type_index,
groups.group_key) AS e__group_0 ON equals(e.`$group_0`, e__group_0.key)
INNER JOIN
(SELECT argMax(person_distinct_id2.person_id, person_distinct_id2.version) AS person_id,
person_distinct_id2.distinct_id AS distinct_id
FROM person_distinct_id2
WHERE equals(person_distinct_id2.team_id, 2)
GROUP BY person_distinct_id2.distinct_id
HAVING ifNull(equals(argMax(person_distinct_id2.is_deleted, person_distinct_id2.version), 0), 0) SETTINGS optimize_aggregation_in_order=1) AS e__pdi ON equals(e.distinct_id, e__pdi.distinct_id)
WHERE and(equals(e.team_id, 2), and(greaterOrEquals(toTimeZone(e.timestamp, 'UTC'), toDateTime64('2020-01-01 00:00:00.000000', 6, 'UTC')), lessOrEquals(toTimeZone(e.timestamp, 'UTC'), toDateTime64('2020-01-08 23:59:59.999999', 6, 'UTC')))))))
WHERE ifNull(equals(step_0, 1), 0)))
GROUP BY aggregation_target,
Expand Down
Loading

0 comments on commit 279f276

Please sign in to comment.