From b33d7e25275def79b31d093c3e53b180bd3204df Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1s=20Far=C3=ADas=20Santana?= Date: Thu, 25 Jul 2024 10:42:15 +0200 Subject: [PATCH] fix(batchexports): Optimize person batch exports query for memory usage (#23843) --- posthog/batch_exports/sql.py | 32 +++++++++---------- ...e_person_batch_exports_query_for_memory.py | 11 +++++++ .../temporal/batch_exports/batch_exports.py | 3 ++ 3 files changed, 30 insertions(+), 16 deletions(-) create mode 100644 posthog/clickhouse/migrations/0073_update_person_batch_exports_query_for_memory.py diff --git a/posthog/batch_exports/sql.py b/posthog/batch_exports/sql.py index ddab2497d228c..b61bd1d448006 100644 --- a/posthog/batch_exports/sql.py +++ b/posthog/batch_exports/sql.py @@ -21,38 +21,38 @@ FROM ( SELECT team_id, - distinct_id, + id, max(version) AS version, - argMax(person_id, person_distinct_id2.version) AS person_id, - argMax(_timestamp, person_distinct_id2.version) AS _timestamp + argMax(properties, person.version) AS properties, + argMax(_timestamp, person.version) AS _timestamp FROM - person_distinct_id2 - WHERE + person + PREWHERE team_id = {{team_id:Int64}} GROUP BY team_id, - distinct_id - ) AS pd + id + ) AS p INNER JOIN ( SELECT team_id, - id, + distinct_id, max(version) AS version, - argMax(properties, person.version) AS properties, - argMax(_timestamp, person.version) AS _timestamp + argMax(person_id, person_distinct_id2.version) AS person_id, + argMax(_timestamp, person_distinct_id2.version) AS _timestamp FROM - person - WHERE + person_distinct_id2 + PREWHERE team_id = {{team_id:Int64}} GROUP BY team_id, - id - ) AS p ON p.id = pd.person_id AND p.team_id = pd.team_id + distinct_id + ) AS pd ON p.id = pd.person_id AND p.team_id = pd.team_id WHERE pd.team_id = {{team_id:Int64}} AND p.team_id = {{team_id:Int64}} - AND (pd._timestamp >= {{interval_start:DateTime64}} AND pd._timestamp < {{interval_end:DateTime64}}) - OR (p._timestamp >= {{interval_start:DateTime64}} AND p._timestamp < {{interval_end:DateTime64}}) + AND ((pd._timestamp >= {{interval_start:DateTime64}} AND pd._timestamp < {{interval_end:DateTime64}}) + OR (p._timestamp >= {{interval_start:DateTime64}} AND p._timestamp < {{interval_end:DateTime64}})) ORDER BY _inserted_at ) diff --git a/posthog/clickhouse/migrations/0073_update_person_batch_exports_query_for_memory.py b/posthog/clickhouse/migrations/0073_update_person_batch_exports_query_for_memory.py new file mode 100644 index 0000000000000..5e9369110bc51 --- /dev/null +++ b/posthog/clickhouse/migrations/0073_update_person_batch_exports_query_for_memory.py @@ -0,0 +1,11 @@ +from posthog.batch_exports.sql import ( + CREATE_PERSONS_BATCH_EXPORT_VIEW, +) +from posthog.clickhouse.client.migration_tools import run_sql_with_exceptions + +operations = map( + run_sql_with_exceptions, + [ + CREATE_PERSONS_BATCH_EXPORT_VIEW, + ], +) diff --git a/posthog/temporal/batch_exports/batch_exports.py b/posthog/temporal/batch_exports/batch_exports.py index e0d629c54e35c..5dd306cd85ec9 100644 --- a/posthog/temporal/batch_exports/batch_exports.py +++ b/posthog/temporal/batch_exports/batch_exports.py @@ -48,6 +48,9 @@ interval_end={interval_end} ) AS persons FORMAT ArrowStream +-- This is half of configured MAX_MEMORY_USAGE for batch exports. +-- TODO: Make the setting available to all queries. +SETTINGS max_bytes_before_external_group_by=50000000000 """ SELECT_FROM_EVENTS_VIEW = Template("""