From 962756b5bc453149aabbb789fee8f8f48e023e3f Mon Sep 17 00:00:00 2001 From: Paul D'Ambra Date: Mon, 18 Dec 2023 19:33:14 +0000 Subject: [PATCH] fix: correct materialized view definition for session replay events snapshot source (#19389) * fix: correct materialized view definition for session replay events snapshot source * Update query snapshots --------- Co-authored-by: github-actions <41898282+github-actions[bot]@users.noreply.github.com> --- .../0051_session_replay_source_mv_fix.py | 15 ++++++++++++ .../test/__snapshots__/test_schema.ambr | 14 ++++++++++- .../sql/session_replay_event_sql.py | 24 +++++++++++++++---- 3 files changed, 47 insertions(+), 6 deletions(-) create mode 100644 posthog/clickhouse/migrations/0051_session_replay_source_mv_fix.py diff --git a/posthog/clickhouse/migrations/0051_session_replay_source_mv_fix.py b/posthog/clickhouse/migrations/0051_session_replay_source_mv_fix.py new file mode 100644 index 0000000000000..4ac35e2535f92 --- /dev/null +++ b/posthog/clickhouse/migrations/0051_session_replay_source_mv_fix.py @@ -0,0 +1,15 @@ +from posthog.clickhouse.client.migration_tools import run_sql_with_exceptions +from posthog.session_recordings.sql.session_replay_event_migrations_sql import ( + DROP_SESSION_REPLAY_EVENTS_TABLE_MV_SQL, +) +from posthog.session_recordings.sql.session_replay_event_sql import ( + SESSION_REPLAY_EVENTS_TABLE_MV_SQL, +) + +operations = [ + # we have to drop materialized view because 0050 created it incorrectly + run_sql_with_exceptions(DROP_SESSION_REPLAY_EVENTS_TABLE_MV_SQL()), + # now we can recreate it with explicit column definitions + # that correctly identifies snapshot source as LowCardinality(Nullable(String)) + run_sql_with_exceptions(SESSION_REPLAY_EVENTS_TABLE_MV_SQL()), +] diff --git a/posthog/clickhouse/test/__snapshots__/test_schema.ambr b/posthog/clickhouse/test/__snapshots__/test_schema.ambr index 6e4344727633b..d337f3d55201e 100644 --- a/posthog/clickhouse/test/__snapshots__/test_schema.ambr +++ b/posthog/clickhouse/test/__snapshots__/test_schema.ambr @@ -1495,7 +1495,19 @@ ' CREATE MATERIALIZED VIEW IF NOT EXISTS session_replay_events_mv ON CLUSTER 'posthog' - TO posthog_test.writable_session_replay_events + TO posthog_test.writable_session_replay_events ( + `session_id` String, `team_id` Int64, `distinct_id` String, + `min_first_timestamp` DateTime64(6, 'UTC'), + `max_last_timestamp` DateTime64(6, 'UTC'), + `first_url` AggregateFunction(argMin, Nullable(String), DateTime64(6, 'UTC')), + `click_count` Int64, `keypress_count` Int64, + `mouse_activity_count` Int64, `active_milliseconds` Int64, + `console_log_count` Int64, `console_warn_count` Int64, + `console_error_count` Int64, `size` Int64, `message_count` Int64, + `event_count` Int64, + `snapshot_source` AggregateFunction(argMin, LowCardinality(Nullable(String)), DateTime64(6, 'UTC')), + `_timestamp` Nullable(DateTime) + ) AS SELECT session_id, team_id, diff --git a/posthog/session_recordings/sql/session_replay_event_sql.py b/posthog/session_recordings/sql/session_replay_event_sql.py index 6c9a1e76bf252..ccec36c2b9f39 100644 --- a/posthog/session_recordings/sql/session_replay_event_sql.py +++ b/posthog/session_recordings/sql/session_replay_event_sql.py @@ -38,6 +38,8 @@ ) ENGINE = {engine} """ +# if updating these column definitions +# you'll need to update the explicit column definitions in the materialized view creation statement below SESSION_REPLAY_EVENTS_TABLE_BASE_SQL = """ CREATE TABLE IF NOT EXISTS {table_name} ON CLUSTER '{cluster}' ( @@ -107,11 +109,10 @@ engine=kafka_engine(topic=KAFKA_CLICKHOUSE_SESSION_REPLAY_EVENTS), ) - SESSION_REPLAY_EVENTS_TABLE_MV_SQL = ( lambda: """ CREATE MATERIALIZED VIEW IF NOT EXISTS session_replay_events_mv ON CLUSTER '{cluster}' -TO {database}.{target_table} +TO {database}.{target_table} {explictly_specify_columns} AS SELECT session_id, team_id, @@ -147,10 +148,25 @@ target_table="writable_session_replay_events", cluster=settings.CLICKHOUSE_CLUSTER, database=settings.CLICKHOUSE_DATABASE, + # ClickHouse is incorrectly expanding the type of the snapshot source column + # Despite it being a LowCardinality(Nullable(String)) in writable_session_replay_events + # The column expansion picks only Nullable(String) and so we can't select it + explictly_specify_columns="""( +`session_id` String, `team_id` Int64, `distinct_id` String, +`min_first_timestamp` DateTime64(6, 'UTC'), +`max_last_timestamp` DateTime64(6, 'UTC'), +`first_url` AggregateFunction(argMin, Nullable(String), DateTime64(6, 'UTC')), +`click_count` Int64, `keypress_count` Int64, +`mouse_activity_count` Int64, `active_milliseconds` Int64, +`console_log_count` Int64, `console_warn_count` Int64, +`console_error_count` Int64, `size` Int64, `message_count` Int64, +`event_count` Int64, +`snapshot_source` AggregateFunction(argMin, LowCardinality(Nullable(String)), DateTime64(6, 'UTC')), +`_timestamp` Nullable(DateTime) +)""", ) ) - # Distributed engine tables are only created if CLICKHOUSE_REPLICATED # This table is responsible for writing to sharded_session_replay_events based on a sharding key. @@ -163,7 +179,6 @@ ), ) - # This table is responsible for reading from session_replay_events on a cluster setting DISTRIBUTED_SESSION_REPLAY_EVENTS_TABLE_SQL = lambda: SESSION_REPLAY_EVENTS_TABLE_BASE_SQL.format( table_name="session_replay_events", @@ -174,7 +189,6 @@ ), ) - DROP_SESSION_REPLAY_EVENTS_TABLE_SQL = lambda: ( f"DROP TABLE IF EXISTS {SESSION_REPLAY_EVENTS_DATA_TABLE()} ON CLUSTER '{settings.CLICKHOUSE_CLUSTER}'" )