From fd7467196487519902710b3a1e0e784a028b741e Mon Sep 17 00:00:00 2001 From: Robbie Coomber Date: Tue, 20 Aug 2024 10:16:27 +0100 Subject: [PATCH] Fix column order difference between creation and migration --- .../test/test_raw_sessions_model.py | 42 +++++++++++++++++++ posthog/models/raw_sessions/migrations.py | 2 + posthog/models/raw_sessions/sql.py | 30 +++++++++---- 3 files changed, 65 insertions(+), 9 deletions(-) create mode 100644 posthog/clickhouse/test/test_raw_sessions_model.py diff --git a/posthog/clickhouse/test/test_raw_sessions_model.py b/posthog/clickhouse/test/test_raw_sessions_model.py new file mode 100644 index 00000000000000..37c1824f0dd69d --- /dev/null +++ b/posthog/clickhouse/test/test_raw_sessions_model.py @@ -0,0 +1,42 @@ +from posthog.clickhouse.client import sync_execute +from posthog.models.raw_sessions.sql import RAW_SESSION_TABLE_BACKFILL_SELECT_SQL +from posthog.models.utils import uuid7 +from posthog.test.base import ( + _create_event, + ClickhouseTestMixin, + BaseTest, +) + +distinct_id_counter = 0 +session_id_counter = 0 + + +def create_distinct_id(): + global distinct_id_counter + distinct_id_counter += 1 + return f"d{distinct_id_counter}" + + +def create_session_id(): + global session_id_counter + session_id_counter += 1 + return str(uuid7(random=session_id_counter)) + + +class TestRawSessionsModel(ClickhouseTestMixin, BaseTest): + def test_backfill_sql(self): + distinct_id = create_distinct_id() + session_id = create_session_id() + _create_event( + team=self.team, + event="$pageview", + distinct_id=distinct_id, + properties={"$current_url": "/", "$session_id": session_id}, + timestamp="2024-03-08", + ) + + # just test that the backfill SQL can be run without error + sync_execute( + "INSERT INTO raw_sessions" + RAW_SESSION_TABLE_BACKFILL_SELECT_SQL() + "AND team_id = %(team_id)s", + {"team_id": self.team.id}, + ) diff --git a/posthog/models/raw_sessions/migrations.py b/posthog/models/raw_sessions/migrations.py index ffc998c3a3d89d..c437974e62601f 100644 --- a/posthog/models/raw_sessions/migrations.py +++ b/posthog/models/raw_sessions/migrations.py @@ -2,11 +2,13 @@ from posthog.models.raw_sessions.sql import RAW_SESSIONS_DATA_TABLE, TABLE_BASE_NAME +# perf ADD_PAGEVIEW_AUTOCAPTURE_SCREEN_UP_TO_2_COLUMN_SQL = """ ALTER TABLE {table_name} on CLUSTER '{cluster}' ADD COLUMN IF NOT EXISTS page_screen_autocapture_uniq_up_to AggregateFunction(uniqUpTo(1), Nullable(UUID)) +AFTER maybe_has_session_replay """ BASE_RAW_SESSIONS_ADD_PAGEVIEW_AUTOCAPTURE_SCREEN_UP_TO_2_COLUMN_SQL = ( diff --git a/posthog/models/raw_sessions/sql.py b/posthog/models/raw_sessions/sql.py index f2f2c03cfd1b79..79f99aa1c2d564 100644 --- a/posthog/models/raw_sessions/sql.py +++ b/posthog/models/raw_sessions/sql.py @@ -94,11 +94,12 @@ autocapture_uniq AggregateFunction(uniq, Nullable(UUID)), screen_count SimpleAggregateFunction(sum, Int64), screen_uniq AggregateFunction(uniq, Nullable(UUID)), - -- as a performance optimisation, also keep track of the uniq events for all of these combined, a bounce is a session with <2 of these - page_screen_autocapture_uniq_up_to AggregateFunction(uniqUpTo(1), Nullable(UUID)), -- replay - maybe_has_session_replay SimpleAggregateFunction(max, Bool) -- will be written False to by the events table mv and True to by the replay table mv + maybe_has_session_replay SimpleAggregateFunction(max, Bool), -- will be written False to by the events table mv and True to by the replay table mv + + -- as a performance optimisation, also keep track of the uniq events for all of these combined, a bounce is a session with <2 of these + page_screen_autocapture_uniq_up_to AggregateFunction(uniqUpTo(1), Nullable(UUID)) ) ENGINE = {engine} """ @@ -153,6 +154,11 @@ def source_int_column(column_name: str) -> str: return f"JSONExtractInt(properties, '{column_name}')" +def source_nullable_float_column(column_name: str) -> str: + # this is what we do in queries, but it seems pretty awful + return f"""accurateCastOrNull(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(properties, '{column_name}'), ''), 'null'), '^"|"$', ''), 'Float64')""" + + RAW_SESSION_TABLE_BACKFILL_SELECT_SQL = ( lambda: """ SELECT @@ -214,10 +220,12 @@ def source_int_column(column_name: str) -> str: initializeAggregation('uniqState', if(event='autocapture', uuid, NULL)) as autocapture_uniq, if(event='$screen', 1, 0) as screen_count, initializeAggregation('uniqState', if(event='screen', uuid, NULL)) as screen_uniq, - initializeAggregation('uniqUpToState(1)', if(event='$pageview' OR event='$screen' OR event='$autocapture', uuid, NULL)) as page_screen_autocapture_uniq_up_to, -- replay - false as maybe_has_session_replay + false as maybe_has_session_replay, + + -- perf + initializeAggregation('uniqUpToState(1)', if(event='$pageview' OR event='$screen' OR event='$autocapture', uuid, NULL)) as page_screen_autocapture_uniq_up_to FROM {database}.events WHERE bitAnd(bitShiftRight(toUInt128(accurateCastOrNull(`$session_id`, 'UUID')), 76), 0xF) == 7 -- has a session id and is valid uuidv7 """.format( @@ -321,10 +329,12 @@ def source_int_column(column_name: str) -> str: uniqState(if(event='$autocapture', uuid, NULL)) as autocapture_uniq, sumIf(1, event='$screen') as screen_count, uniqState(if(event='$screen', uuid, NULL)) as screen_uniq, - uniqUpToState(1)(if(event='$pageview' OR event='$screen' OR event='$autocapture', uuid, NULL)) as page_screen_autocapture_uniq_up_to, -- replay - false as maybe_has_session_replay + false as maybe_has_session_replay, + + -- perf + uniqUpToState(1)(if(event='$pageview' OR event='$screen' OR event='$autocapture', uuid, NULL)) as page_screen_autocapture_uniq_up_to FROM {database}.sharded_events WHERE bitAnd(bitShiftRight(toUInt128(accurateCastOrNull(`$session_id`, 'UUID')), 76), 0xF) == 7 -- has a session id and is valid uuidv7) GROUP BY @@ -429,7 +439,7 @@ def source_int_column(column_name: str) -> str: session_id_v7, fromUnixTimestamp(intDiv(toUInt64(bitShiftRight(session_id_v7, 80)), 1000)) as session_timestamp, team_id, - any(distinct_id) as distinct_id, + argMaxMerge(distinct_id) as distinct_id, min(min_timestamp) as min_timestamp, max(max_timestamp) as max_timestamp, @@ -483,7 +493,9 @@ def source_int_column(column_name: str) -> str: sum(screen_count) as screen_count, uniqMerge(screen_uniq) as screen_uniq, - max(maybe_has_session_replay) as maybe_has_session_replay + max(maybe_has_session_replay) as maybe_has_session_replay, + + uniqUpToMerge(1)(page_screen_autocapture_uniq_up_to) as page_screen_autocapture_uniq_up_to FROM {TABLE_BASE_NAME} GROUP BY session_id_v7, team_id """