diff --git a/posthog/clickhouse/test/__snapshots__/test_schema.ambr b/posthog/clickhouse/test/__snapshots__/test_schema.ambr index 62444a51946d6b..5be965379a7214 100644 --- a/posthog/clickhouse/test/__snapshots__/test_schema.ambr +++ b/posthog/clickhouse/test/__snapshots__/test_schema.ambr @@ -1595,6 +1595,118 @@ _offset FROM posthog_test.kafka_plugin_log_entries + ''' +# --- +# name: test_create_table_query[raw_sessions] + ''' + + CREATE TABLE IF NOT EXISTS raw_sessions ON CLUSTER 'posthog' + ( + team_id Int64, + session_id_v7 UInt128, -- integer representation of a uuidv7 + + -- ClickHouse will pick the latest value of distinct_id for the session + -- this is fine since even if the distinct_id changes during a session + distinct_id AggregateFunction(argMax, String, DateTime64(6, 'UTC')), + + min_timestamp SimpleAggregateFunction(min, DateTime64(6, 'UTC')), + max_timestamp SimpleAggregateFunction(max, DateTime64(6, 'UTC')), + + urls SimpleAggregateFunction(groupUniqArrayArray, Array(String)), + entry_url AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + end_url AggregateFunction(argMax, String, DateTime64(6, 'UTC')), + + initial_referring_domain AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_utm_source AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_utm_campaign AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_utm_medium AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_utm_term AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_utm_content AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_gclid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_gad_source AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_gclsrc AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_dclid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_gbraid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_wbraid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_fbclid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_msclkid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_twclid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_li_fat_id AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_mc_cid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_igshid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_ttclid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + + -- Count pageview, autocapture, and screen events for providing totals. + -- It's unclear if we can use the counts as they are not idempotent, and we had a bug on EU where events were + -- double-counted, so the counts were wrong. To get around this, also keep track of the unique uuids. This will be + -- slower and more expensive to store, but will be correct even if events are double-counted, so can be used to + -- verify correctness and as a backup. Ideally we will be able to delete the uniq columns in the future when we're + -- satisfied that counts are accurate. + pageview_count SimpleAggregateFunction(sum, Int64), + pageview_uniq AggregateFunction(uniq, Nullable(UUID)), + autocapture_count SimpleAggregateFunction(sum, Int64), + autocapture_uniq AggregateFunction(uniq, Nullable(UUID)), + screen_count SimpleAggregateFunction(sum, Int64), + screen_uniq AggregateFunction(uniq, Nullable(UUID)), + + has_session_replay SimpleAggregateFunction(max, Bool) -- we're not writing True values to this column anywhere yet + ) ENGINE = Distributed('posthog', 'posthog_test', 'sharded_raw_sessions', cityHash64(session_id_v7)) + + ''' +# --- +# name: test_create_table_query[raw_sessions_mv] + ''' + + CREATE MATERIALIZED VIEW IF NOT EXISTS raw_sessions_mv ON CLUSTER 'posthog' + TO posthog_test.writable_raw_sessions + AS + + SELECT + team_id, + toUInt128(toUUID(`$session_id`)) as session_id_v7, + + argMaxState(distinct_id, timestamp) as distinct_id, + + min(timestamp) AS min_timestamp, + max(timestamp) AS max_timestamp, + + groupUniqArray(nullIf(JSONExtractString(properties, '$current_url'), '')) AS urls, + argMinState(JSONExtractString(properties, '$current_url'), timestamp) as entry_url, + argMaxState(JSONExtractString(properties, '$current_url'), timestamp) as end_url, + + argMinState(JSONExtractString(properties, '$referring_domain'), timestamp) as initial_referring_domain, + argMinState(JSONExtractString(properties, 'utm_source'), timestamp) as initial_utm_source, + argMinState(JSONExtractString(properties, 'utm_campaign'), timestamp) as initial_utm_campaign, + argMinState(JSONExtractString(properties, 'utm_medium'), timestamp) as initial_utm_medium, + argMinState(JSONExtractString(properties, 'utm_term'), timestamp) as initial_utm_term, + argMinState(JSONExtractString(properties, 'utm_content'), timestamp) as initial_utm_content, + argMinState(JSONExtractString(properties, 'gclid'), timestamp) as initial_gclid, + argMinState(JSONExtractString(properties, 'gad_source'), timestamp) as initial_gad_source, + argMinState(JSONExtractString(properties, 'gclsrc'), timestamp) as initial_gclsrc, + argMinState(JSONExtractString(properties, 'dclid'), timestamp) as initial_dclid, + argMinState(JSONExtractString(properties, 'gbraid'), timestamp) as initial_gbraid, + argMinState(JSONExtractString(properties, 'wbraid'), timestamp) as initial_wbraid, + argMinState(JSONExtractString(properties, 'fbclid'), timestamp) as initial_fbclid, + argMinState(JSONExtractString(properties, 'msclkid'), timestamp) as initial_msclkid, + argMinState(JSONExtractString(properties, 'twclid'), timestamp) as initial_twclid, + argMinState(JSONExtractString(properties, 'la_fat_id'), timestamp) as initial_li_fat_id, + argMinState(JSONExtractString(properties, 'mc_cid'), timestamp) as initial_mc_cid, + argMinState(JSONExtractString(properties, 'igshid'), timestamp) as initial_igshid, + argMinState(JSONExtractString(properties, 'ttclid'), timestamp) as initial_ttclid, + + sumIf(1, event='$pageview') as pageview_count, + uniqState(if(event='$pageview', uuid, NULL)) as pageview_uniq, + sumIf(1, event='$autocapture') as autocapture_count, + uniqState(if(event='$autocapture', uuid, NULL)) as autocapture_uniq, + sumIf(1, event='$screen') as screen_count, + uniqState(if(event='$screen', uuid, NULL)) as screen_uniq, + + false as has_session_replay + FROM posthog_test.sharded_events + WHERE bitAnd(bitShiftRight(toUInt128(accurateCastOrNull(`$session_id`, 'UUID')), 76), 0xF) == 7 -- has a session id and is valid uuidv7 + GROUP BY session_id_v7, team_id + + ''' # --- # name: test_create_table_query[session_recording_events] @@ -2048,6 +2160,72 @@ + ''' +# --- +# name: test_create_table_query[sharded_raw_sessions] + ''' + + CREATE TABLE IF NOT EXISTS sharded_raw_sessions ON CLUSTER 'posthog' + ( + team_id Int64, + session_id_v7 UInt128, -- integer representation of a uuidv7 + + -- ClickHouse will pick the latest value of distinct_id for the session + -- this is fine since even if the distinct_id changes during a session + distinct_id AggregateFunction(argMax, String, DateTime64(6, 'UTC')), + + min_timestamp SimpleAggregateFunction(min, DateTime64(6, 'UTC')), + max_timestamp SimpleAggregateFunction(max, DateTime64(6, 'UTC')), + + urls SimpleAggregateFunction(groupUniqArrayArray, Array(String)), + entry_url AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + end_url AggregateFunction(argMax, String, DateTime64(6, 'UTC')), + + initial_referring_domain AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_utm_source AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_utm_campaign AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_utm_medium AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_utm_term AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_utm_content AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_gclid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_gad_source AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_gclsrc AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_dclid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_gbraid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_wbraid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_fbclid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_msclkid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_twclid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_li_fat_id AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_mc_cid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_igshid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_ttclid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + + -- Count pageview, autocapture, and screen events for providing totals. + -- It's unclear if we can use the counts as they are not idempotent, and we had a bug on EU where events were + -- double-counted, so the counts were wrong. To get around this, also keep track of the unique uuids. This will be + -- slower and more expensive to store, but will be correct even if events are double-counted, so can be used to + -- verify correctness and as a backup. Ideally we will be able to delete the uniq columns in the future when we're + -- satisfied that counts are accurate. + pageview_count SimpleAggregateFunction(sum, Int64), + pageview_uniq AggregateFunction(uniq, Nullable(UUID)), + autocapture_count SimpleAggregateFunction(sum, Int64), + autocapture_uniq AggregateFunction(uniq, Nullable(UUID)), + screen_count SimpleAggregateFunction(sum, Int64), + screen_uniq AggregateFunction(uniq, Nullable(UUID)), + + has_session_replay SimpleAggregateFunction(max, Bool) -- we're not writing True values to this column anywhere yet + ) ENGINE = ReplicatedAggregatingMergeTree('/clickhouse/tables/77f1df52-4b43-11e9-910f-b8ca3a9b9f3e_{shard}/posthog.raw_sessions', '{replica}') + + PARTITION BY toYYYYMM(fromUnixTimestamp(intDiv(toUInt64(bitShiftRight(session_id_v7, 80)), 1000))) + ORDER BY ( + team_id, + toStartOfHour(fromUnixTimestamp(intDiv(toUInt64(bitShiftRight(session_id_v7, 80)), 1000))), + cityHash64(session_id_v7), + session_id_v7 + ) + SAMPLE BY cityHash64(session_id_v7) + ''' # --- # name: test_create_table_query[sharded_session_recording_events] @@ -2262,6 +2440,63 @@ ''' # --- +# name: test_create_table_query[writable_raw_sessions] + ''' + + CREATE TABLE IF NOT EXISTS writable_raw_sessions ON CLUSTER 'posthog' + ( + team_id Int64, + session_id_v7 UInt128, -- integer representation of a uuidv7 + + -- ClickHouse will pick the latest value of distinct_id for the session + -- this is fine since even if the distinct_id changes during a session + distinct_id AggregateFunction(argMax, String, DateTime64(6, 'UTC')), + + min_timestamp SimpleAggregateFunction(min, DateTime64(6, 'UTC')), + max_timestamp SimpleAggregateFunction(max, DateTime64(6, 'UTC')), + + urls SimpleAggregateFunction(groupUniqArrayArray, Array(String)), + entry_url AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + end_url AggregateFunction(argMax, String, DateTime64(6, 'UTC')), + + initial_referring_domain AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_utm_source AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_utm_campaign AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_utm_medium AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_utm_term AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_utm_content AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_gclid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_gad_source AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_gclsrc AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_dclid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_gbraid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_wbraid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_fbclid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_msclkid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_twclid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_li_fat_id AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_mc_cid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_igshid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_ttclid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + + -- Count pageview, autocapture, and screen events for providing totals. + -- It's unclear if we can use the counts as they are not idempotent, and we had a bug on EU where events were + -- double-counted, so the counts were wrong. To get around this, also keep track of the unique uuids. This will be + -- slower and more expensive to store, but will be correct even if events are double-counted, so can be used to + -- verify correctness and as a backup. Ideally we will be able to delete the uniq columns in the future when we're + -- satisfied that counts are accurate. + pageview_count SimpleAggregateFunction(sum, Int64), + pageview_uniq AggregateFunction(uniq, Nullable(UUID)), + autocapture_count SimpleAggregateFunction(sum, Int64), + autocapture_uniq AggregateFunction(uniq, Nullable(UUID)), + screen_count SimpleAggregateFunction(sum, Int64), + screen_uniq AggregateFunction(uniq, Nullable(UUID)), + + has_session_replay SimpleAggregateFunction(max, Bool) -- we're not writing True values to this column anywhere yet + ) ENGINE = Distributed('posthog', 'posthog_test', 'sharded_raw_sessions', cityHash64(session_id_v7)) + + ''' +# --- # name: test_create_table_query[writable_session_recording_events] ''' @@ -2942,6 +3177,72 @@ ''' # --- +# name: test_create_table_query_replicated_and_storage[sharded_raw_sessions] + ''' + + CREATE TABLE IF NOT EXISTS sharded_raw_sessions ON CLUSTER 'posthog' + ( + team_id Int64, + session_id_v7 UInt128, -- integer representation of a uuidv7 + + -- ClickHouse will pick the latest value of distinct_id for the session + -- this is fine since even if the distinct_id changes during a session + distinct_id AggregateFunction(argMax, String, DateTime64(6, 'UTC')), + + min_timestamp SimpleAggregateFunction(min, DateTime64(6, 'UTC')), + max_timestamp SimpleAggregateFunction(max, DateTime64(6, 'UTC')), + + urls SimpleAggregateFunction(groupUniqArrayArray, Array(String)), + entry_url AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + end_url AggregateFunction(argMax, String, DateTime64(6, 'UTC')), + + initial_referring_domain AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_utm_source AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_utm_campaign AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_utm_medium AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_utm_term AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_utm_content AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_gclid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_gad_source AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_gclsrc AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_dclid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_gbraid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_wbraid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_fbclid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_msclkid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_twclid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_li_fat_id AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_mc_cid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_igshid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_ttclid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + + -- Count pageview, autocapture, and screen events for providing totals. + -- It's unclear if we can use the counts as they are not idempotent, and we had a bug on EU where events were + -- double-counted, so the counts were wrong. To get around this, also keep track of the unique uuids. This will be + -- slower and more expensive to store, but will be correct even if events are double-counted, so can be used to + -- verify correctness and as a backup. Ideally we will be able to delete the uniq columns in the future when we're + -- satisfied that counts are accurate. + pageview_count SimpleAggregateFunction(sum, Int64), + pageview_uniq AggregateFunction(uniq, Nullable(UUID)), + autocapture_count SimpleAggregateFunction(sum, Int64), + autocapture_uniq AggregateFunction(uniq, Nullable(UUID)), + screen_count SimpleAggregateFunction(sum, Int64), + screen_uniq AggregateFunction(uniq, Nullable(UUID)), + + has_session_replay SimpleAggregateFunction(max, Bool) -- we're not writing True values to this column anywhere yet + ) ENGINE = ReplicatedAggregatingMergeTree('/clickhouse/tables/77f1df52-4b43-11e9-910f-b8ca3a9b9f3e_{shard}/posthog.raw_sessions', '{replica}') + + PARTITION BY toYYYYMM(fromUnixTimestamp(intDiv(toUInt64(bitShiftRight(session_id_v7, 80)), 1000))) + ORDER BY ( + team_id, + toStartOfHour(fromUnixTimestamp(intDiv(toUInt64(bitShiftRight(session_id_v7, 80)), 1000))), + cityHash64(session_id_v7), + session_id_v7 + ) + SAMPLE BY cityHash64(session_id_v7) + + ''' +# --- # name: test_create_table_query_replicated_and_storage[sharded_session_recording_events] '''