diff --git a/ee/clickhouse/queries/test/__snapshots__/test_groups_join_query.ambr b/ee/clickhouse/queries/test/__snapshots__/test_groups_join_query.ambr new file mode 100644 index 0000000000000..85b77e616249e --- /dev/null +++ b/ee/clickhouse/queries/test/__snapshots__/test_groups_join_query.ambr @@ -0,0 +1,55 @@ +# serializer version: 1 +# name: test_groups_join_query_filtering + tuple( + ''' + + LEFT JOIN ( + SELECT + group_key, + argMax(group_properties, _timestamp) AS group_properties_0 + FROM groups + WHERE team_id = %(team_id)s AND group_type_index = %(group_index_0)s + GROUP BY group_key + ) groups_0 + ON "$group_0" == groups_0.group_key + + ''', + dict({ + 'group_index_0': 0, + 'team_id': 2, + }), + ) +# --- +# name: test_groups_join_query_filtering_with_custom_key_names + tuple( + ''' + + LEFT JOIN ( + SELECT + group_key, + argMax(group_properties, _timestamp) AS group_properties_0 + FROM groups + WHERE team_id = %(team_id)s AND group_type_index = %(group_index_0)s + GROUP BY group_key + ) groups_0 + ON call_me_industry == groups_0.group_key + + + LEFT JOIN ( + SELECT + group_key, + argMax(group_properties, _timestamp) AS group_properties_2 + FROM groups + WHERE team_id = %(team_id)s AND group_type_index = %(group_index_2)s + GROUP BY group_key + ) groups_2 + ON call_me_industry == groups_2.group_key + + ''', + dict({ + 'group_index_0': 0, + 'group_index_2': 2, + 'team_id': 2, + }), + ) +# --- diff --git a/ee/clickhouse/views/test/__snapshots__/test_clickhouse_experiments.ambr b/ee/clickhouse/views/test/__snapshots__/test_clickhouse_experiments.ambr index 9a9ce8a732bf7..9f0ae5a1b9290 100644 --- a/ee/clickhouse/views/test/__snapshots__/test_clickhouse_experiments.ambr +++ b/ee/clickhouse/views/test/__snapshots__/test_clickhouse_experiments.ambr @@ -213,6 +213,113 @@ GROUP BY prop ''' # --- +# name: ClickhouseTestFunnelExperimentResults.test_experiment_flow_with_event_results_for_three_test_variants + ''' + /* user_id:0 celery:posthog.tasks.tasks.sync_insight_caching_state */ + SELECT team_id, + date_diff('second', max(timestamp), now()) AS age + FROM events + WHERE timestamp > date_sub(DAY, 3, now()) + AND timestamp < now() + GROUP BY team_id + ORDER BY age; + ''' +# --- +# name: ClickhouseTestFunnelExperimentResults.test_experiment_flow_with_event_results_for_three_test_variants.1 + ''' + /* user_id:0 request:_snapshot_ */ + SELECT array(replaceRegexpAll(JSONExtractRaw(properties, '$feature/a-b-test'), '^"|"$', '')) AS value, + count(*) as count + FROM events e + WHERE team_id = 2 + AND event IN ['$pageleave', '$pageview'] + AND toTimeZone(timestamp, 'UTC') >= toDateTime('2020-01-01 00:00:00', 'UTC') + AND toTimeZone(timestamp, 'UTC') <= toDateTime('2020-01-06 00:00:00', 'UTC') + GROUP BY value + ORDER BY count DESC, value DESC + LIMIT 26 + OFFSET 0 + ''' +# --- +# name: ClickhouseTestFunnelExperimentResults.test_experiment_flow_with_event_results_for_three_test_variants.2 + ''' + /* user_id:0 request:_snapshot_ */ + SELECT countIf(steps = 1) step_1, + countIf(steps = 2) step_2, + avg(step_1_average_conversion_time_inner) step_1_average_conversion_time, + median(step_1_median_conversion_time_inner) step_1_median_conversion_time, + prop + FROM + (SELECT aggregation_target, + steps, + avg(step_1_conversion_time) step_1_average_conversion_time_inner, + median(step_1_conversion_time) step_1_median_conversion_time_inner , + prop + FROM + (SELECT aggregation_target, + steps, + max(steps) over (PARTITION BY aggregation_target, + prop) as max_steps, + step_1_conversion_time , + prop + FROM + (SELECT *, + if(latest_0 <= latest_1 + AND latest_1 <= latest_0 + INTERVAL 14 DAY, 2, 1) AS steps , + if(isNotNull(latest_1) + AND latest_1 <= latest_0 + INTERVAL 14 DAY, dateDiff('second', toDateTime(latest_0), toDateTime(latest_1)), NULL) step_1_conversion_time, + prop + FROM + (SELECT aggregation_target, timestamp, step_0, + latest_0, + step_1, + min(latest_1) over (PARTITION by aggregation_target, + prop + ORDER BY timestamp DESC ROWS BETWEEN UNBOUNDED PRECEDING AND 0 PRECEDING) latest_1 , + if(has([[''], ['test_1'], ['test'], ['control'], ['unknown_3'], ['unknown_2'], ['unknown_1'], ['test_2']], prop), prop, ['Other']) as prop + FROM + (SELECT *, + if(notEmpty(arrayFilter(x -> notEmpty(x), prop_vals)), prop_vals, ['']) as prop + FROM + (SELECT e.timestamp as timestamp, + if(notEmpty(pdi.distinct_id), pdi.person_id, e.person_id) as aggregation_target, + if(notEmpty(pdi.distinct_id), pdi.person_id, e.person_id) as person_id, + if(event = '$pageview', 1, 0) as step_0, + if(step_0 = 1, timestamp, null) as latest_0, + if(event = '$pageleave', 1, 0) as step_1, + if(step_1 = 1, timestamp, null) as latest_1, + array(replaceRegexpAll(JSONExtractRaw(properties, '$feature/a-b-test'), '^"|"$', '')) AS prop_basic, + prop_basic as prop, + argMinIf(prop, timestamp, notEmpty(arrayFilter(x -> notEmpty(x), prop))) over (PARTITION by aggregation_target) as prop_vals + FROM events e + LEFT OUTER JOIN + (SELECT distinct_id, + argMax(person_id, version) as person_id + FROM person_distinct_id2 + WHERE team_id = 2 + AND distinct_id IN + (SELECT distinct_id + FROM events + WHERE team_id = 2 + AND event IN ['$pageleave', '$pageview'] + AND toTimeZone(timestamp, 'UTC') >= toDateTime('2020-01-01 00:00:00', 'UTC') + AND toTimeZone(timestamp, 'UTC') <= toDateTime('2020-01-06 00:00:00', 'UTC') ) + GROUP BY distinct_id + HAVING argMax(is_deleted, version) = 0) AS pdi ON e.distinct_id = pdi.distinct_id + WHERE team_id = 2 + AND event IN ['$pageleave', '$pageview'] + AND toTimeZone(timestamp, 'UTC') >= toDateTime('2020-01-01 00:00:00', 'UTC') + AND toTimeZone(timestamp, 'UTC') <= toDateTime('2020-01-06 00:00:00', 'UTC') + AND (step_0 = 1 + OR step_1 = 1) ))) + WHERE step_0 = 1 )) + GROUP BY aggregation_target, + steps, + prop + HAVING steps = max(max_steps)) + GROUP BY prop + ''' +# --- # name: ClickhouseTestFunnelExperimentResults.test_experiment_flow_with_event_results_with_hogql_aggregation ''' /* user_id:0 celery:posthog.tasks.tasks.sync_insight_caching_state */ diff --git a/posthog/api/test/__snapshots__/test_feature_flag.ambr b/posthog/api/test/__snapshots__/test_feature_flag.ambr index 46bb514eb2d18..db27b28fd8ee2 100644 --- a/posthog/api/test/__snapshots__/test_feature_flag.ambr +++ b/posthog/api/test/__snapshots__/test_feature_flag.ambr @@ -2055,3 +2055,211 @@ OFFSET 10000 ''' # --- +# name: TestResiliency.test_feature_flags_v3_with_experience_continuity_working_slow_db + ''' + WITH target_person_ids AS + (SELECT team_id, + person_id + FROM posthog_persondistinctid + WHERE team_id = 2 + AND distinct_id = ANY('{example_id,random}') ), + existing_overrides AS + (SELECT team_id, + person_id, + feature_flag_key, + hash_key + FROM posthog_featureflaghashkeyoverride + WHERE team_id = 2 + AND person_id IN + (SELECT person_id + FROM target_person_ids) ) + SELECT key + FROM posthog_featureflag + WHERE team_id = 2 + AND ensure_experience_continuity = TRUE + AND active = TRUE + AND deleted = FALSE + AND key NOT IN + (SELECT feature_flag_key + FROM existing_overrides) + ''' +# --- +# name: TestResiliency.test_feature_flags_v3_with_experience_continuity_working_slow_db.1 + ''' + WITH target_person_ids AS + (SELECT team_id, + person_id + FROM posthog_persondistinctid + WHERE team_id = 2 + AND distinct_id = ANY('{example_id,random}') ), + existing_overrides AS + (SELECT team_id, + person_id, + feature_flag_key, + hash_key + FROM posthog_featureflaghashkeyoverride + WHERE team_id = 2 + AND person_id IN + (SELECT person_id + FROM target_person_ids) ), + flags_to_override AS + (SELECT key + FROM posthog_featureflag + WHERE team_id = 2 + AND ensure_experience_continuity = TRUE + AND active = TRUE + AND deleted = FALSE + AND key NOT IN + (SELECT feature_flag_key + FROM existing_overrides) ) + INSERT INTO posthog_featureflaghashkeyoverride (team_id, person_id, feature_flag_key, hash_key) + SELECT team_id, + person_id, + key, + 'random' + FROM flags_to_override, + target_person_ids + WHERE EXISTS + (SELECT 1 + FROM posthog_person + WHERE id = person_id + AND team_id = 2) ON CONFLICT DO NOTHING + ''' +# --- +# name: TestResiliency.test_feature_flags_v3_with_experience_continuity_working_slow_db.2 + ''' + SELECT "posthog_persondistinctid"."person_id", + "posthog_persondistinctid"."distinct_id" + FROM "posthog_persondistinctid" + WHERE ("posthog_persondistinctid"."distinct_id" IN ('example_id', + 'random') + AND "posthog_persondistinctid"."team_id" = 2) + ''' +# --- +# name: TestResiliency.test_feature_flags_v3_with_experience_continuity_working_slow_db.3 + ''' + SELECT "posthog_featureflaghashkeyoverride"."feature_flag_key", + "posthog_featureflaghashkeyoverride"."hash_key", + "posthog_featureflaghashkeyoverride"."person_id" + FROM "posthog_featureflaghashkeyoverride" + WHERE ("posthog_featureflaghashkeyoverride"."person_id" IN (1, + 2, + 3, + 4, + 5 /* ... */) + AND "posthog_featureflaghashkeyoverride"."team_id" = 2) + ''' +# --- +# name: TestResiliency.test_feature_flags_v3_with_experience_continuity_working_slow_db.4 + ''' + SELECT (("posthog_person"."properties" -> 'email') = '"tim@posthog.com"'::jsonb + AND "posthog_person"."properties" ? 'email' + AND NOT (("posthog_person"."properties" -> 'email') = 'null'::jsonb)) AS "flag_X_condition_0", + (true) AS "flag_X_condition_0" + FROM "posthog_person" + INNER JOIN "posthog_persondistinctid" ON ("posthog_person"."id" = "posthog_persondistinctid"."person_id") + WHERE ("posthog_persondistinctid"."distinct_id" = 'example_id' + AND "posthog_persondistinctid"."team_id" = 2 + AND "posthog_person"."team_id" = 2) + ''' +# --- +# name: TestResiliency.test_feature_flags_v3_with_experience_continuity_working_slow_db.5 + ''' + WITH target_person_ids AS + (SELECT team_id, + person_id + FROM posthog_persondistinctid + WHERE team_id = 2 + AND distinct_id = ANY('{example_id,random}') ), + existing_overrides AS + (SELECT team_id, + person_id, + feature_flag_key, + hash_key + FROM posthog_featureflaghashkeyoverride + WHERE team_id = 2 + AND person_id IN + (SELECT person_id + FROM target_person_ids) ) + SELECT key + FROM posthog_featureflag + WHERE team_id = 2 + AND ensure_experience_continuity = TRUE + AND active = TRUE + AND deleted = FALSE + AND key NOT IN + (SELECT feature_flag_key + FROM existing_overrides) + ''' +# --- +# name: TestResiliency.test_feature_flags_v3_with_experience_continuity_working_slow_db.6 + ''' + SELECT "posthog_persondistinctid"."person_id", + "posthog_persondistinctid"."distinct_id" + FROM "posthog_persondistinctid" + WHERE ("posthog_persondistinctid"."distinct_id" IN ('example_id', + 'random') + AND "posthog_persondistinctid"."team_id" = 2) + ''' +# --- +# name: TestResiliency.test_feature_flags_v3_with_experience_continuity_working_slow_db.7 + ''' + SELECT "posthog_persondistinctid"."person_id", + "posthog_persondistinctid"."distinct_id" + FROM "posthog_persondistinctid" + WHERE ("posthog_persondistinctid"."distinct_id" IN ('random') + AND "posthog_persondistinctid"."team_id" = 2) + ''' +# --- +# name: TestResiliency.test_feature_flags_v3_with_group_properties_and_slow_db + ''' + SELECT "posthog_grouptypemapping"."id", + "posthog_grouptypemapping"."team_id", + "posthog_grouptypemapping"."group_type", + "posthog_grouptypemapping"."group_type_index", + "posthog_grouptypemapping"."name_singular", + "posthog_grouptypemapping"."name_plural" + FROM "posthog_grouptypemapping" + WHERE "posthog_grouptypemapping"."team_id" = 2 + ''' +# --- +# name: TestResiliency.test_feature_flags_v3_with_group_properties_and_slow_db.1 + ''' + SELECT "posthog_grouptypemapping"."id", + "posthog_grouptypemapping"."team_id", + "posthog_grouptypemapping"."group_type", + "posthog_grouptypemapping"."group_type_index", + "posthog_grouptypemapping"."name_singular", + "posthog_grouptypemapping"."name_plural" + FROM "posthog_grouptypemapping" + WHERE "posthog_grouptypemapping"."team_id" = 2 + ''' +# --- +# name: TestResiliency.test_feature_flags_v3_with_group_properties_and_slow_db.2 + ''' + SELECT "posthog_grouptypemapping"."id", + "posthog_grouptypemapping"."team_id", + "posthog_grouptypemapping"."group_type", + "posthog_grouptypemapping"."group_type_index", + "posthog_grouptypemapping"."name_singular", + "posthog_grouptypemapping"."name_plural" + FROM "posthog_grouptypemapping" + WHERE "posthog_grouptypemapping"."team_id" = 2 + ''' +# --- +# name: TestResiliency.test_feature_flags_v3_with_slow_db_doesnt_try_to_compute_conditions_again + ''' + SELECT (("posthog_person"."properties" -> 'email') = '"tim@posthog.com"'::jsonb + AND "posthog_person"."properties" ? 'email' + AND NOT (("posthog_person"."properties" -> 'email') = 'null'::jsonb)) AS "flag_X_condition_0", + (("posthog_person"."properties" -> 'email') = '"tim@posthog.com"'::jsonb + AND "posthog_person"."properties" ? 'email' + AND NOT (("posthog_person"."properties" -> 'email') = 'null'::jsonb)) AS "flag_X_condition_0", + (true) AS "flag_X_condition_0" + FROM "posthog_person" + INNER JOIN "posthog_persondistinctid" ON ("posthog_person"."id" = "posthog_persondistinctid"."person_id") + WHERE ("posthog_persondistinctid"."distinct_id" = 'example_id' + AND "posthog_persondistinctid"."team_id" = 2 + AND "posthog_person"."team_id" = 2) + ''' +# --- diff --git a/posthog/clickhouse/test/__snapshots__/test_schema.ambr b/posthog/clickhouse/test/__snapshots__/test_schema.ambr new file mode 100644 index 0000000000000..ae394429f94b6 --- /dev/null +++ b/posthog/clickhouse/test/__snapshots__/test_schema.ambr @@ -0,0 +1,3721 @@ +# serializer version: 1 +# name: test_create_kafka_events_with_disabled_protobuf + ''' + + CREATE TABLE IF NOT EXISTS kafka_events_json ON CLUSTER 'posthog' + ( + uuid UUID, + event VARCHAR, + properties VARCHAR CODEC(ZSTD(3)), + timestamp DateTime64(6, 'UTC'), + team_id Int64, + distinct_id VARCHAR, + elements_chain VARCHAR, + created_at DateTime64(6, 'UTC'), + person_id UUID, + person_created_at DateTime64, + person_properties VARCHAR Codec(ZSTD(3)), + group0_properties VARCHAR Codec(ZSTD(3)), + group1_properties VARCHAR Codec(ZSTD(3)), + group2_properties VARCHAR Codec(ZSTD(3)), + group3_properties VARCHAR Codec(ZSTD(3)), + group4_properties VARCHAR Codec(ZSTD(3)), + group0_created_at DateTime64, + group1_created_at DateTime64, + group2_created_at DateTime64, + group3_created_at DateTime64, + group4_created_at DateTime64, + person_mode Enum8('full' = 0, 'propertyless' = 1, 'force_upgrade' = 2) + + + + ) ENGINE = Kafka('kafka:9092', 'clickhouse_events_json_test', 'group1', 'JSONEachRow') + + SETTINGS kafka_skip_broken_messages = 100 + + ''' +# --- +# name: test_create_kafka_table_with_different_kafka_host[kafka_app_metrics2] + ''' + + CREATE TABLE IF NOT EXISTS kafka_app_metrics2 ON CLUSTER 'posthog' + ( + team_id Int64, + timestamp DateTime64(6, 'UTC'), + app_source LowCardinality(String), + app_source_id String, + instance_id String, + metric_kind String, + metric_name String, + count Int64 + ) + ENGINE=Kafka('test.kafka.broker:9092', 'clickhouse_app_metrics2_test', 'group1', 'JSONEachRow') + + ''' +# --- +# name: test_create_kafka_table_with_different_kafka_host[kafka_app_metrics] + ''' + + CREATE TABLE IF NOT EXISTS kafka_app_metrics ON CLUSTER 'posthog' + ( + team_id Int64, + timestamp DateTime64(6, 'UTC'), + plugin_config_id Int64, + category LowCardinality(String), + job_id String, + successes Int64, + successes_on_retry Int64, + failures Int64, + error_uuid UUID, + error_type String, + error_details String CODEC(ZSTD(3)) + ) + ENGINE=Kafka('test.kafka.broker:9092', 'clickhouse_app_metrics_test', 'group1', 'JSONEachRow') + + ''' +# --- +# name: test_create_kafka_table_with_different_kafka_host[kafka_events_dead_letter_queue] + ''' + + CREATE TABLE IF NOT EXISTS kafka_events_dead_letter_queue ON CLUSTER 'posthog' + ( + id UUID, + event_uuid UUID, + event VARCHAR, + properties VARCHAR, + distinct_id VARCHAR, + team_id Int64, + elements_chain VARCHAR, + created_at DateTime64(6, 'UTC'), + ip VARCHAR, + site_url VARCHAR, + now DateTime64(6, 'UTC'), + raw_payload VARCHAR, + error_timestamp DateTime64(6, 'UTC'), + error_location VARCHAR, + error VARCHAR, + tags Array(VARCHAR) + + ) ENGINE = Kafka('test.kafka.broker:9092', 'events_dead_letter_queue_test', 'group1', 'JSONEachRow') + SETTINGS kafka_skip_broken_messages=1000 + ''' +# --- +# name: test_create_kafka_table_with_different_kafka_host[kafka_events_json] + ''' + + CREATE TABLE IF NOT EXISTS kafka_events_json ON CLUSTER 'posthog' + ( + uuid UUID, + event VARCHAR, + properties VARCHAR CODEC(ZSTD(3)), + timestamp DateTime64(6, 'UTC'), + team_id Int64, + distinct_id VARCHAR, + elements_chain VARCHAR, + created_at DateTime64(6, 'UTC'), + person_id UUID, + person_created_at DateTime64, + person_properties VARCHAR Codec(ZSTD(3)), + group0_properties VARCHAR Codec(ZSTD(3)), + group1_properties VARCHAR Codec(ZSTD(3)), + group2_properties VARCHAR Codec(ZSTD(3)), + group3_properties VARCHAR Codec(ZSTD(3)), + group4_properties VARCHAR Codec(ZSTD(3)), + group0_created_at DateTime64, + group1_created_at DateTime64, + group2_created_at DateTime64, + group3_created_at DateTime64, + group4_created_at DateTime64, + person_mode Enum8('full' = 0, 'propertyless' = 1, 'force_upgrade' = 2) + + + + ) ENGINE = Kafka('test.kafka.broker:9092', 'clickhouse_events_json_test', 'group1', 'JSONEachRow') + + SETTINGS kafka_skip_broken_messages = 100 + + ''' +# --- +# name: test_create_kafka_table_with_different_kafka_host[kafka_groups] + ''' + + CREATE TABLE IF NOT EXISTS kafka_groups ON CLUSTER 'posthog' + ( + group_type_index UInt8, + group_key VARCHAR, + created_at DateTime64, + team_id Int64, + group_properties VARCHAR + + ) ENGINE = Kafka('test.kafka.broker:9092', 'clickhouse_groups_test', 'group1', 'JSONEachRow') + + ''' +# --- +# name: test_create_kafka_table_with_different_kafka_host[kafka_heatmaps] + ''' + + CREATE TABLE IF NOT EXISTS kafka_heatmaps ON CLUSTER 'posthog' + ( + session_id VARCHAR, + team_id Int64, + distinct_id VARCHAR, + timestamp DateTime64(6, 'UTC'), + -- x is the x with resolution applied, the resolution converts high fidelity mouse positions into an NxN grid + x Int16, + -- y is the y with resolution applied, the resolution converts high fidelity mouse positions into an NxN grid + y Int16, + -- stored so that in future we can support other resolutions + scale_factor Int16, + viewport_width Int16, + viewport_height Int16, + -- some elements move when the page scrolls, others do not + pointer_target_fixed Bool, + current_url VARCHAR, + type LowCardinality(String) + ) ENGINE = Kafka('test.kafka.broker:9092', 'clickhouse_heatmap_events_test', 'group1', 'JSONEachRow') + + ''' +# --- +# name: test_create_kafka_table_with_different_kafka_host[kafka_ingestion_warnings] + ''' + + CREATE TABLE IF NOT EXISTS kafka_ingestion_warnings ON CLUSTER 'posthog' + ( + team_id Int64, + source LowCardinality(VARCHAR), + type VARCHAR, + details VARCHAR CODEC(ZSTD(3)), + timestamp DateTime64(6, 'UTC') + + ) ENGINE = Kafka('test.kafka.broker:9092', 'clickhouse_ingestion_warnings_test', 'group1', 'JSONEachRow') + + ''' +# --- +# name: test_create_kafka_table_with_different_kafka_host[kafka_log_entries] + ''' + + CREATE TABLE IF NOT EXISTS kafka_log_entries ON CLUSTER 'posthog' + ( + team_id UInt64, + -- The name of the service or product that generated the logs. + -- Examples: batch_exports + log_source LowCardinality(String), + -- An id for the log source. + -- Set log_source to avoid collision with ids from other log sources if the id generation is not safe. + -- Examples: A batch export id, a cronjob id, a plugin id. + log_source_id String, + -- A secondary id e.g. for the instance of log_source that generated this log. + -- This may be ommitted if log_source is a singleton. + -- Examples: A batch export run id, a plugin_config id, a thread id, a process id, a machine id. + instance_id String, + -- Timestamp indicating when the log was generated. + timestamp DateTime64(6, 'UTC'), + -- The log level. + -- Examples: INFO, WARNING, DEBUG, ERROR. + level LowCardinality(String), + -- The actual log message. + message String + + ) ENGINE = Kafka('test.kafka.broker:9092', 'log_entries_test', 'group1', 'JSONEachRow') + + ''' +# --- +# name: test_create_kafka_table_with_different_kafka_host[kafka_performance_events] + ''' + + CREATE TABLE IF NOT EXISTS kafka_performance_events ON CLUSTER 'posthog' + ( + uuid UUID, + session_id String, + window_id String, + pageview_id String, + distinct_id String, + timestamp DateTime64, + time_origin DateTime64(3, 'UTC'), + entry_type LowCardinality(String), + name String, + team_id Int64, + current_url String, + start_time Float64, + duration Float64, + redirect_start Float64, + redirect_end Float64, + worker_start Float64, + fetch_start Float64, + domain_lookup_start Float64, + domain_lookup_end Float64, + connect_start Float64, + secure_connection_start Float64, + connect_end Float64, + request_start Float64, + response_start Float64, + response_end Float64, + decoded_body_size Int64, + encoded_body_size Int64, + initiator_type LowCardinality(String), + next_hop_protocol LowCardinality(String), + render_blocking_status LowCardinality(String), + response_status Int64, + transfer_size Int64, + largest_contentful_paint_element String, + largest_contentful_paint_render_time Float64, + largest_contentful_paint_load_time Float64, + largest_contentful_paint_size Float64, + largest_contentful_paint_id String, + largest_contentful_paint_url String, + dom_complete Float64, + dom_content_loaded_event Float64, + dom_interactive Float64, + load_event_end Float64, + load_event_start Float64, + redirect_count Int64, + navigation_type LowCardinality(String), + unload_event_end Float64, + unload_event_start Float64 + + ) ENGINE = Kafka('test.kafka.broker:9092', 'clickhouse_performance_events_test', 'group1', 'JSONEachRow') + + ''' +# --- +# name: test_create_kafka_table_with_different_kafka_host[kafka_person] + ''' + + CREATE TABLE IF NOT EXISTS kafka_person ON CLUSTER 'posthog' + ( + id UUID, + created_at DateTime64, + team_id Int64, + properties VARCHAR, + is_identified Int8, + is_deleted Int8, + version UInt64 + + ) ENGINE = Kafka('test.kafka.broker:9092', 'clickhouse_person_test', 'group1', 'JSONEachRow') + + ''' +# --- +# name: test_create_kafka_table_with_different_kafka_host[kafka_person_distinct_id2] + ''' + + CREATE TABLE IF NOT EXISTS kafka_person_distinct_id2 ON CLUSTER 'posthog' + ( + team_id Int64, + distinct_id VARCHAR, + person_id UUID, + is_deleted Int8, + version Int64 + + ) ENGINE = Kafka('test.kafka.broker:9092', 'clickhouse_person_distinct_id_test', 'group1', 'JSONEachRow') + + ''' +# --- +# name: test_create_kafka_table_with_different_kafka_host[kafka_person_distinct_id] + ''' + + CREATE TABLE IF NOT EXISTS kafka_person_distinct_id ON CLUSTER 'posthog' + ( + distinct_id VARCHAR, + person_id UUID, + team_id Int64, + _sign Nullable(Int8), + is_deleted Nullable(Int8) + ) ENGINE = Kafka('test.kafka.broker:9092', 'clickhouse_person_unique_id_test', 'group1', 'JSONEachRow') + + ''' +# --- +# name: test_create_kafka_table_with_different_kafka_host[kafka_person_distinct_id_overrides] + ''' + + CREATE TABLE IF NOT EXISTS kafka_person_distinct_id_overrides ON CLUSTER 'posthog' + ( + team_id Int64, + distinct_id VARCHAR, + person_id UUID, + is_deleted Int8, + version Int64 + + ) ENGINE = Kafka('test.kafka.broker:9092', 'clickhouse_person_distinct_id_test', 'clickhouse-person-distinct-id-overrides', 'JSONEachRow') + + ''' +# --- +# name: test_create_kafka_table_with_different_kafka_host[kafka_person_overrides] + ''' + + CREATE TABLE IF NOT EXISTS `posthog_test`.`kafka_person_overrides` + ON CLUSTER 'posthog' + + ENGINE = Kafka( + 'kafka:9092', -- Kafka hosts + 'clickhouse_person_override_test', -- Kafka topic + 'clickhouse-person-overrides', -- Kafka consumer group id + 'JSONEachRow' -- Specify that we should pass Kafka messages as JSON + ) + + -- Take the types from the `person_overrides` table, except for the + -- `created_at`, which we want to use the DEFAULT now() from the + -- `person_overrides` definition. See + -- https://github.com/ClickHouse/ClickHouse/pull/38272 for details of `EMPTY + -- AS SELECT` + EMPTY AS SELECT + team_id, + old_person_id, + override_person_id, + merged_at, + oldest_event, + -- We don't want to insert this column via Kafka, as it's + -- set as a default value in the `person_overrides` table. + -- created_at, + version + FROM `posthog_test`.`person_overrides` + + ''' +# --- +# name: test_create_kafka_table_with_different_kafka_host[kafka_plugin_log_entries] + ''' + + CREATE TABLE IF NOT EXISTS kafka_plugin_log_entries ON CLUSTER 'posthog' + ( + id UUID, + team_id Int64, + plugin_id Int64, + plugin_config_id Int64, + timestamp DateTime64(6, 'UTC'), + source VARCHAR, + type VARCHAR, + message VARCHAR, + instance_id UUID + + ) ENGINE = Kafka('test.kafka.broker:9092', 'plugin_log_entries_test', 'group1', 'JSONEachRow') + + ''' +# --- +# name: test_create_kafka_table_with_different_kafka_host[kafka_session_recording_events] + ''' + + CREATE TABLE IF NOT EXISTS kafka_session_recording_events ON CLUSTER 'posthog' + ( + uuid UUID, + timestamp DateTime64(6, 'UTC'), + team_id Int64, + distinct_id VARCHAR, + session_id VARCHAR, + window_id VARCHAR, + snapshot_data VARCHAR, + created_at DateTime64(6, 'UTC') + + + ) ENGINE = Kafka('test.kafka.broker:9092', 'clickhouse_session_recording_events_test', 'group1', 'JSONEachRow') + + ''' +# --- +# name: test_create_kafka_table_with_different_kafka_host[kafka_session_replay_events] + ''' + + CREATE TABLE IF NOT EXISTS kafka_session_replay_events ON CLUSTER 'posthog' + ( + session_id VARCHAR, + team_id Int64, + distinct_id VARCHAR, + first_timestamp DateTime64(6, 'UTC'), + last_timestamp DateTime64(6, 'UTC'), + first_url Nullable(VARCHAR), + click_count Int64, + keypress_count Int64, + mouse_activity_count Int64, + active_milliseconds Int64, + console_log_count Int64, + console_warn_count Int64, + console_error_count Int64, + size Int64, + event_count Int64, + message_count Int64, + snapshot_source LowCardinality(Nullable(String)) + ) ENGINE = Kafka('test.kafka.broker:9092', 'clickhouse_session_replay_events_test', 'group1', 'JSONEachRow') + + ''' +# --- +# name: test_create_table_query[app_metrics2] + ''' + + CREATE TABLE IF NOT EXISTS app_metrics2 ON CLUSTER 'posthog' + ( + team_id Int64, + timestamp DateTime64(6, 'UTC'), + -- The name of the service or product that generated the metrics. + -- Examples: plugins, hog + app_source LowCardinality(String), + -- An id for the app source. + -- Set app_source to avoid collision with ids from other app sources if the id generation is not safe. + -- Examples: A plugin id, a hog application id + app_source_id String, + -- A secondary id e.g. for the instance of app_source that generated this metric. + -- This may be ommitted if app_source is a singleton. + -- Examples: A plugin config id, a hog application config id + instance_id String, + metric_kind LowCardinality(String), + metric_name LowCardinality(String), + count SimpleAggregateFunction(sum, Int64) + + , _timestamp DateTime + , _offset UInt64 + , _partition UInt64 + + ) + ENGINE=Distributed('posthog', 'posthog_test', 'sharded_app_metrics2', rand()) + + ''' +# --- +# name: test_create_table_query[app_metrics2_mv] + ''' + + CREATE MATERIALIZED VIEW IF NOT EXISTS app_metrics2_mv ON CLUSTER 'posthog' + TO posthog_test.sharded_app_metrics2 + AS SELECT + team_id, + timestamp, + app_source, + app_source_id, + instance_id, + metric_kind, + metric_name, + count + FROM posthog_test.kafka_app_metrics2 + + ''' +# --- +# name: test_create_table_query[app_metrics] + ''' + + CREATE TABLE IF NOT EXISTS app_metrics ON CLUSTER 'posthog' + ( + team_id Int64, + timestamp DateTime64(6, 'UTC'), + plugin_config_id Int64, + category LowCardinality(String), + job_id String, + successes SimpleAggregateFunction(sum, Int64), + successes_on_retry SimpleAggregateFunction(sum, Int64), + failures SimpleAggregateFunction(sum, Int64), + error_uuid UUID, + error_type String, + error_details String CODEC(ZSTD(3)) + + , _timestamp DateTime + , _offset UInt64 + , _partition UInt64 + + ) + ENGINE=Distributed('posthog', 'posthog_test', 'sharded_app_metrics', rand()) + + ''' +# --- +# name: test_create_table_query[app_metrics_mv] + ''' + + CREATE MATERIALIZED VIEW IF NOT EXISTS app_metrics_mv ON CLUSTER 'posthog' + TO posthog_test.sharded_app_metrics + AS SELECT + team_id, + timestamp, + plugin_config_id, + category, + job_id, + successes, + successes_on_retry, + failures, + error_uuid, + error_type, + error_details + FROM posthog_test.kafka_app_metrics + + ''' +# --- +# name: test_create_table_query[channel_definition] + ''' + + CREATE TABLE IF NOT EXISTS channel_definition ON CLUSTER 'posthog' ( + domain String NOT NULL, + kind String NOT NULL, + domain_type String NULL, + type_if_paid String NULL, + type_if_organic String NULL + ) ENGINE = ReplicatedMergeTree('/clickhouse/tables/77f1df52-4b43-11e9-910f-b8ca3a9b9f3e_noshard/posthog.channel_definition', '{replica}-{shard}') + ORDER BY (domain, kind); + + ''' +# --- +# name: test_create_table_query[cohortpeople] + ''' + + CREATE TABLE IF NOT EXISTS cohortpeople ON CLUSTER 'posthog' + ( + person_id UUID, + cohort_id Int64, + team_id Int64, + sign Int8, + version UInt64 + ) ENGINE = ReplicatedCollapsingMergeTree('/clickhouse/tables/77f1df52-4b43-11e9-910f-b8ca3a9b9f3e_noshard/posthog.cohortpeople', '{replica}-{shard}', sign) + Order By (team_id, cohort_id, person_id, version) + + + ''' +# --- +# name: test_create_table_query[events] + ''' + + CREATE TABLE IF NOT EXISTS events ON CLUSTER 'posthog' + ( + uuid UUID, + event VARCHAR, + properties VARCHAR CODEC(ZSTD(3)), + timestamp DateTime64(6, 'UTC'), + team_id Int64, + distinct_id VARCHAR, + elements_chain VARCHAR, + created_at DateTime64(6, 'UTC'), + person_id UUID, + person_created_at DateTime64, + person_properties VARCHAR Codec(ZSTD(3)), + group0_properties VARCHAR Codec(ZSTD(3)), + group1_properties VARCHAR Codec(ZSTD(3)), + group2_properties VARCHAR Codec(ZSTD(3)), + group3_properties VARCHAR Codec(ZSTD(3)), + group4_properties VARCHAR Codec(ZSTD(3)), + group0_created_at DateTime64, + group1_created_at DateTime64, + group2_created_at DateTime64, + group3_created_at DateTime64, + group4_created_at DateTime64, + person_mode Enum8('full' = 0, 'propertyless' = 1, 'force_upgrade' = 2) + + , $group_0 VARCHAR COMMENT 'column_materializer::$group_0' + , $group_1 VARCHAR COMMENT 'column_materializer::$group_1' + , $group_2 VARCHAR COMMENT 'column_materializer::$group_2' + , $group_3 VARCHAR COMMENT 'column_materializer::$group_3' + , $group_4 VARCHAR COMMENT 'column_materializer::$group_4' + , $window_id VARCHAR COMMENT 'column_materializer::$window_id' + , $session_id VARCHAR COMMENT 'column_materializer::$session_id' + , elements_chain_href String COMMENT 'column_materializer::elements_chain::href' + , elements_chain_texts Array(String) COMMENT 'column_materializer::elements_chain::texts' + , elements_chain_ids Array(String) COMMENT 'column_materializer::elements_chain::ids' + , elements_chain_elements Array(Enum('a', 'button', 'form', 'input', 'select', 'textarea', 'label')) COMMENT 'column_materializer::elements_chain::elements' + , properties_group_custom Map(String, String), properties_group_feature_flags Map(String, String) + + + , _timestamp DateTime + , _offset UInt64 + , inserted_at Nullable(DateTime64(6, 'UTC')) DEFAULT NULL + + ) ENGINE = Distributed('posthog', 'posthog_test', 'sharded_events', sipHash64(distinct_id)) + + ''' +# --- +# name: test_create_table_query[events_dead_letter_queue] + ''' + + CREATE TABLE IF NOT EXISTS events_dead_letter_queue ON CLUSTER 'posthog' + ( + id UUID, + event_uuid UUID, + event VARCHAR, + properties VARCHAR, + distinct_id VARCHAR, + team_id Int64, + elements_chain VARCHAR, + created_at DateTime64(6, 'UTC'), + ip VARCHAR, + site_url VARCHAR, + now DateTime64(6, 'UTC'), + raw_payload VARCHAR, + error_timestamp DateTime64(6, 'UTC'), + error_location VARCHAR, + error VARCHAR, + tags Array(VARCHAR) + + + , _timestamp DateTime + , _offset UInt64 + + , INDEX kafka_timestamp_minmax_events_dead_letter_queue _timestamp TYPE minmax GRANULARITY 3 + + ) ENGINE = ReplicatedReplacingMergeTree('/clickhouse/tables/77f1df52-4b43-11e9-910f-b8ca3a9b9f3e_noshard/posthog.events_dead_letter_queue', '{replica}-{shard}', _timestamp) + ORDER BY (id, event_uuid, distinct_id, team_id) + + SETTINGS index_granularity=512 + + ''' +# --- +# name: test_create_table_query[events_dead_letter_queue_mv] + ''' + + CREATE MATERIALIZED VIEW IF NOT EXISTS events_dead_letter_queue_mv ON CLUSTER 'posthog' + TO posthog_test.events_dead_letter_queue + AS SELECT + id, + event_uuid, + event, + properties, + distinct_id, + team_id, + elements_chain, + created_at, + ip, + site_url, + now, + raw_payload, + error_timestamp, + error_location, + error, + tags, + _timestamp, + _offset + FROM posthog_test.kafka_events_dead_letter_queue + + ''' +# --- +# name: test_create_table_query[events_json_mv] + ''' + + CREATE MATERIALIZED VIEW IF NOT EXISTS events_json_mv ON CLUSTER 'posthog' + TO posthog_test.writable_events + AS SELECT + uuid, + event, + properties, + timestamp, + team_id, + distinct_id, + elements_chain, + created_at, + person_id, + person_created_at, + person_properties, + group0_properties, + group1_properties, + group2_properties, + group3_properties, + group4_properties, + group0_created_at, + group1_created_at, + group2_created_at, + group3_created_at, + group4_created_at, + person_mode, + NOW64() AS inserted_at, + _timestamp, + _offset + FROM posthog_test.kafka_events_json + + ''' +# --- +# name: test_create_table_query[groups] + ''' + + CREATE TABLE IF NOT EXISTS groups ON CLUSTER 'posthog' + ( + group_type_index UInt8, + group_key VARCHAR, + created_at DateTime64, + team_id Int64, + group_properties VARCHAR + + , _timestamp DateTime + , _offset UInt64 + + ) ENGINE = ReplicatedReplacingMergeTree('/clickhouse/tables/77f1df52-4b43-11e9-910f-b8ca3a9b9f3e_noshard/posthog.groups', '{replica}-{shard}', _timestamp) + Order By (team_id, group_type_index, group_key) + + + ''' +# --- +# name: test_create_table_query[groups_mv] + ''' + + CREATE MATERIALIZED VIEW IF NOT EXISTS groups_mv ON CLUSTER 'posthog' + TO posthog_test.groups + AS SELECT + group_type_index, + group_key, + created_at, + team_id, + group_properties, + _timestamp, + _offset + FROM posthog_test.kafka_groups + + ''' +# --- +# name: test_create_table_query[heatmaps] + ''' + + CREATE TABLE IF NOT EXISTS heatmaps ON CLUSTER 'posthog' + ( + session_id VARCHAR, + team_id Int64, + distinct_id VARCHAR, + timestamp DateTime64(6, 'UTC'), + -- x is the x with resolution applied, the resolution converts high fidelity mouse positions into an NxN grid + x Int16, + -- y is the y with resolution applied, the resolution converts high fidelity mouse positions into an NxN grid + y Int16, + -- stored so that in future we can support other resolutions + scale_factor Int16, + viewport_width Int16, + viewport_height Int16, + -- some elements move when the page scrolls, others do not + pointer_target_fixed Bool, + current_url VARCHAR, + type LowCardinality(String), + _timestamp DateTime, + _offset UInt64, + _partition UInt64 + ) ENGINE = Distributed('posthog', 'posthog_test', 'sharded_heatmaps', cityHash64(concat(toString(team_id), '-', session_id, '-', toString(toDate(timestamp))))) + + ''' +# --- +# name: test_create_table_query[heatmaps_mv] + ''' + + CREATE MATERIALIZED VIEW IF NOT EXISTS heatmaps_mv ON CLUSTER 'posthog' + TO posthog_test.writable_heatmaps + AS SELECT + session_id, + team_id, + distinct_id, + timestamp, + -- x is the x with resolution applied, the resolution converts high fidelity mouse positions into an NxN grid + x, + -- y is the y with resolution applied, the resolution converts high fidelity mouse positions into an NxN grid + y, + -- stored so that in future we can support other resolutions + scale_factor, + viewport_width, + viewport_height, + -- some elements move when the page scrolls, others do not + pointer_target_fixed, + current_url, + type, + _timestamp, + _offset, + _partition + FROM posthog_test.kafka_heatmaps + + ''' +# --- +# name: test_create_table_query[ingestion_warnings] + ''' + + CREATE TABLE IF NOT EXISTS ingestion_warnings ON CLUSTER 'posthog' + ( + team_id Int64, + source LowCardinality(VARCHAR), + type VARCHAR, + details VARCHAR CODEC(ZSTD(3)), + timestamp DateTime64(6, 'UTC') + + , _timestamp DateTime + , _offset UInt64 + , _partition UInt64 + + ) ENGINE = Distributed('posthog', 'posthog_test', 'sharded_ingestion_warnings', rand()) + + ''' +# --- +# name: test_create_table_query[ingestion_warnings_mv] + ''' + + CREATE MATERIALIZED VIEW IF NOT EXISTS ingestion_warnings_mv ON CLUSTER 'posthog' + TO posthog_test.ingestion_warnings + AS SELECT + team_id, + source, + type, + details, + timestamp, + _timestamp, + _offset, + _partition + FROM posthog_test.kafka_ingestion_warnings + + ''' +# --- +# name: test_create_table_query[kafka_app_metrics2] + ''' + + CREATE TABLE IF NOT EXISTS kafka_app_metrics2 ON CLUSTER 'posthog' + ( + team_id Int64, + timestamp DateTime64(6, 'UTC'), + app_source LowCardinality(String), + app_source_id String, + instance_id String, + metric_kind String, + metric_name String, + count Int64 + ) + ENGINE=Kafka('kafka:9092', 'clickhouse_app_metrics2_test', 'group1', 'JSONEachRow') + + ''' +# --- +# name: test_create_table_query[kafka_app_metrics] + ''' + + CREATE TABLE IF NOT EXISTS kafka_app_metrics ON CLUSTER 'posthog' + ( + team_id Int64, + timestamp DateTime64(6, 'UTC'), + plugin_config_id Int64, + category LowCardinality(String), + job_id String, + successes Int64, + successes_on_retry Int64, + failures Int64, + error_uuid UUID, + error_type String, + error_details String CODEC(ZSTD(3)) + ) + ENGINE=Kafka('kafka:9092', 'clickhouse_app_metrics_test', 'group1', 'JSONEachRow') + + ''' +# --- +# name: test_create_table_query[kafka_events_dead_letter_queue] + ''' + + CREATE TABLE IF NOT EXISTS kafka_events_dead_letter_queue ON CLUSTER 'posthog' + ( + id UUID, + event_uuid UUID, + event VARCHAR, + properties VARCHAR, + distinct_id VARCHAR, + team_id Int64, + elements_chain VARCHAR, + created_at DateTime64(6, 'UTC'), + ip VARCHAR, + site_url VARCHAR, + now DateTime64(6, 'UTC'), + raw_payload VARCHAR, + error_timestamp DateTime64(6, 'UTC'), + error_location VARCHAR, + error VARCHAR, + tags Array(VARCHAR) + + ) ENGINE = Kafka('kafka:9092', 'events_dead_letter_queue_test', 'group1', 'JSONEachRow') + SETTINGS kafka_skip_broken_messages=1000 + ''' +# --- +# name: test_create_table_query[kafka_events_json] + ''' + + CREATE TABLE IF NOT EXISTS kafka_events_json ON CLUSTER 'posthog' + ( + uuid UUID, + event VARCHAR, + properties VARCHAR CODEC(ZSTD(3)), + timestamp DateTime64(6, 'UTC'), + team_id Int64, + distinct_id VARCHAR, + elements_chain VARCHAR, + created_at DateTime64(6, 'UTC'), + person_id UUID, + person_created_at DateTime64, + person_properties VARCHAR Codec(ZSTD(3)), + group0_properties VARCHAR Codec(ZSTD(3)), + group1_properties VARCHAR Codec(ZSTD(3)), + group2_properties VARCHAR Codec(ZSTD(3)), + group3_properties VARCHAR Codec(ZSTD(3)), + group4_properties VARCHAR Codec(ZSTD(3)), + group0_created_at DateTime64, + group1_created_at DateTime64, + group2_created_at DateTime64, + group3_created_at DateTime64, + group4_created_at DateTime64, + person_mode Enum8('full' = 0, 'propertyless' = 1, 'force_upgrade' = 2) + + + + ) ENGINE = Kafka('kafka:9092', 'clickhouse_events_json_test', 'group1', 'JSONEachRow') + + SETTINGS kafka_skip_broken_messages = 100 + + ''' +# --- +# name: test_create_table_query[kafka_groups] + ''' + + CREATE TABLE IF NOT EXISTS kafka_groups ON CLUSTER 'posthog' + ( + group_type_index UInt8, + group_key VARCHAR, + created_at DateTime64, + team_id Int64, + group_properties VARCHAR + + ) ENGINE = Kafka('kafka:9092', 'clickhouse_groups_test', 'group1', 'JSONEachRow') + + ''' +# --- +# name: test_create_table_query[kafka_heatmaps] + ''' + + CREATE TABLE IF NOT EXISTS kafka_heatmaps ON CLUSTER 'posthog' + ( + session_id VARCHAR, + team_id Int64, + distinct_id VARCHAR, + timestamp DateTime64(6, 'UTC'), + -- x is the x with resolution applied, the resolution converts high fidelity mouse positions into an NxN grid + x Int16, + -- y is the y with resolution applied, the resolution converts high fidelity mouse positions into an NxN grid + y Int16, + -- stored so that in future we can support other resolutions + scale_factor Int16, + viewport_width Int16, + viewport_height Int16, + -- some elements move when the page scrolls, others do not + pointer_target_fixed Bool, + current_url VARCHAR, + type LowCardinality(String) + ) ENGINE = Kafka('kafka:9092', 'clickhouse_heatmap_events_test', 'group1', 'JSONEachRow') + + ''' +# --- +# name: test_create_table_query[kafka_ingestion_warnings] + ''' + + CREATE TABLE IF NOT EXISTS kafka_ingestion_warnings ON CLUSTER 'posthog' + ( + team_id Int64, + source LowCardinality(VARCHAR), + type VARCHAR, + details VARCHAR CODEC(ZSTD(3)), + timestamp DateTime64(6, 'UTC') + + ) ENGINE = Kafka('kafka:9092', 'clickhouse_ingestion_warnings_test', 'group1', 'JSONEachRow') + + ''' +# --- +# name: test_create_table_query[kafka_log_entries] + ''' + + CREATE TABLE IF NOT EXISTS kafka_log_entries ON CLUSTER 'posthog' + ( + team_id UInt64, + -- The name of the service or product that generated the logs. + -- Examples: batch_exports + log_source LowCardinality(String), + -- An id for the log source. + -- Set log_source to avoid collision with ids from other log sources if the id generation is not safe. + -- Examples: A batch export id, a cronjob id, a plugin id. + log_source_id String, + -- A secondary id e.g. for the instance of log_source that generated this log. + -- This may be ommitted if log_source is a singleton. + -- Examples: A batch export run id, a plugin_config id, a thread id, a process id, a machine id. + instance_id String, + -- Timestamp indicating when the log was generated. + timestamp DateTime64(6, 'UTC'), + -- The log level. + -- Examples: INFO, WARNING, DEBUG, ERROR. + level LowCardinality(String), + -- The actual log message. + message String + + ) ENGINE = Kafka('kafka:9092', 'log_entries_test', 'group1', 'JSONEachRow') + + ''' +# --- +# name: test_create_table_query[kafka_performance_events] + ''' + + CREATE TABLE IF NOT EXISTS kafka_performance_events ON CLUSTER 'posthog' + ( + uuid UUID, + session_id String, + window_id String, + pageview_id String, + distinct_id String, + timestamp DateTime64, + time_origin DateTime64(3, 'UTC'), + entry_type LowCardinality(String), + name String, + team_id Int64, + current_url String, + start_time Float64, + duration Float64, + redirect_start Float64, + redirect_end Float64, + worker_start Float64, + fetch_start Float64, + domain_lookup_start Float64, + domain_lookup_end Float64, + connect_start Float64, + secure_connection_start Float64, + connect_end Float64, + request_start Float64, + response_start Float64, + response_end Float64, + decoded_body_size Int64, + encoded_body_size Int64, + initiator_type LowCardinality(String), + next_hop_protocol LowCardinality(String), + render_blocking_status LowCardinality(String), + response_status Int64, + transfer_size Int64, + largest_contentful_paint_element String, + largest_contentful_paint_render_time Float64, + largest_contentful_paint_load_time Float64, + largest_contentful_paint_size Float64, + largest_contentful_paint_id String, + largest_contentful_paint_url String, + dom_complete Float64, + dom_content_loaded_event Float64, + dom_interactive Float64, + load_event_end Float64, + load_event_start Float64, + redirect_count Int64, + navigation_type LowCardinality(String), + unload_event_end Float64, + unload_event_start Float64 + + ) ENGINE = Kafka('kafka:9092', 'clickhouse_performance_events_test', 'group1', 'JSONEachRow') + + ''' +# --- +# name: test_create_table_query[kafka_person] + ''' + + CREATE TABLE IF NOT EXISTS kafka_person ON CLUSTER 'posthog' + ( + id UUID, + created_at DateTime64, + team_id Int64, + properties VARCHAR, + is_identified Int8, + is_deleted Int8, + version UInt64 + + ) ENGINE = Kafka('kafka:9092', 'clickhouse_person_test', 'group1', 'JSONEachRow') + + ''' +# --- +# name: test_create_table_query[kafka_person_distinct_id2] + ''' + + CREATE TABLE IF NOT EXISTS kafka_person_distinct_id2 ON CLUSTER 'posthog' + ( + team_id Int64, + distinct_id VARCHAR, + person_id UUID, + is_deleted Int8, + version Int64 + + ) ENGINE = Kafka('kafka:9092', 'clickhouse_person_distinct_id_test', 'group1', 'JSONEachRow') + + ''' +# --- +# name: test_create_table_query[kafka_person_distinct_id] + ''' + + CREATE TABLE IF NOT EXISTS kafka_person_distinct_id ON CLUSTER 'posthog' + ( + distinct_id VARCHAR, + person_id UUID, + team_id Int64, + _sign Nullable(Int8), + is_deleted Nullable(Int8) + ) ENGINE = Kafka('kafka:9092', 'clickhouse_person_unique_id_test', 'group1', 'JSONEachRow') + + ''' +# --- +# name: test_create_table_query[kafka_person_distinct_id_overrides] + ''' + + CREATE TABLE IF NOT EXISTS kafka_person_distinct_id_overrides ON CLUSTER 'posthog' + ( + team_id Int64, + distinct_id VARCHAR, + person_id UUID, + is_deleted Int8, + version Int64 + + ) ENGINE = Kafka('kafka:9092', 'clickhouse_person_distinct_id_test', 'clickhouse-person-distinct-id-overrides', 'JSONEachRow') + + ''' +# --- +# name: test_create_table_query[kafka_person_overrides] + ''' + + CREATE TABLE IF NOT EXISTS `posthog_test`.`kafka_person_overrides` + ON CLUSTER 'posthog' + + ENGINE = Kafka( + 'kafka:9092', -- Kafka hosts + 'clickhouse_person_override_test', -- Kafka topic + 'clickhouse-person-overrides', -- Kafka consumer group id + 'JSONEachRow' -- Specify that we should pass Kafka messages as JSON + ) + + -- Take the types from the `person_overrides` table, except for the + -- `created_at`, which we want to use the DEFAULT now() from the + -- `person_overrides` definition. See + -- https://github.com/ClickHouse/ClickHouse/pull/38272 for details of `EMPTY + -- AS SELECT` + EMPTY AS SELECT + team_id, + old_person_id, + override_person_id, + merged_at, + oldest_event, + -- We don't want to insert this column via Kafka, as it's + -- set as a default value in the `person_overrides` table. + -- created_at, + version + FROM `posthog_test`.`person_overrides` + + ''' +# --- +# name: test_create_table_query[kafka_plugin_log_entries] + ''' + + CREATE TABLE IF NOT EXISTS kafka_plugin_log_entries ON CLUSTER 'posthog' + ( + id UUID, + team_id Int64, + plugin_id Int64, + plugin_config_id Int64, + timestamp DateTime64(6, 'UTC'), + source VARCHAR, + type VARCHAR, + message VARCHAR, + instance_id UUID + + ) ENGINE = Kafka('kafka:9092', 'plugin_log_entries_test', 'group1', 'JSONEachRow') + + ''' +# --- +# name: test_create_table_query[kafka_session_recording_events] + ''' + + CREATE TABLE IF NOT EXISTS kafka_session_recording_events ON CLUSTER 'posthog' + ( + uuid UUID, + timestamp DateTime64(6, 'UTC'), + team_id Int64, + distinct_id VARCHAR, + session_id VARCHAR, + window_id VARCHAR, + snapshot_data VARCHAR, + created_at DateTime64(6, 'UTC') + + + ) ENGINE = Kafka('kafka:9092', 'clickhouse_session_recording_events_test', 'group1', 'JSONEachRow') + + ''' +# --- +# name: test_create_table_query[kafka_session_replay_events] + ''' + + CREATE TABLE IF NOT EXISTS kafka_session_replay_events ON CLUSTER 'posthog' + ( + session_id VARCHAR, + team_id Int64, + distinct_id VARCHAR, + first_timestamp DateTime64(6, 'UTC'), + last_timestamp DateTime64(6, 'UTC'), + first_url Nullable(VARCHAR), + click_count Int64, + keypress_count Int64, + mouse_activity_count Int64, + active_milliseconds Int64, + console_log_count Int64, + console_warn_count Int64, + console_error_count Int64, + size Int64, + event_count Int64, + message_count Int64, + snapshot_source LowCardinality(Nullable(String)) + ) ENGINE = Kafka('kafka:9092', 'clickhouse_session_replay_events_test', 'group1', 'JSONEachRow') + + ''' +# --- +# name: test_create_table_query[log_entries] + ''' + + CREATE TABLE IF NOT EXISTS log_entries ON CLUSTER 'posthog' + ( + team_id UInt64, + -- The name of the service or product that generated the logs. + -- Examples: batch_exports + log_source LowCardinality(String), + -- An id for the log source. + -- Set log_source to avoid collision with ids from other log sources if the id generation is not safe. + -- Examples: A batch export id, a cronjob id, a plugin id. + log_source_id String, + -- A secondary id e.g. for the instance of log_source that generated this log. + -- This may be ommitted if log_source is a singleton. + -- Examples: A batch export run id, a plugin_config id, a thread id, a process id, a machine id. + instance_id String, + -- Timestamp indicating when the log was generated. + timestamp DateTime64(6, 'UTC'), + -- The log level. + -- Examples: INFO, WARNING, DEBUG, ERROR. + level LowCardinality(String), + -- The actual log message. + message String + + , _timestamp DateTime + , _offset UInt64 + + ) ENGINE = ReplicatedReplacingMergeTree('/clickhouse/tables/77f1df52-4b43-11e9-910f-b8ca3a9b9f3e_noshard/posthog.log_entries', '{replica}-{shard}', _timestamp) + PARTITION BY toStartOfHour(timestamp) ORDER BY (team_id, log_source, log_source_id, instance_id, timestamp) + + SETTINGS index_granularity=512 + + ''' +# --- +# name: test_create_table_query[log_entries_mv] + ''' + + CREATE MATERIALIZED VIEW IF NOT EXISTS log_entries_mv ON CLUSTER 'posthog' + TO posthog_test.log_entries + AS SELECT + team_id, + log_source, + log_source_id, + instance_id, + timestamp, + level, + message, + _timestamp, + _offset + FROM posthog_test.kafka_log_entries + + ''' +# --- +# name: test_create_table_query[performance_events] + ''' + + CREATE TABLE IF NOT EXISTS performance_events ON CLUSTER 'posthog' + ( + uuid UUID, + session_id String, + window_id String, + pageview_id String, + distinct_id String, + timestamp DateTime64, + time_origin DateTime64(3, 'UTC'), + entry_type LowCardinality(String), + name String, + team_id Int64, + current_url String, + start_time Float64, + duration Float64, + redirect_start Float64, + redirect_end Float64, + worker_start Float64, + fetch_start Float64, + domain_lookup_start Float64, + domain_lookup_end Float64, + connect_start Float64, + secure_connection_start Float64, + connect_end Float64, + request_start Float64, + response_start Float64, + response_end Float64, + decoded_body_size Int64, + encoded_body_size Int64, + initiator_type LowCardinality(String), + next_hop_protocol LowCardinality(String), + render_blocking_status LowCardinality(String), + response_status Int64, + transfer_size Int64, + largest_contentful_paint_element String, + largest_contentful_paint_render_time Float64, + largest_contentful_paint_load_time Float64, + largest_contentful_paint_size Float64, + largest_contentful_paint_id String, + largest_contentful_paint_url String, + dom_complete Float64, + dom_content_loaded_event Float64, + dom_interactive Float64, + load_event_end Float64, + load_event_start Float64, + redirect_count Int64, + navigation_type LowCardinality(String), + unload_event_end Float64, + unload_event_start Float64 + + , _timestamp DateTime + , _offset UInt64 + , _partition UInt64 + + ) ENGINE = Distributed('posthog', 'posthog_test', 'sharded_performance_events', sipHash64(session_id)) + + ''' +# --- +# name: test_create_table_query[performance_events_mv] + ''' + + CREATE MATERIALIZED VIEW IF NOT EXISTS performance_events_mv ON CLUSTER 'posthog' + TO posthog_test.writeable_performance_events + AS SELECT + uuid, session_id, window_id, pageview_id, distinct_id, timestamp, time_origin, entry_type, name, team_id, current_url, start_time, duration, redirect_start, redirect_end, worker_start, fetch_start, domain_lookup_start, domain_lookup_end, connect_start, secure_connection_start, connect_end, request_start, response_start, response_end, decoded_body_size, encoded_body_size, initiator_type, next_hop_protocol, render_blocking_status, response_status, transfer_size, largest_contentful_paint_element, largest_contentful_paint_render_time, largest_contentful_paint_load_time, largest_contentful_paint_size, largest_contentful_paint_id, largest_contentful_paint_url, dom_complete, dom_content_loaded_event, dom_interactive, load_event_end, load_event_start, redirect_count, navigation_type, unload_event_end, unload_event_start + ,_timestamp, _offset, _partition + FROM posthog_test.kafka_performance_events + + ''' +# --- +# name: test_create_table_query[person] + ''' + + CREATE TABLE IF NOT EXISTS person ON CLUSTER 'posthog' + ( + id UUID, + created_at DateTime64, + team_id Int64, + properties VARCHAR, + is_identified Int8, + is_deleted Int8, + version UInt64 + + + , _timestamp DateTime + , _offset UInt64 + + , INDEX kafka_timestamp_minmax_person _timestamp TYPE minmax GRANULARITY 3 + + ) ENGINE = ReplicatedReplacingMergeTree('/clickhouse/tables/77f1df52-4b43-11e9-910f-b8ca3a9b9f3e_noshard/posthog.person', '{replica}-{shard}', version) + Order By (team_id, id) + + + ''' +# --- +# name: test_create_table_query[person_distinct_id2] + ''' + + CREATE TABLE IF NOT EXISTS person_distinct_id2 ON CLUSTER 'posthog' + ( + team_id Int64, + distinct_id VARCHAR, + person_id UUID, + is_deleted Int8, + version Int64 + + + , _timestamp DateTime + , _offset UInt64 + + , _partition UInt64 + , INDEX kafka_timestamp_minmax_person_distinct_id2 _timestamp TYPE minmax GRANULARITY 3 + + ) ENGINE = ReplicatedReplacingMergeTree('/clickhouse/tables/77f1df52-4b43-11e9-910f-b8ca3a9b9f3e_noshard/posthog.person_distinct_id2', '{replica}-{shard}', version) + + ORDER BY (team_id, distinct_id) + SETTINGS index_granularity = 512 + + ''' +# --- +# name: test_create_table_query[person_distinct_id2_mv] + ''' + + CREATE MATERIALIZED VIEW IF NOT EXISTS person_distinct_id2_mv ON CLUSTER 'posthog' + TO posthog_test.person_distinct_id2 + AS SELECT + team_id, + distinct_id, + person_id, + is_deleted, + version, + _timestamp, + _offset, + _partition + FROM posthog_test.kafka_person_distinct_id2 + + ''' +# --- +# name: test_create_table_query[person_distinct_id] + ''' + + CREATE TABLE IF NOT EXISTS person_distinct_id ON CLUSTER 'posthog' + ( + distinct_id VARCHAR, + person_id UUID, + team_id Int64, + _sign Int8 DEFAULT 1, + is_deleted Int8 ALIAS if(_sign==-1, 1, 0) + + , _timestamp DateTime + , _offset UInt64 + + ) ENGINE = ReplicatedCollapsingMergeTree('/clickhouse/tables/77f1df52-4b43-11e9-910f-b8ca3a9b9f3e_noshard/posthog.person_distinct_id', '{replica}-{shard}', _sign) + Order By (team_id, distinct_id, person_id) + + + ''' +# --- +# name: test_create_table_query[person_distinct_id_mv] + ''' + + CREATE MATERIALIZED VIEW IF NOT EXISTS person_distinct_id_mv ON CLUSTER 'posthog' + TO posthog_test.person_distinct_id + AS SELECT + distinct_id, + person_id, + team_id, + coalesce(_sign, if(is_deleted==0, 1, -1)) AS _sign, + _timestamp, + _offset + FROM posthog_test.kafka_person_distinct_id + + ''' +# --- +# name: test_create_table_query[person_distinct_id_overrides] + ''' + + CREATE TABLE IF NOT EXISTS person_distinct_id_overrides ON CLUSTER 'posthog' + ( + team_id Int64, + distinct_id VARCHAR, + person_id UUID, + is_deleted Int8, + version Int64 + + + , _timestamp DateTime + , _offset UInt64 + , _partition UInt64 + + , INDEX kafka_timestamp_minmax_person_distinct_id_overrides _timestamp TYPE minmax GRANULARITY 3 + + ) ENGINE = ReplicatedReplacingMergeTree('/clickhouse/tables/77f1df52-4b43-11e9-910f-b8ca3a9b9f3e_noshard/posthog.person_distinct_id_overrides', '{replica}-{shard}', version) + + ORDER BY (team_id, distinct_id) + SETTINGS index_granularity = 512 + + ''' +# --- +# name: test_create_table_query[person_distinct_id_overrides_mv] + ''' + + CREATE MATERIALIZED VIEW IF NOT EXISTS person_distinct_id_overrides_mv ON CLUSTER 'posthog' + TO posthog_test.person_distinct_id_overrides + AS SELECT + team_id, + distinct_id, + person_id, + is_deleted, + version, + _timestamp, + _offset, + _partition + FROM posthog_test.kafka_person_distinct_id_overrides + WHERE version > 0 -- only store updated rows, not newly inserted ones + + ''' +# --- +# name: test_create_table_query[person_mv] + ''' + + CREATE MATERIALIZED VIEW IF NOT EXISTS person_mv ON CLUSTER 'posthog' + TO posthog_test.person + AS SELECT + id, + created_at, + team_id, + properties, + is_identified, + is_deleted, + version, + _timestamp, + _offset + FROM posthog_test.kafka_person + + ''' +# --- +# name: test_create_table_query[person_overrides] + ''' + + CREATE TABLE IF NOT EXISTS `posthog_test`.`person_overrides` + ON CLUSTER 'posthog' ( + team_id INT NOT NULL, + + -- When we merge two people `old_person_id` and `override_person_id`, we + -- want to keep track of a mapping from the `old_person_id` to the + -- `override_person_id`. This allows us to join with the + -- `sharded_events` table to find all events that were associated with + -- the `old_person_id` and update them to be associated with the + -- `override_person_id`. + old_person_id UUID NOT NULL, + override_person_id UUID NOT NULL, + + -- The timestamp the merge of the two people was completed. + merged_at DateTime64(6, 'UTC') NOT NULL, + -- The timestamp of the oldest event associated with the + -- `old_person_id`. + oldest_event DateTime64(6, 'UTC') NOT NULL, + -- The timestamp rows are created. This isn't part of the JOIN process + -- with the events table but rather a housekeeping column to allow us to + -- see when the row was created. This shouldn't have any impact of the + -- JOIN as it will be stored separately with the Wide ClickHouse table + -- storage. + created_at DateTime64(6, 'UTC') DEFAULT now(), + + -- the specific version of the `old_person_id` mapping. This is used to + -- allow us to discard old mappings as new ones are added. This version + -- will be provided by the corresponding PostgreSQL + --`posthog_personoverrides` table + version INT NOT NULL + ) + + -- By specifying Replacing merge tree on version, we allow ClickHouse to + -- discard old versions of a `old_person_id` mapping. This should help keep + -- performance in check as new versions are added. Note that given we can + -- have partitioning by `oldest_event` which will change as we update + -- `person_id` on old partitions. + -- + -- We also need to ensure that the data is replicated to all replicas in the + -- cluster, as we do not have any constraints on person_id and which shard + -- associated events are on. To do this we use the ReplicatedReplacingMergeTree + -- engine specifying a static `zk_path`. This will cause the Engine to + -- consider all replicas as the same. See + -- https://clickhouse.com/docs/en/engines/table-engines/mergetree-family/replication + -- for details. + ENGINE = ReplicatedReplacingMergeTree( + -- NOTE: for testing we use a uuid to ensure that we don't get conflicts + -- when the tests tear down and recreate the table. + '/clickhouse/tables/{uuid}noshard/posthog_test.person_overrides', + '{replica}-{shard}', + version + ) + + -- We partition the table by the `oldest_event` column. This allows us to + -- handle updating the events table partition by partition, progressing each + -- override partition by partition in lockstep with the events table. Note + -- that this means it is possible that we have a mapping from + -- `old_person_id` in multiple partitions during the merge process. + PARTITION BY toYYYYMM(oldest_event) + + -- We want to collapse down on the `old_person_id` such that we end up with + -- the newest known mapping for it in the table. Query side we will need to + -- ensure that we are always querying the latest version of the mapping. + ORDER BY (team_id, old_person_id) + + ''' +# --- +# name: test_create_table_query[person_overrides_mv] + ''' + + CREATE MATERIALIZED VIEW IF NOT EXISTS `posthog_test`.`person_overrides_mv` + ON CLUSTER 'posthog' + TO `posthog_test`.`person_overrides` + AS SELECT + team_id, + old_person_id, + override_person_id, + merged_at, + oldest_event, + -- We don't want to insert this column via Kafka, as it's + -- set as a default value in the `person_overrides` table. + -- created_at, + version + FROM `posthog_test`.`kafka_person_overrides` + + ''' +# --- +# name: test_create_table_query[person_static_cohort] + ''' + + CREATE TABLE IF NOT EXISTS person_static_cohort ON CLUSTER 'posthog' + ( + id UUID, + person_id UUID, + cohort_id Int64, + team_id Int64 + + , _timestamp DateTime + , _offset UInt64 + + ) ENGINE = ReplicatedReplacingMergeTree('/clickhouse/tables/77f1df52-4b43-11e9-910f-b8ca3a9b9f3e_noshard/posthog.person_static_cohort', '{replica}-{shard}', _timestamp) + Order By (team_id, cohort_id, person_id, id) + + + ''' +# --- +# name: test_create_table_query[plugin_log_entries] + ''' + + CREATE TABLE IF NOT EXISTS plugin_log_entries ON CLUSTER 'posthog' + ( + id UUID, + team_id Int64, + plugin_id Int64, + plugin_config_id Int64, + timestamp DateTime64(6, 'UTC'), + source VARCHAR, + type VARCHAR, + message VARCHAR, + instance_id UUID + + , _timestamp DateTime + , _offset UInt64 + + ) ENGINE = ReplicatedReplacingMergeTree('/clickhouse/tables/77f1df52-4b43-11e9-910f-b8ca3a9b9f3e_noshard/posthog.plugin_log_entries', '{replica}-{shard}', _timestamp) + PARTITION BY toYYYYMMDD(timestamp) ORDER BY (team_id, plugin_id, plugin_config_id, timestamp) + + SETTINGS index_granularity=512 + + ''' +# --- +# name: test_create_table_query[plugin_log_entries_mv] + ''' + + CREATE MATERIALIZED VIEW IF NOT EXISTS plugin_log_entries_mv ON CLUSTER 'posthog' + TO posthog_test.plugin_log_entries + AS SELECT + id, + team_id, + plugin_id, + plugin_config_id, + timestamp, + source, + type, + message, + instance_id, + _timestamp, + _offset + FROM posthog_test.kafka_plugin_log_entries + + ''' +# --- +# name: test_create_table_query[raw_sessions] + ''' + + CREATE TABLE IF NOT EXISTS raw_sessions ON CLUSTER 'posthog' + ( + team_id Int64, + session_id_v7 UInt128, -- integer representation of a uuidv7 + + -- ClickHouse will pick the latest value of distinct_id for the session + -- this is fine since even if the distinct_id changes during a session + distinct_id AggregateFunction(argMax, String, DateTime64(6, 'UTC')), + + min_timestamp SimpleAggregateFunction(min, DateTime64(6, 'UTC')), + max_timestamp SimpleAggregateFunction(max, DateTime64(6, 'UTC')), + + -- urls + urls SimpleAggregateFunction(groupUniqArrayArray, Array(String)), + entry_url AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + end_url AggregateFunction(argMax, String, DateTime64(6, 'UTC')), + last_external_click_url AggregateFunction(argMax, String, DateTime64(6, 'UTC')), + + -- device + initial_browser AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_browser_version AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_os AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_os_version AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_device_type AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_viewport_width AggregateFunction(argMin, Int64, DateTime64(6, 'UTC')), + initial_viewport_height AggregateFunction(argMin, Int64, DateTime64(6, 'UTC')), + + -- geoip + -- only store the properties we actually use, as there's tons, see https://posthog.com/docs/cdp/geoip-enrichment + initial_geoip_country_code AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_geoip_subdivision_1_code AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_geoip_subdivision_1_name AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_geoip_subdivision_city_name AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_geoip_time_zone AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + + -- attribution + initial_referring_domain AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_utm_source AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_utm_campaign AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_utm_medium AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_utm_term AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_utm_content AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_gclid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_gad_source AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_gclsrc AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_dclid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_gbraid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_wbraid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_fbclid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_msclkid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_twclid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_li_fat_id AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_mc_cid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_igshid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_ttclid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + + -- Count pageview, autocapture, and screen events for providing totals. + -- It's unclear if we can use the counts as they are not idempotent, and we had a bug on EU where events were + -- double-counted, so the counts were wrong. To get around this, also keep track of the unique uuids. This will be + -- slower and more expensive to store, but will be correct even if events are double-counted, so can be used to + -- verify correctness and as a backup. Ideally we will be able to delete the uniq columns in the future when we're + -- satisfied that counts are accurate. + pageview_count SimpleAggregateFunction(sum, Int64), + pageview_uniq AggregateFunction(uniq, Nullable(UUID)), + autocapture_count SimpleAggregateFunction(sum, Int64), + autocapture_uniq AggregateFunction(uniq, Nullable(UUID)), + screen_count SimpleAggregateFunction(sum, Int64), + screen_uniq AggregateFunction(uniq, Nullable(UUID)), + + -- replay + maybe_has_session_replay SimpleAggregateFunction(max, Bool), -- will be written False to by the events table mv and True to by the replay table mv + + -- as a performance optimisation, also keep track of the uniq events for all of these combined, a bounce is a session with <2 of these + page_screen_autocapture_uniq_up_to AggregateFunction(uniqUpTo(1), Nullable(UUID)), + + -- web vitals + vitals_lcp AggregateFunction(argMin, Nullable(Float64), DateTime64(6, 'UTC')) + ) ENGINE = Distributed('posthog', 'posthog_test', 'sharded_raw_sessions', cityHash64(session_id_v7)) + + ''' +# --- +# name: test_create_table_query[raw_sessions_mv] + ''' + + CREATE MATERIALIZED VIEW IF NOT EXISTS raw_sessions_mv ON CLUSTER 'posthog' + TO posthog_test.writable_raw_sessions + AS + + SELECT + team_id, + toUInt128(toUUID(`$session_id`)) as session_id_v7, + + argMaxState(distinct_id, timestamp) as distinct_id, + + min(timestamp) AS min_timestamp, + max(timestamp) AS max_timestamp, + + -- urls + groupUniqArray(nullIf(JSONExtractString(properties, '$current_url'), '')) AS urls, + argMinState(JSONExtractString(properties, '$current_url'), timestamp) as entry_url, + argMaxState(JSONExtractString(properties, '$current_url'), timestamp) as end_url, + argMaxState(JSONExtractString(properties, '$external_click_url'), timestamp) as last_external_click_url, + + -- device + argMinState(JSONExtractString(properties, '$browser'), timestamp) as initial_browser, + argMinState(JSONExtractString(properties, '$browser_version'), timestamp) as initial_browser_version, + argMinState(JSONExtractString(properties, '$os'), timestamp) as initial_os, + argMinState(JSONExtractString(properties, '$os_version'), timestamp) as initial_os_version, + argMinState(JSONExtractString(properties, '$device_type'), timestamp) as initial_device_type, + argMinState(JSONExtractInt(properties, '$viewport_width'), timestamp) as initial_viewport_width, + argMinState(JSONExtractInt(properties, '$viewport_height'), timestamp) as initial_viewport_height, + + -- geoip + argMinState(JSONExtractString(properties, '$geoip_country_code'), timestamp) as initial_geoip_country_code, + argMinState(JSONExtractString(properties, '$geoip_subdivision_1_code'), timestamp) as initial_geoip_subdivision_1_code, + argMinState(JSONExtractString(properties, '$geoip_subdivision_1_name'), timestamp) as initial_geoip_subdivision_1_name, + argMinState(JSONExtractString(properties, '$geoip_subdivision_city_name'), timestamp) as initial_geoip_subdivision_city_name, + argMinState(JSONExtractString(properties, '$geoip_time_zone'), timestamp) as initial_geoip_time_zone, + + -- attribution + argMinState(JSONExtractString(properties, '$referring_domain'), timestamp) as initial_referring_domain, + argMinState(JSONExtractString(properties, 'utm_source'), timestamp) as initial_utm_source, + argMinState(JSONExtractString(properties, 'utm_campaign'), timestamp) as initial_utm_campaign, + argMinState(JSONExtractString(properties, 'utm_medium'), timestamp) as initial_utm_medium, + argMinState(JSONExtractString(properties, 'utm_term'), timestamp) as initial_utm_term, + argMinState(JSONExtractString(properties, 'utm_content'), timestamp) as initial_utm_content, + argMinState(JSONExtractString(properties, 'gclid'), timestamp) as initial_gclid, + argMinState(JSONExtractString(properties, 'gad_source'), timestamp) as initial_gad_source, + argMinState(JSONExtractString(properties, 'gclsrc'), timestamp) as initial_gclsrc, + argMinState(JSONExtractString(properties, 'dclid'), timestamp) as initial_dclid, + argMinState(JSONExtractString(properties, 'gbraid'), timestamp) as initial_gbraid, + argMinState(JSONExtractString(properties, 'wbraid'), timestamp) as initial_wbraid, + argMinState(JSONExtractString(properties, 'fbclid'), timestamp) as initial_fbclid, + argMinState(JSONExtractString(properties, 'msclkid'), timestamp) as initial_msclkid, + argMinState(JSONExtractString(properties, 'twclid'), timestamp) as initial_twclid, + argMinState(JSONExtractString(properties, 'li_fat_id'), timestamp) as initial_li_fat_id, + argMinState(JSONExtractString(properties, 'mc_cid'), timestamp) as initial_mc_cid, + argMinState(JSONExtractString(properties, 'igshid'), timestamp) as initial_igshid, + argMinState(JSONExtractString(properties, 'ttclid'), timestamp) as initial_ttclid, + + -- count + sumIf(1, event='$pageview') as pageview_count, + uniqState(CAST(if(event='$pageview', uuid, NULL) AS Nullable(UUID))) as pageview_uniq, + sumIf(1, event='$autocapture') as autocapture_count, + uniqState(CAST(if(event='$autocapture', uuid, NULL) AS Nullable(UUID))) as autocapture_uniq, + sumIf(1, event='$screen') as screen_count, + uniqState(CAST(if(event='$screen', uuid, NULL) AS Nullable(UUID))) as screen_uniq, + + -- replay + false as maybe_has_session_replay, + + -- perf + uniqUpToState(1)(CAST(if(event='$pageview' OR event='$screen' OR event='$autocapture', uuid, NULL) AS Nullable(UUID))) as page_screen_autocapture_uniq_up_to, + + -- web vitals + argMinState(accurateCastOrNull(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(properties, '$web_vitals_LCP_value'), ''), 'null'), '^"|"$', ''), 'Float64'), timestamp) as vitals_lcp + FROM posthog_test.sharded_events + WHERE bitAnd(bitShiftRight(toUInt128(accurateCastOrNull(`$session_id`, 'UUID')), 76), 0xF) == 7 -- has a session id and is valid uuidv7) + GROUP BY + team_id, + toStartOfHour(fromUnixTimestamp(intDiv(toUInt64(bitShiftRight(session_id_v7, 80)), 1000))), + cityHash64(session_id_v7), + session_id_v7 + + + ''' +# --- +# name: test_create_table_query[session_recording_events] + ''' + + CREATE TABLE IF NOT EXISTS session_recording_events ON CLUSTER 'posthog' + ( + uuid UUID, + timestamp DateTime64(6, 'UTC'), + team_id Int64, + distinct_id VARCHAR, + session_id VARCHAR, + window_id VARCHAR, + snapshot_data VARCHAR, + created_at DateTime64(6, 'UTC') + , has_full_snapshot Int8 COMMENT 'column_materializer::has_full_snapshot', events_summary Array(String) COMMENT 'column_materializer::events_summary', click_count Int8 COMMENT 'column_materializer::click_count', keypress_count Int8 COMMENT 'column_materializer::keypress_count', timestamps_summary Array(DateTime64(6, 'UTC')) COMMENT 'column_materializer::timestamps_summary', first_event_timestamp Nullable(DateTime64(6, 'UTC')) COMMENT 'column_materializer::first_event_timestamp', last_event_timestamp Nullable(DateTime64(6, 'UTC')) COMMENT 'column_materializer::last_event_timestamp', urls Array(String) COMMENT 'column_materializer::urls' + + , _timestamp DateTime + , _offset UInt64 + + ) ENGINE = Distributed('posthog', 'posthog_test', 'sharded_session_recording_events', sipHash64(distinct_id)) + + ''' +# --- +# name: test_create_table_query[session_recording_events_mv] + ''' + + CREATE MATERIALIZED VIEW IF NOT EXISTS session_recording_events_mv ON CLUSTER 'posthog' + TO posthog_test.writable_session_recording_events + AS SELECT + uuid, + timestamp, + team_id, + distinct_id, + session_id, + window_id, + snapshot_data, + created_at, + _timestamp, + _offset + FROM posthog_test.kafka_session_recording_events + + ''' +# --- +# name: test_create_table_query[session_replay_events] + ''' + + CREATE TABLE IF NOT EXISTS session_replay_events ON CLUSTER 'posthog' + ( + -- part of order by so will aggregate correctly + session_id VARCHAR, + -- part of order by so will aggregate correctly + team_id Int64, + -- ClickHouse will pick any value of distinct_id for the session + -- this is fine since even if the distinct_id changes during a session + -- it will still (or should still) map to the same person + distinct_id VARCHAR, + min_first_timestamp SimpleAggregateFunction(min, DateTime64(6, 'UTC')), + max_last_timestamp SimpleAggregateFunction(max, DateTime64(6, 'UTC')), + first_url AggregateFunction(argMin, Nullable(VARCHAR), DateTime64(6, 'UTC')), + click_count SimpleAggregateFunction(sum, Int64), + keypress_count SimpleAggregateFunction(sum, Int64), + mouse_activity_count SimpleAggregateFunction(sum, Int64), + active_milliseconds SimpleAggregateFunction(sum, Int64), + console_log_count SimpleAggregateFunction(sum, Int64), + console_warn_count SimpleAggregateFunction(sum, Int64), + console_error_count SimpleAggregateFunction(sum, Int64), + -- this column allows us to estimate the amount of data that is being ingested + size SimpleAggregateFunction(sum, Int64), + -- this allows us to count the number of messages received in a session + -- often very useful in incidents or debugging + message_count SimpleAggregateFunction(sum, Int64), + -- this allows us to count the number of snapshot events received in a session + -- often very useful in incidents or debugging + -- because we batch events we expect message_count to be lower than event_count + event_count SimpleAggregateFunction(sum, Int64), + -- which source the snapshots came from Android, iOS, Mobile, Web. Web if absent + snapshot_source AggregateFunction(argMin, LowCardinality(Nullable(String)), DateTime64(6, 'UTC')), + _timestamp SimpleAggregateFunction(max, DateTime) + ) ENGINE = Distributed('posthog', 'posthog_test', 'sharded_session_replay_events', sipHash64(distinct_id)) + + ''' +# --- +# name: test_create_table_query[session_replay_events_mv] + ''' + + CREATE MATERIALIZED VIEW IF NOT EXISTS session_replay_events_mv ON CLUSTER 'posthog' + TO posthog_test.writable_session_replay_events ( + `session_id` String, `team_id` Int64, `distinct_id` String, + `min_first_timestamp` DateTime64(6, 'UTC'), + `max_last_timestamp` DateTime64(6, 'UTC'), + `first_url` AggregateFunction(argMin, Nullable(String), DateTime64(6, 'UTC')), + `click_count` Int64, `keypress_count` Int64, + `mouse_activity_count` Int64, `active_milliseconds` Int64, + `console_log_count` Int64, `console_warn_count` Int64, + `console_error_count` Int64, `size` Int64, `message_count` Int64, + `event_count` Int64, + `snapshot_source` AggregateFunction(argMin, LowCardinality(Nullable(String)), DateTime64(6, 'UTC')), + `_timestamp` Nullable(DateTime) + ) + AS SELECT + session_id, + team_id, + any(distinct_id) as distinct_id, + min(first_timestamp) AS min_first_timestamp, + max(last_timestamp) AS max_last_timestamp, + -- TRICKY: ClickHouse will pick a relatively random first_url + -- when it collapses the aggregating merge tree + -- unless we teach it what we want... + -- argMin ignores null values + -- so this will get the first non-null value of first_url + -- for each group of session_id and team_id + -- by min of first_timestamp in the batch + -- this is an aggregate function, not a simple aggregate function + -- so we have to write to argMinState, and query with argMinMerge + argMinState(first_url, first_timestamp) as first_url, + sum(click_count) as click_count, + sum(keypress_count) as keypress_count, + sum(mouse_activity_count) as mouse_activity_count, + sum(active_milliseconds) as active_milliseconds, + sum(console_log_count) as console_log_count, + sum(console_warn_count) as console_warn_count, + sum(console_error_count) as console_error_count, + sum(size) as size, + -- we can count the number of kafka messages instead of sending it explicitly + sum(message_count) as message_count, + sum(event_count) as event_count, + argMinState(snapshot_source, first_timestamp) as snapshot_source, + max(_timestamp) as _timestamp + FROM posthog_test.kafka_session_replay_events + group by session_id, team_id + + ''' +# --- +# name: test_create_table_query[sessions] + ''' + + CREATE TABLE IF NOT EXISTS sessions ON CLUSTER 'posthog' + ( + -- part of order by so will aggregate correctly + session_id VARCHAR, + -- part of order by so will aggregate correctly + team_id Int64, + -- ClickHouse will pick any value of distinct_id for the session + -- this is fine since even if the distinct_id changes during a session + -- it will still (or should still) map to the same person + distinct_id SimpleAggregateFunction(any, String), + + min_timestamp SimpleAggregateFunction(min, DateTime64(6, 'UTC')), + max_timestamp SimpleAggregateFunction(max, DateTime64(6, 'UTC')), + + urls SimpleAggregateFunction(groupUniqArrayArray, Array(String)), + entry_url AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + exit_url AggregateFunction(argMax, String, DateTime64(6, 'UTC')), + + initial_referring_domain AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_utm_source AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_utm_campaign AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_utm_medium AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_utm_term AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_utm_content AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_gclid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_gad_source AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_gclsrc AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_dclid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_gbraid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_wbraid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_fbclid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_msclkid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_twclid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_li_fat_id AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_mc_cid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_igshid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_ttclid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + + -- create a map of how many times we saw each event + event_count_map SimpleAggregateFunction(sumMap, Map(String, Int64)), + -- duplicate the event count as a specific column for pageviews and autocaptures, + -- as these are used in some key queries and need to be fast + pageview_count SimpleAggregateFunction(sum, Int64), + autocapture_count SimpleAggregateFunction(sum, Int64), + ) ENGINE = Distributed('posthog', 'posthog_test', 'sharded_sessions', sipHash64(session_id)) + + ''' +# --- +# name: test_create_table_query[sessions_mv] + ''' + + CREATE MATERIALIZED VIEW IF NOT EXISTS sessions_mv ON CLUSTER 'posthog' + TO posthog_test.writable_sessions + AS + + SELECT + + `$session_id` as session_id, + team_id, + + -- it doesn't matter which distinct_id gets picked (it'll be somewhat random) as they can all join to the right person + any(distinct_id) as distinct_id, + + min(timestamp) AS min_timestamp, + max(timestamp) AS max_timestamp, + + groupUniqArray(replaceRegexpAll(JSONExtractRaw(properties, '$current_url'), '^"|"$', '')) AS urls, + argMinState(replaceRegexpAll(JSONExtractRaw(properties, '$current_url'), '^"|"$', ''), timestamp) as entry_url, + argMaxState(replaceRegexpAll(JSONExtractRaw(properties, '$current_url'), '^"|"$', ''), timestamp) as exit_url, + + argMinState(replaceRegexpAll(JSONExtractRaw(properties, '$referring_domain'), '^"|"$', ''), timestamp) as initial_referring_domain, + argMinState(replaceRegexpAll(JSONExtractRaw(properties, 'utm_source'), '^"|"$', ''), timestamp) as initial_utm_source, + argMinState(replaceRegexpAll(JSONExtractRaw(properties, 'utm_campaign'), '^"|"$', ''), timestamp) as initial_utm_campaign, + argMinState(replaceRegexpAll(JSONExtractRaw(properties, 'utm_medium'), '^"|"$', ''), timestamp) as initial_utm_medium, + argMinState(replaceRegexpAll(JSONExtractRaw(properties, 'utm_term'), '^"|"$', ''), timestamp) as initial_utm_term, + argMinState(replaceRegexpAll(JSONExtractRaw(properties, 'utm_content'), '^"|"$', ''), timestamp) as initial_utm_content, + argMinState(replaceRegexpAll(JSONExtractRaw(properties, 'gclid'), '^"|"$', ''), timestamp) as initial_gclid, + argMinState(replaceRegexpAll(JSONExtractRaw(properties, 'gad_source'), '^"|"$', ''), timestamp) as initial_gad_source, + argMinState(replaceRegexpAll(JSONExtractRaw(properties, 'gclsrc'), '^"|"$', ''), timestamp) as initial_gclsrc, + argMinState(replaceRegexpAll(JSONExtractRaw(properties, 'dclid'), '^"|"$', ''), timestamp) as initial_dclid, + argMinState(replaceRegexpAll(JSONExtractRaw(properties, 'gbraid'), '^"|"$', ''), timestamp) as initial_gbraid, + argMinState(replaceRegexpAll(JSONExtractRaw(properties, 'wbraid'), '^"|"$', ''), timestamp) as initial_wbraid, + argMinState(replaceRegexpAll(JSONExtractRaw(properties, 'fbclid'), '^"|"$', ''), timestamp) as initial_fbclid, + argMinState(replaceRegexpAll(JSONExtractRaw(properties, 'msclkid'), '^"|"$', ''), timestamp) as initial_msclkid, + argMinState(replaceRegexpAll(JSONExtractRaw(properties, 'twclid'), '^"|"$', ''), timestamp) as initial_twclid, + argMinState(replaceRegexpAll(JSONExtractRaw(properties, 'li_fat_id'), '^"|"$', ''), timestamp) as initial_li_fat_id, + argMinState(replaceRegexpAll(JSONExtractRaw(properties, 'mc_cid'), '^"|"$', ''), timestamp) as initial_mc_cid, + argMinState(replaceRegexpAll(JSONExtractRaw(properties, 'igshid'), '^"|"$', ''), timestamp) as initial_igshid, + argMinState(replaceRegexpAll(JSONExtractRaw(properties, 'ttclid'), '^"|"$', ''), timestamp) as initial_ttclid, + + sumMap(CAST(([event], [1]), 'Map(String, UInt64)')) as event_count_map, + sumIf(1, event='$pageview') as pageview_count, + sumIf(1, event='$autocapture') as autocapture_count + + FROM posthog_test.sharded_events + WHERE `$session_id` IS NOT NULL AND `$session_id` != '' + GROUP BY `$session_id`, team_id + + + ''' +# --- +# name: test_create_table_query[sharded_app_metrics2] + ''' + + CREATE TABLE IF NOT EXISTS sharded_app_metrics2 ON CLUSTER 'posthog' + ( + team_id Int64, + timestamp DateTime64(6, 'UTC'), + -- The name of the service or product that generated the metrics. + -- Examples: plugins, hog + app_source LowCardinality(String), + -- An id for the app source. + -- Set app_source to avoid collision with ids from other app sources if the id generation is not safe. + -- Examples: A plugin id, a hog application id + app_source_id String, + -- A secondary id e.g. for the instance of app_source that generated this metric. + -- This may be ommitted if app_source is a singleton. + -- Examples: A plugin config id, a hog application config id + instance_id String, + metric_kind LowCardinality(String), + metric_name LowCardinality(String), + count SimpleAggregateFunction(sum, Int64) + + , _timestamp DateTime + , _offset UInt64 + , _partition UInt64 + + ) + ENGINE = ReplicatedAggregatingMergeTree('/clickhouse/tables/77f1df52-4b43-11e9-910f-b8ca3a9b9f3e_{shard}/posthog.sharded_app_metrics2', '{replica}') + PARTITION BY toYYYYMM(timestamp) + ORDER BY (team_id, app_source, app_source_id, instance_id, toStartOfHour(timestamp), metric_kind, metric_name) + + + ''' +# --- +# name: test_create_table_query[sharded_app_metrics] + ''' + + CREATE TABLE IF NOT EXISTS sharded_app_metrics ON CLUSTER 'posthog' + ( + team_id Int64, + timestamp DateTime64(6, 'UTC'), + plugin_config_id Int64, + category LowCardinality(String), + job_id String, + successes SimpleAggregateFunction(sum, Int64), + successes_on_retry SimpleAggregateFunction(sum, Int64), + failures SimpleAggregateFunction(sum, Int64), + error_uuid UUID, + error_type String, + error_details String CODEC(ZSTD(3)) + + , _timestamp DateTime + , _offset UInt64 + , _partition UInt64 + + ) + ENGINE = ReplicatedAggregatingMergeTree('/clickhouse/tables/77f1df52-4b43-11e9-910f-b8ca3a9b9f3e_{shard}/posthog.sharded_app_metrics', '{replica}') + PARTITION BY toYYYYMM(timestamp) + ORDER BY (team_id, plugin_config_id, job_id, category, toStartOfHour(timestamp), error_type, error_uuid) + + ''' +# --- +# name: test_create_table_query[sharded_events] + ''' + + CREATE TABLE IF NOT EXISTS sharded_events ON CLUSTER 'posthog' + ( + uuid UUID, + event VARCHAR, + properties VARCHAR CODEC(ZSTD(3)), + timestamp DateTime64(6, 'UTC'), + team_id Int64, + distinct_id VARCHAR, + elements_chain VARCHAR, + created_at DateTime64(6, 'UTC'), + person_id UUID, + person_created_at DateTime64, + person_properties VARCHAR Codec(ZSTD(3)), + group0_properties VARCHAR Codec(ZSTD(3)), + group1_properties VARCHAR Codec(ZSTD(3)), + group2_properties VARCHAR Codec(ZSTD(3)), + group3_properties VARCHAR Codec(ZSTD(3)), + group4_properties VARCHAR Codec(ZSTD(3)), + group0_created_at DateTime64, + group1_created_at DateTime64, + group2_created_at DateTime64, + group3_created_at DateTime64, + group4_created_at DateTime64, + person_mode Enum8('full' = 0, 'propertyless' = 1, 'force_upgrade' = 2) + + , $group_0 VARCHAR MATERIALIZED replaceRegexpAll(JSONExtractRaw(properties, '$group_0'), '^"|"$', '') COMMENT 'column_materializer::$group_0' + , $group_1 VARCHAR MATERIALIZED replaceRegexpAll(JSONExtractRaw(properties, '$group_1'), '^"|"$', '') COMMENT 'column_materializer::$group_1' + , $group_2 VARCHAR MATERIALIZED replaceRegexpAll(JSONExtractRaw(properties, '$group_2'), '^"|"$', '') COMMENT 'column_materializer::$group_2' + , $group_3 VARCHAR MATERIALIZED replaceRegexpAll(JSONExtractRaw(properties, '$group_3'), '^"|"$', '') COMMENT 'column_materializer::$group_3' + , $group_4 VARCHAR MATERIALIZED replaceRegexpAll(JSONExtractRaw(properties, '$group_4'), '^"|"$', '') COMMENT 'column_materializer::$group_4' + , $window_id VARCHAR MATERIALIZED replaceRegexpAll(JSONExtractRaw(properties, '$window_id'), '^"|"$', '') COMMENT 'column_materializer::$window_id' + , $session_id VARCHAR MATERIALIZED replaceRegexpAll(JSONExtractRaw(properties, '$session_id'), '^"|"$', '') COMMENT 'column_materializer::$session_id' + , elements_chain_href String MATERIALIZED extract(elements_chain, '(?::|")href="(.*?)"') + , elements_chain_texts Array(String) MATERIALIZED arrayDistinct(extractAll(elements_chain, '(?::|")text="(.*?)"')) + , elements_chain_ids Array(String) MATERIALIZED arrayDistinct(extractAll(elements_chain, '(?::|")attr_id="(.*?)"')) + , elements_chain_elements Array(Enum('a', 'button', 'form', 'input', 'select', 'textarea', 'label')) MATERIALIZED arrayDistinct(extractAll(elements_chain, '(?:^|;)(a|button|form|input|select|textarea|label)(?:\.|$|:)')) + , INDEX `minmax_$group_0` `$group_0` TYPE minmax GRANULARITY 1 + , INDEX `minmax_$group_1` `$group_1` TYPE minmax GRANULARITY 1 + , INDEX `minmax_$group_2` `$group_2` TYPE minmax GRANULARITY 1 + , INDEX `minmax_$group_3` `$group_3` TYPE minmax GRANULARITY 1 + , INDEX `minmax_$group_4` `$group_4` TYPE minmax GRANULARITY 1 + , INDEX `minmax_$window_id` `$window_id` TYPE minmax GRANULARITY 1 + , INDEX `minmax_$session_id` `$session_id` TYPE minmax GRANULARITY 1 + , properties_group_custom Map(String, String) + MATERIALIZED mapSort( + mapFilter((key, _) -> key NOT LIKE '$%' AND key NOT IN ('token', 'distinct_id', 'utm_source', 'utm_medium', 'utm_campaign', 'utm_content', 'utm_term', 'gclid', 'gad_source', 'gclsrc', 'dclid', 'gbraid', 'wbraid', 'fbclid', 'msclkid', 'twclid', 'li_fat_id', 'mc_cid', 'igshid', 'ttclid', 'rdt_cid'), + CAST(JSONExtractKeysAndValues(properties, 'String'), 'Map(String, String)')) + ) + CODEC(ZSTD(1)) + , INDEX properties_group_custom_keys_bf mapKeys(properties_group_custom) TYPE bloom_filter, INDEX properties_group_custom_values_bf mapValues(properties_group_custom) TYPE bloom_filter, properties_group_feature_flags Map(String, String) + MATERIALIZED mapSort( + mapFilter((key, _) -> key like '$feature/%', + CAST(JSONExtractKeysAndValues(properties, 'String'), 'Map(String, String)')) + ) + CODEC(ZSTD(1)) + , INDEX properties_group_feature_flags_keys_bf mapKeys(properties_group_feature_flags) TYPE bloom_filter, INDEX properties_group_feature_flags_values_bf mapValues(properties_group_feature_flags) TYPE bloom_filter + + + , _timestamp DateTime + , _offset UInt64 + , inserted_at Nullable(DateTime64(6, 'UTC')) DEFAULT NULL + + , INDEX kafka_timestamp_minmax_sharded_events _timestamp TYPE minmax GRANULARITY 3 + + ) ENGINE = ReplicatedReplacingMergeTree('/clickhouse/tables/77f1df52-4b43-11e9-910f-b8ca3a9b9f3e_{shard}/posthog.events', '{replica}', _timestamp) + PARTITION BY toYYYYMM(timestamp) + ORDER BY (team_id, toDate(timestamp), event, cityHash64(distinct_id), cityHash64(uuid)) + SAMPLE BY cityHash64(distinct_id) + + + ''' +# --- +# name: test_create_table_query[sharded_heatmaps] + ''' + + CREATE TABLE IF NOT EXISTS sharded_heatmaps ON CLUSTER 'posthog' + ( + session_id VARCHAR, + team_id Int64, + distinct_id VARCHAR, + timestamp DateTime64(6, 'UTC'), + -- x is the x with resolution applied, the resolution converts high fidelity mouse positions into an NxN grid + x Int16, + -- y is the y with resolution applied, the resolution converts high fidelity mouse positions into an NxN grid + y Int16, + -- stored so that in future we can support other resolutions + scale_factor Int16, + viewport_width Int16, + viewport_height Int16, + -- some elements move when the page scrolls, others do not + pointer_target_fixed Bool, + current_url VARCHAR, + type LowCardinality(String), + _timestamp DateTime, + _offset UInt64, + _partition UInt64 + ) ENGINE = ReplicatedMergeTree('/clickhouse/tables/77f1df52-4b43-11e9-910f-b8ca3a9b9f3e_{shard}/posthog.heatmaps', '{replica}') + + PARTITION BY toYYYYMM(timestamp) + -- almost always this is being queried by + -- * type, + -- * team_id, + -- * date range, + -- * URL (maybe matching wild cards), + -- * width + -- we'll almost never query this by session id + -- so from least to most cardinality that's + ORDER BY (type, team_id, toDate(timestamp), current_url, viewport_width) + + -- I am purposefully not setting index granularity + -- the default is 8192, and we will be loading a lot of data + -- per query, we tend to copy this 512 around the place but + -- i don't think it applies here + + ''' +# --- +# name: test_create_table_query[sharded_ingestion_warnings] + ''' + + CREATE TABLE IF NOT EXISTS sharded_ingestion_warnings ON CLUSTER 'posthog' + ( + team_id Int64, + source LowCardinality(VARCHAR), + type VARCHAR, + details VARCHAR CODEC(ZSTD(3)), + timestamp DateTime64(6, 'UTC') + + , _timestamp DateTime + , _offset UInt64 + , _partition UInt64 + + ) ENGINE = ReplicatedMergeTree('/clickhouse/tables/77f1df52-4b43-11e9-910f-b8ca3a9b9f3e_{shard}/posthog.sharded_ingestion_warnings', '{replica}') + PARTITION BY toYYYYMMDD(timestamp) + ORDER BY (team_id, toHour(timestamp), type, source, timestamp) + + ''' +# --- +# name: test_create_table_query[sharded_performance_events] + ''' + + CREATE TABLE IF NOT EXISTS sharded_performance_events ON CLUSTER 'posthog' + ( + uuid UUID, + session_id String, + window_id String, + pageview_id String, + distinct_id String, + timestamp DateTime64, + time_origin DateTime64(3, 'UTC'), + entry_type LowCardinality(String), + name String, + team_id Int64, + current_url String, + start_time Float64, + duration Float64, + redirect_start Float64, + redirect_end Float64, + worker_start Float64, + fetch_start Float64, + domain_lookup_start Float64, + domain_lookup_end Float64, + connect_start Float64, + secure_connection_start Float64, + connect_end Float64, + request_start Float64, + response_start Float64, + response_end Float64, + decoded_body_size Int64, + encoded_body_size Int64, + initiator_type LowCardinality(String), + next_hop_protocol LowCardinality(String), + render_blocking_status LowCardinality(String), + response_status Int64, + transfer_size Int64, + largest_contentful_paint_element String, + largest_contentful_paint_render_time Float64, + largest_contentful_paint_load_time Float64, + largest_contentful_paint_size Float64, + largest_contentful_paint_id String, + largest_contentful_paint_url String, + dom_complete Float64, + dom_content_loaded_event Float64, + dom_interactive Float64, + load_event_end Float64, + load_event_start Float64, + redirect_count Int64, + navigation_type LowCardinality(String), + unload_event_end Float64, + unload_event_start Float64 + + , _timestamp DateTime + , _offset UInt64 + , _partition UInt64 + + ) ENGINE = ReplicatedMergeTree('/clickhouse/tables/77f1df52-4b43-11e9-910f-b8ca3a9b9f3e_{shard}/posthog.performance_events', '{replica}') + PARTITION BY toYYYYMM(timestamp) + ORDER BY (team_id, toDate(timestamp), session_id, pageview_id, timestamp) + + + + ''' +# --- +# name: test_create_table_query[sharded_raw_sessions] + ''' + + CREATE TABLE IF NOT EXISTS sharded_raw_sessions ON CLUSTER 'posthog' + ( + team_id Int64, + session_id_v7 UInt128, -- integer representation of a uuidv7 + + -- ClickHouse will pick the latest value of distinct_id for the session + -- this is fine since even if the distinct_id changes during a session + distinct_id AggregateFunction(argMax, String, DateTime64(6, 'UTC')), + + min_timestamp SimpleAggregateFunction(min, DateTime64(6, 'UTC')), + max_timestamp SimpleAggregateFunction(max, DateTime64(6, 'UTC')), + + -- urls + urls SimpleAggregateFunction(groupUniqArrayArray, Array(String)), + entry_url AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + end_url AggregateFunction(argMax, String, DateTime64(6, 'UTC')), + last_external_click_url AggregateFunction(argMax, String, DateTime64(6, 'UTC')), + + -- device + initial_browser AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_browser_version AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_os AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_os_version AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_device_type AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_viewport_width AggregateFunction(argMin, Int64, DateTime64(6, 'UTC')), + initial_viewport_height AggregateFunction(argMin, Int64, DateTime64(6, 'UTC')), + + -- geoip + -- only store the properties we actually use, as there's tons, see https://posthog.com/docs/cdp/geoip-enrichment + initial_geoip_country_code AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_geoip_subdivision_1_code AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_geoip_subdivision_1_name AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_geoip_subdivision_city_name AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_geoip_time_zone AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + + -- attribution + initial_referring_domain AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_utm_source AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_utm_campaign AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_utm_medium AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_utm_term AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_utm_content AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_gclid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_gad_source AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_gclsrc AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_dclid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_gbraid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_wbraid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_fbclid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_msclkid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_twclid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_li_fat_id AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_mc_cid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_igshid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_ttclid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + + -- Count pageview, autocapture, and screen events for providing totals. + -- It's unclear if we can use the counts as they are not idempotent, and we had a bug on EU where events were + -- double-counted, so the counts were wrong. To get around this, also keep track of the unique uuids. This will be + -- slower and more expensive to store, but will be correct even if events are double-counted, so can be used to + -- verify correctness and as a backup. Ideally we will be able to delete the uniq columns in the future when we're + -- satisfied that counts are accurate. + pageview_count SimpleAggregateFunction(sum, Int64), + pageview_uniq AggregateFunction(uniq, Nullable(UUID)), + autocapture_count SimpleAggregateFunction(sum, Int64), + autocapture_uniq AggregateFunction(uniq, Nullable(UUID)), + screen_count SimpleAggregateFunction(sum, Int64), + screen_uniq AggregateFunction(uniq, Nullable(UUID)), + + -- replay + maybe_has_session_replay SimpleAggregateFunction(max, Bool), -- will be written False to by the events table mv and True to by the replay table mv + + -- as a performance optimisation, also keep track of the uniq events for all of these combined, a bounce is a session with <2 of these + page_screen_autocapture_uniq_up_to AggregateFunction(uniqUpTo(1), Nullable(UUID)), + + -- web vitals + vitals_lcp AggregateFunction(argMin, Nullable(Float64), DateTime64(6, 'UTC')) + ) ENGINE = ReplicatedAggregatingMergeTree('/clickhouse/tables/77f1df52-4b43-11e9-910f-b8ca3a9b9f3e_{shard}/posthog.raw_sessions', '{replica}') + + PARTITION BY toYYYYMM(fromUnixTimestamp(intDiv(toUInt64(bitShiftRight(session_id_v7, 80)), 1000))) + ORDER BY ( + team_id, + toStartOfHour(fromUnixTimestamp(intDiv(toUInt64(bitShiftRight(session_id_v7, 80)), 1000))), + cityHash64(session_id_v7), + session_id_v7 + ) + SAMPLE BY cityHash64(session_id_v7) + + ''' +# --- +# name: test_create_table_query[sharded_session_recording_events] + ''' + + CREATE TABLE IF NOT EXISTS sharded_session_recording_events ON CLUSTER 'posthog' + ( + uuid UUID, + timestamp DateTime64(6, 'UTC'), + team_id Int64, + distinct_id VARCHAR, + session_id VARCHAR, + window_id VARCHAR, + snapshot_data VARCHAR, + created_at DateTime64(6, 'UTC') + , has_full_snapshot Int8 MATERIALIZED JSONExtractBool(snapshot_data, 'has_full_snapshot'), events_summary Array(String) MATERIALIZED JSONExtract(JSON_QUERY(snapshot_data, '$.events_summary[*]'), 'Array(String)'), click_count Int8 MATERIALIZED length(arrayFilter((x) -> JSONExtractInt(x, 'type') = 3 AND JSONExtractInt(x, 'data', 'source') = 2, events_summary)), keypress_count Int8 MATERIALIZED length(arrayFilter((x) -> JSONExtractInt(x, 'type') = 3 AND JSONExtractInt(x, 'data', 'source') = 5, events_summary)), timestamps_summary Array(DateTime64(6, 'UTC')) MATERIALIZED arraySort(arrayMap((x) -> toDateTime(JSONExtractInt(x, 'timestamp') / 1000), events_summary)), first_event_timestamp Nullable(DateTime64(6, 'UTC')) MATERIALIZED if(empty(timestamps_summary), NULL, arrayReduce('min', timestamps_summary)), last_event_timestamp Nullable(DateTime64(6, 'UTC')) MATERIALIZED if(empty(timestamps_summary), NULL, arrayReduce('max', timestamps_summary)), urls Array(String) MATERIALIZED arrayFilter(x -> x != '', arrayMap((x) -> JSONExtractString(x, 'data', 'href'), events_summary)) + + + , _timestamp DateTime + , _offset UInt64 + + , INDEX kafka_timestamp_minmax_sharded_session_recording_events _timestamp TYPE minmax GRANULARITY 3 + + ) ENGINE = ReplicatedReplacingMergeTree('/clickhouse/tables/77f1df52-4b43-11e9-910f-b8ca3a9b9f3e_{shard}/posthog.session_recording_events', '{replica}', _timestamp) + PARTITION BY toYYYYMMDD(timestamp) + ORDER BY (team_id, toHour(timestamp), session_id, timestamp, uuid) + + SETTINGS index_granularity=512 + + ''' +# --- +# name: test_create_table_query[sharded_session_replay_events] + ''' + + CREATE TABLE IF NOT EXISTS sharded_session_replay_events ON CLUSTER 'posthog' + ( + -- part of order by so will aggregate correctly + session_id VARCHAR, + -- part of order by so will aggregate correctly + team_id Int64, + -- ClickHouse will pick any value of distinct_id for the session + -- this is fine since even if the distinct_id changes during a session + -- it will still (or should still) map to the same person + distinct_id VARCHAR, + min_first_timestamp SimpleAggregateFunction(min, DateTime64(6, 'UTC')), + max_last_timestamp SimpleAggregateFunction(max, DateTime64(6, 'UTC')), + first_url AggregateFunction(argMin, Nullable(VARCHAR), DateTime64(6, 'UTC')), + click_count SimpleAggregateFunction(sum, Int64), + keypress_count SimpleAggregateFunction(sum, Int64), + mouse_activity_count SimpleAggregateFunction(sum, Int64), + active_milliseconds SimpleAggregateFunction(sum, Int64), + console_log_count SimpleAggregateFunction(sum, Int64), + console_warn_count SimpleAggregateFunction(sum, Int64), + console_error_count SimpleAggregateFunction(sum, Int64), + -- this column allows us to estimate the amount of data that is being ingested + size SimpleAggregateFunction(sum, Int64), + -- this allows us to count the number of messages received in a session + -- often very useful in incidents or debugging + message_count SimpleAggregateFunction(sum, Int64), + -- this allows us to count the number of snapshot events received in a session + -- often very useful in incidents or debugging + -- because we batch events we expect message_count to be lower than event_count + event_count SimpleAggregateFunction(sum, Int64), + -- which source the snapshots came from Android, iOS, Mobile, Web. Web if absent + snapshot_source AggregateFunction(argMin, LowCardinality(Nullable(String)), DateTime64(6, 'UTC')), + _timestamp SimpleAggregateFunction(max, DateTime) + ) ENGINE = ReplicatedAggregatingMergeTree('/clickhouse/tables/77f1df52-4b43-11e9-910f-b8ca3a9b9f3e_{shard}/posthog.session_replay_events', '{replica}') + + PARTITION BY toYYYYMM(min_first_timestamp) + -- order by is used by the aggregating merge tree engine to + -- identify candidates to merge, e.g. toDate(min_first_timestamp) + -- would mean we would have one row per day per session_id + -- if CH could completely merge to match the order by + -- it is also used to organise data to make queries faster + -- we want the fewest rows possible but also the fastest queries + -- since we query by date and not by time + -- and order by must be in order of increasing cardinality + -- so we order by date first, then team_id, then session_id + -- hopefully, this is a good balance between the two + ORDER BY (toDate(min_first_timestamp), team_id, session_id) + SETTINGS index_granularity=512 + + ''' +# --- +# name: test_create_table_query[sharded_sessions] + ''' + + CREATE TABLE IF NOT EXISTS sharded_sessions ON CLUSTER 'posthog' + ( + -- part of order by so will aggregate correctly + session_id VARCHAR, + -- part of order by so will aggregate correctly + team_id Int64, + -- ClickHouse will pick any value of distinct_id for the session + -- this is fine since even if the distinct_id changes during a session + -- it will still (or should still) map to the same person + distinct_id SimpleAggregateFunction(any, String), + + min_timestamp SimpleAggregateFunction(min, DateTime64(6, 'UTC')), + max_timestamp SimpleAggregateFunction(max, DateTime64(6, 'UTC')), + + urls SimpleAggregateFunction(groupUniqArrayArray, Array(String)), + entry_url AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + exit_url AggregateFunction(argMax, String, DateTime64(6, 'UTC')), + + initial_referring_domain AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_utm_source AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_utm_campaign AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_utm_medium AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_utm_term AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_utm_content AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_gclid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_gad_source AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_gclsrc AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_dclid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_gbraid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_wbraid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_fbclid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_msclkid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_twclid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_li_fat_id AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_mc_cid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_igshid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_ttclid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + + -- create a map of how many times we saw each event + event_count_map SimpleAggregateFunction(sumMap, Map(String, Int64)), + -- duplicate the event count as a specific column for pageviews and autocaptures, + -- as these are used in some key queries and need to be fast + pageview_count SimpleAggregateFunction(sum, Int64), + autocapture_count SimpleAggregateFunction(sum, Int64), + ) ENGINE = ReplicatedAggregatingMergeTree('/clickhouse/tables/77f1df52-4b43-11e9-910f-b8ca3a9b9f3e_{shard}/posthog.sessions', '{replica}') + + PARTITION BY toYYYYMM(min_timestamp) + -- order by is used by the aggregating merge tree engine to + -- identify candidates to merge, e.g. toDate(min_timestamp) + -- would mean we would have one row per day per session_id + -- if CH could completely merge to match the order by + -- it is also used to organise data to make queries faster + -- we want the fewest rows possible but also the fastest queries + -- since we query by date and not by time + -- and order by must be in order of increasing cardinality + -- so we order by date first, then team_id, then session_id + -- hopefully, this is a good balance between the two + ORDER BY (toStartOfDay(min_timestamp), team_id, session_id) + SETTINGS index_granularity=512 + + ''' +# --- +# name: test_create_table_query[writable_events] + ''' + + CREATE TABLE IF NOT EXISTS writable_events ON CLUSTER 'posthog' + ( + uuid UUID, + event VARCHAR, + properties VARCHAR CODEC(ZSTD(3)), + timestamp DateTime64(6, 'UTC'), + team_id Int64, + distinct_id VARCHAR, + elements_chain VARCHAR, + created_at DateTime64(6, 'UTC'), + person_id UUID, + person_created_at DateTime64, + person_properties VARCHAR Codec(ZSTD(3)), + group0_properties VARCHAR Codec(ZSTD(3)), + group1_properties VARCHAR Codec(ZSTD(3)), + group2_properties VARCHAR Codec(ZSTD(3)), + group3_properties VARCHAR Codec(ZSTD(3)), + group4_properties VARCHAR Codec(ZSTD(3)), + group0_created_at DateTime64, + group1_created_at DateTime64, + group2_created_at DateTime64, + group3_created_at DateTime64, + group4_created_at DateTime64, + person_mode Enum8('full' = 0, 'propertyless' = 1, 'force_upgrade' = 2) + + + , _timestamp DateTime + , _offset UInt64 + , inserted_at Nullable(DateTime64(6, 'UTC')) DEFAULT NULL + + ) ENGINE = Distributed('posthog', 'posthog_test', 'sharded_events', sipHash64(distinct_id)) + + ''' +# --- +# name: test_create_table_query[writable_heatmaps] + ''' + + CREATE TABLE IF NOT EXISTS writable_heatmaps ON CLUSTER 'posthog' + ( + session_id VARCHAR, + team_id Int64, + distinct_id VARCHAR, + timestamp DateTime64(6, 'UTC'), + -- x is the x with resolution applied, the resolution converts high fidelity mouse positions into an NxN grid + x Int16, + -- y is the y with resolution applied, the resolution converts high fidelity mouse positions into an NxN grid + y Int16, + -- stored so that in future we can support other resolutions + scale_factor Int16, + viewport_width Int16, + viewport_height Int16, + -- some elements move when the page scrolls, others do not + pointer_target_fixed Bool, + current_url VARCHAR, + type LowCardinality(String), + _timestamp DateTime, + _offset UInt64, + _partition UInt64 + ) ENGINE = Distributed('posthog', 'posthog_test', 'sharded_heatmaps', cityHash64(concat(toString(team_id), '-', session_id, '-', toString(toDate(timestamp))))) + + ''' +# --- +# name: test_create_table_query[writable_raw_sessions] + ''' + + CREATE TABLE IF NOT EXISTS writable_raw_sessions ON CLUSTER 'posthog' + ( + team_id Int64, + session_id_v7 UInt128, -- integer representation of a uuidv7 + + -- ClickHouse will pick the latest value of distinct_id for the session + -- this is fine since even if the distinct_id changes during a session + distinct_id AggregateFunction(argMax, String, DateTime64(6, 'UTC')), + + min_timestamp SimpleAggregateFunction(min, DateTime64(6, 'UTC')), + max_timestamp SimpleAggregateFunction(max, DateTime64(6, 'UTC')), + + -- urls + urls SimpleAggregateFunction(groupUniqArrayArray, Array(String)), + entry_url AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + end_url AggregateFunction(argMax, String, DateTime64(6, 'UTC')), + last_external_click_url AggregateFunction(argMax, String, DateTime64(6, 'UTC')), + + -- device + initial_browser AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_browser_version AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_os AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_os_version AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_device_type AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_viewport_width AggregateFunction(argMin, Int64, DateTime64(6, 'UTC')), + initial_viewport_height AggregateFunction(argMin, Int64, DateTime64(6, 'UTC')), + + -- geoip + -- only store the properties we actually use, as there's tons, see https://posthog.com/docs/cdp/geoip-enrichment + initial_geoip_country_code AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_geoip_subdivision_1_code AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_geoip_subdivision_1_name AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_geoip_subdivision_city_name AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_geoip_time_zone AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + + -- attribution + initial_referring_domain AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_utm_source AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_utm_campaign AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_utm_medium AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_utm_term AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_utm_content AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_gclid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_gad_source AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_gclsrc AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_dclid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_gbraid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_wbraid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_fbclid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_msclkid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_twclid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_li_fat_id AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_mc_cid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_igshid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_ttclid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + + -- Count pageview, autocapture, and screen events for providing totals. + -- It's unclear if we can use the counts as they are not idempotent, and we had a bug on EU where events were + -- double-counted, so the counts were wrong. To get around this, also keep track of the unique uuids. This will be + -- slower and more expensive to store, but will be correct even if events are double-counted, so can be used to + -- verify correctness and as a backup. Ideally we will be able to delete the uniq columns in the future when we're + -- satisfied that counts are accurate. + pageview_count SimpleAggregateFunction(sum, Int64), + pageview_uniq AggregateFunction(uniq, Nullable(UUID)), + autocapture_count SimpleAggregateFunction(sum, Int64), + autocapture_uniq AggregateFunction(uniq, Nullable(UUID)), + screen_count SimpleAggregateFunction(sum, Int64), + screen_uniq AggregateFunction(uniq, Nullable(UUID)), + + -- replay + maybe_has_session_replay SimpleAggregateFunction(max, Bool), -- will be written False to by the events table mv and True to by the replay table mv + + -- as a performance optimisation, also keep track of the uniq events for all of these combined, a bounce is a session with <2 of these + page_screen_autocapture_uniq_up_to AggregateFunction(uniqUpTo(1), Nullable(UUID)), + + -- web vitals + vitals_lcp AggregateFunction(argMin, Nullable(Float64), DateTime64(6, 'UTC')) + ) ENGINE = Distributed('posthog', 'posthog_test', 'sharded_raw_sessions', cityHash64(session_id_v7)) + + ''' +# --- +# name: test_create_table_query[writable_session_recording_events] + ''' + + CREATE TABLE IF NOT EXISTS writable_session_recording_events ON CLUSTER 'posthog' + ( + uuid UUID, + timestamp DateTime64(6, 'UTC'), + team_id Int64, + distinct_id VARCHAR, + session_id VARCHAR, + window_id VARCHAR, + snapshot_data VARCHAR, + created_at DateTime64(6, 'UTC') + + + , _timestamp DateTime + , _offset UInt64 + + ) ENGINE = Distributed('posthog', 'posthog_test', 'sharded_session_recording_events', sipHash64(distinct_id)) + + ''' +# --- +# name: test_create_table_query[writable_sessions] + ''' + + CREATE TABLE IF NOT EXISTS writable_sessions ON CLUSTER 'posthog' + ( + -- part of order by so will aggregate correctly + session_id VARCHAR, + -- part of order by so will aggregate correctly + team_id Int64, + -- ClickHouse will pick any value of distinct_id for the session + -- this is fine since even if the distinct_id changes during a session + -- it will still (or should still) map to the same person + distinct_id SimpleAggregateFunction(any, String), + + min_timestamp SimpleAggregateFunction(min, DateTime64(6, 'UTC')), + max_timestamp SimpleAggregateFunction(max, DateTime64(6, 'UTC')), + + urls SimpleAggregateFunction(groupUniqArrayArray, Array(String)), + entry_url AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + exit_url AggregateFunction(argMax, String, DateTime64(6, 'UTC')), + + initial_referring_domain AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_utm_source AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_utm_campaign AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_utm_medium AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_utm_term AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_utm_content AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_gclid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_gad_source AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_gclsrc AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_dclid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_gbraid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_wbraid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_fbclid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_msclkid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_twclid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_li_fat_id AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_mc_cid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_igshid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_ttclid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + + -- create a map of how many times we saw each event + event_count_map SimpleAggregateFunction(sumMap, Map(String, Int64)), + -- duplicate the event count as a specific column for pageviews and autocaptures, + -- as these are used in some key queries and need to be fast + pageview_count SimpleAggregateFunction(sum, Int64), + autocapture_count SimpleAggregateFunction(sum, Int64), + ) ENGINE = Distributed('posthog', 'posthog_test', 'sharded_sessions', sipHash64(session_id)) + + ''' +# --- +# name: test_create_table_query[writeable_performance_events] + ''' + + CREATE TABLE IF NOT EXISTS writeable_performance_events ON CLUSTER 'posthog' + ( + uuid UUID, + session_id String, + window_id String, + pageview_id String, + distinct_id String, + timestamp DateTime64, + time_origin DateTime64(3, 'UTC'), + entry_type LowCardinality(String), + name String, + team_id Int64, + current_url String, + start_time Float64, + duration Float64, + redirect_start Float64, + redirect_end Float64, + worker_start Float64, + fetch_start Float64, + domain_lookup_start Float64, + domain_lookup_end Float64, + connect_start Float64, + secure_connection_start Float64, + connect_end Float64, + request_start Float64, + response_start Float64, + response_end Float64, + decoded_body_size Int64, + encoded_body_size Int64, + initiator_type LowCardinality(String), + next_hop_protocol LowCardinality(String), + render_blocking_status LowCardinality(String), + response_status Int64, + transfer_size Int64, + largest_contentful_paint_element String, + largest_contentful_paint_render_time Float64, + largest_contentful_paint_load_time Float64, + largest_contentful_paint_size Float64, + largest_contentful_paint_id String, + largest_contentful_paint_url String, + dom_complete Float64, + dom_content_loaded_event Float64, + dom_interactive Float64, + load_event_end Float64, + load_event_start Float64, + redirect_count Int64, + navigation_type LowCardinality(String), + unload_event_end Float64, + unload_event_start Float64 + + , _timestamp DateTime + , _offset UInt64 + , _partition UInt64 + + ) ENGINE = Distributed('posthog', 'posthog_test', 'sharded_performance_events', sipHash64(session_id)) + + ''' +# --- +# name: test_create_table_query_replicated_and_storage[channel_definition] + ''' + + CREATE TABLE IF NOT EXISTS channel_definition ON CLUSTER 'posthog' ( + domain String NOT NULL, + kind String NOT NULL, + domain_type String NULL, + type_if_paid String NULL, + type_if_organic String NULL + ) ENGINE = ReplicatedMergeTree('/clickhouse/tables/77f1df52-4b43-11e9-910f-b8ca3a9b9f3e_noshard/posthog.channel_definition', '{replica}-{shard}') + ORDER BY (domain, kind); + + ''' +# --- +# name: test_create_table_query_replicated_and_storage[cohortpeople] + ''' + + CREATE TABLE IF NOT EXISTS cohortpeople ON CLUSTER 'posthog' + ( + person_id UUID, + cohort_id Int64, + team_id Int64, + sign Int8, + version UInt64 + ) ENGINE = ReplicatedCollapsingMergeTree('/clickhouse/tables/77f1df52-4b43-11e9-910f-b8ca3a9b9f3e_noshard/posthog.cohortpeople', '{replica}-{shard}', sign) + Order By (team_id, cohort_id, person_id, version) + + + ''' +# --- +# name: test_create_table_query_replicated_and_storage[events_dead_letter_queue] + ''' + + CREATE TABLE IF NOT EXISTS events_dead_letter_queue ON CLUSTER 'posthog' + ( + id UUID, + event_uuid UUID, + event VARCHAR, + properties VARCHAR, + distinct_id VARCHAR, + team_id Int64, + elements_chain VARCHAR, + created_at DateTime64(6, 'UTC'), + ip VARCHAR, + site_url VARCHAR, + now DateTime64(6, 'UTC'), + raw_payload VARCHAR, + error_timestamp DateTime64(6, 'UTC'), + error_location VARCHAR, + error VARCHAR, + tags Array(VARCHAR) + + + , _timestamp DateTime + , _offset UInt64 + + , INDEX kafka_timestamp_minmax_events_dead_letter_queue _timestamp TYPE minmax GRANULARITY 3 + + ) ENGINE = ReplicatedReplacingMergeTree('/clickhouse/tables/77f1df52-4b43-11e9-910f-b8ca3a9b9f3e_noshard/posthog.events_dead_letter_queue', '{replica}-{shard}', _timestamp) + ORDER BY (id, event_uuid, distinct_id, team_id) + + SETTINGS index_granularity=512 + + ''' +# --- +# name: test_create_table_query_replicated_and_storage[groups] + ''' + + CREATE TABLE IF NOT EXISTS groups ON CLUSTER 'posthog' + ( + group_type_index UInt8, + group_key VARCHAR, + created_at DateTime64, + team_id Int64, + group_properties VARCHAR + + , _timestamp DateTime + , _offset UInt64 + + ) ENGINE = ReplicatedReplacingMergeTree('/clickhouse/tables/77f1df52-4b43-11e9-910f-b8ca3a9b9f3e_noshard/posthog.groups', '{replica}-{shard}', _timestamp) + Order By (team_id, group_type_index, group_key) + SETTINGS storage_policy = 'hot_to_cold' + + ''' +# --- +# name: test_create_table_query_replicated_and_storage[log_entries] + ''' + + CREATE TABLE IF NOT EXISTS log_entries ON CLUSTER 'posthog' + ( + team_id UInt64, + -- The name of the service or product that generated the logs. + -- Examples: batch_exports + log_source LowCardinality(String), + -- An id for the log source. + -- Set log_source to avoid collision with ids from other log sources if the id generation is not safe. + -- Examples: A batch export id, a cronjob id, a plugin id. + log_source_id String, + -- A secondary id e.g. for the instance of log_source that generated this log. + -- This may be ommitted if log_source is a singleton. + -- Examples: A batch export run id, a plugin_config id, a thread id, a process id, a machine id. + instance_id String, + -- Timestamp indicating when the log was generated. + timestamp DateTime64(6, 'UTC'), + -- The log level. + -- Examples: INFO, WARNING, DEBUG, ERROR. + level LowCardinality(String), + -- The actual log message. + message String + + , _timestamp DateTime + , _offset UInt64 + + ) ENGINE = ReplicatedReplacingMergeTree('/clickhouse/tables/77f1df52-4b43-11e9-910f-b8ca3a9b9f3e_noshard/posthog.log_entries', '{replica}-{shard}', _timestamp) + PARTITION BY toStartOfHour(timestamp) ORDER BY (team_id, log_source, log_source_id, instance_id, timestamp) + + SETTINGS index_granularity=512 + + ''' +# --- +# name: test_create_table_query_replicated_and_storage[person] + ''' + + CREATE TABLE IF NOT EXISTS person ON CLUSTER 'posthog' + ( + id UUID, + created_at DateTime64, + team_id Int64, + properties VARCHAR, + is_identified Int8, + is_deleted Int8, + version UInt64 + + + , _timestamp DateTime + , _offset UInt64 + + , INDEX kafka_timestamp_minmax_person _timestamp TYPE minmax GRANULARITY 3 + + ) ENGINE = ReplicatedReplacingMergeTree('/clickhouse/tables/77f1df52-4b43-11e9-910f-b8ca3a9b9f3e_noshard/posthog.person', '{replica}-{shard}', version) + Order By (team_id, id) + SETTINGS storage_policy = 'hot_to_cold' + + ''' +# --- +# name: test_create_table_query_replicated_and_storage[person_distinct_id2] + ''' + + CREATE TABLE IF NOT EXISTS person_distinct_id2 ON CLUSTER 'posthog' + ( + team_id Int64, + distinct_id VARCHAR, + person_id UUID, + is_deleted Int8, + version Int64 + + + , _timestamp DateTime + , _offset UInt64 + + , _partition UInt64 + , INDEX kafka_timestamp_minmax_person_distinct_id2 _timestamp TYPE minmax GRANULARITY 3 + + ) ENGINE = ReplicatedReplacingMergeTree('/clickhouse/tables/77f1df52-4b43-11e9-910f-b8ca3a9b9f3e_noshard/posthog.person_distinct_id2', '{replica}-{shard}', version) + + ORDER BY (team_id, distinct_id) + SETTINGS index_granularity = 512 + + ''' +# --- +# name: test_create_table_query_replicated_and_storage[person_distinct_id] + ''' + + CREATE TABLE IF NOT EXISTS person_distinct_id ON CLUSTER 'posthog' + ( + distinct_id VARCHAR, + person_id UUID, + team_id Int64, + _sign Int8 DEFAULT 1, + is_deleted Int8 ALIAS if(_sign==-1, 1, 0) + + , _timestamp DateTime + , _offset UInt64 + + ) ENGINE = ReplicatedCollapsingMergeTree('/clickhouse/tables/77f1df52-4b43-11e9-910f-b8ca3a9b9f3e_noshard/posthog.person_distinct_id', '{replica}-{shard}', _sign) + Order By (team_id, distinct_id, person_id) + SETTINGS storage_policy = 'hot_to_cold' + + ''' +# --- +# name: test_create_table_query_replicated_and_storage[person_distinct_id_overrides] + ''' + + CREATE TABLE IF NOT EXISTS person_distinct_id_overrides ON CLUSTER 'posthog' + ( + team_id Int64, + distinct_id VARCHAR, + person_id UUID, + is_deleted Int8, + version Int64 + + + , _timestamp DateTime + , _offset UInt64 + , _partition UInt64 + + , INDEX kafka_timestamp_minmax_person_distinct_id_overrides _timestamp TYPE minmax GRANULARITY 3 + + ) ENGINE = ReplicatedReplacingMergeTree('/clickhouse/tables/77f1df52-4b43-11e9-910f-b8ca3a9b9f3e_noshard/posthog.person_distinct_id_overrides', '{replica}-{shard}', version) + + ORDER BY (team_id, distinct_id) + SETTINGS index_granularity = 512 + + ''' +# --- +# name: test_create_table_query_replicated_and_storage[person_overrides] + ''' + + CREATE TABLE IF NOT EXISTS `posthog_test`.`person_overrides` + ON CLUSTER 'posthog' ( + team_id INT NOT NULL, + + -- When we merge two people `old_person_id` and `override_person_id`, we + -- want to keep track of a mapping from the `old_person_id` to the + -- `override_person_id`. This allows us to join with the + -- `sharded_events` table to find all events that were associated with + -- the `old_person_id` and update them to be associated with the + -- `override_person_id`. + old_person_id UUID NOT NULL, + override_person_id UUID NOT NULL, + + -- The timestamp the merge of the two people was completed. + merged_at DateTime64(6, 'UTC') NOT NULL, + -- The timestamp of the oldest event associated with the + -- `old_person_id`. + oldest_event DateTime64(6, 'UTC') NOT NULL, + -- The timestamp rows are created. This isn't part of the JOIN process + -- with the events table but rather a housekeeping column to allow us to + -- see when the row was created. This shouldn't have any impact of the + -- JOIN as it will be stored separately with the Wide ClickHouse table + -- storage. + created_at DateTime64(6, 'UTC') DEFAULT now(), + + -- the specific version of the `old_person_id` mapping. This is used to + -- allow us to discard old mappings as new ones are added. This version + -- will be provided by the corresponding PostgreSQL + --`posthog_personoverrides` table + version INT NOT NULL + ) + + -- By specifying Replacing merge tree on version, we allow ClickHouse to + -- discard old versions of a `old_person_id` mapping. This should help keep + -- performance in check as new versions are added. Note that given we can + -- have partitioning by `oldest_event` which will change as we update + -- `person_id` on old partitions. + -- + -- We also need to ensure that the data is replicated to all replicas in the + -- cluster, as we do not have any constraints on person_id and which shard + -- associated events are on. To do this we use the ReplicatedReplacingMergeTree + -- engine specifying a static `zk_path`. This will cause the Engine to + -- consider all replicas as the same. See + -- https://clickhouse.com/docs/en/engines/table-engines/mergetree-family/replication + -- for details. + ENGINE = ReplicatedReplacingMergeTree( + -- NOTE: for testing we use a uuid to ensure that we don't get conflicts + -- when the tests tear down and recreate the table. + '/clickhouse/tables/{uuid}noshard/posthog_test.person_overrides', + '{replica}-{shard}', + version + ) + + -- We partition the table by the `oldest_event` column. This allows us to + -- handle updating the events table partition by partition, progressing each + -- override partition by partition in lockstep with the events table. Note + -- that this means it is possible that we have a mapping from + -- `old_person_id` in multiple partitions during the merge process. + PARTITION BY toYYYYMM(oldest_event) + + -- We want to collapse down on the `old_person_id` such that we end up with + -- the newest known mapping for it in the table. Query side we will need to + -- ensure that we are always querying the latest version of the mapping. + ORDER BY (team_id, old_person_id) + + ''' +# --- +# name: test_create_table_query_replicated_and_storage[person_static_cohort] + ''' + + CREATE TABLE IF NOT EXISTS person_static_cohort ON CLUSTER 'posthog' + ( + id UUID, + person_id UUID, + cohort_id Int64, + team_id Int64 + + , _timestamp DateTime + , _offset UInt64 + + ) ENGINE = ReplicatedReplacingMergeTree('/clickhouse/tables/77f1df52-4b43-11e9-910f-b8ca3a9b9f3e_noshard/posthog.person_static_cohort', '{replica}-{shard}', _timestamp) + Order By (team_id, cohort_id, person_id, id) + SETTINGS storage_policy = 'hot_to_cold' + + ''' +# --- +# name: test_create_table_query_replicated_and_storage[plugin_log_entries] + ''' + + CREATE TABLE IF NOT EXISTS plugin_log_entries ON CLUSTER 'posthog' + ( + id UUID, + team_id Int64, + plugin_id Int64, + plugin_config_id Int64, + timestamp DateTime64(6, 'UTC'), + source VARCHAR, + type VARCHAR, + message VARCHAR, + instance_id UUID + + , _timestamp DateTime + , _offset UInt64 + + ) ENGINE = ReplicatedReplacingMergeTree('/clickhouse/tables/77f1df52-4b43-11e9-910f-b8ca3a9b9f3e_noshard/posthog.plugin_log_entries', '{replica}-{shard}', _timestamp) + PARTITION BY toYYYYMMDD(timestamp) ORDER BY (team_id, plugin_id, plugin_config_id, timestamp) + + SETTINGS index_granularity=512 + + ''' +# --- +# name: test_create_table_query_replicated_and_storage[sharded_app_metrics2] + ''' + + CREATE TABLE IF NOT EXISTS sharded_app_metrics2 ON CLUSTER 'posthog' + ( + team_id Int64, + timestamp DateTime64(6, 'UTC'), + -- The name of the service or product that generated the metrics. + -- Examples: plugins, hog + app_source LowCardinality(String), + -- An id for the app source. + -- Set app_source to avoid collision with ids from other app sources if the id generation is not safe. + -- Examples: A plugin id, a hog application id + app_source_id String, + -- A secondary id e.g. for the instance of app_source that generated this metric. + -- This may be ommitted if app_source is a singleton. + -- Examples: A plugin config id, a hog application config id + instance_id String, + metric_kind LowCardinality(String), + metric_name LowCardinality(String), + count SimpleAggregateFunction(sum, Int64) + + , _timestamp DateTime + , _offset UInt64 + , _partition UInt64 + + ) + ENGINE = ReplicatedAggregatingMergeTree('/clickhouse/tables/77f1df52-4b43-11e9-910f-b8ca3a9b9f3e_{shard}/posthog.sharded_app_metrics2', '{replica}') + PARTITION BY toYYYYMM(timestamp) + ORDER BY (team_id, app_source, app_source_id, instance_id, toStartOfHour(timestamp), metric_kind, metric_name) + + + ''' +# --- +# name: test_create_table_query_replicated_and_storage[sharded_app_metrics] + ''' + + CREATE TABLE IF NOT EXISTS sharded_app_metrics ON CLUSTER 'posthog' + ( + team_id Int64, + timestamp DateTime64(6, 'UTC'), + plugin_config_id Int64, + category LowCardinality(String), + job_id String, + successes SimpleAggregateFunction(sum, Int64), + successes_on_retry SimpleAggregateFunction(sum, Int64), + failures SimpleAggregateFunction(sum, Int64), + error_uuid UUID, + error_type String, + error_details String CODEC(ZSTD(3)) + + , _timestamp DateTime + , _offset UInt64 + , _partition UInt64 + + ) + ENGINE = ReplicatedAggregatingMergeTree('/clickhouse/tables/77f1df52-4b43-11e9-910f-b8ca3a9b9f3e_{shard}/posthog.sharded_app_metrics', '{replica}') + PARTITION BY toYYYYMM(timestamp) + ORDER BY (team_id, plugin_config_id, job_id, category, toStartOfHour(timestamp), error_type, error_uuid) + + ''' +# --- +# name: test_create_table_query_replicated_and_storage[sharded_events] + ''' + + CREATE TABLE IF NOT EXISTS sharded_events ON CLUSTER 'posthog' + ( + uuid UUID, + event VARCHAR, + properties VARCHAR CODEC(ZSTD(3)), + timestamp DateTime64(6, 'UTC'), + team_id Int64, + distinct_id VARCHAR, + elements_chain VARCHAR, + created_at DateTime64(6, 'UTC'), + person_id UUID, + person_created_at DateTime64, + person_properties VARCHAR Codec(ZSTD(3)), + group0_properties VARCHAR Codec(ZSTD(3)), + group1_properties VARCHAR Codec(ZSTD(3)), + group2_properties VARCHAR Codec(ZSTD(3)), + group3_properties VARCHAR Codec(ZSTD(3)), + group4_properties VARCHAR Codec(ZSTD(3)), + group0_created_at DateTime64, + group1_created_at DateTime64, + group2_created_at DateTime64, + group3_created_at DateTime64, + group4_created_at DateTime64, + person_mode Enum8('full' = 0, 'propertyless' = 1, 'force_upgrade' = 2) + + , $group_0 VARCHAR MATERIALIZED replaceRegexpAll(JSONExtractRaw(properties, '$group_0'), '^"|"$', '') COMMENT 'column_materializer::$group_0' + , $group_1 VARCHAR MATERIALIZED replaceRegexpAll(JSONExtractRaw(properties, '$group_1'), '^"|"$', '') COMMENT 'column_materializer::$group_1' + , $group_2 VARCHAR MATERIALIZED replaceRegexpAll(JSONExtractRaw(properties, '$group_2'), '^"|"$', '') COMMENT 'column_materializer::$group_2' + , $group_3 VARCHAR MATERIALIZED replaceRegexpAll(JSONExtractRaw(properties, '$group_3'), '^"|"$', '') COMMENT 'column_materializer::$group_3' + , $group_4 VARCHAR MATERIALIZED replaceRegexpAll(JSONExtractRaw(properties, '$group_4'), '^"|"$', '') COMMENT 'column_materializer::$group_4' + , $window_id VARCHAR MATERIALIZED replaceRegexpAll(JSONExtractRaw(properties, '$window_id'), '^"|"$', '') COMMENT 'column_materializer::$window_id' + , $session_id VARCHAR MATERIALIZED replaceRegexpAll(JSONExtractRaw(properties, '$session_id'), '^"|"$', '') COMMENT 'column_materializer::$session_id' + , elements_chain_href String MATERIALIZED extract(elements_chain, '(?::|")href="(.*?)"') + , elements_chain_texts Array(String) MATERIALIZED arrayDistinct(extractAll(elements_chain, '(?::|")text="(.*?)"')) + , elements_chain_ids Array(String) MATERIALIZED arrayDistinct(extractAll(elements_chain, '(?::|")attr_id="(.*?)"')) + , elements_chain_elements Array(Enum('a', 'button', 'form', 'input', 'select', 'textarea', 'label')) MATERIALIZED arrayDistinct(extractAll(elements_chain, '(?:^|;)(a|button|form|input|select|textarea|label)(?:\.|$|:)')) + , INDEX `minmax_$group_0` `$group_0` TYPE minmax GRANULARITY 1 + , INDEX `minmax_$group_1` `$group_1` TYPE minmax GRANULARITY 1 + , INDEX `minmax_$group_2` `$group_2` TYPE minmax GRANULARITY 1 + , INDEX `minmax_$group_3` `$group_3` TYPE minmax GRANULARITY 1 + , INDEX `minmax_$group_4` `$group_4` TYPE minmax GRANULARITY 1 + , INDEX `minmax_$window_id` `$window_id` TYPE minmax GRANULARITY 1 + , INDEX `minmax_$session_id` `$session_id` TYPE minmax GRANULARITY 1 + , properties_group_custom Map(String, String) + MATERIALIZED mapSort( + mapFilter((key, _) -> key NOT LIKE '$%' AND key NOT IN ('token', 'distinct_id', 'utm_source', 'utm_medium', 'utm_campaign', 'utm_content', 'utm_term', 'gclid', 'gad_source', 'gclsrc', 'dclid', 'gbraid', 'wbraid', 'fbclid', 'msclkid', 'twclid', 'li_fat_id', 'mc_cid', 'igshid', 'ttclid', 'rdt_cid'), + CAST(JSONExtractKeysAndValues(properties, 'String'), 'Map(String, String)')) + ) + CODEC(ZSTD(1)) + , INDEX properties_group_custom_keys_bf mapKeys(properties_group_custom) TYPE bloom_filter, INDEX properties_group_custom_values_bf mapValues(properties_group_custom) TYPE bloom_filter, properties_group_feature_flags Map(String, String) + MATERIALIZED mapSort( + mapFilter((key, _) -> key like '$feature/%', + CAST(JSONExtractKeysAndValues(properties, 'String'), 'Map(String, String)')) + ) + CODEC(ZSTD(1)) + , INDEX properties_group_feature_flags_keys_bf mapKeys(properties_group_feature_flags) TYPE bloom_filter, INDEX properties_group_feature_flags_values_bf mapValues(properties_group_feature_flags) TYPE bloom_filter + + + , _timestamp DateTime + , _offset UInt64 + , inserted_at Nullable(DateTime64(6, 'UTC')) DEFAULT NULL + + , INDEX kafka_timestamp_minmax_sharded_events _timestamp TYPE minmax GRANULARITY 3 + + ) ENGINE = ReplicatedReplacingMergeTree('/clickhouse/tables/77f1df52-4b43-11e9-910f-b8ca3a9b9f3e_{shard}/posthog.events', '{replica}', _timestamp) + PARTITION BY toYYYYMM(timestamp) + ORDER BY (team_id, toDate(timestamp), event, cityHash64(distinct_id), cityHash64(uuid)) + SAMPLE BY cityHash64(distinct_id) + SETTINGS storage_policy = 'hot_to_cold' + + ''' +# --- +# name: test_create_table_query_replicated_and_storage[sharded_heatmaps] + ''' + + CREATE TABLE IF NOT EXISTS sharded_heatmaps ON CLUSTER 'posthog' + ( + session_id VARCHAR, + team_id Int64, + distinct_id VARCHAR, + timestamp DateTime64(6, 'UTC'), + -- x is the x with resolution applied, the resolution converts high fidelity mouse positions into an NxN grid + x Int16, + -- y is the y with resolution applied, the resolution converts high fidelity mouse positions into an NxN grid + y Int16, + -- stored so that in future we can support other resolutions + scale_factor Int16, + viewport_width Int16, + viewport_height Int16, + -- some elements move when the page scrolls, others do not + pointer_target_fixed Bool, + current_url VARCHAR, + type LowCardinality(String), + _timestamp DateTime, + _offset UInt64, + _partition UInt64 + ) ENGINE = ReplicatedMergeTree('/clickhouse/tables/77f1df52-4b43-11e9-910f-b8ca3a9b9f3e_{shard}/posthog.heatmaps', '{replica}') + + PARTITION BY toYYYYMM(timestamp) + -- almost always this is being queried by + -- * type, + -- * team_id, + -- * date range, + -- * URL (maybe matching wild cards), + -- * width + -- we'll almost never query this by session id + -- so from least to most cardinality that's + ORDER BY (type, team_id, toDate(timestamp), current_url, viewport_width) + + -- I am purposefully not setting index granularity + -- the default is 8192, and we will be loading a lot of data + -- per query, we tend to copy this 512 around the place but + -- i don't think it applies here + + ''' +# --- +# name: test_create_table_query_replicated_and_storage[sharded_ingestion_warnings] + ''' + + CREATE TABLE IF NOT EXISTS sharded_ingestion_warnings ON CLUSTER 'posthog' + ( + team_id Int64, + source LowCardinality(VARCHAR), + type VARCHAR, + details VARCHAR CODEC(ZSTD(3)), + timestamp DateTime64(6, 'UTC') + + , _timestamp DateTime + , _offset UInt64 + , _partition UInt64 + + ) ENGINE = ReplicatedMergeTree('/clickhouse/tables/77f1df52-4b43-11e9-910f-b8ca3a9b9f3e_{shard}/posthog.sharded_ingestion_warnings', '{replica}') + PARTITION BY toYYYYMMDD(timestamp) + ORDER BY (team_id, toHour(timestamp), type, source, timestamp) + + ''' +# --- +# name: test_create_table_query_replicated_and_storage[sharded_performance_events] + ''' + + CREATE TABLE IF NOT EXISTS sharded_performance_events ON CLUSTER 'posthog' + ( + uuid UUID, + session_id String, + window_id String, + pageview_id String, + distinct_id String, + timestamp DateTime64, + time_origin DateTime64(3, 'UTC'), + entry_type LowCardinality(String), + name String, + team_id Int64, + current_url String, + start_time Float64, + duration Float64, + redirect_start Float64, + redirect_end Float64, + worker_start Float64, + fetch_start Float64, + domain_lookup_start Float64, + domain_lookup_end Float64, + connect_start Float64, + secure_connection_start Float64, + connect_end Float64, + request_start Float64, + response_start Float64, + response_end Float64, + decoded_body_size Int64, + encoded_body_size Int64, + initiator_type LowCardinality(String), + next_hop_protocol LowCardinality(String), + render_blocking_status LowCardinality(String), + response_status Int64, + transfer_size Int64, + largest_contentful_paint_element String, + largest_contentful_paint_render_time Float64, + largest_contentful_paint_load_time Float64, + largest_contentful_paint_size Float64, + largest_contentful_paint_id String, + largest_contentful_paint_url String, + dom_complete Float64, + dom_content_loaded_event Float64, + dom_interactive Float64, + load_event_end Float64, + load_event_start Float64, + redirect_count Int64, + navigation_type LowCardinality(String), + unload_event_end Float64, + unload_event_start Float64 + + , _timestamp DateTime + , _offset UInt64 + , _partition UInt64 + + ) ENGINE = ReplicatedMergeTree('/clickhouse/tables/77f1df52-4b43-11e9-910f-b8ca3a9b9f3e_{shard}/posthog.performance_events', '{replica}') + PARTITION BY toYYYYMM(timestamp) + ORDER BY (team_id, toDate(timestamp), session_id, pageview_id, timestamp) + + SETTINGS storage_policy = 'hot_to_cold' + + ''' +# --- +# name: test_create_table_query_replicated_and_storage[sharded_raw_sessions] + ''' + + CREATE TABLE IF NOT EXISTS sharded_raw_sessions ON CLUSTER 'posthog' + ( + team_id Int64, + session_id_v7 UInt128, -- integer representation of a uuidv7 + + -- ClickHouse will pick the latest value of distinct_id for the session + -- this is fine since even if the distinct_id changes during a session + distinct_id AggregateFunction(argMax, String, DateTime64(6, 'UTC')), + + min_timestamp SimpleAggregateFunction(min, DateTime64(6, 'UTC')), + max_timestamp SimpleAggregateFunction(max, DateTime64(6, 'UTC')), + + -- urls + urls SimpleAggregateFunction(groupUniqArrayArray, Array(String)), + entry_url AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + end_url AggregateFunction(argMax, String, DateTime64(6, 'UTC')), + last_external_click_url AggregateFunction(argMax, String, DateTime64(6, 'UTC')), + + -- device + initial_browser AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_browser_version AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_os AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_os_version AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_device_type AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_viewport_width AggregateFunction(argMin, Int64, DateTime64(6, 'UTC')), + initial_viewport_height AggregateFunction(argMin, Int64, DateTime64(6, 'UTC')), + + -- geoip + -- only store the properties we actually use, as there's tons, see https://posthog.com/docs/cdp/geoip-enrichment + initial_geoip_country_code AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_geoip_subdivision_1_code AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_geoip_subdivision_1_name AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_geoip_subdivision_city_name AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_geoip_time_zone AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + + -- attribution + initial_referring_domain AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_utm_source AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_utm_campaign AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_utm_medium AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_utm_term AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_utm_content AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_gclid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_gad_source AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_gclsrc AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_dclid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_gbraid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_wbraid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_fbclid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_msclkid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_twclid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_li_fat_id AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_mc_cid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_igshid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_ttclid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + + -- Count pageview, autocapture, and screen events for providing totals. + -- It's unclear if we can use the counts as they are not idempotent, and we had a bug on EU where events were + -- double-counted, so the counts were wrong. To get around this, also keep track of the unique uuids. This will be + -- slower and more expensive to store, but will be correct even if events are double-counted, so can be used to + -- verify correctness and as a backup. Ideally we will be able to delete the uniq columns in the future when we're + -- satisfied that counts are accurate. + pageview_count SimpleAggregateFunction(sum, Int64), + pageview_uniq AggregateFunction(uniq, Nullable(UUID)), + autocapture_count SimpleAggregateFunction(sum, Int64), + autocapture_uniq AggregateFunction(uniq, Nullable(UUID)), + screen_count SimpleAggregateFunction(sum, Int64), + screen_uniq AggregateFunction(uniq, Nullable(UUID)), + + -- replay + maybe_has_session_replay SimpleAggregateFunction(max, Bool), -- will be written False to by the events table mv and True to by the replay table mv + + -- as a performance optimisation, also keep track of the uniq events for all of these combined, a bounce is a session with <2 of these + page_screen_autocapture_uniq_up_to AggregateFunction(uniqUpTo(1), Nullable(UUID)), + + -- web vitals + vitals_lcp AggregateFunction(argMin, Nullable(Float64), DateTime64(6, 'UTC')) + ) ENGINE = ReplicatedAggregatingMergeTree('/clickhouse/tables/77f1df52-4b43-11e9-910f-b8ca3a9b9f3e_{shard}/posthog.raw_sessions', '{replica}') + + PARTITION BY toYYYYMM(fromUnixTimestamp(intDiv(toUInt64(bitShiftRight(session_id_v7, 80)), 1000))) + ORDER BY ( + team_id, + toStartOfHour(fromUnixTimestamp(intDiv(toUInt64(bitShiftRight(session_id_v7, 80)), 1000))), + cityHash64(session_id_v7), + session_id_v7 + ) + SAMPLE BY cityHash64(session_id_v7) + + ''' +# --- +# name: test_create_table_query_replicated_and_storage[sharded_session_recording_events] + ''' + + CREATE TABLE IF NOT EXISTS sharded_session_recording_events ON CLUSTER 'posthog' + ( + uuid UUID, + timestamp DateTime64(6, 'UTC'), + team_id Int64, + distinct_id VARCHAR, + session_id VARCHAR, + window_id VARCHAR, + snapshot_data VARCHAR, + created_at DateTime64(6, 'UTC') + , has_full_snapshot Int8 MATERIALIZED JSONExtractBool(snapshot_data, 'has_full_snapshot'), events_summary Array(String) MATERIALIZED JSONExtract(JSON_QUERY(snapshot_data, '$.events_summary[*]'), 'Array(String)'), click_count Int8 MATERIALIZED length(arrayFilter((x) -> JSONExtractInt(x, 'type') = 3 AND JSONExtractInt(x, 'data', 'source') = 2, events_summary)), keypress_count Int8 MATERIALIZED length(arrayFilter((x) -> JSONExtractInt(x, 'type') = 3 AND JSONExtractInt(x, 'data', 'source') = 5, events_summary)), timestamps_summary Array(DateTime64(6, 'UTC')) MATERIALIZED arraySort(arrayMap((x) -> toDateTime(JSONExtractInt(x, 'timestamp') / 1000), events_summary)), first_event_timestamp Nullable(DateTime64(6, 'UTC')) MATERIALIZED if(empty(timestamps_summary), NULL, arrayReduce('min', timestamps_summary)), last_event_timestamp Nullable(DateTime64(6, 'UTC')) MATERIALIZED if(empty(timestamps_summary), NULL, arrayReduce('max', timestamps_summary)), urls Array(String) MATERIALIZED arrayFilter(x -> x != '', arrayMap((x) -> JSONExtractString(x, 'data', 'href'), events_summary)) + + + , _timestamp DateTime + , _offset UInt64 + + , INDEX kafka_timestamp_minmax_sharded_session_recording_events _timestamp TYPE minmax GRANULARITY 3 + + ) ENGINE = ReplicatedReplacingMergeTree('/clickhouse/tables/77f1df52-4b43-11e9-910f-b8ca3a9b9f3e_{shard}/posthog.session_recording_events', '{replica}', _timestamp) + PARTITION BY toYYYYMMDD(timestamp) + ORDER BY (team_id, toHour(timestamp), session_id, timestamp, uuid) + + SETTINGS index_granularity=512 + + ''' +# --- +# name: test_create_table_query_replicated_and_storage[sharded_session_replay_events] + ''' + + CREATE TABLE IF NOT EXISTS sharded_session_replay_events ON CLUSTER 'posthog' + ( + -- part of order by so will aggregate correctly + session_id VARCHAR, + -- part of order by so will aggregate correctly + team_id Int64, + -- ClickHouse will pick any value of distinct_id for the session + -- this is fine since even if the distinct_id changes during a session + -- it will still (or should still) map to the same person + distinct_id VARCHAR, + min_first_timestamp SimpleAggregateFunction(min, DateTime64(6, 'UTC')), + max_last_timestamp SimpleAggregateFunction(max, DateTime64(6, 'UTC')), + first_url AggregateFunction(argMin, Nullable(VARCHAR), DateTime64(6, 'UTC')), + click_count SimpleAggregateFunction(sum, Int64), + keypress_count SimpleAggregateFunction(sum, Int64), + mouse_activity_count SimpleAggregateFunction(sum, Int64), + active_milliseconds SimpleAggregateFunction(sum, Int64), + console_log_count SimpleAggregateFunction(sum, Int64), + console_warn_count SimpleAggregateFunction(sum, Int64), + console_error_count SimpleAggregateFunction(sum, Int64), + -- this column allows us to estimate the amount of data that is being ingested + size SimpleAggregateFunction(sum, Int64), + -- this allows us to count the number of messages received in a session + -- often very useful in incidents or debugging + message_count SimpleAggregateFunction(sum, Int64), + -- this allows us to count the number of snapshot events received in a session + -- often very useful in incidents or debugging + -- because we batch events we expect message_count to be lower than event_count + event_count SimpleAggregateFunction(sum, Int64), + -- which source the snapshots came from Android, iOS, Mobile, Web. Web if absent + snapshot_source AggregateFunction(argMin, LowCardinality(Nullable(String)), DateTime64(6, 'UTC')), + _timestamp SimpleAggregateFunction(max, DateTime) + ) ENGINE = ReplicatedAggregatingMergeTree('/clickhouse/tables/77f1df52-4b43-11e9-910f-b8ca3a9b9f3e_{shard}/posthog.session_replay_events', '{replica}') + + PARTITION BY toYYYYMM(min_first_timestamp) + -- order by is used by the aggregating merge tree engine to + -- identify candidates to merge, e.g. toDate(min_first_timestamp) + -- would mean we would have one row per day per session_id + -- if CH could completely merge to match the order by + -- it is also used to organise data to make queries faster + -- we want the fewest rows possible but also the fastest queries + -- since we query by date and not by time + -- and order by must be in order of increasing cardinality + -- so we order by date first, then team_id, then session_id + -- hopefully, this is a good balance between the two + ORDER BY (toDate(min_first_timestamp), team_id, session_id) + SETTINGS index_granularity=512 + + ''' +# --- +# name: test_create_table_query_replicated_and_storage[sharded_sessions] + ''' + + CREATE TABLE IF NOT EXISTS sharded_sessions ON CLUSTER 'posthog' + ( + -- part of order by so will aggregate correctly + session_id VARCHAR, + -- part of order by so will aggregate correctly + team_id Int64, + -- ClickHouse will pick any value of distinct_id for the session + -- this is fine since even if the distinct_id changes during a session + -- it will still (or should still) map to the same person + distinct_id SimpleAggregateFunction(any, String), + + min_timestamp SimpleAggregateFunction(min, DateTime64(6, 'UTC')), + max_timestamp SimpleAggregateFunction(max, DateTime64(6, 'UTC')), + + urls SimpleAggregateFunction(groupUniqArrayArray, Array(String)), + entry_url AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + exit_url AggregateFunction(argMax, String, DateTime64(6, 'UTC')), + + initial_referring_domain AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_utm_source AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_utm_campaign AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_utm_medium AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_utm_term AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_utm_content AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_gclid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_gad_source AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_gclsrc AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_dclid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_gbraid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_wbraid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_fbclid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_msclkid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_twclid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_li_fat_id AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_mc_cid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_igshid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + initial_ttclid AggregateFunction(argMin, String, DateTime64(6, 'UTC')), + + -- create a map of how many times we saw each event + event_count_map SimpleAggregateFunction(sumMap, Map(String, Int64)), + -- duplicate the event count as a specific column for pageviews and autocaptures, + -- as these are used in some key queries and need to be fast + pageview_count SimpleAggregateFunction(sum, Int64), + autocapture_count SimpleAggregateFunction(sum, Int64), + ) ENGINE = ReplicatedAggregatingMergeTree('/clickhouse/tables/77f1df52-4b43-11e9-910f-b8ca3a9b9f3e_{shard}/posthog.sessions', '{replica}') + + PARTITION BY toYYYYMM(min_timestamp) + -- order by is used by the aggregating merge tree engine to + -- identify candidates to merge, e.g. toDate(min_timestamp) + -- would mean we would have one row per day per session_id + -- if CH could completely merge to match the order by + -- it is also used to organise data to make queries faster + -- we want the fewest rows possible but also the fastest queries + -- since we query by date and not by time + -- and order by must be in order of increasing cardinality + -- so we order by date first, then team_id, then session_id + -- hopefully, this is a good balance between the two + ORDER BY (toStartOfDay(min_timestamp), team_id, session_id) + SETTINGS index_granularity=512 + + ''' +# ---