From 170827a07ae29cd3d823d26eb0997564ddc67ae5 Mon Sep 17 00:00:00 2001 From: Robbie Date: Fri, 13 Oct 2023 11:57:20 +0100 Subject: [PATCH] feat(web-analytics): Fix source query views and uniques - remove blended source (#17961) Fix source query views and uniques --- posthog/hogql_queries/web_analytics/ctes.py | 45 +++++++------------ .../hogql_queries/web_analytics/top_pages.py | 6 +-- .../web_analytics/top_sources.py | 34 +++++++++----- 3 files changed, 43 insertions(+), 42 deletions(-) diff --git a/posthog/hogql_queries/web_analytics/ctes.py b/posthog/hogql_queries/web_analytics/ctes.py index 2e17b5710e874..0e20fcdc3c7dc 100644 --- a/posthog/hogql_queries/web_analytics/ctes.py +++ b/posthog/hogql_queries/web_analytics/ctes.py @@ -9,35 +9,9 @@ max(events.timestamp) AS max_timestamp, dateDiff('second', min_timestamp, max_timestamp) AS duration_s, - argMin(events.properties.`$referrer`, events.timestamp) AS earliest_referrer, - argMin(events.properties.`$pathname`, events.timestamp) AS entry_pathname, - argMax(events.properties.`$pathname`, events.timestamp ) AS exit_pathname, - argMax(events.properties.utm_source, events.timestamp) AS earliest_utm_source, - - if(domain(earliest_referrer) = '', earliest_referrer, domain(earliest_referrer)) AS referrer_domain, - multiIf( - earliest_utm_source IS NOT NULL, earliest_utm_source, - -- This will need to be an approach that scales better - referrer_domain == 'app.posthog.com', 'posthog', - referrer_domain == 'eu.posthog.com', 'posthog', - referrer_domain == 'posthog.com', 'posthog', - referrer_domain == 'www.google.com', 'google', - referrer_domain == 'www.google.co.uk', 'google', - referrer_domain == 'www.google.com.hk', 'google', - referrer_domain == 'www.google.de', 'google', - referrer_domain == 't.co', 'twitter', - referrer_domain == 'github.com', 'github', - referrer_domain == 'duckduckgo.com', 'duckduckgo', - referrer_domain == 'www.bing.com', 'bing', - referrer_domain == 'bing.com', 'bing', - referrer_domain == 'yandex.ru', 'yandex', - referrer_domain == 'quora.com', 'quora', - referrer_domain == 'www.quora.com', 'quora', - referrer_domain == 'linkedin.com', 'linkedin', - referrer_domain == 'www.linkedin.com', 'linkedin', - startsWith(referrer_domain, 'http://localhost:'), 'localhost', - referrer_domain - ) AS blended_source, + any(events.properties.$initial_referring_domain) AS $initial_referring_domain, + any(events.properties.$set_once.$initial_pathname) AS $initial_pathname, + any(events.properties.$set_once.$initial_utm_source) AS $initial_utm_source, countIf(events.event == '$pageview') AS num_pageviews, countIf(events.event == '$autocapture') AS num_autocaptures, @@ -57,6 +31,19 @@ ({session_having}) """ +SOURCE_CTE = """ +SELECT + events.properties.$set_once.$initial_utm_source AS $initial_utm_source, + count() as total_pageviews, + uniq(events.person_id) as unique_visitors +FROM + events +WHERE + (event = '$pageview') + AND ({source_where}) + GROUP BY $initial_utm_source +""" + PATHNAME_CTE = """ SELECT events.properties.`$pathname` AS $pathname, diff --git a/posthog/hogql_queries/web_analytics/top_pages.py b/posthog/hogql_queries/web_analytics/top_pages.py index 16e1923fa1b0c..d219495d3aa24 100644 --- a/posthog/hogql_queries/web_analytics/top_pages.py +++ b/posthog/hogql_queries/web_analytics/top_pages.py @@ -42,15 +42,15 @@ def to_query(self) -> ast.SelectQuery | ast.SelectUnionQuery: LEFT OUTER JOIN ( SELECT - session.entry_pathname, + session.$initial_pathname, avg(session.is_bounce) as bounce_rate FROM {session_query} AS session GROUP BY - session.entry_pathname + session.$initial_pathname ) AS bounce_rate ON - pathname.$pathname = bounce_rate.entry_pathname + pathname.$pathname = bounce_rate.$initial_pathname LEFT OUTER JOIN {pathname_scroll_query} AS scroll_data ON diff --git a/posthog/hogql_queries/web_analytics/top_sources.py b/posthog/hogql_queries/web_analytics/top_sources.py index 22522877a66df..1cc65c8ee9cec 100644 --- a/posthog/hogql_queries/web_analytics/top_sources.py +++ b/posthog/hogql_queries/web_analytics/top_sources.py @@ -1,7 +1,7 @@ from posthog.hogql import ast from posthog.hogql.parser import parse_select from posthog.hogql.query import execute_hogql_query -from posthog.hogql_queries.web_analytics.ctes import SESSION_CTE +from posthog.hogql_queries.web_analytics.ctes import SESSION_CTE, SOURCE_CTE from posthog.hogql_queries.web_analytics.web_analytics_query_runner import WebAnalyticsQueryRunner from posthog.schema import WebTopSourcesQuery, WebTopSourcesQueryResponse @@ -17,25 +17,39 @@ def to_query(self) -> ast.SelectQuery | ast.SelectUnionQuery: timings=self.timings, placeholders={"session_where": self.session_where(), "session_having": self.session_having()}, ) + with self.timings.measure("sources_query"): + source_query = parse_select( + SOURCE_CTE, + timings=self.timings, + placeholders={"source_where": self.events_where()}, + ) with self.timings.measure("top_sources_query"): top_sources_query = parse_select( """ SELECT - blended_source as "Source", - count(num_pageviews) as "context.columns.views", - count(DISTINCT person_id) as "context.columns.visitors", - avg(is_bounce) AS "context.columns.bounce_rate" + source_query.$initial_utm_source as "Initial UTM Source", + source_query.total_pageviews as "context.columns.views", + source_query.unique_visitors as "context.columns.visitors", + bounce_rate.bounce_rate AS "context.columns.bounce_rate" FROM - {session_query} + {source_query} AS source_query +LEFT JOIN ( + SELECT + session.$initial_utm_source, + avg(session.is_bounce) as bounce_rate + FROM + {session_query} AS session + GROUP BY + session.$initial_utm_source + ) AS bounce_rate +ON source_query.$initial_utm_source = bounce_rate.$initial_utm_source WHERE - "Source" IS NOT NULL -GROUP BY "Source" - + "Initial UTM Source" IS NOT NULL ORDER BY "context.columns.views" DESC LIMIT 10 """, timings=self.timings, - placeholders={"session_query": session_query}, + placeholders={"session_query": session_query, "source_query": source_query}, ) return top_sources_query