Skip to content

Commit

Permalink
feat(web-analytics): Fix source query views and uniques - remove blen…
Browse files Browse the repository at this point in the history
…ded source (#17961)

Fix source query views and uniques
  • Loading branch information
robbie-c authored Oct 13, 2023
1 parent bbe24ba commit 170827a
Show file tree
Hide file tree
Showing 3 changed files with 43 additions and 42 deletions.
45 changes: 16 additions & 29 deletions posthog/hogql_queries/web_analytics/ctes.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,35 +9,9 @@
max(events.timestamp) AS max_timestamp,
dateDiff('second', min_timestamp, max_timestamp) AS duration_s,
argMin(events.properties.`$referrer`, events.timestamp) AS earliest_referrer,
argMin(events.properties.`$pathname`, events.timestamp) AS entry_pathname,
argMax(events.properties.`$pathname`, events.timestamp ) AS exit_pathname,
argMax(events.properties.utm_source, events.timestamp) AS earliest_utm_source,
if(domain(earliest_referrer) = '', earliest_referrer, domain(earliest_referrer)) AS referrer_domain,
multiIf(
earliest_utm_source IS NOT NULL, earliest_utm_source,
-- This will need to be an approach that scales better
referrer_domain == 'app.posthog.com', 'posthog',
referrer_domain == 'eu.posthog.com', 'posthog',
referrer_domain == 'posthog.com', 'posthog',
referrer_domain == 'www.google.com', 'google',
referrer_domain == 'www.google.co.uk', 'google',
referrer_domain == 'www.google.com.hk', 'google',
referrer_domain == 'www.google.de', 'google',
referrer_domain == 't.co', 'twitter',
referrer_domain == 'github.com', 'github',
referrer_domain == 'duckduckgo.com', 'duckduckgo',
referrer_domain == 'www.bing.com', 'bing',
referrer_domain == 'bing.com', 'bing',
referrer_domain == 'yandex.ru', 'yandex',
referrer_domain == 'quora.com', 'quora',
referrer_domain == 'www.quora.com', 'quora',
referrer_domain == 'linkedin.com', 'linkedin',
referrer_domain == 'www.linkedin.com', 'linkedin',
startsWith(referrer_domain, 'http://localhost:'), 'localhost',
referrer_domain
) AS blended_source,
any(events.properties.$initial_referring_domain) AS $initial_referring_domain,
any(events.properties.$set_once.$initial_pathname) AS $initial_pathname,
any(events.properties.$set_once.$initial_utm_source) AS $initial_utm_source,
countIf(events.event == '$pageview') AS num_pageviews,
countIf(events.event == '$autocapture') AS num_autocaptures,
Expand All @@ -57,6 +31,19 @@
({session_having})
"""

SOURCE_CTE = """
SELECT
events.properties.$set_once.$initial_utm_source AS $initial_utm_source,
count() as total_pageviews,
uniq(events.person_id) as unique_visitors
FROM
events
WHERE
(event = '$pageview')
AND ({source_where})
GROUP BY $initial_utm_source
"""

PATHNAME_CTE = """
SELECT
events.properties.`$pathname` AS $pathname,
Expand Down
6 changes: 3 additions & 3 deletions posthog/hogql_queries/web_analytics/top_pages.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,15 +42,15 @@ def to_query(self) -> ast.SelectQuery | ast.SelectUnionQuery:
LEFT OUTER JOIN
(
SELECT
session.entry_pathname,
session.$initial_pathname,
avg(session.is_bounce) as bounce_rate
FROM
{session_query} AS session
GROUP BY
session.entry_pathname
session.$initial_pathname
) AS bounce_rate
ON
pathname.$pathname = bounce_rate.entry_pathname
pathname.$pathname = bounce_rate.$initial_pathname
LEFT OUTER JOIN
{pathname_scroll_query} AS scroll_data
ON
Expand Down
34 changes: 24 additions & 10 deletions posthog/hogql_queries/web_analytics/top_sources.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from posthog.hogql import ast
from posthog.hogql.parser import parse_select
from posthog.hogql.query import execute_hogql_query
from posthog.hogql_queries.web_analytics.ctes import SESSION_CTE
from posthog.hogql_queries.web_analytics.ctes import SESSION_CTE, SOURCE_CTE
from posthog.hogql_queries.web_analytics.web_analytics_query_runner import WebAnalyticsQueryRunner
from posthog.schema import WebTopSourcesQuery, WebTopSourcesQueryResponse

Expand All @@ -17,25 +17,39 @@ def to_query(self) -> ast.SelectQuery | ast.SelectUnionQuery:
timings=self.timings,
placeholders={"session_where": self.session_where(), "session_having": self.session_having()},
)
with self.timings.measure("sources_query"):
source_query = parse_select(
SOURCE_CTE,
timings=self.timings,
placeholders={"source_where": self.events_where()},
)
with self.timings.measure("top_sources_query"):
top_sources_query = parse_select(
"""
SELECT
blended_source as "Source",
count(num_pageviews) as "context.columns.views",
count(DISTINCT person_id) as "context.columns.visitors",
avg(is_bounce) AS "context.columns.bounce_rate"
source_query.$initial_utm_source as "Initial UTM Source",
source_query.total_pageviews as "context.columns.views",
source_query.unique_visitors as "context.columns.visitors",
bounce_rate.bounce_rate AS "context.columns.bounce_rate"
FROM
{session_query}
{source_query} AS source_query
LEFT JOIN (
SELECT
session.$initial_utm_source,
avg(session.is_bounce) as bounce_rate
FROM
{session_query} AS session
GROUP BY
session.$initial_utm_source
) AS bounce_rate
ON source_query.$initial_utm_source = bounce_rate.$initial_utm_source
WHERE
"Source" IS NOT NULL
GROUP BY "Source"
"Initial UTM Source" IS NOT NULL
ORDER BY "context.columns.views" DESC
LIMIT 10
""",
timings=self.timings,
placeholders={"session_query": session_query},
placeholders={"session_query": session_query, "source_query": source_query},
)
return top_sources_query

Expand Down

0 comments on commit 170827a

Please sign in to comment.