Skip to content

Commit

Permalink
Add web top sources query to the backend
Browse files Browse the repository at this point in the history
  • Loading branch information
robbie-c committed Sep 26, 2023
1 parent 2cae4bd commit 106d7b3
Show file tree
Hide file tree
Showing 11 changed files with 306 additions and 266 deletions.
15 changes: 15 additions & 0 deletions frontend/src/queries/schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -389,6 +389,9 @@
},
{
"$ref": "#/definitions/TimeToSeeDataSessionsQuery"
},
{
"$ref": "#/definitions/WebTopSourcesQuery"
}
],
"description": "Source of the events"
Expand Down Expand Up @@ -2269,6 +2272,18 @@
},
"required": ["result"],
"type": "object"
},
"WebTopSourcesQuery": {
"additionalProperties": false,
"properties": {
"kind": {
"const": "WebTopSourcesQuery",
"type": "string"
},
"query": {}
},
"required": ["kind", "query"],
"type": "object"
}
}
}
13 changes: 12 additions & 1 deletion frontend/src/queries/schema.ts
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,9 @@ export enum NodeKind {
StickinessQuery = 'StickinessQuery',
LifecycleQuery = 'LifecycleQuery',

// Web analytics queries
WebTopSourcesQuery = 'WebTopSourcesQuery',

// Time to see data
TimeToSeeDataSessionsQuery = 'TimeToSeeDataSessionsQuery',
TimeToSeeDataQuery = 'TimeToSeeDataQuery',
Expand Down Expand Up @@ -277,7 +280,7 @@ export type HasPropertiesNode = EventsNode | EventsQuery | PersonsNode
export interface DataTableNode extends Node, DataTableNodeViewProps {
kind: NodeKind.DataTableNode
/** Source of the events */
source: EventsNode | EventsQuery | PersonsNode | HogQLQuery | TimeToSeeDataSessionsQuery
source: EventsNode | EventsQuery | PersonsNode | HogQLQuery | TimeToSeeDataSessionsQuery | WebTopSourcesQuery

/** Columns shown in the table, unless the `source` provides them. */
columns?: HogQLExpression[]
Expand Down Expand Up @@ -483,6 +486,14 @@ export interface LifecycleQuery extends InsightsQueryBase {
response?: LifecycleQueryResponse
}

export interface WebAnalyticsQueryBase {}

export interface WebTopSourcesQuery extends WebAnalyticsQueryBase {
kind: NodeKind.WebTopSourcesQuery
query: any
}
export interface WebTopSourcesQueryResponse extends QueryResponse {}

export type InsightQueryNode =
| TrendsQuery
| FunnelsQuery
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ import { SSO_PROVIDER_NAMES } from 'lib/constants'
import { preflightLogic } from 'scenes/PreflightCheck/preflightLogic'
import { SSOProvider } from '~/types'

interface SSOSelectInterface {
export interface SSOSelectInterface {
value: SSOProvider | ''
loading: boolean
onChange: (value: SSOProvider | '') => void
Expand Down
6 changes: 6 additions & 0 deletions frontend/src/scenes/saved-insights/SavedInsights.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -241,6 +241,12 @@ export const QUERY_TYPES_METADATA: Record<NodeKind, InsightTypeMetadata> = {
icon: InsightSQLIcon,
inMenu: true,
},
[NodeKind.WebTopSourcesQuery]: {
name: 'Top Sources',
description: 'View top sources for a website',
icon: InsightsTrendsIcon,
inMenu: true,
},
}

export const INSIGHT_TYPE_OPTIONS: LemonSelectOptions<string> = [
Expand Down
272 changes: 11 additions & 261 deletions frontend/src/scenes/web-analytics/WebAnalyticsScene.tsx
Original file line number Diff line number Diff line change
@@ -1,47 +1,22 @@
import { SceneExport } from 'scenes/sceneTypes'
import { webAnalyticsLogic } from 'scenes/web-analytics/webAnalyticsLogic'
import { Query } from '~/queries/Query/Query'
import { NodeKind } from '~/queries/schema'
import { DataTableNode, NodeKind } from '~/queries/schema'

export function WebAnalyticsScene(): JSX.Element {
const query: DataTableNode = {
full: true,
kind: NodeKind.DataTableNode,
source: {
kind: NodeKind.WebTopSourcesQuery,
query: {},
},
}

return (
<div>
Top pages
<Query
query={{
full: true,
kind: NodeKind.DataTableNode,
source: {
kind: NodeKind.HogQLQuery,
query: TOP_PAGES_SQL,
},
}}
readOnly={true}
/>
Top sources
<Query
query={{
full: true,
kind: NodeKind.DataTableNode,
source: {
kind: NodeKind.HogQLQuery,
query: TOP_SOURCES,
},
}}
readOnly={true}
/>
Top clicks
<Query
query={{
full: true,
kind: NodeKind.DataTableNode,
source: {
kind: NodeKind.HogQLQuery,
query: TOP_CLICKS_SQL,
},
}}
readOnly={true}
/>
<Query query={query} readOnly={true} />
</div>
)
}
Expand All @@ -50,228 +25,3 @@ export const scene: SceneExport = {
component: WebAnalyticsScene,
logic: webAnalyticsLogic,
}

// Whilst this component is using HogQLQuery nodes, the SQL needs to exist client-side.
// These should be deleted once the new web analytics node types are created

const TOP_CLICKS_SQL = `
SELECT
properties.$el_text as el_text,
count() as total_clicks,
COUNT(DISTINCT events.person_id) as unique_visitors
FROM
events
WHERE
event == '$autocapture'
AND events.timestamp >= now() - INTERVAL 7 DAY
AND events.properties.$event_type = 'click'
AND el_text IS NOT NULL
GROUP BY
el_text
ORDER BY total_clicks DESC
`

const TOP_PAGES_SQL = `
WITH
scroll_depth_cte AS (
SELECT
events.properties.\`$prev_pageview_pathname\` AS pathname,
countIf(events.event == '$pageview') as total_pageviews,
COUNT(DISTINCT events.properties.distinct_id) as unique_visitors, -- might want to use person id? have seen a small number of pages where unique > total
avg(CASE
WHEN events.properties.\`$prev_pageview_max_content_percentage\` IS NULL THEN NULL
WHEN events.properties.\`$prev_pageview_max_content_percentage\` > 0.8 THEN 100
ELSE 0
END) AS scroll_gt80_percentage,
avg(events.properties.$prev_pageview_max_scroll_percentage) * 100 as average_scroll_percentage
FROM
events
WHERE
(event = '$pageview' OR event = '$pageleave') AND events.properties.\`$prev_pageview_pathname\` IS NOT NULL
AND events.timestamp >= now() - INTERVAL 7 DAY
GROUP BY pathname
)
,
session_cte AS (
SELECT
events.properties.\`$session_id\` AS session_id,
min(events.timestamp) AS min_timestamp,
max(events.timestamp) AS max_timestamp,
dateDiff('second', min_timestamp, max_timestamp) AS duration_s,
-- create a tuple so that these are grouped in the same order, see https://github.com/ClickHouse/ClickHouse/discussions/42338
groupArray((events.timestamp, events.properties.\`$referrer\`, events.properties.\`$pathname\`, events.properties.utm_source)) AS tuple_array,
arrayFirstIndex(x -> tupleElement(x, 1) == min_timestamp, tuple_array) as index_of_earliest,
arrayFirstIndex(x -> tupleElement(x, 1) == max_timestamp, tuple_array) as index_of_latest,
tupleElement(arrayElement(
tuple_array,
index_of_earliest
), 2) AS earliest_referrer,
tupleElement(arrayElement(
tuple_array,
index_of_earliest
), 3) AS earliest_pathname,
tupleElement(arrayElement(
tuple_array,
index_of_earliest
), 4) AS earliest_utm_source,
if(domain(earliest_referrer) = '', earliest_referrer, domain(earliest_referrer)) AS referrer_domain,
multiIf(
earliest_utm_source IS NOT NULL, earliest_utm_source,
-- This will need to be an approach that scales better
referrer_domain == 'app.posthog.com', 'posthog',
referrer_domain == 'eu.posthog.com', 'posthog',
referrer_domain == 'posthog.com', 'posthog',
referrer_domain == 'www.google.com', 'google',
referrer_domain == 'www.google.co.uk', 'google',
referrer_domain == 'www.google.com.hk', 'google',
referrer_domain == 'www.google.de', 'google',
referrer_domain == 't.co', 'twitter',
referrer_domain == 'github.com', 'github',
referrer_domain == 'duckduckgo.com', 'duckduckgo',
referrer_domain == 'www.bing.com', 'bing',
referrer_domain == 'bing.com', 'bing',
referrer_domain == 'yandex.ru', 'yandex',
referrer_domain == 'quora.com', 'quora',
referrer_domain == 'www.quora.com', 'quora',
referrer_domain == 'linkedin.com', 'linkedin',
referrer_domain == 'www.linkedin.com', 'linkedin',
startsWith(referrer_domain, 'http://localhost:'), 'localhost',
referrer_domain
) AS blended_source,
countIf(events.event == '$pageview') AS num_pageviews,
countIf(events.event == '$autocapture') AS num_autocaptures,
-- in v1 we'd also want to count whether there were any conversion events
any(events.person_id) as person_id,
-- definition of a GA4 bounce from here https://support.google.com/analytics/answer/12195621?hl=en
(num_autocaptures == 0 AND num_pageviews <= 1 AND duration_s < 10) AS is_bounce
FROM
events
WHERE
session_id IS NOT NULL
AND
events.timestamp >= now() - INTERVAL 8 DAY
GROUP BY
events.properties.\`$session_id\`
HAVING
min_timestamp >= now() - INTERVAL 7 DAY
)
,
bounce_rate_cte AS (
SELECT session_cte.earliest_pathname,
avg(session_cte.is_bounce) as bounce_rate
FROM session_cte
GROUP BY earliest_pathname
)
SELECT scroll_depth_cte.pathname as pathname,
scroll_depth_cte.total_pageviews as total_pageviews,
scroll_depth_cte.unique_visitors as unique_visitors,
scroll_depth_cte.scroll_gt80_percentage as scroll_gt80_percentage,
scroll_depth_cte.average_scroll_percentage as average_scroll_percentage,
bounce_rate_cte.bounce_rate as bounce_rate
FROM
scroll_depth_cte LEFT OUTER JOIN bounce_rate_cte
ON scroll_depth_cte.pathname = bounce_rate_cte.earliest_pathname
ORDER BY total_pageviews DESC
`

const TOP_SOURCES = `
WITH
session_cte AS (
SELECT
events.properties.\`$session_id\` AS session_id,
min(events.timestamp) AS min_timestamp,
max(events.timestamp) AS max_timestamp,
dateDiff('second', min_timestamp, max_timestamp) AS duration_s,
-- create a tuple so that these are grouped in the same order, see https://github.com/ClickHouse/ClickHouse/discussions/42338
groupArray((events.timestamp, events.properties.\`$referrer\`, events.properties.\`$pathname\`, events.properties.utm_source)) AS tuple_array,
arrayFirstIndex(x -> tupleElement(x, 1) == min_timestamp, tuple_array) as index_of_earliest,
arrayFirstIndex(x -> tupleElement(x, 1) == max_timestamp, tuple_array) as index_of_latest,
tupleElement(arrayElement(
tuple_array,
index_of_earliest
), 2) AS earliest_referrer,
tupleElement(arrayElement(
tuple_array,
index_of_earliest
), 3) AS earliest_pathname,
tupleElement(arrayElement(
tuple_array,
index_of_earliest
), 4) AS earliest_utm_source,
if(domain(earliest_referrer) = '', earliest_referrer, domain(earliest_referrer)) AS referrer_domain,
multiIf(
earliest_utm_source IS NOT NULL, earliest_utm_source,
-- This will need to be an approach that scales better
referrer_domain == 'app.posthog.com', 'posthog',
referrer_domain == 'eu.posthog.com', 'posthog',
referrer_domain == 'posthog.com', 'posthog',
referrer_domain == 'www.google.com', 'google',
referrer_domain == 'www.google.co.uk', 'google',
referrer_domain == 'www.google.com.hk', 'google',
referrer_domain == 'www.google.de', 'google',
referrer_domain == 't.co', 'twitter',
referrer_domain == 'github.com', 'github',
referrer_domain == 'duckduckgo.com', 'duckduckgo',
referrer_domain == 'www.bing.com', 'bing',
referrer_domain == 'bing.com', 'bing',
referrer_domain == 'yandex.ru', 'yandex',
referrer_domain == 'quora.com', 'quora',
referrer_domain == 'www.quora.com', 'quora',
referrer_domain == 'linkedin.com', 'linkedin',
referrer_domain == 'www.linkedin.com', 'linkedin',
startsWith(referrer_domain, 'http://localhost:'), 'localhost',
referrer_domain
) AS blended_source,
countIf(events.event == '$pageview') AS num_pageviews,
countIf(events.event == '$autocapture') AS num_autocaptures,
-- in v1 we'd also want to count whether there were any conversion events
any(events.person_id) as person_id,
-- definition of a GA4 bounce from here https://support.google.com/analytics/answer/12195621?hl=en
(num_autocaptures == 0 AND num_pageviews <= 1 AND duration_s < 10) AS is_bounce
FROM
events
WHERE
session_id IS NOT NULL
AND
events.timestamp >= now() - INTERVAL 8 DAY
GROUP BY
events.properties.\`$session_id\`
HAVING
min_timestamp >= now() - INTERVAL 7 DAY
)
SELECT
blended_source,
count(num_pageviews) as total_pageviews,
count(DISTINCT person_id) as unique_visitors,
avg(is_bounce) AS bounce_rate
FROM
session_cte
WHERE
blended_source IS NOT NULL
GROUP BY blended_source
ORDER BY total_pageviews DESC
LIMIT 100
`
5 changes: 5 additions & 0 deletions posthog/api/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@

from posthog.hogql_queries.lifecycle_query_runner import LifecycleQueryRunner
from posthog.hogql_queries.trends_query_runner import TrendsQueryRunner
from posthog.hogql_queries.web_analytics.top_sources import TopSourcesQueryRunner
from posthog.models import Team
from posthog.models.event.events_query import run_events_query
from posthog.models.user import User
Expand Down Expand Up @@ -250,6 +251,10 @@ def process_query(
)
serializer.is_valid(raise_exception=True)
return get_session_events(serializer) or {}
elif query_kind == "WebTopSourcesQuery":
refresh_requested = refresh_requested_by_client(request) if request else False
top_sources_query_runner = TopSourcesQueryRunner(query_json, team)
return _unwrap_pydantic_dict(top_sources_query_runner.run(refresh_requested=refresh_requested))
else:
if query_json.get("source"):
return process_query(team, query_json["source"])
Expand Down
Empty file.
Loading

0 comments on commit 106d7b3

Please sign in to comment.