diff --git a/frontend/src/queries/nodes/DataTable/DataTable.tsx b/frontend/src/queries/nodes/DataTable/DataTable.tsx index 5a12f15368b9e..477846231a8a2 100644 --- a/frontend/src/queries/nodes/DataTable/DataTable.tsx +++ b/frontend/src/queries/nodes/DataTable/DataTable.tsx @@ -34,6 +34,8 @@ import { isHogQLQuery, isPersonsNode, isWebTopSourcesQuery, + isWebTopClicksQuery, + isWebTopPagesQuery, taxonomicFilterToHogQl, } from '~/queries/utils' import { PersonPropertyFilters } from '~/queries/nodes/PersonsNode/PersonPropertyFilters' @@ -123,7 +125,10 @@ export function DataTable({ uniqueKey, query, setQuery, context, cachedResults } const actionsColumnShown = showActions && isEventsQuery(query.source) && columnsInResponse?.includes('*') const columnsInLemonTable = - isHogQLQuery(query.source) || isWebTopSourcesQuery(query.source) + isHogQLQuery(query.source) || + isWebTopSourcesQuery(query.source) || + isWebTopClicksQuery(query.source) || + isWebTopPagesQuery(query.source) ? columnsInResponse ?? columnsInQuery : columnsInQuery @@ -145,7 +150,9 @@ export function DataTable({ uniqueKey, query, setQuery, context, cachedResults } if ( isEventsQuery(query.source) || isHogQLQuery(query.source) || - isWebTopSourcesQuery(query.source) + isWebTopSourcesQuery(query.source) || + isWebTopClicksQuery(query.source) || + isWebTopPagesQuery(query.source) ) { return renderColumn(key, result[index], result, query, setQuery, context) } @@ -506,9 +513,9 @@ export function DataTable({ uniqueKey, query, setQuery, context, cachedResults } } footer={ canLoadNextData && - ((response as any).results.length > 0 || !responseLoading) && ( - - ) + ((response as any).results.length > 0 || + (response as any).result.length > 0 || + !responseLoading) && } /> )} diff --git a/frontend/src/queries/nodes/DataTable/dataTableLogic.ts b/frontend/src/queries/nodes/DataTable/dataTableLogic.ts index 5fb75476e0af3..e61d0130b8219 100644 --- a/frontend/src/queries/nodes/DataTable/dataTableLogic.ts +++ b/frontend/src/queries/nodes/DataTable/dataTableLogic.ts @@ -130,9 +130,15 @@ export const dataTableLogic = kea([ })) } - return response && 'results' in response && Array.isArray(response.results) - ? response.results.map((result: any) => ({ result })) ?? null + const results = !response + ? null + : 'results' in response && Array.isArray(response.results) + ? response.results + : 'result' in response && Array.isArray(response.result) + ? response.result : null + + return results ? results.map((result: any) => ({ result })) ?? null : null }, ], queryWithDefaults: [ diff --git a/frontend/src/queries/schema.json b/frontend/src/queries/schema.json index 53c0dfcf3ce88..0dd784cffced8 100644 --- a/frontend/src/queries/schema.json +++ b/frontend/src/queries/schema.json @@ -98,6 +98,12 @@ }, { "$ref": "#/definitions/WebTopSourcesQuery" + }, + { + "$ref": "#/definitions/WebTopClicksQuery" + }, + { + "$ref": "#/definitions/WebTopPagesQuery" } ] }, @@ -395,6 +401,12 @@ }, { "$ref": "#/definitions/WebTopSourcesQuery" + }, + { + "$ref": "#/definitions/WebTopClicksQuery" + }, + { + "$ref": "#/definitions/WebTopPagesQuery" } ], "description": "Source of the events" @@ -2276,8 +2288,113 @@ "required": ["result"], "type": "object" }, - "WebAnalyticsFilters": { + "WebAnalyticsFilters": {}, + "WebTopClicksQuery": { + "additionalProperties": false, + "properties": { + "dateRange": { + "$ref": "#/definitions/DateRange" + }, + "filters": { + "$ref": "#/definitions/WebAnalyticsFilters" + }, + "kind": { + "const": "WebTopClicksQuery", + "type": "string" + }, + "response": { + "$ref": "#/definitions/WebTopClicksQueryResponse" + } + }, + "required": ["kind", "filters"], + "type": "object" + }, + "WebTopClicksQueryResponse": { + "additionalProperties": false, + "properties": { + "columns": { + "items": {}, + "type": "array" + }, + "is_cached": { + "type": "boolean" + }, + "last_refresh": { + "type": "string" + }, + "next_allowed_client_refresh": { + "type": "string" + }, + "result": { + "items": {}, + "type": "array" + }, + "timings": { + "items": { + "$ref": "#/definitions/QueryTiming" + }, + "type": "array" + }, + "types": { + "items": {}, + "type": "array" + } + }, + "required": ["result"], + "type": "object" + }, + "WebTopPagesQuery": { + "additionalProperties": false, + "properties": { + "dateRange": { + "$ref": "#/definitions/DateRange" + }, + "filters": { + "$ref": "#/definitions/WebAnalyticsFilters" + }, + "kind": { + "const": "WebTopPagesQuery", + "type": "string" + }, + "response": { + "$ref": "#/definitions/WebTopPagesQueryResponse" + } + }, + "required": ["kind", "filters"], + "type": "object" + }, + "WebTopPagesQueryResponse": { "additionalProperties": false, + "properties": { + "columns": { + "items": {}, + "type": "array" + }, + "is_cached": { + "type": "boolean" + }, + "last_refresh": { + "type": "string" + }, + "next_allowed_client_refresh": { + "type": "string" + }, + "result": { + "items": {}, + "type": "array" + }, + "timings": { + "items": { + "$ref": "#/definitions/QueryTiming" + }, + "type": "array" + }, + "types": { + "items": {}, + "type": "array" + } + }, + "required": ["result"], "type": "object" }, "WebTopSourcesQuery": { @@ -2294,11 +2411,45 @@ "type": "string" }, "response": { - "$ref": "#/definitions/HogQLQueryResponse" + "$ref": "#/definitions/WebTopSourcesQueryResponse" } }, "required": ["kind", "filters"], "type": "object" + }, + "WebTopSourcesQueryResponse": { + "additionalProperties": false, + "properties": { + "columns": { + "items": {}, + "type": "array" + }, + "is_cached": { + "type": "boolean" + }, + "last_refresh": { + "type": "string" + }, + "next_allowed_client_refresh": { + "type": "string" + }, + "result": { + "items": {}, + "type": "array" + }, + "timings": { + "items": { + "$ref": "#/definitions/QueryTiming" + }, + "type": "array" + }, + "types": { + "items": {}, + "type": "array" + } + }, + "required": ["result"], + "type": "object" } } } diff --git a/frontend/src/queries/schema.ts b/frontend/src/queries/schema.ts index d400e2e1743d7..f1f803c41e886 100644 --- a/frontend/src/queries/schema.ts +++ b/frontend/src/queries/schema.ts @@ -59,6 +59,8 @@ export enum NodeKind { // Web analytics queries WebTopSourcesQuery = 'WebTopSourcesQuery', + WebTopPagesQuery = 'WebTopPagesQuery', + WebTopClicksQuery = 'WebTopClicksQuery', // Time to see data TimeToSeeDataSessionsQuery = 'TimeToSeeDataSessionsQuery', @@ -79,6 +81,8 @@ export type AnyDataNode = | HogQLMetadata | TimeToSeeDataSessionsQuery | WebTopSourcesQuery + | WebTopClicksQuery + | WebTopPagesQuery export type QuerySchema = // Data nodes (see utils.ts) @@ -281,7 +285,15 @@ export type HasPropertiesNode = EventsNode | EventsQuery | PersonsNode export interface DataTableNode extends Node, DataTableNodeViewProps { kind: NodeKind.DataTableNode /** Source of the events */ - source: EventsNode | EventsQuery | PersonsNode | HogQLQuery | TimeToSeeDataSessionsQuery | WebTopSourcesQuery + source: + | EventsNode + | EventsQuery + | PersonsNode + | HogQLQuery + | TimeToSeeDataSessionsQuery + | WebTopSourcesQuery + | WebTopClicksQuery + | WebTopPagesQuery /** Columns shown in the table, unless the `source` provides them. */ columns?: HogQLExpression[] @@ -487,7 +499,7 @@ export interface LifecycleQuery extends InsightsQueryBase { response?: LifecycleQueryResponse } -export interface WebAnalyticsFilters {} +export type WebAnalyticsFilters = any export interface WebAnalyticsQueryBase { dateRange?: DateRange @@ -496,9 +508,35 @@ export interface WebAnalyticsQueryBase { export interface WebTopSourcesQuery extends WebAnalyticsQueryBase { kind: NodeKind.WebTopSourcesQuery filters: WebAnalyticsFilters - response?: HogQLQueryResponse + response?: WebTopSourcesQueryResponse +} +export interface WebTopSourcesQueryResponse extends QueryResponse { + result: unknown[] + types?: unknown[] + columns?: unknown[] +} + +export interface WebTopClicksQuery extends WebAnalyticsQueryBase { + kind: NodeKind.WebTopClicksQuery + filters: WebAnalyticsFilters + response?: WebTopClicksQueryResponse +} +export interface WebTopClicksQueryResponse extends QueryResponse { + result: unknown[] + types?: unknown[] + columns?: unknown[] +} + +export interface WebTopPagesQuery extends WebAnalyticsQueryBase { + kind: NodeKind.WebTopPagesQuery + filters: WebAnalyticsFilters + response?: WebTopPagesQueryResponse +} +export interface WebTopPagesQueryResponse extends QueryResponse { + result: unknown[] + types?: unknown[] + columns?: unknown[] } -export interface WebTopSourcesQueryResponse extends QueryResponse {} export type InsightQueryNode = | TrendsQuery diff --git a/frontend/src/queries/utils.ts b/frontend/src/queries/utils.ts index be47cd4a02e9d..cdbfcedfafb90 100644 --- a/frontend/src/queries/utils.ts +++ b/frontend/src/queries/utils.ts @@ -27,6 +27,8 @@ import { DatabaseSchemaQuery, SavedInsightNode, WebTopSourcesQuery, + WebTopClicksQuery, + WebTopPagesQuery, } from '~/queries/schema' import { TaxonomicFilterGroupType, TaxonomicFilterValue } from 'lib/components/TaxonomicFilter/types' import { dayjs } from 'lib/dayjs' @@ -95,6 +97,14 @@ export function isWebTopSourcesQuery(node?: Node | null): node is WebTopSourcesQ return node?.kind === NodeKind.WebTopSourcesQuery } +export function isWebTopClicksQuery(node?: Node | null): node is WebTopClicksQuery { + return node?.kind === NodeKind.WebTopClicksQuery +} + +export function isWebTopPagesQuery(node?: Node | null): node is WebTopPagesQuery { + return node?.kind === NodeKind.WebTopPagesQuery +} + export function containsHogQLQuery(node?: Node | null): boolean { if (!node) { return false diff --git a/frontend/src/scenes/saved-insights/SavedInsights.tsx b/frontend/src/scenes/saved-insights/SavedInsights.tsx index af72dfb21241d..cf2265f308ae9 100644 --- a/frontend/src/scenes/saved-insights/SavedInsights.tsx +++ b/frontend/src/scenes/saved-insights/SavedInsights.tsx @@ -248,6 +248,18 @@ export const QUERY_TYPES_METADATA: Record = { icon: InsightsTrendsIcon, inMenu: true, }, + [NodeKind.WebTopPagesQuery]: { + name: 'Top Pages', + description: 'View top pages for a website', + icon: InsightsTrendsIcon, + inMenu: true, + }, + [NodeKind.WebTopClicksQuery]: { + name: 'Top Clicks', + description: 'View top clicks for a website', + icon: InsightsTrendsIcon, + inMenu: true, + }, } export const INSIGHT_TYPE_OPTIONS: LemonSelectOptions = [ diff --git a/frontend/src/scenes/web-analytics/WebAnalyticsScene.tsx b/frontend/src/scenes/web-analytics/WebAnalyticsScene.tsx index 5b3859cf9e298..4fb41fe7261cc 100644 --- a/frontend/src/scenes/web-analytics/WebAnalyticsScene.tsx +++ b/frontend/src/scenes/web-analytics/WebAnalyticsScene.tsx @@ -6,7 +6,7 @@ import { NodeKind } from '~/queries/schema' export function WebAnalyticsScene(): JSX.Element { return (
- Top pages + Top sources + Top clicks + + Top pages +
) } diff --git a/posthog/api/query.py b/posthog/api/query.py index cefe52b841f89..21ede66e8e91e 100644 --- a/posthog/api/query.py +++ b/posthog/api/query.py @@ -28,6 +28,8 @@ from posthog.hogql_queries.lifecycle_query_runner import LifecycleQueryRunner from posthog.hogql_queries.trends_query_runner import TrendsQueryRunner +from posthog.hogql_queries.web_analytics.top_clicks import TopClicksQueryRunner +from posthog.hogql_queries.web_analytics.top_pages import TopPagesQueryRunner from posthog.hogql_queries.web_analytics.top_sources import TopSourcesQueryRunner from posthog.models import Team from posthog.models.event.events_query import run_events_query @@ -252,8 +254,17 @@ def process_query( serializer.is_valid(raise_exception=True) return get_session_events(serializer) or {} elif query_kind == "WebTopSourcesQuery": + refresh_requested = refresh_requested_by_client(request) if request else False top_sources_query_runner = TopSourcesQueryRunner(query_json, team) - return _unwrap_pydantic_dict(top_sources_query_runner.run()) + return _unwrap_pydantic_dict(top_sources_query_runner.run(refresh_requested=refresh_requested)) + elif query_kind == "WebTopClicksQuery": + refresh_requested = refresh_requested_by_client(request) if request else False + top_clicks_query_runner = TopClicksQueryRunner(query_json, team) + return _unwrap_pydantic_dict(top_clicks_query_runner.run(refresh_requested=refresh_requested)) + elif query_kind == "WebTopPagesQuery": + refresh_requested = refresh_requested_by_client(request) if request else False + top_pages_query_runner = TopPagesQueryRunner(query_json, team) + return _unwrap_pydantic_dict(top_pages_query_runner.run(refresh_requested=refresh_requested)) else: if query_json.get("source"): return process_query(team, query_json["source"]) diff --git a/posthog/hogql_queries/lifecycle_query_runner.py b/posthog/hogql_queries/lifecycle_query_runner.py index c7bebc76be062..9583a9eb3bd38 100644 --- a/posthog/hogql_queries/lifecycle_query_runner.py +++ b/posthog/hogql_queries/lifecycle_query_runner.py @@ -11,14 +11,14 @@ from posthog.hogql.property import property_to_expr, action_to_expr from posthog.hogql.query import execute_hogql_query from posthog.hogql.timings import HogQLTimings -from posthog.hogql_queries.query_runner import QueryRunner +from posthog.hogql_queries.query_runner import InsightQueryRunner from posthog.models import Team, Action from posthog.hogql_queries.utils.query_date_range import QueryDateRange from posthog.models.filters.mixins.utils import cached_property from posthog.schema import LifecycleQuery, ActionsNode, EventsNode, LifecycleQueryResponse -class LifecycleQueryRunner(QueryRunner): +class LifecycleQueryRunner(InsightQueryRunner): query: LifecycleQuery query_type = LifecycleQuery diff --git a/posthog/hogql_queries/query_runner.py b/posthog/hogql_queries/query_runner.py index 5dbd4850e599d..629e1b578984a 100644 --- a/posthog/hogql_queries/query_runner.py +++ b/posthog/hogql_queries/query_runner.py @@ -2,11 +2,12 @@ from datetime import datetime from typing import Any, Generic, List, Optional, Type, Dict, TypeVar -from prometheus_client import Counter -from django.core.cache import cache from django.conf import settings +from django.core.cache import cache +from prometheus_client import Counter from pydantic import BaseModel, ConfigDict +from posthog.caching.insights_api import BASE_MINIMUM_INSIGHT_REFRESH_INTERVAL from posthog.clickhouse.query_tagging import tag_queries from posthog.hogql import ast from posthog.hogql.context import HogQLContext @@ -15,7 +16,7 @@ from posthog.metrics import LABEL_TEAM_ID from posthog.models import Team from posthog.schema import QueryTiming -from posthog.types import InsightQueryNode +from posthog.types import InsightOrWebAnalyticsQueryNode from posthog.utils import generate_cache_key, get_safe_cache QUERY_CACHE_WRITE_COUNTER = Counter( @@ -39,6 +40,8 @@ class QueryResponse(BaseModel, Generic[DataT]): ) result: DataT timings: Optional[List[QueryTiming]] = None + types: Optional[Any] = None + columns: Optional[Any] = None class CachedQueryResponse(QueryResponse): @@ -50,13 +53,15 @@ class CachedQueryResponse(QueryResponse): next_allowed_client_refresh: str -class QueryRunner(ABC): - query: InsightQueryNode - query_type: Type[InsightQueryNode] +class BaseQueryRunner(ABC): + query: InsightOrWebAnalyticsQueryNode + query_type: Type[InsightOrWebAnalyticsQueryNode] team: Team timings: HogQLTimings - def __init__(self, query: InsightQueryNode | Dict[str, Any], team: Team, timings: Optional[HogQLTimings] = None): + def __init__( + self, query: InsightOrWebAnalyticsQueryNode | Dict[str, Any], team: Team, timings: Optional[HogQLTimings] = None + ): self.team = team self.timings = timings or HogQLTimings() if isinstance(query, self.query_type): @@ -99,11 +104,6 @@ def run(self, refresh_requested: bool) -> CachedQueryResponse: def to_query(self) -> ast.SelectQuery: raise NotImplementedError() - @abstractmethod - def to_persons_query(self) -> str: - # TODO: add support for selecting and filtering by breakdowns - raise NotImplementedError() - def to_hogql(self) -> str: with self.timings.measure("to_hogql"): return print_ast( @@ -116,7 +116,9 @@ def toJSON(self) -> str: return self.query.model_dump_json(exclude_defaults=True, exclude_none=True) def _cache_key(self) -> str: - return generate_cache_key(f"query_{self.toJSON()}_{self.team.pk}_{self.team.timezone}") + return generate_cache_key( + f"query_{self.toJSON()}_{self.__class__.__name__}_{self.team.pk}_{self.team.timezone}" + ) @abstractmethod def _is_stale(self, cached_result_package): @@ -125,3 +127,18 @@ def _is_stale(self, cached_result_package): @abstractmethod def _refresh_frequency(self): raise NotImplementedError() + + +class InsightQueryRunner(BaseQueryRunner): + @abstractmethod + def to_persons_query(self) -> str: + # TODO: add support for selecting and filtering by breakdowns + raise NotImplementedError() + + +class WebAnalyticsQueryRunner(BaseQueryRunner): + def _is_stale(self, cached_result_package): + return True + + def _refresh_frequency(self): + return BASE_MINIMUM_INSIGHT_REFRESH_INTERVAL diff --git a/posthog/hogql_queries/test/test_query_runner.py b/posthog/hogql_queries/test/test_query_runner.py index d9af90a1e4ff9..2d014fee2d6cf 100644 --- a/posthog/hogql_queries/test/test_query_runner.py +++ b/posthog/hogql_queries/test/test_query_runner.py @@ -1,10 +1,12 @@ from datetime import datetime, timedelta -from dateutil.parser import isoparse -from zoneinfo import ZoneInfo from typing import Any, List, Literal, Optional, Type +from zoneinfo import ZoneInfo + +from dateutil.parser import isoparse from freezegun import freeze_time from pydantic import BaseModel -from posthog.hogql_queries.query_runner import QueryResponse, QueryRunner + +from posthog.hogql_queries.query_runner import QueryResponse, BaseQueryRunner from posthog.models.team.team import Team from posthog.test.base import BaseTest from posthog.types import InsightQueryNode @@ -20,7 +22,7 @@ class QueryRunnerTest(BaseTest): def setup_test_query_runner_class(self, query_class: Type[InsightQueryNode] = TestQuery): # type: ignore """Setup required methods and attributes of the abstract base class.""" - class TestQueryRunner(QueryRunner): + class TestQueryRunner(BaseQueryRunner): query_type = query_class def calculate(self) -> QueryResponse: @@ -86,7 +88,20 @@ def test_cache_key(self): runner = TestQueryRunner(query={"some_attr": "bla"}, team=team) # type: ignore cache_key = runner._cache_key() - self.assertEqual(cache_key, "cache_f0f2ce8b1f3d107b9671a178b25be2aa") + self.assertEqual(cache_key, "cache_33c9ea3098895d5a363a75feefafef06") + + def test_cache_key_runner_subclass(self): + TestQueryRunner = self.setup_test_query_runner_class() + + class TestSubclassQueryRunner(TestQueryRunner): # type: ignore + pass + + team = Team.objects.create(pk=42, organization=self.organization) + + runner = TestSubclassQueryRunner(query={"some_attr": "bla"}, team=team) # type: ignore + + cache_key = runner._cache_key() + self.assertEqual(cache_key, "cache_d626615de8ad0df73c1d8610ca586597") def test_cache_key_different_timezone(self): TestQueryRunner = self.setup_test_query_runner_class() @@ -97,7 +112,7 @@ def test_cache_key_different_timezone(self): runner = TestQueryRunner(query={"some_attr": "bla"}, team=team) # type: ignore cache_key = runner._cache_key() - self.assertEqual(cache_key, "cache_0fa2172980705adb41741351f40189b7") + self.assertEqual(cache_key, "cache_aeb23ec9e8de56dd8499f99f2e976d5a") def test_cache_response(self): TestQueryRunner = self.setup_test_query_runner_class() diff --git a/posthog/hogql_queries/trends_query_runner.py b/posthog/hogql_queries/trends_query_runner.py index 373b55b32790b..076fa8b109dfc 100644 --- a/posthog/hogql_queries/trends_query_runner.py +++ b/posthog/hogql_queries/trends_query_runner.py @@ -13,7 +13,7 @@ from posthog.hogql.property import property_to_expr from posthog.hogql.query import execute_hogql_query from posthog.hogql.timings import HogQLTimings -from posthog.hogql_queries.query_runner import QueryRunner +from posthog.hogql_queries.query_runner import InsightQueryRunner from posthog.hogql_queries.utils.formula_ast import FormulaAST from posthog.hogql_queries.utils.query_date_range import QueryDateRange from posthog.hogql_queries.utils.query_previous_period_date_range import QueryPreviousPeriodDateRange @@ -31,7 +31,7 @@ def __init__(self, series: EventsNode | ActionsNode, is_previous_period_series: self.is_previous_period_series = is_previous_period_series -class TrendsQueryRunner(QueryRunner): +class TrendsQueryRunner(InsightQueryRunner): query: TrendsQuery query_type = TrendsQuery series: List[SeriesWithExtras] diff --git a/posthog/hogql_queries/web_analytics/top_clicks.py b/posthog/hogql_queries/web_analytics/top_clicks.py new file mode 100644 index 0000000000000..9c912a6b6f149 --- /dev/null +++ b/posthog/hogql_queries/web_analytics/top_clicks.py @@ -0,0 +1,53 @@ +from django.utils.timezone import datetime + +from posthog.hogql import ast +from posthog.hogql.parser import parse_select +from posthog.hogql.query import execute_hogql_query +from posthog.hogql_queries.query_runner import WebAnalyticsQueryRunner +from posthog.hogql_queries.utils.query_date_range import QueryDateRange +from posthog.models.filters.mixins.utils import cached_property +from posthog.schema import WebTopClicksQuery, WebTopClicksQueryResponse + + +class TopClicksQueryRunner(WebAnalyticsQueryRunner): + query: WebTopClicksQuery + query_type = WebTopClicksQuery + + def to_query(self) -> ast.SelectQuery | ast.SelectUnionQuery: + with self.timings.measure("top_clicks_query"): + top_sources_query = parse_select( + """ +SELECT + properties.$el_text as el_text, + count() as total_clicks, + COUNT(DISTINCT events.person_id) as unique_visitors +FROM + events +WHERE + event == '$autocapture' +AND events.timestamp >= now() - INTERVAL 7 DAY +AND events.properties.$event_type = 'click' +AND el_text IS NOT NULL +GROUP BY + el_text +ORDER BY total_clicks DESC + """, + timings=self.timings, + ) + return top_sources_query + + def calculate(self): + response = execute_hogql_query( + query_type="top_sources_query", + query=self.to_query(), + team=self.team, + timings=self.timings, + ) + + return WebTopClicksQueryResponse( + columns=response.columns, result=response.results, timings=response.timings, types=response.types + ) + + @cached_property + def query_date_range(self): + return QueryDateRange(date_range=self.query.dateRange, team=self.team, interval=None, now=datetime.now()) diff --git a/posthog/hogql_queries/web_analytics/top_pages.py b/posthog/hogql_queries/web_analytics/top_pages.py new file mode 100644 index 0000000000000..1de2a2936c8dd --- /dev/null +++ b/posthog/hogql_queries/web_analytics/top_pages.py @@ -0,0 +1,151 @@ +from django.utils.timezone import datetime + +from posthog.hogql import ast +from posthog.hogql.parser import parse_select +from posthog.hogql.query import execute_hogql_query +from posthog.hogql_queries.query_runner import WebAnalyticsQueryRunner +from posthog.hogql_queries.utils.query_date_range import QueryDateRange +from posthog.models.filters.mixins.utils import cached_property +from posthog.schema import WebTopPagesQuery, WebTopPagesQueryResponse + + +class TopPagesQueryRunner(WebAnalyticsQueryRunner): + query: WebTopPagesQuery + query_type = WebTopPagesQuery + + def to_query(self) -> ast.SelectQuery | ast.SelectUnionQuery: + with self.timings.measure("top_pages_query"): + top_sources_query = parse_select( + """ +WITH + +scroll_depth_cte AS ( +SELECT + events.properties.`$prev_pageview_pathname` AS pathname, + countIf(events.event == '$pageview') as total_pageviews, + COUNT(DISTINCT events.properties.distinct_id) as unique_visitors, -- might want to use person id? have seen a small number of pages where unique > total + avg(CASE + WHEN events.properties.`$prev_pageview_max_content_percentage` IS NULL THEN NULL + WHEN events.properties.`$prev_pageview_max_content_percentage` > 0.8 THEN 100 + ELSE 0 + END) AS scroll_gt80_percentage, + avg(events.properties.$prev_pageview_max_scroll_percentage) * 100 as average_scroll_percentage +FROM + events +WHERE + (event = '$pageview' OR event = '$pageleave') AND events.properties.`$prev_pageview_pathname` IS NOT NULL + AND events.timestamp >= now() - INTERVAL 7 DAY +GROUP BY pathname +) + +, + +session_cte AS ( +SELECT + events.properties.`$session_id` AS session_id, + min(events.timestamp) AS min_timestamp, + max(events.timestamp) AS max_timestamp, + dateDiff('second', min_timestamp, max_timestamp) AS duration_s, + + -- create a tuple so that these are grouped in the same order, see https://github.com/ClickHouse/ClickHouse/discussions/42338 + groupArray((events.timestamp, events.properties.`$referrer`, events.properties.`$pathname`, events.properties.utm_source)) AS tuple_array, + arrayFirstIndex(x -> tupleElement(x, 1) == min_timestamp, tuple_array) as index_of_earliest, + arrayFirstIndex(x -> tupleElement(x, 1) == max_timestamp, tuple_array) as index_of_latest, + tupleElement(arrayElement( + tuple_array, + index_of_earliest + ), 2) AS earliest_referrer, + tupleElement(arrayElement( + tuple_array, + index_of_earliest + ), 3) AS earliest_pathname, + tupleElement(arrayElement( + tuple_array, + index_of_earliest + ), 4) AS earliest_utm_source, + + if(domain(earliest_referrer) = '', earliest_referrer, domain(earliest_referrer)) AS referrer_domain, + multiIf( + earliest_utm_source IS NOT NULL, earliest_utm_source, + -- This will need to be an approach that scales better + referrer_domain == 'app.posthog.com', 'posthog', + referrer_domain == 'eu.posthog.com', 'posthog', + referrer_domain == 'posthog.com', 'posthog', + referrer_domain == 'www.google.com', 'google', + referrer_domain == 'www.google.co.uk', 'google', + referrer_domain == 'www.google.com.hk', 'google', + referrer_domain == 'www.google.de', 'google', + referrer_domain == 't.co', 'twitter', + referrer_domain == 'github.com', 'github', + referrer_domain == 'duckduckgo.com', 'duckduckgo', + referrer_domain == 'www.bing.com', 'bing', + referrer_domain == 'bing.com', 'bing', + referrer_domain == 'yandex.ru', 'yandex', + referrer_domain == 'quora.com', 'quora', + referrer_domain == 'www.quora.com', 'quora', + referrer_domain == 'linkedin.com', 'linkedin', + referrer_domain == 'www.linkedin.com', 'linkedin', + startsWith(referrer_domain, 'http://localhost:'), 'localhost', + referrer_domain + ) AS blended_source, + + countIf(events.event == '$pageview') AS num_pageviews, + countIf(events.event == '$autocapture') AS num_autocaptures, + -- in v1 we'd also want to count whether there were any conversion events + + any(events.person_id) as person_id, + -- definition of a GA4 bounce from here https://support.google.com/analytics/answer/12195621?hl=en + (num_autocaptures == 0 AND num_pageviews <= 1 AND duration_s < 10) AS is_bounce +FROM + events +WHERE + session_id IS NOT NULL +AND + events.timestamp >= now() - INTERVAL 8 DAY +GROUP BY + events.properties.`$session_id` +HAVING + min_timestamp >= now() - INTERVAL 7 DAY +) + +, + +bounce_rate_cte AS ( +SELECT session_cte.earliest_pathname, + avg(session_cte.is_bounce) as bounce_rate +FROM session_cte +GROUP BY earliest_pathname +) + + + +SELECT scroll_depth_cte.pathname as pathname, +scroll_depth_cte.total_pageviews as total_pageviews, +scroll_depth_cte.unique_visitors as unique_visitors, +scroll_depth_cte.scroll_gt80_percentage as scroll_gt80_percentage, +scroll_depth_cte.average_scroll_percentage as average_scroll_percentage, +bounce_rate_cte.bounce_rate as bounce_rate +FROM + scroll_depth_cte LEFT OUTER JOIN bounce_rate_cte +ON scroll_depth_cte.pathname = bounce_rate_cte.earliest_pathname +ORDER BY total_pageviews DESC + """, + timings=self.timings, + ) + return top_sources_query + + def calculate(self): + response = execute_hogql_query( + query_type="top_sources_query", + query=self.to_query(), + team=self.team, + timings=self.timings, + ) + + return WebTopPagesQueryResponse( + columns=response.columns, result=response.results, timings=response.timings, types=response.types + ) + + @cached_property + def query_date_range(self): + return QueryDateRange(date_range=self.query.dateRange, team=self.team, interval=None, now=datetime.now()) diff --git a/posthog/hogql_queries/web_analytics/top_sources.py b/posthog/hogql_queries/web_analytics/top_sources.py index 4372b98002f0a..df08dbc843e39 100644 --- a/posthog/hogql_queries/web_analytics/top_sources.py +++ b/posthog/hogql_queries/web_analytics/top_sources.py @@ -1,21 +1,18 @@ -from datetime import timedelta -from math import ceil -from typing import Optional - from django.utils.timezone import datetime -from posthog.caching.insights_api import BASE_MINIMUM_INSIGHT_REFRESH_INTERVAL, REDUCED_MINIMUM_INSIGHT_REFRESH_INTERVAL -from posthog.caching.utils import is_stale from posthog.hogql import ast from posthog.hogql.parser import parse_select from posthog.hogql.query import execute_hogql_query +from posthog.hogql_queries.query_runner import WebAnalyticsQueryRunner from posthog.hogql_queries.utils.query_date_range import QueryDateRange -from posthog.hogql_queries.web_analytics.web_analytics_query_runner import WebAnalyticsQueryRunner from posthog.models.filters.mixins.utils import cached_property -from posthog.schema import HogQLQueryResponse +from posthog.schema import WebTopSourcesQuery, WebTopSourcesQueryResponse class TopSourcesQueryRunner(WebAnalyticsQueryRunner): + query: WebTopSourcesQuery + query_type = WebTopSourcesQuery + def to_query(self) -> ast.SelectQuery | ast.SelectUnionQuery: with self.timings.measure("top_sources_query"): top_sources_query = parse_select( @@ -110,7 +107,7 @@ def to_query(self) -> ast.SelectQuery | ast.SelectUnionQuery: ) return top_sources_query - def calculate(self) -> HogQLQueryResponse: + def calculate(self): response = execute_hogql_query( query_type="top_sources_query", query=self.to_query(), @@ -118,30 +115,10 @@ def calculate(self) -> HogQLQueryResponse: timings=self.timings, ) - return response + return WebTopSourcesQueryResponse( + columns=response.columns, result=response.results, timings=response.timings, types=response.types + ) @cached_property def query_date_range(self): return QueryDateRange(date_range=self.query.dateRange, team=self.team, interval=None, now=datetime.now()) - - def _is_stale(self, cached_result_package): - date_to = self.query_date_range.date_to() - interval = self.query_date_range.interval_name - return is_stale(self.team, date_to, interval, cached_result_package) - - def _refresh_frequency(self): - date_to = self.query_date_range.date_to() - date_from = self.query_date_range.date_from() - interval = self.query_date_range.interval_name - - delta_days: Optional[int] = None - if date_from and date_to: - delta = date_to - date_from - delta_days = ceil(delta.total_seconds() / timedelta(days=1).total_seconds()) - - refresh_frequency = BASE_MINIMUM_INSIGHT_REFRESH_INTERVAL - if interval == "hour" or (delta_days is not None and delta_days <= 7): - # The interval is shorter for short-term insights - refresh_frequency = REDUCED_MINIMUM_INSIGHT_REFRESH_INTERVAL - - return refresh_frequency diff --git a/posthog/hogql_queries/web_analytics/web_analytics_query_runner.py b/posthog/hogql_queries/web_analytics/web_analytics_query_runner.py deleted file mode 100644 index 641f8a5c5e582..0000000000000 --- a/posthog/hogql_queries/web_analytics/web_analytics_query_runner.py +++ /dev/null @@ -1,47 +0,0 @@ -from abc import ABC, abstractmethod -from typing import Any, Optional, Dict - -from posthog.hogql.timings import HogQLTimings -from posthog.models import Team -from posthog.schema import HogQLQueryResponse -from posthog.types import WebAnalyticsQueryNode -from posthog.utils import generate_cache_key - - -class WebAnalyticsQueryRunner(ABC): - team: Team - timings: HogQLTimings - query: WebAnalyticsQueryNode - - def __init__( - self, query: WebAnalyticsQueryNode | Dict[str, Any], team: Team, timings: Optional[HogQLTimings] = None - ): - self.team = team - self.timings = timings or HogQLTimings() - if isinstance(query, WebAnalyticsQueryNode): - self.query = query - else: - self.query = WebAnalyticsQueryNode.model_validate(query) - - @abstractmethod - def calculate(self) -> HogQLQueryResponse: - raise NotImplementedError() - - def run(self) -> HogQLQueryResponse: - return self.calculate() - - def toJSON(self) -> str: - return self.query.model_dump_json(exclude_defaults=True, exclude_none=True) - - def _cache_key(self) -> str: - return generate_cache_key( - f"query_{self.__class__.__name__}_{self.toJSON()}_{self.team.pk}_{self.team.timezone}" - ) - - @abstractmethod - def _is_stale(self, cached_result_package): - raise NotImplementedError() - - @abstractmethod - def _refresh_frequency(self): - raise NotImplementedError() diff --git a/posthog/schema.py b/posthog/schema.py index bbe34de311ec6..094f194ac0fc1 100644 --- a/posthog/schema.py +++ b/posthog/schema.py @@ -468,11 +468,43 @@ class TrendsQueryResponse(BaseModel): timings: Optional[List[QueryTiming]] = None -class WebAnalyticsFilters(BaseModel): - pass +class WebTopClicksQueryResponse(BaseModel): + model_config = ConfigDict( + extra="forbid", + ) + columns: Optional[List] = None + is_cached: Optional[bool] = None + last_refresh: Optional[str] = None + next_allowed_client_refresh: Optional[str] = None + result: List + timings: Optional[List[QueryTiming]] = None + types: Optional[List] = None + + +class WebTopPagesQueryResponse(BaseModel): + model_config = ConfigDict( + extra="forbid", + ) + columns: Optional[List] = None + is_cached: Optional[bool] = None + last_refresh: Optional[str] = None + next_allowed_client_refresh: Optional[str] = None + result: List + timings: Optional[List[QueryTiming]] = None + types: Optional[List] = None + + +class WebTopSourcesQueryResponse(BaseModel): model_config = ConfigDict( extra="forbid", ) + columns: Optional[List] = None + is_cached: Optional[bool] = None + last_refresh: Optional[str] = None + next_allowed_client_refresh: Optional[str] = None + result: List + timings: Optional[List[QueryTiming]] = None + types: Optional[List] = None class Breakdown(BaseModel): @@ -661,14 +693,34 @@ class TimeToSeeDataSessionsQuery(BaseModel): teamId: Optional[float] = Field(default=None, description="Project to filter on. Defaults to current project") +class WebTopClicksQuery(BaseModel): + model_config = ConfigDict( + extra="forbid", + ) + dateRange: Optional[DateRange] = None + filters: Any + kind: Literal["WebTopClicksQuery"] = "WebTopClicksQuery" + response: Optional[WebTopClicksQueryResponse] = None + + +class WebTopPagesQuery(BaseModel): + model_config = ConfigDict( + extra="forbid", + ) + dateRange: Optional[DateRange] = None + filters: Any + kind: Literal["WebTopPagesQuery"] = "WebTopPagesQuery" + response: Optional[WebTopPagesQueryResponse] = None + + class WebTopSourcesQuery(BaseModel): model_config = ConfigDict( extra="forbid", ) dateRange: Optional[DateRange] = None - filters: WebAnalyticsFilters + filters: Any kind: Literal["WebTopSourcesQuery"] = "WebTopSourcesQuery" - response: Optional[HogQLQueryResponse] = None + response: Optional[WebTopSourcesQueryResponse] = None class DatabaseSchemaQuery(BaseModel): @@ -1000,7 +1052,14 @@ class DataTableNode(BaseModel): showSearch: Optional[bool] = Field(default=None, description="Include a free text search field (PersonsNode only)") showTimings: Optional[bool] = Field(default=None, description="Show a detailed query timing breakdown") source: Union[ - EventsNode, EventsQuery, PersonsNode, HogQLQuery, TimeToSeeDataSessionsQuery, WebTopSourcesQuery + EventsNode, + EventsQuery, + PersonsNode, + HogQLQuery, + TimeToSeeDataSessionsQuery, + WebTopSourcesQuery, + WebTopClicksQuery, + WebTopPagesQuery, ] = Field(..., description="Source of the events") @@ -1279,6 +1338,8 @@ class Model(RootModel): HogQLMetadata, TimeToSeeDataSessionsQuery, WebTopSourcesQuery, + WebTopClicksQuery, + WebTopPagesQuery, ], ] diff --git a/posthog/types.py b/posthog/types.py index bb126661dbad3..20a90a09473df 100644 --- a/posthog/types.py +++ b/posthog/types.py @@ -12,10 +12,21 @@ StickinessQuery, TrendsQuery, WebTopSourcesQuery, + WebTopClicksQuery, + WebTopPagesQuery, ) FilterType = Union[Filter, PathFilter, RetentionFilter, StickinessFilter] InsightQueryNode = Union[TrendsQuery, FunnelsQuery, RetentionQuery, PathsQuery, StickinessQuery, LifecycleQuery] - -WebAnalyticsQueryNode = Union[WebTopSourcesQuery] +InsightOrWebAnalyticsQueryNode = Union[ + TrendsQuery, + FunnelsQuery, + RetentionQuery, + PathsQuery, + StickinessQuery, + LifecycleQuery, + WebTopSourcesQuery, + WebTopClicksQuery, + WebTopPagesQuery, +]