From f21d6fc07ad948603c50af05fd6e6ae3dac69005 Mon Sep 17 00:00:00 2001 From: Robbie Coomber Date: Tue, 10 Oct 2023 15:57:54 +0100 Subject: [PATCH] Get working pathname filtering --- posthog/hogql/property.py | 4 +- posthog/hogql_queries/web_analytics/ctes.py | 14 ++-- .../hogql_queries/web_analytics/top_clicks.py | 10 +-- .../hogql_queries/web_analytics/top_pages.py | 52 ++----------- .../web_analytics/top_sources.py | 26 +------ .../web_analytics_query_runner.py | 74 +++++++++++++++++++ 6 files changed, 95 insertions(+), 85 deletions(-) diff --git a/posthog/hogql/property.py b/posthog/hogql/property.py index b97cf37eb31a45..9c2ea6f341d17e 100644 --- a/posthog/hogql/property.py +++ b/posthog/hogql/property.py @@ -48,7 +48,7 @@ def visit_call(self, node: ast.Call): def property_to_expr( - property: Union[BaseModel, PropertyGroup, Property, dict, list], + property: Union[BaseModel, PropertyGroup, Property, dict, list, ast.Expr], team: Team, scope: Literal["event", "person"] = "event", ) -> ast.Expr: @@ -63,6 +63,8 @@ def property_to_expr( return ast.And(exprs=properties) elif isinstance(property, Property): pass + elif isinstance(property, ast.Expr): + return property elif ( isinstance(property, PropertyGroup) or isinstance(property, PropertyGroupFilter) diff --git a/posthog/hogql_queries/web_analytics/ctes.py b/posthog/hogql_queries/web_analytics/ctes.py index af5776a30b47ef..2e17b5710e874b 100644 --- a/posthog/hogql_queries/web_analytics/ctes.py +++ b/posthog/hogql_queries/web_analytics/ctes.py @@ -10,8 +10,8 @@ dateDiff('second', min_timestamp, max_timestamp) AS duration_s, argMin(events.properties.`$referrer`, events.timestamp) AS earliest_referrer, - argMin(events.properties.`$pathname`, events.timestamp) AS earliest_pathname, - argMax(events.properties.`$pathname`, events.timestamp ) AS latest_pathname, + argMin(events.properties.`$pathname`, events.timestamp) AS entry_pathname, + argMax(events.properties.`$pathname`, events.timestamp ) AS exit_pathname, argMax(events.properties.utm_source, events.timestamp) AS earliest_utm_source, if(domain(earliest_referrer) = '', earliest_referrer, domain(earliest_referrer)) AS referrer_domain, @@ -59,20 +59,20 @@ PATHNAME_CTE = """ SELECT - events.properties.`$pathname` AS pathname, + events.properties.`$pathname` AS $pathname, count() as total_pageviews, - uniq(events.person_id) as unique_visitors -- might want to use person id? have seen a small number of pages where unique > total + uniq(events.person_id) as unique_visitors FROM events WHERE (event = '$pageview') AND ({pathname_where}) - GROUP BY pathname + GROUP BY $pathname """ PATHNAME_SCROLL_CTE = """ SELECT - events.properties.`$prev_pageview_pathname` AS pathname, + events.properties.`$prev_pageview_pathname` AS $pathname, avg(CASE WHEN toFloat(JSONExtractRaw(events.properties, '$prev_pageview_max_content_percentage')) IS NULL THEN NULL WHEN toFloat(JSONExtractRaw(events.properties, '$prev_pageview_max_content_percentage')) > 0.8 THEN 100 @@ -84,5 +84,5 @@ WHERE (event = '$pageview' OR event = '$pageleave') AND events.properties.`$prev_pageview_pathname` IS NOT NULL AND ({pathname_scroll_where}) -GROUP BY pathname +GROUP BY $pathname """ diff --git a/posthog/hogql_queries/web_analytics/top_clicks.py b/posthog/hogql_queries/web_analytics/top_clicks.py index 1acc9e1b10733f..004cad7947c934 100644 --- a/posthog/hogql_queries/web_analytics/top_clicks.py +++ b/posthog/hogql_queries/web_analytics/top_clicks.py @@ -2,7 +2,6 @@ from posthog.hogql import ast from posthog.hogql.parser import parse_select -from posthog.hogql.property import property_to_expr from posthog.hogql.query import execute_hogql_query from posthog.hogql_queries.utils.query_date_range import QueryDateRange from posthog.hogql_queries.web_analytics.web_analytics_query_runner import WebAnalyticsQueryRunner @@ -26,11 +25,9 @@ def to_query(self) -> ast.SelectQuery | ast.SelectUnionQuery: events WHERE event == '$autocapture' -AND events.timestamp >= {date_from} -AND events.timestamp < {date_to} AND events.properties.$event_type = 'click' AND el_text IS NOT NULL -AND ({event_properties}) +AND ({events_where}) GROUP BY el_text ORDER BY total_clicks DESC @@ -38,7 +35,7 @@ def to_query(self) -> ast.SelectQuery | ast.SelectUnionQuery: """, timings=self.timings, placeholders={ - "event_properties": self.event_properties(), + "event_properties": self.events_where(), "date_from": self.query_date_range.date_from_as_hogql(), "date_to": self.query_date_range.date_to_as_hogql(), }, @@ -60,6 +57,3 @@ def calculate(self): @cached_property def query_date_range(self): return QueryDateRange(date_range=self.query.dateRange, team=self.team, interval=None, now=datetime.now()) - - def event_properties(self) -> ast.Expr: - return property_to_expr(self.query.properties, team=self.team) diff --git a/posthog/hogql_queries/web_analytics/top_pages.py b/posthog/hogql_queries/web_analytics/top_pages.py index 9cf8d81e524f3e..e994e5363c5f5c 100644 --- a/posthog/hogql_queries/web_analytics/top_pages.py +++ b/posthog/hogql_queries/web_analytics/top_pages.py @@ -1,12 +1,8 @@ -from django.utils.timezone import datetime - from posthog.hogql import ast -from posthog.hogql.parser import parse_select, parse_expr +from posthog.hogql.parser import parse_select from posthog.hogql.query import execute_hogql_query -from posthog.hogql_queries.utils.query_date_range import QueryDateRange from posthog.hogql_queries.web_analytics.ctes import SESSION_CTE, PATHNAME_CTE, PATHNAME_SCROLL_CTE from posthog.hogql_queries.web_analytics.web_analytics_query_runner import WebAnalyticsQueryRunner -from posthog.models.filters.mixins.utils import cached_property from posthog.schema import WebTopPagesQuery, WebTopPagesQueryResponse @@ -23,19 +19,19 @@ def to_query(self) -> ast.SelectQuery | ast.SelectUnionQuery: ) with self.timings.measure("pathname_query"): pathname_query = parse_select( - PATHNAME_CTE, timings=self.timings, placeholders={"pathname_where": self.pathname_where()} + PATHNAME_CTE, timings=self.timings, placeholders={"pathname_where": self.events_where()} ) with self.timings.measure("pathname_scroll_query"): pathname_scroll_query = parse_select( PATHNAME_SCROLL_CTE, timings=self.timings, - placeholders={"pathname_scroll_where": self.pathname_scroll_where()}, + placeholders={"pathname_scroll_where": self.events_where()}, ) with self.timings.measure("top_pages_query"): top_sources_query = parse_select( """ SELECT - pathname.pathname as pathname, + pathname.$pathname as pathname, pathname.total_pageviews as total_pageviews, pathname.unique_visitors as unique_visitors, bounce_rate.bounce_rate as bounce_rate, @@ -46,19 +42,19 @@ def to_query(self) -> ast.SelectQuery | ast.SelectUnionQuery: LEFT OUTER JOIN ( SELECT - session.earliest_pathname, + session.entry_pathname, avg(session.is_bounce) as bounce_rate FROM {session_query} AS session GROUP BY - session.earliest_pathname + session.entry_pathname ) AS bounce_rate ON - pathname.pathname = bounce_rate.earliest_pathname + pathname.$pathname = bounce_rate.entry_pathname LEFT OUTER JOIN {pathname_scroll_query} AS scroll_data ON - pathname.pathname = scroll_data.pathname + pathname.$pathname = scroll_data.$pathname ORDER BY total_pageviews DESC LIMIT 10 @@ -83,35 +79,3 @@ def calculate(self): return WebTopPagesQueryResponse( columns=response.columns, results=response.results, timings=response.timings, types=response.types ) - - @cached_property - def query_date_range(self): - return QueryDateRange(date_range=self.query.dateRange, team=self.team, interval=None, now=datetime.now()) - - def session_where(self): - # TODO needs to consider some session properties, e.g. what to do with utm parameters? - return parse_expr( - "events.timestamp < {date_to} AND events.timestamp >= minus({date_from}, toIntervalHour(1))", - placeholders={ - "date_from": self.query_date_range.date_from_as_hogql(), - "date_to": self.query_date_range.date_to_as_hogql(), - }, - ) - - def session_having(self): - # TODO needs to consider some session properties, e.g. what to do with utm parameters? - return parse_expr( - "min_timestamp >= {date_from}", placeholders={"date_from": self.query_date_range.date_from_as_hogql()} - ) - - def pathname_where(self): - # TODO needs to apply filters - return parse_expr( - "events.timestamp >= {date_from}", placeholders={"date_from": self.query_date_range.date_from_as_hogql()} - ) - - def pathname_scroll_where(self): - # TODO needs to apply filters - return parse_expr( - "events.timestamp >= {date_from}", placeholders={"date_from": self.query_date_range.date_from_as_hogql()} - ) diff --git a/posthog/hogql_queries/web_analytics/top_sources.py b/posthog/hogql_queries/web_analytics/top_sources.py index 96e530968ac134..493844e74c0d94 100644 --- a/posthog/hogql_queries/web_analytics/top_sources.py +++ b/posthog/hogql_queries/web_analytics/top_sources.py @@ -1,12 +1,8 @@ -from django.utils.timezone import datetime - from posthog.hogql import ast -from posthog.hogql.parser import parse_select, parse_expr +from posthog.hogql.parser import parse_select from posthog.hogql.query import execute_hogql_query -from posthog.hogql_queries.utils.query_date_range import QueryDateRange from posthog.hogql_queries.web_analytics.ctes import SESSION_CTE from posthog.hogql_queries.web_analytics.web_analytics_query_runner import WebAnalyticsQueryRunner -from posthog.models.filters.mixins.utils import cached_property from posthog.schema import WebTopSourcesQuery, WebTopSourcesQueryResponse @@ -54,23 +50,3 @@ def calculate(self): return WebTopSourcesQueryResponse( columns=response.columns, results=response.results, timings=response.timings, types=response.types ) - - @cached_property - def query_date_range(self): - return QueryDateRange(date_range=self.query.dateRange, team=self.team, interval=None, now=datetime.now()) - - def session_where(self): - # TODO needs to consider some session properties, e.g. what to do with utm parameters? - return parse_expr( - "events.timestamp < {date_to} AND events.timestamp >= minus({date_from}, toIntervalHour(1))", - placeholders={ - "date_from": self.query_date_range.date_from_as_hogql(), - "date_to": self.query_date_range.date_to_as_hogql(), - }, - ) - - def session_having(self): - # TODO needs to consider some session properties, e.g. what to do with utm parameters? - return parse_expr( - "min_timestamp >= {date_from}", placeholders={"date_from": self.query_date_range.date_from_as_hogql()} - ) diff --git a/posthog/hogql_queries/web_analytics/web_analytics_query_runner.py b/posthog/hogql_queries/web_analytics/web_analytics_query_runner.py index e023ad32954b35..56d44ca239077c 100644 --- a/posthog/hogql_queries/web_analytics/web_analytics_query_runner.py +++ b/posthog/hogql_queries/web_analytics/web_analytics_query_runner.py @@ -1,12 +1,86 @@ from abc import ABC +from typing import Optional, List, Union, Type +from django.utils.timezone import datetime from posthog.caching.insights_api import BASE_MINIMUM_INSIGHT_REFRESH_INTERVAL +from posthog.hogql.parser import parse_expr +from posthog.hogql.property import property_to_expr from posthog.hogql_queries.query_runner import QueryRunner +from posthog.hogql_queries.utils.query_date_range import QueryDateRange +from posthog.models.filters.mixins.utils import cached_property +from posthog.schema import EventPropertyFilter, WebTopSourcesQuery, WebTopClicksQuery, WebTopPagesQuery + +WebQueryNode = Union[ + WebTopSourcesQuery, + WebTopClicksQuery, + WebTopPagesQuery, +] class WebAnalyticsQueryRunner(QueryRunner, ABC): + query: WebQueryNode + query_type: Type[WebQueryNode] + def _is_stale(self, cached_result_package): return True def _refresh_frequency(self): return BASE_MINIMUM_INSIGHT_REFRESH_INTERVAL + + @cached_property + def query_date_range(self): + return QueryDateRange(date_range=self.query.dateRange, team=self.team, interval=None, now=datetime.now()) + + @cached_property + def pathname_property_filter(self) -> Optional[EventPropertyFilter]: + return next((p for p in self.query.properties if p.key == "$pathname"), None) + + @cached_property + def property_filters_without_pathname(self) -> List[EventPropertyFilter]: + return [p for p in self.query.properties if p.key != "$pathname"] + + def session_where(self): + properties = [ + parse_expr( + "events.timestamp < {date_to} AND events.timestamp >= minus({date_from}, toIntervalHour(1))", + placeholders={ + "date_from": self.query_date_range.date_from_as_hogql(), + "date_to": self.query_date_range.date_to_as_hogql(), + }, + ) + ] + self.property_filters_without_pathname + return property_to_expr( + properties, + self.team, + ) + + def session_having(self): + properties = [ + parse_expr( + "min_timestamp >= {date_from}", + placeholders={"date_from": self.query_date_range.date_from_as_hogql()}, + ) + ] + pathname = self.pathname_property_filter + if pathname: + properties.append( + EventPropertyFilter( + key="earliest_pathname", label=pathname.label, operator=pathname.operator, value=pathname.value + ) + ) + return property_to_expr( + properties, + self.team, + ) + + def events_where(self): + properties = [ + parse_expr( + "events.timestamp >= {date_from}", + placeholders={"date_from": self.query_date_range.date_from_as_hogql()}, + ) + ] + self.query.properties + return property_to_expr( + properties, + self.team, + )