Skip to content

Commit

Permalink
Get working pathname filtering
Browse files Browse the repository at this point in the history
  • Loading branch information
robbie-c committed Oct 10, 2023
1 parent eaff047 commit f21d6fc
Show file tree
Hide file tree
Showing 6 changed files with 95 additions and 85 deletions.
4 changes: 3 additions & 1 deletion posthog/hogql/property.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def visit_call(self, node: ast.Call):


def property_to_expr(
property: Union[BaseModel, PropertyGroup, Property, dict, list],
property: Union[BaseModel, PropertyGroup, Property, dict, list, ast.Expr],
team: Team,
scope: Literal["event", "person"] = "event",
) -> ast.Expr:
Expand All @@ -63,6 +63,8 @@ def property_to_expr(
return ast.And(exprs=properties)
elif isinstance(property, Property):
pass
elif isinstance(property, ast.Expr):
return property
elif (
isinstance(property, PropertyGroup)
or isinstance(property, PropertyGroupFilter)
Expand Down
14 changes: 7 additions & 7 deletions posthog/hogql_queries/web_analytics/ctes.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@
dateDiff('second', min_timestamp, max_timestamp) AS duration_s,
argMin(events.properties.`$referrer`, events.timestamp) AS earliest_referrer,
argMin(events.properties.`$pathname`, events.timestamp) AS earliest_pathname,
argMax(events.properties.`$pathname`, events.timestamp ) AS latest_pathname,
argMin(events.properties.`$pathname`, events.timestamp) AS entry_pathname,
argMax(events.properties.`$pathname`, events.timestamp ) AS exit_pathname,
argMax(events.properties.utm_source, events.timestamp) AS earliest_utm_source,
if(domain(earliest_referrer) = '', earliest_referrer, domain(earliest_referrer)) AS referrer_domain,
Expand Down Expand Up @@ -59,20 +59,20 @@

PATHNAME_CTE = """
SELECT
events.properties.`$pathname` AS pathname,
events.properties.`$pathname` AS $pathname,
count() as total_pageviews,
uniq(events.person_id) as unique_visitors -- might want to use person id? have seen a small number of pages where unique > total
uniq(events.person_id) as unique_visitors
FROM
events
WHERE
(event = '$pageview')
AND ({pathname_where})
GROUP BY pathname
GROUP BY $pathname
"""

PATHNAME_SCROLL_CTE = """
SELECT
events.properties.`$prev_pageview_pathname` AS pathname,
events.properties.`$prev_pageview_pathname` AS $pathname,
avg(CASE
WHEN toFloat(JSONExtractRaw(events.properties, '$prev_pageview_max_content_percentage')) IS NULL THEN NULL
WHEN toFloat(JSONExtractRaw(events.properties, '$prev_pageview_max_content_percentage')) > 0.8 THEN 100
Expand All @@ -84,5 +84,5 @@
WHERE
(event = '$pageview' OR event = '$pageleave') AND events.properties.`$prev_pageview_pathname` IS NOT NULL
AND ({pathname_scroll_where})
GROUP BY pathname
GROUP BY $pathname
"""
10 changes: 2 additions & 8 deletions posthog/hogql_queries/web_analytics/top_clicks.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

from posthog.hogql import ast
from posthog.hogql.parser import parse_select
from posthog.hogql.property import property_to_expr
from posthog.hogql.query import execute_hogql_query
from posthog.hogql_queries.utils.query_date_range import QueryDateRange
from posthog.hogql_queries.web_analytics.web_analytics_query_runner import WebAnalyticsQueryRunner
Expand All @@ -26,19 +25,17 @@ def to_query(self) -> ast.SelectQuery | ast.SelectUnionQuery:
events
WHERE
event == '$autocapture'
AND events.timestamp >= {date_from}
AND events.timestamp < {date_to}
AND events.properties.$event_type = 'click'
AND el_text IS NOT NULL
AND ({event_properties})
AND ({events_where})
GROUP BY
el_text
ORDER BY total_clicks DESC
LIMIT 10
""",
timings=self.timings,
placeholders={
"event_properties": self.event_properties(),
"event_properties": self.events_where(),
"date_from": self.query_date_range.date_from_as_hogql(),
"date_to": self.query_date_range.date_to_as_hogql(),
},
Expand All @@ -60,6 +57,3 @@ def calculate(self):
@cached_property
def query_date_range(self):
return QueryDateRange(date_range=self.query.dateRange, team=self.team, interval=None, now=datetime.now())

def event_properties(self) -> ast.Expr:
return property_to_expr(self.query.properties, team=self.team)
52 changes: 8 additions & 44 deletions posthog/hogql_queries/web_analytics/top_pages.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,8 @@
from django.utils.timezone import datetime

from posthog.hogql import ast
from posthog.hogql.parser import parse_select, parse_expr
from posthog.hogql.parser import parse_select
from posthog.hogql.query import execute_hogql_query
from posthog.hogql_queries.utils.query_date_range import QueryDateRange
from posthog.hogql_queries.web_analytics.ctes import SESSION_CTE, PATHNAME_CTE, PATHNAME_SCROLL_CTE
from posthog.hogql_queries.web_analytics.web_analytics_query_runner import WebAnalyticsQueryRunner
from posthog.models.filters.mixins.utils import cached_property
from posthog.schema import WebTopPagesQuery, WebTopPagesQueryResponse


Expand All @@ -23,19 +19,19 @@ def to_query(self) -> ast.SelectQuery | ast.SelectUnionQuery:
)
with self.timings.measure("pathname_query"):
pathname_query = parse_select(
PATHNAME_CTE, timings=self.timings, placeholders={"pathname_where": self.pathname_where()}
PATHNAME_CTE, timings=self.timings, placeholders={"pathname_where": self.events_where()}
)
with self.timings.measure("pathname_scroll_query"):
pathname_scroll_query = parse_select(
PATHNAME_SCROLL_CTE,
timings=self.timings,
placeholders={"pathname_scroll_where": self.pathname_scroll_where()},
placeholders={"pathname_scroll_where": self.events_where()},
)
with self.timings.measure("top_pages_query"):
top_sources_query = parse_select(
"""
SELECT
pathname.pathname as pathname,
pathname.$pathname as pathname,
pathname.total_pageviews as total_pageviews,
pathname.unique_visitors as unique_visitors,
bounce_rate.bounce_rate as bounce_rate,
Expand All @@ -46,19 +42,19 @@ def to_query(self) -> ast.SelectQuery | ast.SelectUnionQuery:
LEFT OUTER JOIN
(
SELECT
session.earliest_pathname,
session.entry_pathname,
avg(session.is_bounce) as bounce_rate
FROM
{session_query} AS session
GROUP BY
session.earliest_pathname
session.entry_pathname
) AS bounce_rate
ON
pathname.pathname = bounce_rate.earliest_pathname
pathname.$pathname = bounce_rate.entry_pathname
LEFT OUTER JOIN
{pathname_scroll_query} AS scroll_data
ON
pathname.pathname = scroll_data.pathname
pathname.$pathname = scroll_data.$pathname
ORDER BY
total_pageviews DESC
LIMIT 10
Expand All @@ -83,35 +79,3 @@ def calculate(self):
return WebTopPagesQueryResponse(
columns=response.columns, results=response.results, timings=response.timings, types=response.types
)

@cached_property
def query_date_range(self):
return QueryDateRange(date_range=self.query.dateRange, team=self.team, interval=None, now=datetime.now())

def session_where(self):
# TODO needs to consider some session properties, e.g. what to do with utm parameters?
return parse_expr(
"events.timestamp < {date_to} AND events.timestamp >= minus({date_from}, toIntervalHour(1))",
placeholders={
"date_from": self.query_date_range.date_from_as_hogql(),
"date_to": self.query_date_range.date_to_as_hogql(),
},
)

def session_having(self):
# TODO needs to consider some session properties, e.g. what to do with utm parameters?
return parse_expr(
"min_timestamp >= {date_from}", placeholders={"date_from": self.query_date_range.date_from_as_hogql()}
)

def pathname_where(self):
# TODO needs to apply filters
return parse_expr(
"events.timestamp >= {date_from}", placeholders={"date_from": self.query_date_range.date_from_as_hogql()}
)

def pathname_scroll_where(self):
# TODO needs to apply filters
return parse_expr(
"events.timestamp >= {date_from}", placeholders={"date_from": self.query_date_range.date_from_as_hogql()}
)
26 changes: 1 addition & 25 deletions posthog/hogql_queries/web_analytics/top_sources.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,8 @@
from django.utils.timezone import datetime

from posthog.hogql import ast
from posthog.hogql.parser import parse_select, parse_expr
from posthog.hogql.parser import parse_select
from posthog.hogql.query import execute_hogql_query
from posthog.hogql_queries.utils.query_date_range import QueryDateRange
from posthog.hogql_queries.web_analytics.ctes import SESSION_CTE
from posthog.hogql_queries.web_analytics.web_analytics_query_runner import WebAnalyticsQueryRunner
from posthog.models.filters.mixins.utils import cached_property
from posthog.schema import WebTopSourcesQuery, WebTopSourcesQueryResponse


Expand Down Expand Up @@ -54,23 +50,3 @@ def calculate(self):
return WebTopSourcesQueryResponse(
columns=response.columns, results=response.results, timings=response.timings, types=response.types
)

@cached_property
def query_date_range(self):
return QueryDateRange(date_range=self.query.dateRange, team=self.team, interval=None, now=datetime.now())

def session_where(self):
# TODO needs to consider some session properties, e.g. what to do with utm parameters?
return parse_expr(
"events.timestamp < {date_to} AND events.timestamp >= minus({date_from}, toIntervalHour(1))",
placeholders={
"date_from": self.query_date_range.date_from_as_hogql(),
"date_to": self.query_date_range.date_to_as_hogql(),
},
)

def session_having(self):
# TODO needs to consider some session properties, e.g. what to do with utm parameters?
return parse_expr(
"min_timestamp >= {date_from}", placeholders={"date_from": self.query_date_range.date_from_as_hogql()}
)
74 changes: 74 additions & 0 deletions posthog/hogql_queries/web_analytics/web_analytics_query_runner.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,86 @@
from abc import ABC
from typing import Optional, List, Union, Type

from django.utils.timezone import datetime
from posthog.caching.insights_api import BASE_MINIMUM_INSIGHT_REFRESH_INTERVAL
from posthog.hogql.parser import parse_expr
from posthog.hogql.property import property_to_expr
from posthog.hogql_queries.query_runner import QueryRunner
from posthog.hogql_queries.utils.query_date_range import QueryDateRange
from posthog.models.filters.mixins.utils import cached_property
from posthog.schema import EventPropertyFilter, WebTopSourcesQuery, WebTopClicksQuery, WebTopPagesQuery

WebQueryNode = Union[
WebTopSourcesQuery,
WebTopClicksQuery,
WebTopPagesQuery,
]


class WebAnalyticsQueryRunner(QueryRunner, ABC):
query: WebQueryNode
query_type: Type[WebQueryNode]

def _is_stale(self, cached_result_package):
return True

def _refresh_frequency(self):
return BASE_MINIMUM_INSIGHT_REFRESH_INTERVAL

@cached_property
def query_date_range(self):
return QueryDateRange(date_range=self.query.dateRange, team=self.team, interval=None, now=datetime.now())

@cached_property
def pathname_property_filter(self) -> Optional[EventPropertyFilter]:
return next((p for p in self.query.properties if p.key == "$pathname"), None)

@cached_property
def property_filters_without_pathname(self) -> List[EventPropertyFilter]:
return [p for p in self.query.properties if p.key != "$pathname"]

def session_where(self):
properties = [
parse_expr(
"events.timestamp < {date_to} AND events.timestamp >= minus({date_from}, toIntervalHour(1))",
placeholders={
"date_from": self.query_date_range.date_from_as_hogql(),
"date_to": self.query_date_range.date_to_as_hogql(),
},
)
] + self.property_filters_without_pathname
return property_to_expr(
properties,
self.team,
)

def session_having(self):
properties = [
parse_expr(
"min_timestamp >= {date_from}",
placeholders={"date_from": self.query_date_range.date_from_as_hogql()},
)
]
pathname = self.pathname_property_filter
if pathname:
properties.append(
EventPropertyFilter(
key="earliest_pathname", label=pathname.label, operator=pathname.operator, value=pathname.value
)
)
return property_to_expr(
properties,
self.team,
)

def events_where(self):
properties = [
parse_expr(
"events.timestamp >= {date_from}",
placeholders={"date_from": self.query_date_range.date_from_as_hogql()},
)
] + self.query.properties
return property_to_expr(
properties,
self.team,
)

0 comments on commit f21d6fc

Please sign in to comment.