From c491bf2cf5c7cc4eac862a15ab5c5b3fbf6d385f Mon Sep 17 00:00:00 2001 From: Julian Bez Date: Thu, 21 Mar 2024 12:44:12 +0000 Subject: [PATCH] fix(hogql): Optimize querying for date frame in trends actors query (#21060) --- mypy-baseline.txt | 4 +- .../insights/insight_actors_query_runner.py | 4 +- .../insights/trends/trends_query_builder.py | 60 ++++++++----------- .../insights/trends/trends_query_runner.py | 2 +- .../hogql_queries/utils/query_date_range.py | 47 +++++++++------ 5 files changed, 59 insertions(+), 58 deletions(-) diff --git a/mypy-baseline.txt b/mypy-baseline.txt index 781ad2980830b..2143c119ea5c2 100644 --- a/mypy-baseline.txt +++ b/mypy-baseline.txt @@ -377,11 +377,11 @@ posthog/hogql_queries/insights/trends/trends_query_runner.py:0: error: Signature posthog/hogql_queries/insights/trends/trends_query_runner.py:0: note: Superclass: posthog/hogql_queries/insights/trends/trends_query_runner.py:0: note: def to_actors_query(self) -> SelectQuery | SelectUnionQuery posthog/hogql_queries/insights/trends/trends_query_runner.py:0: note: Subclass: -posthog/hogql_queries/insights/trends/trends_query_runner.py:0: note: def to_actors_query(self, time_frame: str | int | None, series_index: int, breakdown_value: str | int | None = ..., compare: Compare | None = ...) -> SelectQuery | SelectUnionQuery +posthog/hogql_queries/insights/trends/trends_query_runner.py:0: note: def to_actors_query(self, time_frame: str | None, series_index: int, breakdown_value: str | int | None = ..., compare: Compare | None = ...) -> SelectQuery | SelectUnionQuery posthog/hogql_queries/insights/trends/trends_query_runner.py:0: note: Superclass: posthog/hogql_queries/insights/trends/trends_query_runner.py:0: note: def to_actors_query(self) -> SelectQuery | SelectUnionQuery posthog/hogql_queries/insights/trends/trends_query_runner.py:0: note: Subclass: -posthog/hogql_queries/insights/trends/trends_query_runner.py:0: note: def to_actors_query(self, time_frame: str | int | None, series_index: int, breakdown_value: str | int | None = ..., compare: Compare | None = ...) -> SelectQuery | SelectUnionQuery +posthog/hogql_queries/insights/trends/trends_query_runner.py:0: note: def to_actors_query(self, time_frame: str | None, series_index: int, breakdown_value: str | int | None = ..., compare: Compare | None = ...) -> SelectQuery | SelectUnionQuery posthog/hogql_queries/insights/trends/trends_query_runner.py:0: error: Statement is unreachable [unreachable] posthog/hogql_queries/insights/trends/trends_query_runner.py:0: error: Argument 1 to "_event_property" of "TrendsQueryRunner" has incompatible type "str | float | list[str | float] | None"; expected "str" [arg-type] posthog/hogql_queries/insights/retention_query_runner.py:0: error: Incompatible types in assignment (expression has type "Expr", variable has type "Call") [assignment] diff --git a/posthog/hogql_queries/insights/insight_actors_query_runner.py b/posthog/hogql_queries/insights/insight_actors_query_runner.py index 782dd5b054a0e..a0e38abae1776 100644 --- a/posthog/hogql_queries/insights/insight_actors_query_runner.py +++ b/posthog/hogql_queries/insights/insight_actors_query_runner.py @@ -1,5 +1,5 @@ from datetime import timedelta -from typing import cast +from typing import cast, Optional from posthog.hogql import ast from posthog.hogql.query import execute_hogql_query @@ -37,7 +37,7 @@ def to_query(self) -> ast.SelectQuery | ast.SelectUnionQuery: trends_runner = cast(TrendsQueryRunner, self.source_runner) query = cast(InsightActorsQuery, self.query) return trends_runner.to_actors_query( - time_frame=query.day, + time_frame=cast(Optional[str], query.day), # Other runner accept day as int, but not this one series_index=query.series or 0, breakdown_value=query.breakdown, compare=query.compare, diff --git a/posthog/hogql_queries/insights/trends/trends_query_builder.py b/posthog/hogql_queries/insights/trends/trends_query_builder.py index 4a2536750cb2c..7be735d3b0a8b 100644 --- a/posthog/hogql_queries/insights/trends/trends_query_builder.py +++ b/posthog/hogql_queries/insights/trends/trends_query_builder.py @@ -68,7 +68,7 @@ def build_query(self) -> ast.SelectQuery | ast.SelectUnionQuery: return full_query def build_actors_query( - self, time_frame: Optional[str | int] = None, breakdown_filter: Optional[str | int] = None + self, time_frame: Optional[str] = None, breakdown_filter: Optional[str | int] = None ) -> ast.SelectQuery | ast.SelectUnionQuery: breakdown = self._breakdown(is_actors_query=True, breakdown_values_override=breakdown_filter) @@ -169,7 +169,7 @@ def _get_events_subquery( is_actors_query: bool, breakdown: Breakdown, breakdown_values_override: Optional[str | int] = None, - actors_query_time_frame: Optional[str | int] = None, + actors_query_time_frame: Optional[str] = None, ) -> ast.SelectQuery: day_start = ast.Alias( alias="day_start", @@ -186,31 +186,16 @@ def _get_events_subquery( actors_query_time_frame=actors_query_time_frame, ) - default_query = cast( - ast.SelectQuery, - parse_select( - """ - SELECT - {aggregation_operation} AS total - FROM {table} AS e - WHERE {events_filter} - """ - if isinstance(self.series, DataWarehouseNode) - else """ - SELECT - {aggregation_operation} AS total - FROM {table} AS e - SAMPLE {sample} - WHERE {events_filter} - """, - placeholders={ - "table": self._table_expr, - "events_filter": events_filter, - "aggregation_operation": self._aggregation_operation.select_aggregation(), - "sample": self._sample_value(), - }, - ), + default_query = ast.SelectQuery( + select=[ast.Alias(alias="total", expr=self._aggregation_operation.select_aggregation())], + select_from=ast.JoinExpr(table=self._table_expr, alias="e"), + where=events_filter, ) + if not isinstance(self.series, DataWarehouseNode): + assert default_query.select_from is not None + default_query.select_from.sample = ast.SampleExpr( + sample_value=self._sample_value(), + ) default_query.group_by = [] @@ -463,20 +448,27 @@ def _events_filter( breakdown: Breakdown | None, ignore_breakdowns: bool = False, breakdown_values_override: Optional[str | int] = None, - actors_query_time_frame: Optional[str | int] = None, + actors_query_time_frame: Optional[str] = None, ) -> ast.Expr: series = self.series filters: List[ast.Expr] = [] # Dates if is_actors_query and actors_query_time_frame is not None: - to_start_of_time_frame = f"toStartOf{self.query_date_range.interval_name.capitalize()}" - filters.append( - ast.CompareOperation( - left=ast.Call(name=to_start_of_time_frame, args=[ast.Field(chain=["timestamp"])]), - op=ast.CompareOperationOp.Eq, - right=ast.Call(name="toDateTime", args=[ast.Constant(value=actors_query_time_frame)]), - ) + actors_from, actors_to = self.query_date_range.interval_bounds_from_str(actors_query_time_frame) + filters.extend( + [ + ast.CompareOperation( + left=ast.Field(chain=["timestamp"]), + op=ast.CompareOperationOp.GtEq, + right=ast.Constant(value=actors_from), + ), + ast.CompareOperation( + left=ast.Field(chain=["timestamp"]), + op=ast.CompareOperationOp.Lt, + right=ast.Constant(value=actors_to), + ), + ] ) elif not self._aggregation_operation.requires_query_orchestration(): filters.extend( diff --git a/posthog/hogql_queries/insights/trends/trends_query_runner.py b/posthog/hogql_queries/insights/trends/trends_query_runner.py index 29d29b55e8b0f..40f26664c2585 100644 --- a/posthog/hogql_queries/insights/trends/trends_query_runner.py +++ b/posthog/hogql_queries/insights/trends/trends_query_runner.py @@ -142,7 +142,7 @@ def to_queries(self) -> List[ast.SelectQuery | ast.SelectUnionQuery]: def to_actors_query( self, - time_frame: Optional[str | int], + time_frame: Optional[str], series_index: int, breakdown_value: Optional[str | int] = None, compare: Optional[Compare] = None, diff --git a/posthog/hogql_queries/utils/query_date_range.py b/posthog/hogql_queries/utils/query_date_range.py index f2e5cef3d82a3..5453d878b4017 100644 --- a/posthog/hogql_queries/utils/query_date_range.py +++ b/posthog/hogql_queries/utils/query_date_range.py @@ -4,6 +4,7 @@ from typing import Literal, Optional, Dict, List from zoneinfo import ZoneInfo +from dateutil.parser import parse from dateutil.relativedelta import relativedelta from posthog.hogql.errors import HogQLException @@ -116,36 +117,39 @@ def interval_type(self) -> IntervalType: def interval_name(self) -> Literal["hour", "day", "week", "month"]: return self.interval_type.name - def all_values(self) -> List[str]: - start: datetime = self.date_from() - end: datetime = self.date_to() - interval = self.interval_name - - if interval == "hour": - start = start.replace(minute=0, second=0, microsecond=0) - elif interval == "day": - start = start.replace(hour=0, minute=0, second=0, microsecond=0) - elif interval == "week": + def align_with_interval(self, start: datetime) -> datetime: + if self.interval_name == "hour": + return start.replace(minute=0, second=0, microsecond=0) + elif self.interval_name == "day": + return start.replace(hour=0, minute=0, second=0, microsecond=0) + elif self.interval_name == "week": start = start.replace(hour=0, minute=0, second=0, microsecond=0) week_start_alignment_days = start.isoweekday() % 7 if self._team.week_start_day == WeekStartDay.MONDAY: week_start_alignment_days = start.weekday() start -= timedelta(days=week_start_alignment_days) - elif interval == "month": - start = start.replace(day=1, hour=0, minute=0, second=0, microsecond=0) + return start + elif self.interval_name == "month": + return start.replace(day=1, hour=0, minute=0, second=0, microsecond=0) + + def interval_relativedelta(self) -> relativedelta: + return relativedelta( + days=1 if self.interval_name == "day" else 0, + weeks=1 if self.interval_name == "week" else 0, + months=1 if self.interval_name == "month" else 0, + hours=1 if self.interval_name == "hour" else 0, + ) + def all_values(self) -> List[str]: + start = self.align_with_interval(self.date_from()) + end: datetime = self.date_to() values: List[str] = [] while start <= end: - if interval == "hour": + if self.interval_name == "hour": values.append(start.strftime("%Y-%m-%d %H:%M:%S")) else: values.append(start.strftime("%Y-%m-%d")) - start += relativedelta( - days=1 if interval == "day" else 0, - weeks=1 if interval == "week" else 0, - months=1 if interval == "month" else 0, - hours=1 if interval == "hour" else 0, - ) + start += self.interval_relativedelta() return values def date_to_as_hogql(self) -> ast.Expr: @@ -257,6 +261,11 @@ def to_placeholders(self) -> Dict[str, ast.Expr]: else self.date_from_as_hogql(), } + def interval_bounds_from_str(self, time_frame: str) -> tuple[datetime, datetime]: + date_from = parse(time_frame, tzinfos={None: self._team.timezone_info}) + date_to = date_from + self.interval_relativedelta() + return date_from, date_to + class QueryDateRangeWithIntervals(QueryDateRange): def __init__(