diff --git a/posthog/api/test/test_cohort.py b/posthog/api/test/test_cohort.py index 14b6b60b51484..eaf2c4a3c6f41 100644 --- a/posthog/api/test/test_cohort.py +++ b/posthog/api/test/test_cohort.py @@ -33,7 +33,7 @@ class TestCohort(TestExportMixin, ClickhouseTestMixin, APIBaseTest, QueryMatchingTest): # select all queries for snapshots def capture_select_queries(self): - return self.capture_queries(("INSERT INTO cohortpeople", "SELECT", "ALTER", "select", "DELETE")) + return self.capture_queries_startswith(("INSERT INTO cohortpeople", "SELECT", "ALTER", "select", "DELETE")) def _get_cohort_activity( self, @@ -101,7 +101,7 @@ def test_creating_update_and_calculating(self, patch_sync_execute, patch_calcula }, ) - with self.capture_queries("INSERT INTO cohortpeople") as insert_statements: + with self.capture_queries_startswith("INSERT INTO cohortpeople") as insert_statements: response = self.client.patch( f"/api/projects/{self.team.id}/cohorts/{response.json()['id']}", data={ diff --git a/posthog/hogql/visitor.py b/posthog/hogql/visitor.py index cc27920dd3b81..80e573124b307 100644 --- a/posthog/hogql/visitor.py +++ b/posthog/hogql/visitor.py @@ -114,7 +114,7 @@ def visit_join_expr(self, node: ast.JoinExpr): def visit_select_query(self, node: ast.SelectQuery): # :TRICKY: when adding new fields, also add them to visit_select_query of resolver.py - # pass the CTEs of the node to its children + # pass the CTEs of the node to select_froms (needed for nested joins to have access to CTEs) if node.type is not None and node.type.ctes is not None and node.select_from is not None and hasattr(node.select_from.type, "ctes"): node.select_from.type.ctes = {**node.type.ctes, **node.select_from.type.ctes} self.visit(node.select_from) diff --git a/posthog/hogql_queries/actors_query_runner.py b/posthog/hogql_queries/actors_query_runner.py index a717f9bf3fad0..3746d5aeeb1ae 100644 --- a/posthog/hogql_queries/actors_query_runner.py +++ b/posthog/hogql_queries/actors_query_runner.py @@ -272,14 +272,13 @@ def to_query(self) -> ast.SelectQuery: ctes = { source_alias: ast.CTE(name=source_alias, expr=source_query, cte_type="subquery"), } - if True: - if isinstance(self.strategy, PersonStrategy) and any( - isinstance(x, C) for x in [self.query.source.source] for C in (TrendsQuery,) - ): - s = parse_select("SELECT distinct actor_id as person_id FROM source") - s.select_from.table = source_query - # How to get rid of the extra superfluous select - ctes["person_ids"] = ast.CTE(name="person_ids", expr=s, cte_type="subquery") + if isinstance(self.strategy, PersonStrategy) and any( + isinstance(x, C) for x in [self.query.source.source] for C in (TrendsQuery,) + ): + s = parse_select("SELECT distinct actor_id as person_id FROM source") + s.select_from.table = source_query + # This feels like it adds one extra level of SELECT which is unnecessary + ctes["person_ids"] = ast.CTE(name="person_ids", expr=s, cte_type="subquery") stmt = ast.SelectQuery( ctes=ctes, diff --git a/posthog/hogql_queries/insights/test/test_insight_actors_query_runner.py b/posthog/hogql_queries/insights/test/test_insight_actors_query_runner.py index 51e078def6db0..3498838f0cc5e 100644 --- a/posthog/hogql_queries/insights/test/test_insight_actors_query_runner.py +++ b/posthog/hogql_queries/insights/test/test_insight_actors_query_runner.py @@ -1,4 +1,5 @@ from typing import Any, Optional +import re from freezegun import freeze_time @@ -217,23 +218,26 @@ def test_insight_persons_trends_query_with_argmaxV2(self): self.team.timezone = "US/Pacific" self.team.save() - response = self.select( - """ - select * from ( - - - } - series={[]} - /> - - + with self.capture_queries(lambda query: re.match("^SELECT\s+name\s+AS\s+name", query)) as queries: + response = self.select( + """ + select * from ( + + + } + series={[]} + /> + + + ) + """, + modifiers={"personsArgMaxVersion": PersonsArgMaxVersion.V2}, ) - """, - modifiers={"personsArgMaxVersion": PersonsArgMaxVersion.V2}, - ) self.assertEqual([("p2",)], response.results) + assert "in(distinct_id" in queries[0] + assert "in(person.id" in queries[0] @snapshot_clickhouse_queries def test_insight_persons_trends_query_with_argmaxV1(self): @@ -241,23 +245,26 @@ def test_insight_persons_trends_query_with_argmaxV1(self): self.team.timezone = "US/Pacific" self.team.save() - response = self.select( - """ - select * from ( - - - } - series={[]} - /> - - + with self.capture_queries(lambda query: re.match("^SELECT\s+name\s+AS\s+name", query)) as queries: + response = self.select( + """ + select * from ( + + + } + series={[]} + /> + + + ) + """, + modifiers={"personsArgMaxVersion": PersonsArgMaxVersion.V1}, ) - """, - modifiers={"personsArgMaxVersion": PersonsArgMaxVersion.V1}, - ) self.assertEqual([("p2",)], response.results) + assert "in(distinct_id" in queries[0] + assert "in(person.id" in queries[0] @snapshot_clickhouse_queries def test_insight_persons_trends_groups_query(self): diff --git a/posthog/hogql_queries/insights/trends/test/__snapshots__/test_trends.ambr b/posthog/hogql_queries/insights/trends/test/__snapshots__/test_trends.ambr index 23ff710fea0ed..731939491b535 100644 --- a/posthog/hogql_queries/insights/trends/test/__snapshots__/test_trends.ambr +++ b/posthog/hogql_queries/insights/trends/test/__snapshots__/test_trends.ambr @@ -245,6 +245,7 @@ groupUniqArray(100)(tuple(timestamp, uuid, `$session_id`, `$window_id`)) AS matching_events FROM (SELECT e.person_id AS actor_id, + e.distinct_id AS distinct_id, toTimeZone(e.timestamp, 'UTC') AS timestamp, e.uuid AS uuid, e.`$session_id` AS `$session_id`, @@ -259,12 +260,28 @@ GROUP BY groups.group_type_index, groups.group_key) AS e__group_0 ON equals(e.`$group_0`, e__group_0.key) WHERE and(equals(e.team_id, 2), equals(e.event, 'sign up'), greaterOrEquals(toTimeZone(e.timestamp, 'UTC'), toDateTime64('2020-01-02 00:00:00.000000', 6, 'UTC')), less(toTimeZone(e.timestamp, 'UTC'), toDateTime64('2020-01-03 00:00:00.000000', 6, 'UTC')), ifNull(equals(e__group_0.properties___industry, 'technology'), 0))) - GROUP BY actor_id) AS source + GROUP BY actor_id SETTINGS use_query_cache=1, + query_cache_ttl=600) AS source INNER JOIN (SELECT argMax(person.created_at, person.version) AS created_at, person.id AS id FROM person - WHERE equals(person.team_id, 2) + WHERE and(equals(person.team_id, 2), in(person.id, + (SELECT person_ids.person_id AS person_id + FROM + (SELECT DISTINCT actor_id AS person_id + FROM + (SELECT actor_id AS actor_id, count() AS event_count, groupUniqArray(100)(tuple(timestamp, uuid, `$session_id`, `$window_id`)) AS matching_events + FROM + (SELECT e.person_id AS actor_id, e.distinct_id AS distinct_id, e.timestamp AS timestamp, e.uuid AS uuid, e.`$session_id` AS `$session_id`, e.`$window_id` AS `$window_id` + FROM events AS e + LEFT JOIN + (SELECT argMax(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(groups.group_properties, 'industry'), ''), 'null'), '^"|"$', ''), groups._timestamp) AS properties___industry, groups.group_type_index AS index, groups.group_key AS key + FROM groups + WHERE and(equals(groups.team_id, 2), ifNull(equals(index, 0), 0)) + GROUP BY groups.group_type_index, groups.group_key) AS e__group_0 ON equals(e.`$group_0`, e__group_0.key) + WHERE and(equals(e.team_id, 2), equals(e.event, 'sign up'), greaterOrEquals(e.timestamp, toDateTime64('2020-01-02 00:00:00.000000', 6, 'UTC')), less(e.timestamp, toDateTime64('2020-01-03 00:00:00.000000', 6, 'UTC')), ifNull(equals(e__group_0.properties___industry, 'technology'), 0))) + GROUP BY actor_id SETTINGS use_query_cache=1, query_cache_ttl=600)) AS person_ids))) GROUP BY person.id HAVING and(ifNull(equals(argMax(person.is_deleted, person.version), 0), 0), ifNull(less(argMax(person.created_at, person.version), plus(now64(6, 'UTC'), toIntervalDay(1))), 0)) SETTINGS optimize_aggregation_in_order=1) AS persons ON equals(persons.id, source.actor_id) ORDER BY source.event_count DESC @@ -1050,6 +1067,7 @@ groupUniqArray(100)(tuple(timestamp, uuid, `$session_id`, `$window_id`)) AS matching_events FROM (SELECT e.person_id AS actor_id, + e.distinct_id AS distinct_id, toTimeZone(e.timestamp, 'UTC') AS timestamp, e.uuid AS uuid, e.`$session_id` AS `$session_id`, @@ -1072,12 +1090,33 @@ GROUP BY groups.group_type_index, groups.group_key) AS e__group_0 ON equals(e.`$group_0`, e__group_0.key) WHERE and(equals(e.team_id, 2), equals(e.event, 'sign up'), and(ifNull(equals(e__group_0.properties___industry, 'finance'), 0), ifNull(equals(e__group_2.properties___name, 'six'), 0)), greaterOrEquals(toTimeZone(e.timestamp, 'UTC'), toDateTime64('2020-01-02 00:00:00.000000', 6, 'UTC')), less(toTimeZone(e.timestamp, 'UTC'), toDateTime64('2020-01-03 00:00:00.000000', 6, 'UTC')))) - GROUP BY actor_id) AS source + GROUP BY actor_id SETTINGS use_query_cache=1, + query_cache_ttl=600) AS source INNER JOIN (SELECT argMax(person.created_at, person.version) AS created_at, person.id AS id FROM person - WHERE equals(person.team_id, 2) + WHERE and(equals(person.team_id, 2), in(person.id, + (SELECT person_ids.person_id AS person_id + FROM + (SELECT DISTINCT actor_id AS person_id + FROM + (SELECT actor_id AS actor_id, count() AS event_count, groupUniqArray(100)(tuple(timestamp, uuid, `$session_id`, `$window_id`)) AS matching_events + FROM + (SELECT e.person_id AS actor_id, e.distinct_id AS distinct_id, e.timestamp AS timestamp, e.uuid AS uuid, e.`$session_id` AS `$session_id`, e.`$window_id` AS `$window_id` + FROM events AS e + LEFT JOIN + (SELECT argMax(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(groups.group_properties, 'name'), ''), 'null'), '^"|"$', ''), groups._timestamp) AS properties___name, groups.group_type_index AS index, groups.group_key AS key + FROM groups + WHERE and(equals(groups.team_id, 2), ifNull(equals(index, 2), 0)) + GROUP BY groups.group_type_index, groups.group_key) AS e__group_2 ON equals(e.`$group_2`, e__group_2.key) + LEFT JOIN + (SELECT argMax(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(groups.group_properties, 'industry'), ''), 'null'), '^"|"$', ''), groups._timestamp) AS properties___industry, groups.group_type_index AS index, groups.group_key AS key + FROM groups + WHERE and(equals(groups.team_id, 2), ifNull(equals(index, 0), 0)) + GROUP BY groups.group_type_index, groups.group_key) AS e__group_0 ON equals(e.`$group_0`, e__group_0.key) + WHERE and(equals(e.team_id, 2), equals(e.event, 'sign up'), and(ifNull(equals(e__group_0.properties___industry, 'finance'), 0), ifNull(equals(e__group_2.properties___name, 'six'), 0)), greaterOrEquals(e.timestamp, toDateTime64('2020-01-02 00:00:00.000000', 6, 'UTC')), less(e.timestamp, toDateTime64('2020-01-03 00:00:00.000000', 6, 'UTC')))) + GROUP BY actor_id SETTINGS use_query_cache=1, query_cache_ttl=600)) AS person_ids))) GROUP BY person.id HAVING and(ifNull(equals(argMax(person.is_deleted, person.version), 0), 0), ifNull(less(argMax(person.created_at, person.version), plus(now64(6, 'UTC'), toIntervalDay(1))), 0)) SETTINGS optimize_aggregation_in_order=1) AS persons ON equals(persons.id, source.actor_id) ORDER BY source.event_count DESC diff --git a/posthog/hogql_queries/insights/trends/trends_actors_query_builder.py b/posthog/hogql_queries/insights/trends/trends_actors_query_builder.py index 485ba0d1a265b..dde7832e815d3 100644 --- a/posthog/hogql_queries/insights/trends/trends_actors_query_builder.py +++ b/posthog/hogql_queries/insights/trends/trends_actors_query_builder.py @@ -28,6 +28,8 @@ TrendsFilter, TrendsQuery, CompareFilter, + BreakdownType, + PersonPropertyFilter, ) from posthog.settings import HOGQL_INCREASED_MAX_EXECUTION_TIME @@ -166,7 +168,6 @@ def is_total_value(self) -> bool: return self.trends_display.is_total_value() def build_actors_query(self) -> ast.SelectQuery | ast.SelectUnionQuery: - # Insert CTE here cte_events_query = self._cte_events_query() if cte_events_query.settings is None: cte_events_query.settings = HogQLQuerySettings() @@ -215,13 +216,14 @@ def _get_events_query(self) -> ast.SelectQuery: def _cte_events_query(self) -> ast.SelectQuery: return ast.SelectQuery( - select=[ast.Field(chain=["*"])], # Filter this down to save space + # Could filter this down to what we actually use to save memory + select=[ast.Field(chain=["*"])], select_from=ast.JoinExpr( table=ast.Field(chain=["events"]), alias="e", sample=self._sample_expr(), ), - where=self._cte_events_where_expr(), + where=self._persons_cte_events_where_expr(), ) def _get_actor_value_expr(self) -> ast.Expr: @@ -248,13 +250,17 @@ def _events_where_expr(self, with_breakdown_expr: bool = True) -> ast.And: ] ) - def _cte_events_where_expr(self, with_breakdown_expr: bool = True) -> ast.And: + def _persons_cte_events_where_expr(self, with_breakdown_expr: bool = True) -> ast.And: return ast.And( exprs=[ *self._entity_where_expr(), # *self._prop_where_expr(), *self._date_where_expr(), - *(self._breakdown_where_expr() if with_breakdown_expr else []), + *( + self._breakdown_where_expr() + if with_breakdown_expr and self.trends_query.breakdownFilter.breakdown_type != BreakdownType.PERSON + else [] + ), *self._filter_empty_actors_expr(), ] ) @@ -293,12 +299,13 @@ def _entity_where_expr(self) -> list[ast.Expr]: return conditions - def _prop_where_expr(self) -> list[ast.Expr]: + def _prop_where_expr(self, exclude_person_props=False) -> list[ast.Expr]: conditions: list[ast.Expr] = [] # Filter Test Accounts if ( - self.trends_query.filterTestAccounts + not exclude_person_props + and self.trends_query.filterTestAccounts and isinstance(self.team.test_account_filters, list) and len(self.team.test_account_filters) > 0 ): @@ -307,7 +314,10 @@ def _prop_where_expr(self) -> list[ast.Expr]: # Properties if self.trends_query.properties is not None and self.trends_query.properties != []: - conditions.append(property_to_expr(self.trends_query.properties, self.team)) + properties = self.trends_query.properties + if exclude_person_props: + properties = [x for x in properties if isinstance(x, PersonPropertyFilter)] + conditions.append(property_to_expr(properties, self.team)) return conditions diff --git a/posthog/test/base.py b/posthog/test/base.py index 6258bd34ee6b5..6e9922910a683 100644 --- a/posthog/test/base.py +++ b/posthog/test/base.py @@ -885,10 +885,13 @@ class ClickhouseTestMixin(QueryMatchingTest): snapshot: Any def capture_select_queries(self): - return self.capture_queries(("SELECT", "WITH", "select", "with")) + return self.capture_queries_startswith(("SELECT", "WITH", "select", "with")) + + def capture_queries_startswith(self, query_prefixes: Union[str, tuple[str, ...]]): + return self.capture_queries(lambda x: x.startswith(query_prefixes)) @contextmanager - def capture_queries(self, query_prefixes: Union[str, tuple[str, ...]]): + def capture_queries(self, query_filter: Callable[[str], bool]): queries = [] original_get_client = ch_pool.get_client @@ -901,7 +904,7 @@ def get_client(): original_client_execute = client.execute def execute_wrapper(query, *args, **kwargs): - if sqlparse.format(query, strip_comments=True).strip().startswith(query_prefixes): + if query_filter(sqlparse.format(query, strip_comments=True).strip()): queries.append(query) return original_client_execute(query, *args, **kwargs)