diff --git a/posthog/api/test/test_cohort.py b/posthog/api/test/test_cohort.py
index 14b6b60b51484..eaf2c4a3c6f41 100644
--- a/posthog/api/test/test_cohort.py
+++ b/posthog/api/test/test_cohort.py
@@ -33,7 +33,7 @@
class TestCohort(TestExportMixin, ClickhouseTestMixin, APIBaseTest, QueryMatchingTest):
# select all queries for snapshots
def capture_select_queries(self):
- return self.capture_queries(("INSERT INTO cohortpeople", "SELECT", "ALTER", "select", "DELETE"))
+ return self.capture_queries_startswith(("INSERT INTO cohortpeople", "SELECT", "ALTER", "select", "DELETE"))
def _get_cohort_activity(
self,
@@ -101,7 +101,7 @@ def test_creating_update_and_calculating(self, patch_sync_execute, patch_calcula
},
)
- with self.capture_queries("INSERT INTO cohortpeople") as insert_statements:
+ with self.capture_queries_startswith("INSERT INTO cohortpeople") as insert_statements:
response = self.client.patch(
f"/api/projects/{self.team.id}/cohorts/{response.json()['id']}",
data={
diff --git a/posthog/hogql/visitor.py b/posthog/hogql/visitor.py
index cc27920dd3b81..80e573124b307 100644
--- a/posthog/hogql/visitor.py
+++ b/posthog/hogql/visitor.py
@@ -114,7 +114,7 @@ def visit_join_expr(self, node: ast.JoinExpr):
def visit_select_query(self, node: ast.SelectQuery):
# :TRICKY: when adding new fields, also add them to visit_select_query of resolver.py
- # pass the CTEs of the node to its children
+ # pass the CTEs of the node to select_froms (needed for nested joins to have access to CTEs)
if node.type is not None and node.type.ctes is not None and node.select_from is not None and hasattr(node.select_from.type, "ctes"):
node.select_from.type.ctes = {**node.type.ctes, **node.select_from.type.ctes}
self.visit(node.select_from)
diff --git a/posthog/hogql_queries/actors_query_runner.py b/posthog/hogql_queries/actors_query_runner.py
index a717f9bf3fad0..3746d5aeeb1ae 100644
--- a/posthog/hogql_queries/actors_query_runner.py
+++ b/posthog/hogql_queries/actors_query_runner.py
@@ -272,14 +272,13 @@ def to_query(self) -> ast.SelectQuery:
ctes = {
source_alias: ast.CTE(name=source_alias, expr=source_query, cte_type="subquery"),
}
- if True:
- if isinstance(self.strategy, PersonStrategy) and any(
- isinstance(x, C) for x in [self.query.source.source] for C in (TrendsQuery,)
- ):
- s = parse_select("SELECT distinct actor_id as person_id FROM source")
- s.select_from.table = source_query
- # How to get rid of the extra superfluous select
- ctes["person_ids"] = ast.CTE(name="person_ids", expr=s, cte_type="subquery")
+ if isinstance(self.strategy, PersonStrategy) and any(
+ isinstance(x, C) for x in [self.query.source.source] for C in (TrendsQuery,)
+ ):
+ s = parse_select("SELECT distinct actor_id as person_id FROM source")
+ s.select_from.table = source_query
+ # This feels like it adds one extra level of SELECT which is unnecessary
+ ctes["person_ids"] = ast.CTE(name="person_ids", expr=s, cte_type="subquery")
stmt = ast.SelectQuery(
ctes=ctes,
diff --git a/posthog/hogql_queries/insights/test/test_insight_actors_query_runner.py b/posthog/hogql_queries/insights/test/test_insight_actors_query_runner.py
index 51e078def6db0..3498838f0cc5e 100644
--- a/posthog/hogql_queries/insights/test/test_insight_actors_query_runner.py
+++ b/posthog/hogql_queries/insights/test/test_insight_actors_query_runner.py
@@ -1,4 +1,5 @@
from typing import Any, Optional
+import re
from freezegun import freeze_time
@@ -217,23 +218,26 @@ def test_insight_persons_trends_query_with_argmaxV2(self):
self.team.timezone = "US/Pacific"
self.team.save()
- response = self.select(
- """
- select * from (
-
-
- }
- series={[]}
- />
-
-
+ with self.capture_queries(lambda query: re.match("^SELECT\s+name\s+AS\s+name", query)) as queries:
+ response = self.select(
+ """
+ select * from (
+
+
+ }
+ series={[]}
+ />
+
+
+ )
+ """,
+ modifiers={"personsArgMaxVersion": PersonsArgMaxVersion.V2},
)
- """,
- modifiers={"personsArgMaxVersion": PersonsArgMaxVersion.V2},
- )
self.assertEqual([("p2",)], response.results)
+ assert "in(distinct_id" in queries[0]
+ assert "in(person.id" in queries[0]
@snapshot_clickhouse_queries
def test_insight_persons_trends_query_with_argmaxV1(self):
@@ -241,23 +245,26 @@ def test_insight_persons_trends_query_with_argmaxV1(self):
self.team.timezone = "US/Pacific"
self.team.save()
- response = self.select(
- """
- select * from (
-
-
- }
- series={[]}
- />
-
-
+ with self.capture_queries(lambda query: re.match("^SELECT\s+name\s+AS\s+name", query)) as queries:
+ response = self.select(
+ """
+ select * from (
+
+
+ }
+ series={[]}
+ />
+
+
+ )
+ """,
+ modifiers={"personsArgMaxVersion": PersonsArgMaxVersion.V1},
)
- """,
- modifiers={"personsArgMaxVersion": PersonsArgMaxVersion.V1},
- )
self.assertEqual([("p2",)], response.results)
+ assert "in(distinct_id" in queries[0]
+ assert "in(person.id" in queries[0]
@snapshot_clickhouse_queries
def test_insight_persons_trends_groups_query(self):
diff --git a/posthog/hogql_queries/insights/trends/test/__snapshots__/test_trends.ambr b/posthog/hogql_queries/insights/trends/test/__snapshots__/test_trends.ambr
index 23ff710fea0ed..731939491b535 100644
--- a/posthog/hogql_queries/insights/trends/test/__snapshots__/test_trends.ambr
+++ b/posthog/hogql_queries/insights/trends/test/__snapshots__/test_trends.ambr
@@ -245,6 +245,7 @@
groupUniqArray(100)(tuple(timestamp, uuid, `$session_id`, `$window_id`)) AS matching_events
FROM
(SELECT e.person_id AS actor_id,
+ e.distinct_id AS distinct_id,
toTimeZone(e.timestamp, 'UTC') AS timestamp,
e.uuid AS uuid,
e.`$session_id` AS `$session_id`,
@@ -259,12 +260,28 @@
GROUP BY groups.group_type_index,
groups.group_key) AS e__group_0 ON equals(e.`$group_0`, e__group_0.key)
WHERE and(equals(e.team_id, 2), equals(e.event, 'sign up'), greaterOrEquals(toTimeZone(e.timestamp, 'UTC'), toDateTime64('2020-01-02 00:00:00.000000', 6, 'UTC')), less(toTimeZone(e.timestamp, 'UTC'), toDateTime64('2020-01-03 00:00:00.000000', 6, 'UTC')), ifNull(equals(e__group_0.properties___industry, 'technology'), 0)))
- GROUP BY actor_id) AS source
+ GROUP BY actor_id SETTINGS use_query_cache=1,
+ query_cache_ttl=600) AS source
INNER JOIN
(SELECT argMax(person.created_at, person.version) AS created_at,
person.id AS id
FROM person
- WHERE equals(person.team_id, 2)
+ WHERE and(equals(person.team_id, 2), in(person.id,
+ (SELECT person_ids.person_id AS person_id
+ FROM
+ (SELECT DISTINCT actor_id AS person_id
+ FROM
+ (SELECT actor_id AS actor_id, count() AS event_count, groupUniqArray(100)(tuple(timestamp, uuid, `$session_id`, `$window_id`)) AS matching_events
+ FROM
+ (SELECT e.person_id AS actor_id, e.distinct_id AS distinct_id, e.timestamp AS timestamp, e.uuid AS uuid, e.`$session_id` AS `$session_id`, e.`$window_id` AS `$window_id`
+ FROM events AS e
+ LEFT JOIN
+ (SELECT argMax(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(groups.group_properties, 'industry'), ''), 'null'), '^"|"$', ''), groups._timestamp) AS properties___industry, groups.group_type_index AS index, groups.group_key AS key
+ FROM groups
+ WHERE and(equals(groups.team_id, 2), ifNull(equals(index, 0), 0))
+ GROUP BY groups.group_type_index, groups.group_key) AS e__group_0 ON equals(e.`$group_0`, e__group_0.key)
+ WHERE and(equals(e.team_id, 2), equals(e.event, 'sign up'), greaterOrEquals(e.timestamp, toDateTime64('2020-01-02 00:00:00.000000', 6, 'UTC')), less(e.timestamp, toDateTime64('2020-01-03 00:00:00.000000', 6, 'UTC')), ifNull(equals(e__group_0.properties___industry, 'technology'), 0)))
+ GROUP BY actor_id SETTINGS use_query_cache=1, query_cache_ttl=600)) AS person_ids)))
GROUP BY person.id
HAVING and(ifNull(equals(argMax(person.is_deleted, person.version), 0), 0), ifNull(less(argMax(person.created_at, person.version), plus(now64(6, 'UTC'), toIntervalDay(1))), 0)) SETTINGS optimize_aggregation_in_order=1) AS persons ON equals(persons.id, source.actor_id)
ORDER BY source.event_count DESC
@@ -1050,6 +1067,7 @@
groupUniqArray(100)(tuple(timestamp, uuid, `$session_id`, `$window_id`)) AS matching_events
FROM
(SELECT e.person_id AS actor_id,
+ e.distinct_id AS distinct_id,
toTimeZone(e.timestamp, 'UTC') AS timestamp,
e.uuid AS uuid,
e.`$session_id` AS `$session_id`,
@@ -1072,12 +1090,33 @@
GROUP BY groups.group_type_index,
groups.group_key) AS e__group_0 ON equals(e.`$group_0`, e__group_0.key)
WHERE and(equals(e.team_id, 2), equals(e.event, 'sign up'), and(ifNull(equals(e__group_0.properties___industry, 'finance'), 0), ifNull(equals(e__group_2.properties___name, 'six'), 0)), greaterOrEquals(toTimeZone(e.timestamp, 'UTC'), toDateTime64('2020-01-02 00:00:00.000000', 6, 'UTC')), less(toTimeZone(e.timestamp, 'UTC'), toDateTime64('2020-01-03 00:00:00.000000', 6, 'UTC'))))
- GROUP BY actor_id) AS source
+ GROUP BY actor_id SETTINGS use_query_cache=1,
+ query_cache_ttl=600) AS source
INNER JOIN
(SELECT argMax(person.created_at, person.version) AS created_at,
person.id AS id
FROM person
- WHERE equals(person.team_id, 2)
+ WHERE and(equals(person.team_id, 2), in(person.id,
+ (SELECT person_ids.person_id AS person_id
+ FROM
+ (SELECT DISTINCT actor_id AS person_id
+ FROM
+ (SELECT actor_id AS actor_id, count() AS event_count, groupUniqArray(100)(tuple(timestamp, uuid, `$session_id`, `$window_id`)) AS matching_events
+ FROM
+ (SELECT e.person_id AS actor_id, e.distinct_id AS distinct_id, e.timestamp AS timestamp, e.uuid AS uuid, e.`$session_id` AS `$session_id`, e.`$window_id` AS `$window_id`
+ FROM events AS e
+ LEFT JOIN
+ (SELECT argMax(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(groups.group_properties, 'name'), ''), 'null'), '^"|"$', ''), groups._timestamp) AS properties___name, groups.group_type_index AS index, groups.group_key AS key
+ FROM groups
+ WHERE and(equals(groups.team_id, 2), ifNull(equals(index, 2), 0))
+ GROUP BY groups.group_type_index, groups.group_key) AS e__group_2 ON equals(e.`$group_2`, e__group_2.key)
+ LEFT JOIN
+ (SELECT argMax(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(groups.group_properties, 'industry'), ''), 'null'), '^"|"$', ''), groups._timestamp) AS properties___industry, groups.group_type_index AS index, groups.group_key AS key
+ FROM groups
+ WHERE and(equals(groups.team_id, 2), ifNull(equals(index, 0), 0))
+ GROUP BY groups.group_type_index, groups.group_key) AS e__group_0 ON equals(e.`$group_0`, e__group_0.key)
+ WHERE and(equals(e.team_id, 2), equals(e.event, 'sign up'), and(ifNull(equals(e__group_0.properties___industry, 'finance'), 0), ifNull(equals(e__group_2.properties___name, 'six'), 0)), greaterOrEquals(e.timestamp, toDateTime64('2020-01-02 00:00:00.000000', 6, 'UTC')), less(e.timestamp, toDateTime64('2020-01-03 00:00:00.000000', 6, 'UTC'))))
+ GROUP BY actor_id SETTINGS use_query_cache=1, query_cache_ttl=600)) AS person_ids)))
GROUP BY person.id
HAVING and(ifNull(equals(argMax(person.is_deleted, person.version), 0), 0), ifNull(less(argMax(person.created_at, person.version), plus(now64(6, 'UTC'), toIntervalDay(1))), 0)) SETTINGS optimize_aggregation_in_order=1) AS persons ON equals(persons.id, source.actor_id)
ORDER BY source.event_count DESC
diff --git a/posthog/hogql_queries/insights/trends/trends_actors_query_builder.py b/posthog/hogql_queries/insights/trends/trends_actors_query_builder.py
index 485ba0d1a265b..dde7832e815d3 100644
--- a/posthog/hogql_queries/insights/trends/trends_actors_query_builder.py
+++ b/posthog/hogql_queries/insights/trends/trends_actors_query_builder.py
@@ -28,6 +28,8 @@
TrendsFilter,
TrendsQuery,
CompareFilter,
+ BreakdownType,
+ PersonPropertyFilter,
)
from posthog.settings import HOGQL_INCREASED_MAX_EXECUTION_TIME
@@ -166,7 +168,6 @@ def is_total_value(self) -> bool:
return self.trends_display.is_total_value()
def build_actors_query(self) -> ast.SelectQuery | ast.SelectUnionQuery:
- # Insert CTE here
cte_events_query = self._cte_events_query()
if cte_events_query.settings is None:
cte_events_query.settings = HogQLQuerySettings()
@@ -215,13 +216,14 @@ def _get_events_query(self) -> ast.SelectQuery:
def _cte_events_query(self) -> ast.SelectQuery:
return ast.SelectQuery(
- select=[ast.Field(chain=["*"])], # Filter this down to save space
+ # Could filter this down to what we actually use to save memory
+ select=[ast.Field(chain=["*"])],
select_from=ast.JoinExpr(
table=ast.Field(chain=["events"]),
alias="e",
sample=self._sample_expr(),
),
- where=self._cte_events_where_expr(),
+ where=self._persons_cte_events_where_expr(),
)
def _get_actor_value_expr(self) -> ast.Expr:
@@ -248,13 +250,17 @@ def _events_where_expr(self, with_breakdown_expr: bool = True) -> ast.And:
]
)
- def _cte_events_where_expr(self, with_breakdown_expr: bool = True) -> ast.And:
+ def _persons_cte_events_where_expr(self, with_breakdown_expr: bool = True) -> ast.And:
return ast.And(
exprs=[
*self._entity_where_expr(),
# *self._prop_where_expr(),
*self._date_where_expr(),
- *(self._breakdown_where_expr() if with_breakdown_expr else []),
+ *(
+ self._breakdown_where_expr()
+ if with_breakdown_expr and self.trends_query.breakdownFilter.breakdown_type != BreakdownType.PERSON
+ else []
+ ),
*self._filter_empty_actors_expr(),
]
)
@@ -293,12 +299,13 @@ def _entity_where_expr(self) -> list[ast.Expr]:
return conditions
- def _prop_where_expr(self) -> list[ast.Expr]:
+ def _prop_where_expr(self, exclude_person_props=False) -> list[ast.Expr]:
conditions: list[ast.Expr] = []
# Filter Test Accounts
if (
- self.trends_query.filterTestAccounts
+ not exclude_person_props
+ and self.trends_query.filterTestAccounts
and isinstance(self.team.test_account_filters, list)
and len(self.team.test_account_filters) > 0
):
@@ -307,7 +314,10 @@ def _prop_where_expr(self) -> list[ast.Expr]:
# Properties
if self.trends_query.properties is not None and self.trends_query.properties != []:
- conditions.append(property_to_expr(self.trends_query.properties, self.team))
+ properties = self.trends_query.properties
+ if exclude_person_props:
+ properties = [x for x in properties if isinstance(x, PersonPropertyFilter)]
+ conditions.append(property_to_expr(properties, self.team))
return conditions
diff --git a/posthog/test/base.py b/posthog/test/base.py
index 6258bd34ee6b5..6e9922910a683 100644
--- a/posthog/test/base.py
+++ b/posthog/test/base.py
@@ -885,10 +885,13 @@ class ClickhouseTestMixin(QueryMatchingTest):
snapshot: Any
def capture_select_queries(self):
- return self.capture_queries(("SELECT", "WITH", "select", "with"))
+ return self.capture_queries_startswith(("SELECT", "WITH", "select", "with"))
+
+ def capture_queries_startswith(self, query_prefixes: Union[str, tuple[str, ...]]):
+ return self.capture_queries(lambda x: x.startswith(query_prefixes))
@contextmanager
- def capture_queries(self, query_prefixes: Union[str, tuple[str, ...]]):
+ def capture_queries(self, query_filter: Callable[[str], bool]):
queries = []
original_get_client = ch_pool.get_client
@@ -901,7 +904,7 @@ def get_client():
original_client_execute = client.execute
def execute_wrapper(query, *args, **kwargs):
- if sqlparse.format(query, strip_comments=True).strip().startswith(query_prefixes):
+ if query_filter(sqlparse.format(query, strip_comments=True).strip()):
queries.append(query)
return original_client_execute(query, *args, **kwargs)