From 4c070fe00afa068e7ae9ddc35236c2dc46da9a80 Mon Sep 17 00:00:00 2001 From: Georgiy Tarasov Date: Tue, 17 Dec 2024 18:47:25 +0100 Subject: [PATCH 1/2] feat: most used property values --- frontend/src/queries/schema.json | 3 + frontend/src/queries/schema.ts | 1 + .../ai/event_taxonomy_query_runner.py | 80 ++++++++++++----- .../test/test_event_taxonomy_query_runner.py | 85 +++++++++++++++++++ posthog/schema.py | 1 + 5 files changed, 151 insertions(+), 19 deletions(-) diff --git a/frontend/src/queries/schema.json b/frontend/src/queries/schema.json index b81e4669af38a..e079ddf8ca393 100644 --- a/frontend/src/queries/schema.json +++ b/frontend/src/queries/schema.json @@ -5630,6 +5630,9 @@ "$ref": "#/definitions/HogQLQueryModifiers", "description": "Modifiers used when performing the query" }, + "property": { + "type": "string" + }, "response": { "$ref": "#/definitions/EventTaxonomyQueryResponse" } diff --git a/frontend/src/queries/schema.ts b/frontend/src/queries/schema.ts index 7375910003a3f..a3501f8a94a39 100644 --- a/frontend/src/queries/schema.ts +++ b/frontend/src/queries/schema.ts @@ -2420,6 +2420,7 @@ export type EventTaxonomyResponse = EventTaxonomyItem[] export interface EventTaxonomyQuery extends DataNode { kind: NodeKind.EventTaxonomyQuery event: string + property?: string } export type EventTaxonomyQueryResponse = AnalyticsQueryResponseBase diff --git a/posthog/hogql_queries/ai/event_taxonomy_query_runner.py b/posthog/hogql_queries/ai/event_taxonomy_query_runner.py index c00f2a4f771c4..681553e150a1e 100644 --- a/posthog/hogql_queries/ai/event_taxonomy_query_runner.py +++ b/posthog/hogql_queries/ai/event_taxonomy_query_runner.py @@ -50,8 +50,9 @@ def calculate(self): ) def to_query(self) -> ast.SelectQuery | ast.SelectSetQuery: - query = parse_select( - """ + if not self.query.property: + return parse_select( + """ SELECT key, -- Pick five latest distinct sample values. @@ -64,10 +65,22 @@ def to_query(self) -> ast.SelectQuery | ast.SelectSetQuery: ORDER BY total_count DESC LIMIT 500 """, - placeholders={"from_query": self._get_subquery(), "filter": self._get_omit_filter()}, - ) + placeholders={"from_query": self._get_subquery(), "filter": self._get_omit_filter()}, + ) - return query + return parse_select( + """ + SELECT + {const}, + arraySlice(arrayDistinct(groupArray(value)), 1, 5) AS values, + count(DISTINCT value) AS total_count + FROM {from_query} + """, + placeholders={ + "const": ast.Constant(value=self.query.property), + "from_query": self._get_subquery(), + }, + ) def _get_omit_filter(self): """ @@ -107,21 +120,50 @@ def _get_omit_filter(self): def _get_subquery_filter(self) -> ast.Expr: date_filter = parse_expr("timestamp >= now() - INTERVAL 30 DAY") - filter_expr = ast.And( - exprs=[ - date_filter, + filter_expr: list[ast.Expr] = [ + date_filter, + ast.CompareOperation( + left=ast.Field(chain=["event"]), + right=ast.Constant(value=self.query.event), + op=ast.CompareOperationOp.Eq, + ), + ] + + if self.query.property: + filter_expr.append( ast.CompareOperation( - left=ast.Field(chain=["event"]), - right=ast.Constant(value=self.query.event), - op=ast.CompareOperationOp.Eq, - ), - ] - ) - return filter_expr + left=ast.Field(chain=["properties", self.query.property]), + op=ast.CompareOperationOp.NotEq, + right=ast.Constant(value=""), + ) + ) + + return ast.And(exprs=filter_expr) def _get_subquery(self) -> ast.SelectQuery: - query = parse_select( - """ + if self.query.property: + query = parse_select( + """ + SELECT + {prop} as value, + count(*) AS count + FROM + events + WHERE + {filter} + GROUP BY + value + ORDER BY + count DESC + """, + placeholders={ + "prop": ast.Field(chain=["properties", self.query.property]), + "filter": self._get_subquery_filter(), + }, + ) + else: + query = parse_select( + """ SELECT JSONExtractKeysAndValues(properties, 'String') as kv FROM @@ -130,7 +172,7 @@ def _get_subquery(self) -> ast.SelectQuery: ORDER BY timestamp desc LIMIT 100 """, - placeholders={"filter": self._get_subquery_filter()}, - ) + placeholders={"filter": self._get_subquery_filter()}, + ) return cast(ast.SelectQuery, query) diff --git a/posthog/hogql_queries/ai/test/test_event_taxonomy_query_runner.py b/posthog/hogql_queries/ai/test/test_event_taxonomy_query_runner.py index 539033b40b21b..28cb3f498fd17 100644 --- a/posthog/hogql_queries/ai/test/test_event_taxonomy_query_runner.py +++ b/posthog/hogql_queries/ai/test/test_event_taxonomy_query_runner.py @@ -251,3 +251,88 @@ def test_limit(self): response = EventTaxonomyQueryRunner(team=self.team, query=EventTaxonomyQuery(event="event1")).calculate() self.assertEqual(len(response.results), 500) + + def test_property_taxonomy(self): + _create_person( + distinct_ids=["person1"], + properties={"email": "person1@example.com"}, + team=self.team, + ) + _create_person( + distinct_ids=["person2"], + properties={"email": "person1@example.com"}, + team=self.team, + ) + + _create_event( + event="event1", + distinct_id="person1", + properties={"$host": "us.posthog.com"}, + team=self.team, + ) + + for _ in range(10): + _create_event( + event="event1", + distinct_id="person1", + properties={"$host": "posthog.com"}, + team=self.team, + ) + + for _ in range(3): + _create_event( + event="event1", + distinct_id="person2", + properties={"$host": "eu.posthog.com"}, + team=self.team, + ) + + response = EventTaxonomyQueryRunner( + team=self.team, query=EventTaxonomyQuery(event="event1", property="$host") + ).calculate() + self.assertEqual(len(response.results), 1) + self.assertEqual(response.results[0].property, "$host") + self.assertEqual(response.results[0].sample_values, ["posthog.com", "eu.posthog.com", "us.posthog.com"]) + self.assertEqual(response.results[0].sample_count, 3) + + def test_property_taxonomy_filters(self): + _create_person( + distinct_ids=["person1"], + properties={"email": "person1@example.com"}, + team=self.team, + ) + _create_person( + distinct_ids=["person2"], + properties={"email": "person1@example.com"}, + team=self.team, + ) + + _create_event( + event="event1", + distinct_id="person1", + properties={"$host": "us.posthog.com", "$browser": "Chrome"}, + team=self.team, + ) + + for _ in range(10): + _create_event( + event="event2", + distinct_id="person1", + properties={"$host": "posthog.com", "prop": 10}, + team=self.team, + ) + + for _ in range(3): + _create_event( + event="event1", + distinct_id="person2", + team=self.team, + ) + + response = EventTaxonomyQueryRunner( + team=self.team, query=EventTaxonomyQuery(event="event1", property="$host") + ).calculate() + self.assertEqual(len(response.results), 1) + self.assertEqual(response.results[0].property, "$host") + self.assertEqual(response.results[0].sample_values, ["us.posthog.com"]) + self.assertEqual(response.results[0].sample_count, 1) diff --git a/posthog/schema.py b/posthog/schema.py index 003c22eac087b..77c4154cb78a7 100644 --- a/posthog/schema.py +++ b/posthog/schema.py @@ -5582,6 +5582,7 @@ class EventTaxonomyQuery(BaseModel): modifiers: Optional[HogQLQueryModifiers] = Field( default=None, description="Modifiers used when performing the query" ) + property: Optional[str] = None response: Optional[EventTaxonomyQueryResponse] = None From d50fb60bfb25c7078650b0d56c6234587b598524 Mon Sep 17 00:00:00 2001 From: Georgiy Tarasov Date: Tue, 17 Dec 2024 20:54:24 +0100 Subject: [PATCH 2/2] fix: support multiple events --- frontend/src/queries/schema.json | 7 +- frontend/src/queries/schema.ts | 2 +- .../ai/event_taxonomy_query_runner.py | 63 +++++++++----- .../test/test_event_taxonomy_query_runner.py | 83 ++++++++++++++++++- posthog/schema.py | 2 +- 5 files changed, 129 insertions(+), 28 deletions(-) diff --git a/frontend/src/queries/schema.json b/frontend/src/queries/schema.json index e079ddf8ca393..1d2a4d94012aa 100644 --- a/frontend/src/queries/schema.json +++ b/frontend/src/queries/schema.json @@ -5630,8 +5630,11 @@ "$ref": "#/definitions/HogQLQueryModifiers", "description": "Modifiers used when performing the query" }, - "property": { - "type": "string" + "properties": { + "items": { + "type": "string" + }, + "type": "array" }, "response": { "$ref": "#/definitions/EventTaxonomyQueryResponse" diff --git a/frontend/src/queries/schema.ts b/frontend/src/queries/schema.ts index a3501f8a94a39..5360ae06d99f4 100644 --- a/frontend/src/queries/schema.ts +++ b/frontend/src/queries/schema.ts @@ -2420,7 +2420,7 @@ export type EventTaxonomyResponse = EventTaxonomyItem[] export interface EventTaxonomyQuery extends DataNode { kind: NodeKind.EventTaxonomyQuery event: string - property?: string + properties?: string[] } export type EventTaxonomyQueryResponse = AnalyticsQueryResponseBase diff --git a/posthog/hogql_queries/ai/event_taxonomy_query_runner.py b/posthog/hogql_queries/ai/event_taxonomy_query_runner.py index 681553e150a1e..2b72e985b3a59 100644 --- a/posthog/hogql_queries/ai/event_taxonomy_query_runner.py +++ b/posthog/hogql_queries/ai/event_taxonomy_query_runner.py @@ -50,7 +50,7 @@ def calculate(self): ) def to_query(self) -> ast.SelectQuery | ast.SelectSetQuery: - if not self.query.property: + if not self.query.properties: return parse_select( """ SELECT @@ -71,13 +71,14 @@ def to_query(self) -> ast.SelectQuery | ast.SelectSetQuery: return parse_select( """ SELECT - {const}, + key, arraySlice(arrayDistinct(groupArray(value)), 1, 5) AS values, count(DISTINCT value) AS total_count FROM {from_query} + GROUP BY key + LIMIT 500 """, placeholders={ - "const": ast.Constant(value=self.query.property), "from_query": self._get_subquery(), }, ) @@ -129,35 +130,57 @@ def _get_subquery_filter(self) -> ast.Expr: ), ] - if self.query.property: + if self.query.properties: filter_expr.append( - ast.CompareOperation( - left=ast.Field(chain=["properties", self.query.property]), - op=ast.CompareOperationOp.NotEq, - right=ast.Constant(value=""), + ast.Or( + exprs=[ + ast.CompareOperation( + left=ast.Field(chain=["properties", prop]), + op=ast.CompareOperationOp.NotEq, + right=ast.Constant(value=""), + ) + for prop in self.query.properties + ] ) ) return ast.And(exprs=filter_expr) def _get_subquery(self) -> ast.SelectQuery: - if self.query.property: + if self.query.properties: query = parse_select( """ SELECT - {prop} as value, - count(*) AS count - FROM - events - WHERE - {filter} - GROUP BY - value - ORDER BY - count DESC + key, + value, + count() as count + FROM ( + SELECT + {props} as kv + FROM + events + WHERE {filter} + ) + ARRAY JOIN kv.1 AS key, kv.2 AS value + WHERE value != '' + GROUP BY key, value + ORDER BY count DESC """, placeholders={ - "prop": ast.Field(chain=["properties", self.query.property]), + "props": ast.Array( + exprs=[ + ast.Tuple( + exprs=[ + ast.Constant(value=prop), + ast.Call( + name="JSONExtractString", + args=[ast.Field(chain=["properties"]), ast.Constant(value=prop)], + ), + ] + ) + for prop in self.query.properties + ] + ), "filter": self._get_subquery_filter(), }, ) diff --git a/posthog/hogql_queries/ai/test/test_event_taxonomy_query_runner.py b/posthog/hogql_queries/ai/test/test_event_taxonomy_query_runner.py index 28cb3f498fd17..147bf5bcfdc14 100644 --- a/posthog/hogql_queries/ai/test/test_event_taxonomy_query_runner.py +++ b/posthog/hogql_queries/ai/test/test_event_taxonomy_query_runner.py @@ -252,7 +252,7 @@ def test_limit(self): response = EventTaxonomyQueryRunner(team=self.team, query=EventTaxonomyQuery(event="event1")).calculate() self.assertEqual(len(response.results), 500) - def test_property_taxonomy(self): + def test_property_taxonomy_returns_unique_values_for_specified_property(self): _create_person( distinct_ids=["person1"], properties={"email": "person1@example.com"}, @@ -288,14 +288,14 @@ def test_property_taxonomy(self): ) response = EventTaxonomyQueryRunner( - team=self.team, query=EventTaxonomyQuery(event="event1", property="$host") + team=self.team, query=EventTaxonomyQuery(event="event1", properties=["$host"]) ).calculate() self.assertEqual(len(response.results), 1) self.assertEqual(response.results[0].property, "$host") self.assertEqual(response.results[0].sample_values, ["posthog.com", "eu.posthog.com", "us.posthog.com"]) self.assertEqual(response.results[0].sample_count, 3) - def test_property_taxonomy_filters(self): + def test_property_taxonomy_filters_events_by_event_name(self): _create_person( distinct_ids=["person1"], properties={"email": "person1@example.com"}, @@ -330,9 +330,84 @@ def test_property_taxonomy_filters(self): ) response = EventTaxonomyQueryRunner( - team=self.team, query=EventTaxonomyQuery(event="event1", property="$host") + team=self.team, query=EventTaxonomyQuery(event="event1", properties=["$host"]) ).calculate() self.assertEqual(len(response.results), 1) self.assertEqual(response.results[0].property, "$host") self.assertEqual(response.results[0].sample_values, ["us.posthog.com"]) self.assertEqual(response.results[0].sample_count, 1) + + def test_property_taxonomy_handles_multiple_properties_in_query(self): + _create_person( + distinct_ids=["person1"], + properties={"email": "person1@example.com"}, + team=self.team, + ) + _create_person( + distinct_ids=["person2"], + properties={"email": "person1@example.com"}, + team=self.team, + ) + + _create_event( + event="event1", + distinct_id="person1", + properties={"$host": "us.posthog.com", "$browser": "Chrome"}, + team=self.team, + ) + + for _ in range(5): + _create_event( + event="event1", + distinct_id="person1", + properties={"$host": "posthog.com", "prop": 10}, + team=self.team, + ) + + for _ in range(3): + _create_event( + event="event1", + distinct_id="person2", + team=self.team, + ) + + response = EventTaxonomyQueryRunner( + team=self.team, query=EventTaxonomyQuery(event="event1", properties=["$host", "prop"]) + ).calculate() + self.assertEqual(len(response.results), 2) + self.assertEqual(response.results[0].property, "prop") + self.assertEqual(response.results[0].sample_values, ["10"]) + self.assertEqual(response.results[0].sample_count, 1) + self.assertEqual(response.results[1].property, "$host") + self.assertEqual(response.results[1].sample_values, ["posthog.com", "us.posthog.com"]) + self.assertEqual(response.results[1].sample_count, 2) + + def test_property_taxonomy_includes_events_with_partial_property_matches(self): + _create_person( + distinct_ids=["person1"], + properties={"email": "person1@example.com"}, + team=self.team, + ) + _create_event( + event="event1", + distinct_id="person1", + properties={"$host": "us.posthog.com"}, + team=self.team, + ) + _create_event( + event="event1", + distinct_id="person2", + properties={"prop": 10}, + team=self.team, + ) + + response = EventTaxonomyQueryRunner( + team=self.team, query=EventTaxonomyQuery(event="event1", properties=["$host", "prop"]) + ).calculate() + self.assertEqual(len(response.results), 2) + self.assertEqual(response.results[0].property, "prop") + self.assertEqual(response.results[0].sample_values, ["10"]) + self.assertEqual(response.results[0].sample_count, 1) + self.assertEqual(response.results[1].property, "$host") + self.assertEqual(response.results[1].sample_values, ["us.posthog.com"]) + self.assertEqual(response.results[1].sample_count, 1) diff --git a/posthog/schema.py b/posthog/schema.py index 77c4154cb78a7..564dcc321fa60 100644 --- a/posthog/schema.py +++ b/posthog/schema.py @@ -5582,7 +5582,7 @@ class EventTaxonomyQuery(BaseModel): modifiers: Optional[HogQLQueryModifiers] = Field( default=None, description="Modifiers used when performing the query" ) - property: Optional[str] = None + properties: Optional[list[str]] = None response: Optional[EventTaxonomyQueryResponse] = None