Skip to content

Commit

Permalink
perf: Also use property group optimizations for JSONHas on properti…
Browse files Browse the repository at this point in the history
…es materialized into their own columns (#24924)
  • Loading branch information
tkaemming authored Sep 12, 2024
1 parent 2c9e1db commit 51b77c5
Show file tree
Hide file tree
Showing 2 changed files with 55 additions and 29 deletions.
38 changes: 23 additions & 15 deletions posthog/hogql/printer.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import re
from collections.abc import Iterable
from dataclasses import dataclass
from datetime import datetime, date
from difflib import get_close_matches
Expand Down Expand Up @@ -965,9 +966,14 @@ def resolve_field_type(expr: ast.Expr) -> ast.Type | None:
if not isinstance(field_type, ast.FieldType):
return None

property_source = self.__get_materialized_property_source(field_type, str(property_name))
if isinstance(property_source, PrintableMaterializedPropertyGroupItem):
return property_source.has_expr
# TRICKY: Materialized property columns do not currently support null values (see comment in
# `visit_property_type`) so checking whether or not a property is set for a row cannot safely use that
# field and falls back to the equivalent ``JSONHas(properties, ...)`` call instead. However, if this
# property is part of *any* property group, we can use that column instead to evaluate this expression
# more efficiently -- even if the materialized column would be a better choice in other situations.
for property_source in self.__get_all_materialized_property_sources(field_type, str(property_name)):
if isinstance(property_source, PrintableMaterializedPropertyGroupItem):
return property_source.has_expr

return None # nothing to optimize

Expand Down Expand Up @@ -1256,19 +1262,22 @@ def __get_materialized_property_source_for_property_type(
self, type: ast.PropertyType
) -> PrintableMaterializedColumn | PrintableMaterializedPropertyGroupItem | None:
"""
Find a materialized property for the provided property type.
Find the most efficient materialized property source for the provided property type.
"""
return self.__get_materialized_property_source(type.field_type, str(type.chain[0]))
for source in self.__get_all_materialized_property_sources(type.field_type, str(type.chain[0])):
return source
return None

def __get_materialized_property_source(
def __get_all_materialized_property_sources(
self, field_type: ast.FieldType, property_name: str
) -> PrintableMaterializedColumn | PrintableMaterializedPropertyGroupItem | None:
) -> Iterable[PrintableMaterializedColumn | PrintableMaterializedPropertyGroupItem]:
"""
Find a materialized property for the provided field type and property name.
Find all materialized property sources for the provided field type and property name, ordered from what is
likely to be the most efficient access path to the least efficient.
"""
# TODO: It likely makes sense to make this independent of whether or not property groups are used.
if self.context.modifiers.materializationMode == "disabled":
return None
return

field = field_type.resolve_database_field(self.context)

Expand All @@ -1288,11 +1297,12 @@ def __get_materialized_property_source(

materialized_column = self._get_materialized_column(table_name, property_name, field_name)
if materialized_column:
return PrintableMaterializedColumn(
yield PrintableMaterializedColumn(
self.visit(field_type.table_type),
self._print_identifier(materialized_column),
)
elif self.context.modifiers.propertyGroupsMode in (

if self.context.modifiers.propertyGroupsMode in (
PropertyGroupsMode.ENABLED,
PropertyGroupsMode.OPTIMIZED,
):
Expand All @@ -1302,7 +1312,7 @@ def __get_materialized_property_source(
for property_group_column in property_groups.get_property_group_columns(
table_name, field_name, property_name
):
return PrintableMaterializedPropertyGroupItem(
yield PrintableMaterializedPropertyGroupItem(
self.visit(field_type.table_type),
self._print_identifier(property_group_column),
self.context.add_value(property_name),
Expand All @@ -1318,9 +1328,7 @@ def __get_materialized_property_source(
else:
materialized_column = self._get_materialized_column("person", property_name, "properties")
if materialized_column:
return PrintableMaterializedColumn(None, self._print_identifier(materialized_column))

return None
yield PrintableMaterializedColumn(None, self._print_identifier(materialized_column))

def visit_property_type(self, type: ast.PropertyType):
if type.joined_subquery is not None and type.joined_subquery_field_name is not None:
Expand Down
46 changes: 32 additions & 14 deletions posthog/hogql/test/test_printer.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import json
from typing import Any, Literal, Optional, cast
from collections.abc import Mapping
from contextlib import contextmanager
from typing import Any, Literal, Optional, cast

import pytest
from django.test import override_settings
Expand All @@ -26,6 +27,21 @@
from posthog.test.base import BaseTest, _create_event, cleanup_materialized_columns


@contextmanager
def materialized(table, property):
"""Materialize a property within the managed block, removing it on exit."""
try:
from ee.clickhouse.materialized_columns.analyze import materialize
except ModuleNotFoundError as e:
pytest.xfail(str(e))

try:
materialize(table, property)
yield
finally:
cleanup_materialized_columns()


class TestPrinter(BaseTest):
maxDiff = None

Expand Down Expand Up @@ -341,25 +357,19 @@ def test_property_groups(self):
),
)

cleanup_materialized_columns()

self.assertEqual(
self._expr("properties['foo']", context),
"has(events.properties_group_custom, %(hogql_val_0)s) ? events.properties_group_custom[%(hogql_val_0)s] : null",
)
self.assertEqual(context.values["hogql_val_0"], "foo")

try:
from ee.clickhouse.materialized_columns.analyze import materialize
except ModuleNotFoundError:
return

# Properties that are materialized as columns should take precedence over the values in the group's map column.
materialize("events", "foo")
self.assertEqual(
self._expr("properties['foo']", context),
"nullIf(nullIf(events.mat_foo, ''), 'null')",
)
with materialized("events", "foo"):
# Properties that are materialized as columns should take precedence over the values in the group's map
# column.
self.assertEqual(
self._expr("properties['foo']", context),
"nullIf(nullIf(events.mat_foo, ''), 'null')",
)

def _test_property_group_comparison(
self,
Expand Down Expand Up @@ -526,6 +536,14 @@ def test_property_groups_optimized_has(self) -> None:
# TODO: Chained operations/path traversal could be optimized further, but is left alone for now.
self._test_property_group_comparison("JSONHas(properties, 'foo', 'bar')", None)

with materialized("events", "key"):
self._test_property_group_comparison(
"JSONHas(properties, 'key')",
"has(events.properties_group_custom, %(hogql_val_0)s)",
{"hogql_val_0": "key"},
expected_skip_indexes_used={"properties_group_custom_keys_bf"},
)

def test_property_groups_optimized_in_comparisons(self) -> None:
# The IN operator works much like equality when the right hand side of the expression is all constants. Like
# equality, it also needs to handle the empty string special case.
Expand Down

0 comments on commit 51b77c5

Please sign in to comment.