-
Notifications
You must be signed in to change notification settings - Fork 1.4k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat(clickhouse): Start writing property groups on events (#24152)
Co-authored-by: Daesgar <[email protected]> Co-authored-by: James Greenhill <[email protected]>
- Loading branch information
1 parent
4107195
commit 6ec9210
Showing
2 changed files
with
85 additions
and
0 deletions.
There are no files selected for viewing
10 changes: 10 additions & 0 deletions
10
posthog/clickhouse/migrations/0074_add_custom_and_features_property_groups.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
from posthog.clickhouse.client.migration_tools import run_sql_with_exceptions | ||
from posthog.clickhouse.property_groups import sharded_events_property_groups | ||
|
||
operations = [ | ||
run_sql_with_exceptions(statement) | ||
for statement in [ | ||
*sharded_events_property_groups.get_alter_create_statements("custom"), | ||
*sharded_events_property_groups.get_alter_create_statements("feature_flags"), | ||
] | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
from collections.abc import Iterable, MutableMapping | ||
from dataclasses import dataclass | ||
|
||
from posthog import settings | ||
|
||
|
||
@dataclass | ||
class PropertyGroupDefinition: | ||
filter_expression: str | ||
codec: str = "ZSTD(1)" | ||
|
||
|
||
class PropertyGroupManager: | ||
def __init__(self, cluster: str, table: str, source_column: str) -> None: | ||
self.__cluster = cluster | ||
self.__table = table | ||
self.__source_column = source_column | ||
self.__groups: MutableMapping[str, PropertyGroupDefinition] = {} | ||
|
||
def register(self, name: str, definition: PropertyGroupDefinition) -> None: | ||
assert name not in self.__groups, "property group names can only be used once" | ||
self.__groups[name] = definition | ||
|
||
def __get_map_expression(self, definition: PropertyGroupDefinition) -> str: | ||
return f"mapSort(mapFilter((key, _) -> {definition.filter_expression}, CAST(JSONExtractKeysAndValues({self.__source_column}, 'String'), 'Map(String, String)')))" | ||
|
||
def get_alter_create_statements(self, name: str) -> Iterable[str]: | ||
column_name = f"{self.__source_column}_group_{name}" | ||
definition = self.__groups[name] | ||
return [ | ||
f"ALTER TABLE {self.__table} ON CLUSTER {self.__cluster} ADD COLUMN {column_name} Map(String, String) MATERIALIZED {self.__get_map_expression(definition)} CODEC({definition.codec})", | ||
f"ALTER TABLE {self.__table} ON CLUSTER {self.__cluster} ADD INDEX {column_name}_keys_bf mapKeys({column_name}) TYPE bloom_filter", | ||
f"ALTER TABLE {self.__table} ON CLUSTER {self.__cluster} ADD INDEX {column_name}_values_bf mapValues({column_name}) TYPE bloom_filter", | ||
] | ||
|
||
|
||
sharded_events_property_groups = PropertyGroupManager(settings.CLICKHOUSE_CLUSTER, "sharded_events", "properties") | ||
|
||
ignore_custom_properties = [ | ||
# `token` & `distinct_id` properties are sent with ~50% of events and by | ||
# many teams, and should not be treated as custom properties and their use | ||
# should be optimized separately | ||
"token", | ||
"distinct_id", | ||
# campaign properties are defined by external entities and are commonly used | ||
# across a large number of teams, and should also be optimized separately | ||
"utm_source", | ||
"utm_medium", | ||
"utm_campaign", | ||
"utm_content", | ||
"utm_term", | ||
"gclid", # google ads | ||
"gad_source", # google ads | ||
"gclsrc", # google ads 360 | ||
"dclid", # google display ads | ||
"gbraid", # google ads, web to app | ||
"wbraid", # google ads, app to web | ||
"fbclid", # facebook | ||
"msclkid", # microsoft | ||
"twclid", # twitter | ||
"li_fat_id", # linkedin | ||
"mc_cid", # mailchimp campaign id | ||
"igshid", # instagram | ||
"ttclid", # tiktok | ||
"rdt_cid", # reddit | ||
] | ||
|
||
sharded_events_property_groups.register( | ||
"custom", | ||
PropertyGroupDefinition( | ||
f"key NOT LIKE '$%' AND key NOT IN (" + f", ".join(f"'{name}'" for name in ignore_custom_properties) + f")" | ||
), | ||
) | ||
|
||
sharded_events_property_groups.register("feature_flags", PropertyGroupDefinition("key like '$feature/%'")) |