From d210fcad3b7bd0f0aaad52f42595a54e2f1cda6d Mon Sep 17 00:00:00 2001 From: Robbie Date: Tue, 12 Mar 2024 13:03:02 +0000 Subject: [PATCH] fix(web-analytics): Be defensive against numeric query params (#20848) * Be defensive against numeric query params * Use placeholders --- posthog/hogql/database/schema/channel_type.py | 61 ++++++++++++------- .../database/schema/test/test_channel_type.py | 13 ++++ 2 files changed, 52 insertions(+), 22 deletions(-) diff --git a/posthog/hogql/database/schema/channel_type.py b/posthog/hogql/database/schema/channel_type.py index 39374022c227c..702681aeeb29f 100644 --- a/posthog/hogql/database/schema/channel_type.py +++ b/posthog/hogql/database/schema/channel_type.py @@ -1,6 +1,8 @@ +from posthog.hogql import ast from posthog.hogql.database.models import ExpressionField from posthog.hogql.parser import parse_expr + # Create a virtual field that categories the type of channel that a user was acquired through. Use GA4's definitions as # a starting point, but also add some custom logic to handle some edge cases that GA4 doesn't handle. # The source for this logic is: @@ -22,11 +24,16 @@ def create_initial_domain_type(name: str): expr=parse_expr( """ if( - properties.$initial_referring_domain = '$direct', + {referring_domain} = '$direct', '$direct', - hogql_lookupDomainType(properties.$initial_referring_domain) + hogql_lookupDomainType({referring_domain}) ) -""" +""", + { + "referring_domain": ast.Call( + name="toString", args=[ast.Field(chain=["properties", "$initial_referring_domain"])] + ) + }, ), ) @@ -37,28 +44,28 @@ def create_initial_channel_type(name: str): expr=parse_expr( """ multiIf( - match(properties.$initial_utm_campaign, 'cross-network'), + match({campaign}, 'cross-network'), 'Cross Network', ( - match(properties.$initial_utm_medium, '^(.*cp.*|ppc|retargeting|paid.*)$') OR - properties.$initial_gclid IS NOT NULL OR - properties.$initial_gad_source IS NOT NULL + match({medium}, '^(.*cp.*|ppc|retargeting|paid.*)$') OR + {gclid} IS NOT NULL OR + {gad_source} IS NOT NULL ), coalesce( - hogql_lookupPaidSourceType(properties.$initial_utm_source), - hogql_lookupPaidDomainType(properties.$initial_referring_domain), + hogql_lookupPaidSourceType({source}), + hogql_lookupPaidDomainType({referring_domain}), if( - match(properties.$initial_utm_campaign, '^(.*(([^a-df-z]|^)shop|shopping).*)$'), + match({campaign}, '^(.*(([^a-df-z]|^)shop|shopping).*)$'), 'Paid Shopping', NULL ), - hogql_lookupPaidMediumType(properties.$initial_utm_medium), + hogql_lookupPaidMediumType({medium}), multiIf ( - properties.$initial_gad_source = '1', + {gad_source} = '1', 'Paid Search', - match(properties.$initial_utm_campaign, '^(.*video.*)$'), + match({campaign}, '^(.*video.*)$'), 'Paid Video', 'Paid Other' @@ -66,26 +73,26 @@ def create_initial_channel_type(name: str): ), ( - properties.$initial_referring_domain = '$direct' - AND (properties.$initial_utm_medium IS NULL OR properties.$initial_utm_medium = '') - AND (properties.$initial_utm_source IS NULL OR properties.$initial_utm_source IN ('', '(direct)', 'direct')) + {referring_domain} = '$direct' + AND ({medium} IS NULL OR {medium} = '') + AND ({source} IS NULL OR {source} IN ('', '(direct)', 'direct')) ), 'Direct', coalesce( - hogql_lookupOrganicSourceType(properties.$initial_utm_source), - hogql_lookupOrganicDomainType(properties.$initial_referring_domain), + hogql_lookupOrganicSourceType({source}), + hogql_lookupOrganicDomainType({referring_domain}), if( - match(properties.$initial_utm_campaign, '^(.*(([^a-df-z]|^)shop|shopping).*)$'), + match({campaign}, '^(.*(([^a-df-z]|^)shop|shopping).*)$'), 'Organic Shopping', NULL ), - hogql_lookupOrganicMediumType(properties.$initial_utm_medium), + hogql_lookupOrganicMediumType({medium}), multiIf( - match(properties.$initial_utm_campaign, '^(.*video.*)$'), + match({campaign}, '^(.*video.*)$'), 'Organic Video', - match(properties.$initial_utm_medium, 'push$'), + match({medium}, 'push$'), 'Push', 'Other' @@ -93,5 +100,15 @@ def create_initial_channel_type(name: str): ) )""", start=None, + placeholders={ + "campaign": ast.Call(name="toString", args=[ast.Field(chain=["properties", "$initial_utm_campaign"])]), + "medium": ast.Call(name="toString", args=[ast.Field(chain=["properties", "$initial_utm_medium"])]), + "source": ast.Call(name="toString", args=[ast.Field(chain=["properties", "$initial_utm_source"])]), + "referring_domain": ast.Call( + name="toString", args=[ast.Field(chain=["properties", "$initial_referring_domain"])] + ), + "gclid": ast.Call(name="toString", args=[ast.Field(chain=["properties", "$initial_gclid"])]), + "gad_source": ast.Call(name="toString", args=[ast.Field(chain=["properties", "$initial_gad_source"])]), + }, ), ) diff --git a/posthog/hogql/database/schema/test/test_channel_type.py b/posthog/hogql/database/schema/test/test_channel_type.py index ce287430575f6..89e026ff3aed0 100644 --- a/posthog/hogql/database/schema/test/test_channel_type.py +++ b/posthog/hogql/database/schema/test/test_channel_type.py @@ -220,6 +220,19 @@ def test_unknown_domain_is_other(self): ), ) + def test_doesnt_fail_on_numbers(self): + self.assertEqual( + "Other", + self._get_initial_channel_type( + { + "$initial_referring_domain": "example.com", + "$initial_utm_source": 123, + "$initial_utm_medium": 123, + "$initial_utm_campaign": 123, + } + ), + ) + def _get_initial_channel_type_from_wild_clicks(self, url: str, referrer: str): parsed_url = urlparse(url) query = parse_qs(parsed_url.query)