Skip to content

Commit

Permalink
Add referring domain type
Browse files Browse the repository at this point in the history
  • Loading branch information
robbie-c committed Dec 5, 2023
1 parent 9cc2c22 commit 6a8d92b
Show file tree
Hide file tree
Showing 14 changed files with 1,498 additions and 1 deletion.
1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@
"editor:update-tsd": "pnpm packages:build && node frontend/editor-update-tsd.mjs",
"prettier": "prettier --write \"./**/*.{js,mjs,ts,tsx,json,yaml,yml,css,scss}\"",
"prettier:check": "prettier --check \"frontend/**/*.{js,mjs,ts,tsx,json,yaml,yml,css,scss}\"",
"prettier:file": "prettier --write",
"typescript:check": "tsc --noEmit && echo \"No errors reported by tsc.\"",
"lint:js": "eslint frontend/src",
"lint:css": "stylelint \"frontend/**/*.{css,scss}\"",
Expand Down
12 changes: 12 additions & 0 deletions posthog/clickhouse/migrations/0050_add_ga4_channel_type.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
from posthog.clickhouse.client.migration_tools import run_sql_with_exceptions
from posthog.models.channel_type.sql import (
GA4_CHANNEL_DEFINITION_TABLE_SQL,
GA_CHANNEL_DEFINITIONS_DATA_SQL,
GA4_CHANNEL_DEFINITION_DICTIONARY_SQL,
)

operations = [
run_sql_with_exceptions(GA4_CHANNEL_DEFINITION_TABLE_SQL),
run_sql_with_exceptions(GA_CHANNEL_DEFINITIONS_DATA_SQL),
run_sql_with_exceptions(GA4_CHANNEL_DEFINITION_DICTIONARY_SQL),
]
10 changes: 9 additions & 1 deletion posthog/clickhouse/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,11 @@
)
from posthog.clickhouse.plugin_log_entries import *
from posthog.models.app_metrics.sql import *
from posthog.models.channel_type.sql import (
GA4_CHANNEL_DEFINITION_TABLE_SQL,
GA_CHANNEL_DEFINITIONS_DATA_SQL,
GA4_CHANNEL_DEFINITION_DICTIONARY_SQL,
)
from posthog.models.cohort.sql import *
from posthog.models.event.sql import *
from posthog.models.group.sql import *
Expand Down Expand Up @@ -57,6 +62,7 @@
APP_METRICS_DATA_TABLE_SQL,
PERFORMANCE_EVENTS_TABLE_SQL,
SESSION_REPLAY_EVENTS_TABLE_SQL,
GA4_CHANNEL_DEFINITION_TABLE_SQL,
)
CREATE_DISTRIBUTED_TABLE_QUERIES = (
WRITABLE_EVENTS_TABLE_SQL,
Expand Down Expand Up @@ -109,7 +115,9 @@
+ CREATE_MV_TABLE_QUERIES
)

CREATE_DICTIONARY_QUERIES = (PERSON_OVERRIDES_CREATE_DICTIONARY_SQL,)
CREATE_DICTIONARY_QUERIES = (PERSON_OVERRIDES_CREATE_DICTIONARY_SQL, GA4_CHANNEL_DEFINITION_DICTIONARY_SQL)

CREATE_DATA_QUERIES = (GA_CHANNEL_DEFINITIONS_DATA_SQL,)

build_query = lambda query: query if isinstance(query, str) else query()
get_table_name = lambda query: re.findall(r"[\.\s]`?([a-z0-9_]+)`?\s+ON CLUSTER", build_query(query))[0]
6 changes: 6 additions & 0 deletions posthog/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ def create_clickhouse_tables(num_tables: int):
from posthog.clickhouse.schema import (
CREATE_DISTRIBUTED_TABLE_QUERIES,
CREATE_MERGETREE_TABLE_QUERIES,
CREATE_DATA_QUERIES,
build_query,
)

Expand All @@ -27,6 +28,9 @@ def create_clickhouse_tables(num_tables: int):
queries = list(map(build_query, CREATE_TABLE_QUERIES))
run_clickhouse_statement_in_parallel(queries)

data_queries = list(map(build_query, CREATE_DATA_QUERIES))
run_clickhouse_statement_in_parallel(data_queries)


def reset_clickhouse_tables():
# Truncate clickhouse tables to default before running test
Expand All @@ -51,6 +55,7 @@ def reset_clickhouse_tables():
from posthog.session_recordings.sql.session_recording_event_sql import (
TRUNCATE_SESSION_RECORDING_EVENTS_TABLE_SQL,
)
from posthog.models.channel_type.sql import TRUNCATE_GA4_CHANNEL_DEFINITION_TABLE_SQL

# REMEMBER TO ADD ANY NEW CLICKHOUSE TABLES TO THIS ARRAY!
TABLES_TO_CREATE_DROP = [
Expand All @@ -66,6 +71,7 @@ def reset_clickhouse_tables():
TRUNCATE_GROUPS_TABLE_SQL,
TRUNCATE_APP_METRICS_TABLE_SQL,
TRUNCATE_PERFORMANCE_EVENTS_TABLE_SQL,
TRUNCATE_GA4_CHANNEL_DEFINITION_TABLE_SQL,
]

run_clickhouse_statement_in_parallel(TABLES_TO_CREATE_DROP)
Expand Down
6 changes: 6 additions & 0 deletions posthog/hogql/database/database.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
FunctionCallTable,
ExpressionField,
)
from posthog.hogql.database.schema.channel_type import create_initial_channel_type, create_initial_domain_type
from posthog.hogql.database.schema.log_entries import (
LogEntriesTable,
ReplayConsoleLogsLogEntriesTable,
Expand Down Expand Up @@ -167,6 +168,11 @@ def create_hogql_database(team_id: int, modifiers: Optional[HogQLQueryModifiers]
database.events.fields["poe"].fields["id"] = database.events.fields["person_id"]
database.events.fields["person"] = FieldTraverser(chain=["poe"])

database.persons.fields["$initial_referring_domain_type"] = create_initial_domain_type(
"$initial_referring_domain_type"
)
database.persons.fields["$initial_channel_type"] = create_initial_channel_type("$initial_channel_type")

for mapping in GroupTypeMapping.objects.filter(team=team):
if database.events.fields.get(mapping.group_type) is None:
database.events.fields[mapping.group_type] = FieldTraverser(chain=[f"group_{mapping.group_type_index}"])
Expand Down
94 changes: 94 additions & 0 deletions posthog/hogql/database/schema/channel_type.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
# Create a virtual field that uses GA's channel grouping logic to group events into acquisition channels.
# The source for this logic is:
# UA: https://support.google.com/analytics/answer/3297892?hl=en
# GA4: https://support.google.com/analytics/answer/9756891?hl=en

# I'm not fully convinced that this approach will work on its own, as GA4 will have a lot more information on paid ads
# than what we will have access to. We'll need to get this live and see what it looks like on Posthog data.
from posthog.hogql.database.models import ExpressionField
from posthog.hogql.parser import parse_expr


def create_initial_domain_type(name: str):
return ExpressionField(
name=name,
expr=parse_expr(
"""
if(
properties.$initial_referring_domain = '$direct',
'$direct',
dictGetOrNull(
'ga4_channel_definition_dict',
'type',
cutToFirstSignificantSubdomain(coalesce(properties.$initial_referring_domain, ''))
)
)
"""
),
)


def create_initial_channel_type(name: str):
return ExpressionField(
name=name,
expr=parse_expr(
"""
multiIf(
match(properties.$initial_utm_campaign, 'cross-network'),
'Cross Network',
match(properties.$initial_utm_medium, '^(.*cp.*|ppc|retargeting|paid.*)$'),
CASE dictGetOrNull('ga4_channel_definition_dict', 'type', cutToFirstSignificantSubdomain(coalesce(properties.$initial_referring_domain, '')))
WHEN 'Shopping' THEN 'Paid Shopping'
WHEN 'Search' THEN 'Paid Search'
WHEN 'Video' THEN 'Paid Video'
WHEN 'Social' THEN 'Paid Social'
ELSE multiIf(
match(properties.$initial_utm_campaign, '^(.*(([^a-df-z]|^)shop|shopping).*)$'),
'Paid Shopping',
properties.$initial_utm_medium IN
('display', 'banner', 'expandable', 'interstitial', 'cpm'),
'Display',
'Paid Other'
)
END,
properties.$initial_referring_domain = '$direct' AND (properties.$initial_utm_medium IS NULL OR properties.$initial_utm_medium = ''),
'Direct',
CASE dictGetOrNull('ga4_channel_definition_dict', 'type', cutToFirstSignificantSubdomain(coalesce(properties.$initial_referring_domain, '')))
WHEN 'Shopping' THEN 'Organic Shopping'
WHEN 'Search' THEN 'Organic Search'
WHEN 'Video' THEN 'Organic Video'
WHEN 'Social' THEN 'Organic Social'
ELSE multiIf(
match(properties.$initial_utm_campaign, '^(.*(([^a-df-z]|^)shop|shopping).*)$'),
'Organic Shopping',
properties.$initial_utm_medium IN
('social', 'social-network', 'social-media', 'sm', 'social network', 'social media'),
'Organic Social',
match(properties.$initial_utm_campaign, '^(.*video.*)$'),
'Organic Video',
properties.$initial_utm_medium = 'organic',
'Organic Search',
properties.$initial_utm_medium IN ('referral', 'app', 'link'),
'Referral',
properties.$initial_utm_source IN ('email', 'e-mail', 'e_mail', 'e mail')
OR properties.$initial_utm_medium IN ('email', 'e-mail', 'e_mail', 'e mail'),
'Email',
properties.$initial_utm_medium = 'affiliate',
'Affiliate',
properties.$initial_utm_medium = 'audio',
'Audio',
properties.$initial_utm_source = 'sms' OR properties.$initial_utm_medium = 'sms',
'SMS',
match(properties.$initial_utm_medium, '(push$|mobile|notification)')
OR properties.$initial_utm_source = 'firebase',
'Push',
NULL
)
END
)""",
start=None,
),
)
160 changes: 160 additions & 0 deletions posthog/hogql/database/schema/test/test_channel_type.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
import uuid

from posthog.hogql import ast
from posthog.hogql.parser import parse_select
from posthog.hogql.query import execute_hogql_query
from posthog.test.base import (
APIBaseTest,
ClickhouseTestMixin,
_create_person,
)


class ReferringDomainTypeQueryRunner(ClickhouseTestMixin, APIBaseTest):
maxDiff = None

def _get_initial_referring_domain_type(self, initial_referring_domain: str):
person_id = str(uuid.uuid4())

_create_person(
uuid=person_id,
team_id=self.team.pk,
distinct_ids=[person_id],
properties={
"$initial_referring_domain": initial_referring_domain,
},
)

response = execute_hogql_query(
parse_select(
"select $initial_referring_domain_type as channel_type from persons where id = {person_id}",
placeholders={"person_id": ast.Constant(value=person_id)},
),
self.team,
)

return response.results[0][0]

def test_direct(self):
self.assertEqual(
"$direct",
self._get_initial_referring_domain_type("$direct"),
)

def test_search(self):
self.assertEqual(
"Search",
self._get_initial_referring_domain_type("www.google.co.uk"),
)
self.assertEqual(
"Search",
self._get_initial_referring_domain_type("yahoo.co.jp"),
)

def test_shopping(self):
self.assertEqual(
"Shopping",
self._get_initial_referring_domain_type("m.alibaba.com"),
)
self.assertEqual(
"Shopping",
self._get_initial_referring_domain_type("stripe.com"),
)

def test_social(self):
self.assertEqual(
"Social",
self._get_initial_referring_domain_type("lnkd.in"),
)
self.assertEqual(
"Social",
self._get_initial_referring_domain_type("old.reddit.com"),
)


class ChannelTypeQueryRunner(ClickhouseTestMixin, APIBaseTest):
maxDiff = None

def _get_initial_channel_type(self, properties=None):
person_id = str(uuid.uuid4())

_create_person(
uuid=person_id,
team_id=self.team.pk,
distinct_ids=[person_id],
properties=properties,
)

response = execute_hogql_query(
parse_select(
"select $initial_channel_type as channel_type from persons where id = {person_id}",
placeholders={"person_id": ast.Constant(value=person_id)},
),
self.team,
)

return response.results[0][0]

def test_direct(self):
self.assertEqual(
"Direct",
self._get_initial_channel_type(
{
"$initial_referring_domain": "$direct",
}
),
)

def test_cross_network(self):
self.assertEqual(
"Cross Network",
self._get_initial_channel_type(
{
"$initial_referring_domain": "$direct",
"$initial_utm_campaign": "cross-network",
}
),
)

def test_paid_shopping(self):
self.assertEqual(
"Paid Shopping",
self._get_initial_channel_type(
{
"$initial_referring_domain": "www.ebay.co.uk",
"$initial_utm_medium": "ppc",
}
),
)

def test_paid_search(self):
self.assertEqual(
"Paid Shopping",
self._get_initial_channel_type(
{
"$initial_referring_domain": "www.ebay.co.uk",
"$initial_utm_medium": "ppc",
}
),
)

def test_paid_video(self):
self.assertEqual(
"Paid Video",
self._get_initial_channel_type(
{
"$initial_referring_domain": "youtube.com",
"$initial_utm_medium": "cpm",
}
),
)

def test_organic_video(self):
self.assertEqual(
"Organic Video",
self._get_initial_channel_type(
{
"$initial_referring_domain": "youtube.com",
}
),
)
4 changes: 4 additions & 0 deletions posthog/hogql/functions/mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -568,6 +568,10 @@ class HogQLFunctionMeta:
"nth_value": HogQLFunctionMeta("nth_value", 2, 2),
"lagInFrame": HogQLFunctionMeta("lagInFrame", 1, 1),
"leadInFrame": HogQLFunctionMeta("leadInFrame", 1, 1),
# dict
"dictGet": HogQLFunctionMeta("dictGet", 3, 3),
"dictGetOrNull": HogQLFunctionMeta("dictGetOrNull", 3, 3),
"dictGetOrDefault": HogQLFunctionMeta("dictGetOrDefault", 4, 4),
}
# Permitted HogQL aggregations
HOGQL_AGGREGATIONS: Dict[str, HogQLFunctionMeta] = {
Expand Down
Loading

0 comments on commit 6a8d92b

Please sign in to comment.