Skip to content

Commit

Permalink
normalize urls
Browse files Browse the repository at this point in the history
  • Loading branch information
mariusandra committed Mar 28, 2024
1 parent c7910d8 commit 13064e8
Show file tree
Hide file tree
Showing 5 changed files with 199 additions and 1 deletion.
4 changes: 4 additions & 0 deletions posthog/hogql_queries/insights/trends/breakdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,10 @@ def _get_breakdown_transform_func(self) -> ast.Call:
return self._get_breakdown_values_transform(ast.Field(chain=self._properties_chain))

def _get_breakdown_values_transform(self, node: ast.Expr) -> ast.Call:
if self.query.breakdownFilter and self.query.breakdownFilter.breakdown_normalize_url:
node = parse_expr(
"empty(trimTrailing('/?#', {node})) ? '/' : trimTrailing('/?#', {node})", placeholders={"node": node}
)
return cast(
ast.Call,
parse_expr(
Expand Down
8 changes: 8 additions & 0 deletions posthog/hogql_queries/insights/trends/breakdown_values.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ class BreakdownValues:
histogram_bin_count: Optional[int]
group_type_index: Optional[int]
hide_other_aggregation: Optional[bool]
normalize_url: Optional[bool]
breakdown_limit: Optional[int]
query_date_range: QueryDateRange
modifiers: HogQLQueryModifiers
Expand Down Expand Up @@ -66,6 +67,7 @@ def __init__(
else None
)
self.hide_other_aggregation = breakdown_filter.breakdown_hide_other_aggregation
self.normalize_url = breakdown_filter.breakdown_normalize_url
self.breakdown_limit = breakdown_filter.breakdown_limit
self.query_date_range = query_date_range
self.modifiers = modifiers
Expand Down Expand Up @@ -98,6 +100,12 @@ def get_breakdown_values(self) -> List[str | int]:
)

if not self.histogram_bin_count:
if self.normalize_url:
select_field.expr = parse_expr(
"empty(trimTrailing('/?#', {node})) ? '/' : trimTrailing('/?#', {node})",
placeholders={"node": select_field.expr},
)

select_field.expr = ast.Call(name="toString", args=[select_field.expr])

if self.chart_display_type == ChartDisplayType.WorldMap:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3663,6 +3663,145 @@
allow_experimental_object_type=1
'''
# ---
# name: TestTrends.test_trends_breakdown_normalize_url
'''
SELECT toString(if(empty(trim(TRAILING '/?#'
FROM replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$current_url'), ''), 'null'), '^"|"$', ''))), '/', trim(TRAILING '/?#'
FROM replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$current_url'), ''), 'null'), '^"|"$', '')))) AS value,
count(e.uuid) AS count
FROM events AS e
WHERE and(equals(e.team_id, 2), and(greaterOrEquals(toTimeZone(e.timestamp, 'UTC'), toStartOfDay(assumeNotNull(parseDateTime64BestEffortOrNull('2019-12-28 00:00:00', 6, 'UTC')))), lessOrEquals(toTimeZone(e.timestamp, 'UTC'), assumeNotNull(parseDateTime64BestEffortOrNull('2020-01-04 23:59:59', 6, 'UTC')))), and(greaterOrEquals(toTimeZone(e.timestamp, 'UTC'), toStartOfDay(assumeNotNull(parseDateTime64BestEffortOrNull('2019-12-28 00:00:00', 6, 'UTC')))), lessOrEquals(toTimeZone(e.timestamp, 'UTC'), assumeNotNull(parseDateTime64BestEffortOrNull('2020-01-04 23:59:59', 6, 'UTC'))), equals(e.event, 'sign up')))
GROUP BY value
ORDER BY count DESC, value DESC
LIMIT 26 SETTINGS readonly=2,
max_execution_time=60,
allow_experimental_object_type=1
'''
# ---
# name: TestTrends.test_trends_breakdown_normalize_url.1
'''
SELECT groupArray(day_start) AS date,
groupArray(count) AS total,
ifNull(toString(breakdown_value), '$$_posthog_breakdown_null_$$') AS breakdown_value
FROM
(SELECT day_start AS day_start,
sum(count) OVER (PARTITION BY breakdown_value
ORDER BY day_start ASC) AS count,
breakdown_value AS breakdown_value
FROM
(SELECT sum(total) AS count,
day_start AS day_start,
breakdown_value AS breakdown_value
FROM
(SELECT 0 AS total,
ticks.day_start AS day_start,
sec.breakdown_value AS breakdown_value
FROM
(SELECT minus(toStartOfDay(assumeNotNull(parseDateTime64BestEffortOrNull('2020-01-04 23:59:59', 6, 'UTC'))), toIntervalDay(numbers.number)) AS day_start
FROM numbers(coalesce(dateDiff('day', assumeNotNull(parseDateTime64BestEffortOrNull('2019-12-28 00:00:00', 6, 'UTC')), assumeNotNull(parseDateTime64BestEffortOrNull('2020-01-04 23:59:59', 6, 'UTC'))), 0)) AS numbers
UNION ALL SELECT toStartOfDay(assumeNotNull(parseDateTime64BestEffortOrNull('2019-12-28 00:00:00', 6, 'UTC'))) AS day_start) AS ticks
CROSS JOIN
(SELECT breakdown_value
FROM
(SELECT ['http://hogflix/first', 'http://hogflix/second'] AS breakdown_value) ARRAY
JOIN breakdown_value AS breakdown_value) AS sec
ORDER BY sec.breakdown_value ASC, day_start ASC
UNION ALL SELECT count(DISTINCT e__pdi.person_id) AS total,
min(toStartOfDay(toTimeZone(e.timestamp, 'UTC'))) AS day_start,
transform(ifNull(nullIf(toString(if(empty(trim(TRAILING '/?#'
FROM replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$current_url'), ''), 'null'), '^"|"$', ''))), '/', trim(TRAILING '/?#'
FROM replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$current_url'), ''), 'null'), '^"|"$', '')))), ''), '$$_posthog_breakdown_null_$$'), ['http://hogflix/first', 'http://hogflix/second'], ['http://hogflix/first', 'http://hogflix/second'], '$$_posthog_breakdown_other_$$') AS breakdown_value
FROM events AS e SAMPLE 1
INNER JOIN
(SELECT argMax(person_distinct_id2.person_id, person_distinct_id2.version) AS person_id,
person_distinct_id2.distinct_id AS distinct_id
FROM person_distinct_id2
WHERE equals(person_distinct_id2.team_id, 2)
GROUP BY person_distinct_id2.distinct_id
HAVING ifNull(equals(argMax(person_distinct_id2.is_deleted, person_distinct_id2.version), 0), 0)) AS e__pdi ON equals(e.distinct_id, e__pdi.distinct_id)
WHERE and(equals(e.team_id, 2), greaterOrEquals(toTimeZone(e.timestamp, 'UTC'), toStartOfDay(assumeNotNull(parseDateTime64BestEffortOrNull('2019-12-28 00:00:00', 6, 'UTC')))), lessOrEquals(toTimeZone(e.timestamp, 'UTC'), assumeNotNull(parseDateTime64BestEffortOrNull('2020-01-04 23:59:59', 6, 'UTC'))), equals(e.event, 'sign up'))
GROUP BY e__pdi.person_id,
breakdown_value)
GROUP BY day_start,
breakdown_value
ORDER BY day_start ASC, breakdown_value ASC)
ORDER BY day_start ASC)
GROUP BY breakdown_value
ORDER BY sum(count) DESC, breakdown_value ASC
LIMIT 10000 SETTINGS readonly=2,
max_execution_time=60,
allow_experimental_object_type=1
'''
# ---
# name: TestTrends.test_trends_breakdown_normalize_url_poe_v2
'''
SELECT toString(if(empty(trim(TRAILING '/?#'
FROM replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$current_url'), ''), 'null'), '^"|"$', ''))), '/', trim(TRAILING '/?#'
FROM replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$current_url'), ''), 'null'), '^"|"$', '')))) AS value,
count(e.uuid) AS count
FROM events AS e
WHERE and(equals(e.team_id, 2), and(greaterOrEquals(toTimeZone(e.timestamp, 'UTC'), toStartOfDay(assumeNotNull(parseDateTime64BestEffortOrNull('2019-12-28 00:00:00', 6, 'UTC')))), lessOrEquals(toTimeZone(e.timestamp, 'UTC'), assumeNotNull(parseDateTime64BestEffortOrNull('2020-01-04 23:59:59', 6, 'UTC')))), and(greaterOrEquals(toTimeZone(e.timestamp, 'UTC'), toStartOfDay(assumeNotNull(parseDateTime64BestEffortOrNull('2019-12-28 00:00:00', 6, 'UTC')))), lessOrEquals(toTimeZone(e.timestamp, 'UTC'), assumeNotNull(parseDateTime64BestEffortOrNull('2020-01-04 23:59:59', 6, 'UTC'))), equals(e.event, 'sign up')))
GROUP BY value
ORDER BY count DESC, value DESC
LIMIT 26 SETTINGS readonly=2,
max_execution_time=60,
allow_experimental_object_type=1
'''
# ---
# name: TestTrends.test_trends_breakdown_normalize_url_poe_v2.1
'''
SELECT groupArray(day_start) AS date,
groupArray(count) AS total,
ifNull(toString(breakdown_value), '$$_posthog_breakdown_null_$$') AS breakdown_value
FROM
(SELECT day_start AS day_start,
sum(count) OVER (PARTITION BY breakdown_value
ORDER BY day_start ASC) AS count,
breakdown_value AS breakdown_value
FROM
(SELECT sum(total) AS count,
day_start AS day_start,
breakdown_value AS breakdown_value
FROM
(SELECT 0 AS total,
ticks.day_start AS day_start,
sec.breakdown_value AS breakdown_value
FROM
(SELECT minus(toStartOfDay(assumeNotNull(parseDateTime64BestEffortOrNull('2020-01-04 23:59:59', 6, 'UTC'))), toIntervalDay(numbers.number)) AS day_start
FROM numbers(coalesce(dateDiff('day', assumeNotNull(parseDateTime64BestEffortOrNull('2019-12-28 00:00:00', 6, 'UTC')), assumeNotNull(parseDateTime64BestEffortOrNull('2020-01-04 23:59:59', 6, 'UTC'))), 0)) AS numbers
UNION ALL SELECT toStartOfDay(assumeNotNull(parseDateTime64BestEffortOrNull('2019-12-28 00:00:00', 6, 'UTC'))) AS day_start) AS ticks
CROSS JOIN
(SELECT breakdown_value
FROM
(SELECT ['http://hogflix/first', 'http://hogflix/second'] AS breakdown_value) ARRAY
JOIN breakdown_value AS breakdown_value) AS sec
ORDER BY sec.breakdown_value ASC, day_start ASC
UNION ALL SELECT count(DISTINCT ifNull(nullIf(e__override.override_person_id, '00000000-0000-0000-0000-000000000000'), e.person_id)) AS total,
min(toStartOfDay(toTimeZone(e.timestamp, 'UTC'))) AS day_start,
transform(ifNull(nullIf(toString(if(empty(trim(TRAILING '/?#'
FROM replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$current_url'), ''), 'null'), '^"|"$', ''))), '/', trim(TRAILING '/?#'
FROM replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$current_url'), ''), 'null'), '^"|"$', '')))), ''), '$$_posthog_breakdown_null_$$'), ['http://hogflix/first', 'http://hogflix/second'], ['http://hogflix/first', 'http://hogflix/second'], '$$_posthog_breakdown_other_$$') AS breakdown_value
FROM events AS e SAMPLE 1
LEFT OUTER JOIN
(SELECT argMax(person_overrides.override_person_id, person_overrides.version) AS override_person_id,
person_overrides.old_person_id AS old_person_id
FROM person_overrides
WHERE equals(person_overrides.team_id, 2)
GROUP BY person_overrides.old_person_id) AS e__override ON equals(e.person_id, e__override.old_person_id)
WHERE and(equals(e.team_id, 2), greaterOrEquals(toTimeZone(e.timestamp, 'UTC'), toStartOfDay(assumeNotNull(parseDateTime64BestEffortOrNull('2019-12-28 00:00:00', 6, 'UTC')))), lessOrEquals(toTimeZone(e.timestamp, 'UTC'), assumeNotNull(parseDateTime64BestEffortOrNull('2020-01-04 23:59:59', 6, 'UTC'))), equals(e.event, 'sign up'))
GROUP BY ifNull(nullIf(e__override.override_person_id, '00000000-0000-0000-0000-000000000000'), e.person_id),
breakdown_value)
GROUP BY day_start,
breakdown_value
ORDER BY day_start ASC, breakdown_value ASC)
ORDER BY day_start ASC)
GROUP BY breakdown_value
ORDER BY sum(count) DESC, breakdown_value ASC
LIMIT 10000 SETTINGS readonly=2,
max_execution_time=60,
allow_experimental_object_type=1
'''
# ---
# name: TestTrends.test_trends_breakdown_with_session_property_single_aggregate_math_and_breakdown
'''
SELECT toString(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$some_property'), ''), 'null'), '^"|"$', '')) AS value,
Expand Down
47 changes: 47 additions & 0 deletions posthog/hogql_queries/insights/trends/test/test_trends.py
Original file line number Diff line number Diff line change
Expand Up @@ -384,6 +384,29 @@ def _create_breakdown_events(self):
)
_create_action(team=self.team, name="sign up")

def _create_breakdown_url_events(self):
freeze_without_time = ["2020-01-02"]

with freeze_time(freeze_without_time[0]):
self._create_event(
team=self.team,
event="sign up",
distinct_id="blabla",
properties={"$current_url": "http://hogflix/first"},
)
self._create_event(
team=self.team,
event="sign up",
distinct_id="blabla",
properties={"$current_url": "http://hogflix/first/"},
)
self._create_event(
team=self.team,
event="sign up",
distinct_id="blabla",
properties={"$current_url": "http://hogflix/second"},
)

def _create_event_count_per_actor_events(self):
self._create_person(
team_id=self.team.pk,
Expand Down Expand Up @@ -731,6 +754,30 @@ def test_trends_breakdown_cumulative(self):
self.assertEqual(response[2]["label"], "other_value")
self.assertEqual(response[2]["data"], [0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0])

@also_test_with_person_on_events_v2
@snapshot_clickhouse_queries
def test_trends_breakdown_normalize_url(self):
self._create_breakdown_url_events()
with freeze_time("2020-01-04T13:00:01Z"):
response = self._run(
Filter(
team=self.team,
data={
"date_from": "-7d",
"display": "ActionsLineGraphCumulative",
"events": [{"id": "sign up", "math": "dau"}],
"breakdown": "$current_url",
"breakdown_normalize_url": True,
},
),
self.team,
)

labels = [item["label"] for item in response]
assert sorted(labels) == ["http://hogflix/first", "http://hogflix/second"]
breakdown_values = [item["breakdown_value"] for item in response]
assert sorted(breakdown_values) == sorted(labels)

def test_trends_single_aggregate_dau(self):
self._create_events()
with freeze_time("2020-01-04T13:00:01Z"):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -522,7 +522,7 @@ def _events_filter(
filters.append(parse_expr("1 = 2"))

# Breakdown
if not ignore_breakdowns and breakdown is not None and breakdown.enabled:
if not ignore_breakdowns and breakdown is not None and self.query.breakdownFilter and breakdown.enabled:
# Only add the filter for non-histogram breakdowns that hide the "other" column.
if not breakdown.is_histogram_breakdown and self.query.breakdownFilter.breakdown_hide_other_aggregation:
breakdown_filter = breakdown.events_where_filter()
Expand Down

0 comments on commit 13064e8

Please sign in to comment.