Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(web-analytics): Turn timezones into UTC offsets instead of timezone names #26409

Merged
merged 4 commits into from
Nov 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 22 additions & 0 deletions frontend/src/scenes/web-analytics/tiles/WebAnalyticsTile.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,23 @@ import { DataTableNode, InsightVizNode, NodeKind, QuerySchema, WebStatsBreakdown
import { QueryContext, QueryContextColumnComponent, QueryContextColumnTitleComponent } from '~/queries/types'
import { ChartDisplayType, GraphPointPayload, InsightLogicProps, ProductKey, PropertyFilterType } from '~/types'

const toUtcOffsetFormat = (value: number): string => {
if (value === 0) {
return 'UTC'
}

const integerPart = Math.floor(value)
const sign = integerPart > 0 ? '+' : '-'

// India has half-hour offsets, and Australia has 45-minute offsets, why?
const decimalPart = value - integerPart
const decimalPartAsMinutes = decimalPart * 60
const formattedMinutes = decimalPartAsMinutes > 0 ? `:${decimalPartAsMinutes}` : ''

// E.g. UTC-3, UTC, UTC+5:30, UTC+11:45
return `UTC${sign}${integerPart}${formattedMinutes}`
}

const PercentageCell: QueryContextColumnComponent = ({ value }) => {
if (typeof value === 'number') {
return <span>{`${(value * 100).toFixed(1)}%`}</span>
Expand Down Expand Up @@ -121,6 +138,11 @@ const BreakdownValueCell: QueryContextColumnComponent = (props) => {
)
}
break
case WebStatsBreakdown.Timezone:
if (typeof value === 'number') {
return <>{toUtcOffsetFormat(value)}</>
}
break
}

if (typeof value === 'string') {
Expand Down
4 changes: 3 additions & 1 deletion posthog/hogql/functions/mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -421,13 +421,15 @@ def compare_types(arg_types: list[ConstantType], sig_arg_types: tuple[ConstantTy
"toString": HogQLFunctionMeta(
"toString",
1,
1,
2,
signatures=[
((IntegerType(),), StringType()),
((StringType(),), StringType()),
((FloatType(),), StringType()),
((DateType(),), StringType()),
((DateType(), StringType()), StringType()),
((DateTimeType(),), StringType()),
((DateTimeType(), StringType()), StringType()),
],
),
"toJSONString": HogQLFunctionMeta("toJSONString", 1, 1),
Expand Down
13 changes: 10 additions & 3 deletions posthog/hogql_queries/web_analytics/stats_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ def to_main_query(self) -> ast.SelectQuery:
)
GROUP BY "context.columns.breakdown_value"
ORDER BY "context.columns.visitors" DESC,
"context.columns.views" DESC,
robbie-c marked this conversation as resolved.
Show resolved Hide resolved
"context.columns.breakdown_value" ASC
""",
timings=self.timings,
Expand Down Expand Up @@ -118,6 +119,7 @@ def _to_main_query_with_session_properties(self) -> ast.SelectQuery:
)
GROUP BY "context.columns.breakdown_value"
ORDER BY "context.columns.visitors" DESC,
"context.columns.views" DESC,
"context.columns.breakdown_value" ASC
""",
timings=self.timings,
Expand Down Expand Up @@ -162,6 +164,7 @@ def to_entry_bounce_query(self) -> ast.SelectQuery:
)
GROUP BY "context.columns.breakdown_value"
ORDER BY "context.columns.visitors" DESC,
"context.columns.views" DESC,
"context.columns.breakdown_value" ASC
""",
timings=self.timings,
Expand Down Expand Up @@ -269,6 +272,7 @@ def to_path_scroll_bounce_query(self) -> ast.SelectQuery:
) AS scroll
ON counts.breakdown_value = scroll.breakdown_value
ORDER BY "context.columns.visitors" DESC,
"context.columns.views" DESC,
"context.columns.breakdown_value" ASC
""",
timings=self.timings,
Expand Down Expand Up @@ -346,6 +350,7 @@ def to_path_bounce_query(self) -> ast.SelectQuery:
) as bounce
ON counts.breakdown_value = bounce.breakdown_value
ORDER BY "context.columns.visitors" DESC,
"context.columns.views" DESC,
"context.columns.breakdown_value" ASC
""",
timings=self.timings,
Expand Down Expand Up @@ -500,9 +505,11 @@ def _counts_breakdown_value(self):
case WebStatsBreakdown.CITY:
return parse_expr("tuple(properties.$geoip_country_code, properties.$geoip_city_name)")
case WebStatsBreakdown.TIMEZONE:
# Timezone offsets would be slightly more useful, but that's not easily achievable
# with Clickhouse, we might attempt to change this in the future
return ast.Field(chain=["properties", "$timezone"])
# Get the difference between the UNIX timestamp at UTC and the UNIX timestamp at the event's timezone
# Value is in milliseconds, turn it to hours, works even for fractional timezone offsets (I'm looking at you, Australia)
return parse_expr(
"if(or(isNull(properties.$timezone), empty(properties.$timezone)), NULL, (toUnixTimestamp64Milli(parseDateTimeBestEffort(assumeNotNull(toString(timestamp, properties.$timezone)))) - toUnixTimestamp64Milli(timestamp)) / 3600000)"
)
case _:
raise NotImplementedError("Breakdown not implemented")

Expand Down
136 changes: 114 additions & 22 deletions posthog/hogql_queries/web_analytics/test/test_web_stats_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -1006,50 +1006,142 @@ def test_cohort_test_filters(self):
assert results == [["/path1", 1, 1]]

def test_timezone_filter(self):
d1, s1 = "d1", str(uuid7("2024-07-30"))
d2, s2 = "d2", str(uuid7("2024-07-30"))
date = "2024-07-30"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For me to be suuuper confidant this works, I'd like to see some tests that cover before / after the daylight savings change for the same timezone

Copy link
Member Author

@rafaeelaudibert rafaeelaudibert Nov 25, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just added it, good call

3a98085


for idx, (distinct_id, session_id) in enumerate(
[
("UTC", str(uuid7(date))),
("Asia/Calcutta", str(uuid7(date))),
("America/New_York", str(uuid7(date))),
("America/Sao_Paulo", str(uuid7(date))),
]
):
_create_person(
team_id=self.team.pk,
distinct_ids=[distinct_id],
properties={"name": session_id, "email": f"{distinct_id}@example.com"},
)

for i in range(idx + 1):
_create_event(
team=self.team,
event="$pageview",
distinct_id=distinct_id,
timestamp=date,
properties={"$session_id": session_id, "$pathname": f"/path{i}", "$timezone": distinct_id},
)

results = self._run_web_stats_table_query(
"all",
None,
breakdown_by=WebStatsBreakdown.TIMEZONE,
).results

# Brasilia UTC-3, New York UTC-4, Calcutta UTC+5:30, UTC
assert results == [[-3.0, 1.0, 4.0], [-4.0, 1.0, 3.0], [5.5, 1.0, 2.0], [0.0, 1.0, 1.0]]

def test_timezone_filter_dst_change(self):
did = "id"
sid = str(uuid7("2019-02-17"))

_create_person(
team_id=self.team.pk,
distinct_ids=[d1],
properties={"name": d1, "email": "[email protected]"},
distinct_ids=[did],
properties={"name": sid, "email": f"[email protected]"},
)

# Cross daylight savings time change in Brazil
for i in range(6):
_create_event(
team=self.team,
event="$pageview",
distinct_id=did,
timestamp=f"2019-02-17 0{i}:00:00",
properties={"$session_id": sid, "$pathname": f"/path1", "$timezone": "America/Sao_Paulo"},
)

results = self._run_web_stats_table_query(
"all",
None,
breakdown_by=WebStatsBreakdown.TIMEZONE,
).results

# Change from UTC-2 to UTC-3 in the middle of the night
assert results == [[-3.0, 1.0, 4.0], [-2.0, 1.0, 2.0]]

def test_timezone_filter_with_invalid_timezone(self):
date = "2024-07-30"

for idx, (distinct_id, session_id) in enumerate(
[
("UTC", str(uuid7(date))),
("Timezone_not_exists", str(uuid7(date))),
]
):
_create_person(
team_id=self.team.pk,
distinct_ids=[distinct_id],
properties={"name": session_id, "email": f"{distinct_id}@example.com"},
)

for i in range(idx + 1):
_create_event(
team=self.team,
event="$pageview",
distinct_id=distinct_id,
timestamp=date,
properties={"$session_id": session_id, "$pathname": f"/path{i}", "$timezone": distinct_id},
)

with self.assertRaisesRegex(Exception, "Cannot load time zone"):
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Reasonable error?

self._run_web_stats_table_query(
"all",
None,
breakdown_by=WebStatsBreakdown.TIMEZONE,
)

def test_timezone_filter_with_empty_timezone(self):
did = "id"
sid = str(uuid7("2019-02-17"))

_create_person(
team_id=self.team.pk,
distinct_ids=[did],
properties={"name": sid, "email": f"[email protected]"},
)

# Key not exists
_create_event(
team=self.team,
event="$pageview",
distinct_id=d1,
timestamp="2024-07-30",
properties={"$session_id": s1, "$pathname": "/path1", "$timezone": "America/New_York"},
distinct_id=did,
timestamp=f"2019-02-17 00:00:00",
properties={"$session_id": sid, "$pathname": f"/path1"},
)

_create_person(
team_id=self.team.pk,
distinct_ids=[d2],
properties={"name": d2, "email": "[email protected]"},
)
# Key exists, it's null
_create_event(
team=self.team,
event="$pageview",
distinct_id=d2,
timestamp="2024-07-30",
properties={"$session_id": s2, "$pathname": "/path2", "$timezone": "America/Brasilia"},
distinct_id=did,
timestamp=f"2019-02-17 00:00:00",
properties={"$session_id": sid, "$pathname": f"/path1", "$timezone": None},
)

# Key exists, it's empty string
_create_event(
team=self.team,
event="$pageview",
distinct_id=d2,
timestamp="2024-07-30",
properties={"$session_id": s2, "$pathname": "/path3", "$timezone": "America/Brasilia"},
distinct_id=did,
timestamp=f"2019-02-17 00:00:00",
properties={"$session_id": sid, "$pathname": f"/path1", "$timezone": ""},
)

flush_persons_and_events()

results = self._run_web_stats_table_query(
"all",
None,
breakdown_by=WebStatsBreakdown.TIMEZONE,
filter_test_accounts=True,
).results

assert results == [["America/Brasilia", 1.0, 2.0], ["America/New_York", 1.0, 1.0]]
# Don't crash, treat all of them null
assert results == []
Loading