From c0652cf3ca07bb590e5336ad73ec55658e6d6f35 Mon Sep 17 00:00:00 2001 From: Rafael Audibert <32079912+rafaeelaudibert@users.noreply.github.com> Date: Tue, 26 Nov 2024 16:10:35 +0000 Subject: [PATCH] feat(web-analytics): Turn timezones into UTC offsets instead of timezone names (#26409) --- .../web-analytics/tiles/WebAnalyticsTile.tsx | 22 +++ posthog/hogql/functions/mapping.py | 4 +- .../web_analytics/stats_table.py | 13 +- .../test/test_web_stats_table.py | 136 +++++++++++++++--- 4 files changed, 149 insertions(+), 26 deletions(-) diff --git a/frontend/src/scenes/web-analytics/tiles/WebAnalyticsTile.tsx b/frontend/src/scenes/web-analytics/tiles/WebAnalyticsTile.tsx index 1820360e0ec39..44f85a38d8eea 100644 --- a/frontend/src/scenes/web-analytics/tiles/WebAnalyticsTile.tsx +++ b/frontend/src/scenes/web-analytics/tiles/WebAnalyticsTile.tsx @@ -19,6 +19,23 @@ import { DataTableNode, InsightVizNode, NodeKind, QuerySchema, WebStatsBreakdown import { QueryContext, QueryContextColumnComponent, QueryContextColumnTitleComponent } from '~/queries/types' import { ChartDisplayType, GraphPointPayload, InsightLogicProps, ProductKey, PropertyFilterType } from '~/types' +const toUtcOffsetFormat = (value: number): string => { + if (value === 0) { + return 'UTC' + } + + const integerPart = Math.floor(value) + const sign = integerPart > 0 ? '+' : '-' + + // India has half-hour offsets, and Australia has 45-minute offsets, why? + const decimalPart = value - integerPart + const decimalPartAsMinutes = decimalPart * 60 + const formattedMinutes = decimalPartAsMinutes > 0 ? `:${decimalPartAsMinutes}` : '' + + // E.g. UTC-3, UTC, UTC+5:30, UTC+11:45 + return `UTC${sign}${integerPart}${formattedMinutes}` +} + const PercentageCell: QueryContextColumnComponent = ({ value }) => { if (typeof value === 'number') { return {`${(value * 100).toFixed(1)}%`} @@ -121,6 +138,11 @@ const BreakdownValueCell: QueryContextColumnComponent = (props) => { ) } break + case WebStatsBreakdown.Timezone: + if (typeof value === 'number') { + return <>{toUtcOffsetFormat(value)} + } + break } if (typeof value === 'string') { diff --git a/posthog/hogql/functions/mapping.py b/posthog/hogql/functions/mapping.py index 3a8615d6cf100..9eb0980b9d933 100644 --- a/posthog/hogql/functions/mapping.py +++ b/posthog/hogql/functions/mapping.py @@ -421,13 +421,15 @@ def compare_types(arg_types: list[ConstantType], sig_arg_types: tuple[ConstantTy "toString": HogQLFunctionMeta( "toString", 1, - 1, + 2, signatures=[ ((IntegerType(),), StringType()), ((StringType(),), StringType()), ((FloatType(),), StringType()), ((DateType(),), StringType()), + ((DateType(), StringType()), StringType()), ((DateTimeType(),), StringType()), + ((DateTimeType(), StringType()), StringType()), ], ), "toJSONString": HogQLFunctionMeta("toJSONString", 1, 1), diff --git a/posthog/hogql_queries/web_analytics/stats_table.py b/posthog/hogql_queries/web_analytics/stats_table.py index a5c2b6640bf94..fef2c71af7fdb 100644 --- a/posthog/hogql_queries/web_analytics/stats_table.py +++ b/posthog/hogql_queries/web_analytics/stats_table.py @@ -77,6 +77,7 @@ def to_main_query(self) -> ast.SelectQuery: ) GROUP BY "context.columns.breakdown_value" ORDER BY "context.columns.visitors" DESC, +"context.columns.views" DESC, "context.columns.breakdown_value" ASC """, timings=self.timings, @@ -118,6 +119,7 @@ def _to_main_query_with_session_properties(self) -> ast.SelectQuery: ) GROUP BY "context.columns.breakdown_value" ORDER BY "context.columns.visitors" DESC, +"context.columns.views" DESC, "context.columns.breakdown_value" ASC """, timings=self.timings, @@ -162,6 +164,7 @@ def to_entry_bounce_query(self) -> ast.SelectQuery: ) GROUP BY "context.columns.breakdown_value" ORDER BY "context.columns.visitors" DESC, +"context.columns.views" DESC, "context.columns.breakdown_value" ASC """, timings=self.timings, @@ -269,6 +272,7 @@ def to_path_scroll_bounce_query(self) -> ast.SelectQuery: ) AS scroll ON counts.breakdown_value = scroll.breakdown_value ORDER BY "context.columns.visitors" DESC, +"context.columns.views" DESC, "context.columns.breakdown_value" ASC """, timings=self.timings, @@ -346,6 +350,7 @@ def to_path_bounce_query(self) -> ast.SelectQuery: ) as bounce ON counts.breakdown_value = bounce.breakdown_value ORDER BY "context.columns.visitors" DESC, +"context.columns.views" DESC, "context.columns.breakdown_value" ASC """, timings=self.timings, @@ -500,9 +505,11 @@ def _counts_breakdown_value(self): case WebStatsBreakdown.CITY: return parse_expr("tuple(properties.$geoip_country_code, properties.$geoip_city_name)") case WebStatsBreakdown.TIMEZONE: - # Timezone offsets would be slightly more useful, but that's not easily achievable - # with Clickhouse, we might attempt to change this in the future - return ast.Field(chain=["properties", "$timezone"]) + # Get the difference between the UNIX timestamp at UTC and the UNIX timestamp at the event's timezone + # Value is in milliseconds, turn it to hours, works even for fractional timezone offsets (I'm looking at you, Australia) + return parse_expr( + "if(or(isNull(properties.$timezone), empty(properties.$timezone)), NULL, (toUnixTimestamp64Milli(parseDateTimeBestEffort(assumeNotNull(toString(timestamp, properties.$timezone)))) - toUnixTimestamp64Milli(timestamp)) / 3600000)" + ) case _: raise NotImplementedError("Breakdown not implemented") diff --git a/posthog/hogql_queries/web_analytics/test/test_web_stats_table.py b/posthog/hogql_queries/web_analytics/test/test_web_stats_table.py index a58b8353daa1a..616d406978ca1 100644 --- a/posthog/hogql_queries/web_analytics/test/test_web_stats_table.py +++ b/posthog/hogql_queries/web_analytics/test/test_web_stats_table.py @@ -1006,50 +1006,142 @@ def test_cohort_test_filters(self): assert results == [["/path1", 1, 1]] def test_timezone_filter(self): - d1, s1 = "d1", str(uuid7("2024-07-30")) - d2, s2 = "d2", str(uuid7("2024-07-30")) + date = "2024-07-30" + + for idx, (distinct_id, session_id) in enumerate( + [ + ("UTC", str(uuid7(date))), + ("Asia/Calcutta", str(uuid7(date))), + ("America/New_York", str(uuid7(date))), + ("America/Sao_Paulo", str(uuid7(date))), + ] + ): + _create_person( + team_id=self.team.pk, + distinct_ids=[distinct_id], + properties={"name": session_id, "email": f"{distinct_id}@example.com"}, + ) + + for i in range(idx + 1): + _create_event( + team=self.team, + event="$pageview", + distinct_id=distinct_id, + timestamp=date, + properties={"$session_id": session_id, "$pathname": f"/path{i}", "$timezone": distinct_id}, + ) + + results = self._run_web_stats_table_query( + "all", + None, + breakdown_by=WebStatsBreakdown.TIMEZONE, + ).results + + # Brasilia UTC-3, New York UTC-4, Calcutta UTC+5:30, UTC + assert results == [[-3.0, 1.0, 4.0], [-4.0, 1.0, 3.0], [5.5, 1.0, 2.0], [0.0, 1.0, 1.0]] + + def test_timezone_filter_dst_change(self): + did = "id" + sid = str(uuid7("2019-02-17")) _create_person( team_id=self.team.pk, - distinct_ids=[d1], - properties={"name": d1, "email": "test@example.com"}, + distinct_ids=[did], + properties={"name": sid, "email": f"test@example.com"}, + ) + + # Cross daylight savings time change in Brazil + for i in range(6): + _create_event( + team=self.team, + event="$pageview", + distinct_id=did, + timestamp=f"2019-02-17 0{i}:00:00", + properties={"$session_id": sid, "$pathname": f"/path1", "$timezone": "America/Sao_Paulo"}, + ) + + results = self._run_web_stats_table_query( + "all", + None, + breakdown_by=WebStatsBreakdown.TIMEZONE, + ).results + + # Change from UTC-2 to UTC-3 in the middle of the night + assert results == [[-3.0, 1.0, 4.0], [-2.0, 1.0, 2.0]] + + def test_timezone_filter_with_invalid_timezone(self): + date = "2024-07-30" + + for idx, (distinct_id, session_id) in enumerate( + [ + ("UTC", str(uuid7(date))), + ("Timezone_not_exists", str(uuid7(date))), + ] + ): + _create_person( + team_id=self.team.pk, + distinct_ids=[distinct_id], + properties={"name": session_id, "email": f"{distinct_id}@example.com"}, + ) + + for i in range(idx + 1): + _create_event( + team=self.team, + event="$pageview", + distinct_id=distinct_id, + timestamp=date, + properties={"$session_id": session_id, "$pathname": f"/path{i}", "$timezone": distinct_id}, + ) + + with self.assertRaisesRegex(Exception, "Cannot load time zone"): + self._run_web_stats_table_query( + "all", + None, + breakdown_by=WebStatsBreakdown.TIMEZONE, + ) + + def test_timezone_filter_with_empty_timezone(self): + did = "id" + sid = str(uuid7("2019-02-17")) + + _create_person( + team_id=self.team.pk, + distinct_ids=[did], + properties={"name": sid, "email": f"test@example.com"}, ) + # Key not exists _create_event( team=self.team, event="$pageview", - distinct_id=d1, - timestamp="2024-07-30", - properties={"$session_id": s1, "$pathname": "/path1", "$timezone": "America/New_York"}, + distinct_id=did, + timestamp=f"2019-02-17 00:00:00", + properties={"$session_id": sid, "$pathname": f"/path1"}, ) - _create_person( - team_id=self.team.pk, - distinct_ids=[d2], - properties={"name": d2, "email": "d2@hedgebox.net"}, - ) + # Key exists, it's null _create_event( team=self.team, event="$pageview", - distinct_id=d2, - timestamp="2024-07-30", - properties={"$session_id": s2, "$pathname": "/path2", "$timezone": "America/Brasilia"}, + distinct_id=did, + timestamp=f"2019-02-17 00:00:00", + properties={"$session_id": sid, "$pathname": f"/path1", "$timezone": None}, ) + + # Key exists, it's empty string _create_event( team=self.team, event="$pageview", - distinct_id=d2, - timestamp="2024-07-30", - properties={"$session_id": s2, "$pathname": "/path3", "$timezone": "America/Brasilia"}, + distinct_id=did, + timestamp=f"2019-02-17 00:00:00", + properties={"$session_id": sid, "$pathname": f"/path1", "$timezone": ""}, ) - flush_persons_and_events() - results = self._run_web_stats_table_query( "all", None, breakdown_by=WebStatsBreakdown.TIMEZONE, - filter_test_accounts=True, ).results - assert results == [["America/Brasilia", 1.0, 2.0], ["America/New_York", 1.0, 1.0]] + # Don't crash, treat all of them null + assert results == []