From 25db3273c41ed7667925905b967d6b7d180959ba Mon Sep 17 00:00:00 2001 From: Raquel Smith Date: Tue, 2 Apr 2024 09:08:54 -0700 Subject: [PATCH] feat: start collecting enhanced_persons metrics in usage reports (#21246) * start collecting metrics in usage reports * Update query snapshots --------- Co-authored-by: github-actions <41898282+github-actions[bot]@users.noreply.github.com> --- .../test/__snapshots__/test_usage_report.ambr | 106 ++++++++++-------- posthog/tasks/test/test_usage_report.py | 6 + posthog/tasks/usage_report.py | 75 ++++++++++--- 3 files changed, 125 insertions(+), 62 deletions(-) diff --git a/posthog/tasks/test/__snapshots__/test_usage_report.ambr b/posthog/tasks/test/__snapshots__/test_usage_report.ambr index 4e5039c1df0b1..e2038e3de9f89 100644 --- a/posthog/tasks/test/__snapshots__/test_usage_report.ambr +++ b/posthog/tasks/test/__snapshots__/test_usage_report.ambr @@ -24,6 +24,19 @@ # --- # name: TestFeatureFlagsUsageReport.test_usage_report_decide_requests.10 ''' + + SELECT distinct_id as team, + sum(JSONExtractInt(properties, 'count')) as sum + FROM events + WHERE team_id = 2 + AND event='local evaluation usage' + AND timestamp between '2022-01-01 00:00:00' AND '2022-01-10 23:59:59' + AND has(['correct'], replaceRegexpAll(JSONExtractRaw(properties, 'token'), '^"|"$', '')) + GROUP BY team + ''' +# --- +# name: TestFeatureFlagsUsageReport.test_usage_report_decide_requests.11 + ''' WITH JSONExtractInt(log_comment, 'team_id') as team_id, JSONExtractString(log_comment, 'query_type') as query_type, JSONExtractString(log_comment, 'access_method') as access_method @@ -39,7 +52,7 @@ GROUP BY team_id ''' # --- -# name: TestFeatureFlagsUsageReport.test_usage_report_decide_requests.11 +# name: TestFeatureFlagsUsageReport.test_usage_report_decide_requests.12 ''' WITH JSONExtractInt(log_comment, 'team_id') as team_id, JSONExtractString(log_comment, 'query_type') as query_type, @@ -56,7 +69,7 @@ GROUP BY team_id ''' # --- -# name: TestFeatureFlagsUsageReport.test_usage_report_decide_requests.12 +# name: TestFeatureFlagsUsageReport.test_usage_report_decide_requests.13 ''' WITH JSONExtractInt(log_comment, 'team_id') as team_id, JSONExtractString(log_comment, 'query_type') as query_type, @@ -73,7 +86,7 @@ GROUP BY team_id ''' # --- -# name: TestFeatureFlagsUsageReport.test_usage_report_decide_requests.13 +# name: TestFeatureFlagsUsageReport.test_usage_report_decide_requests.14 ''' WITH JSONExtractInt(log_comment, 'team_id') as team_id, JSONExtractString(log_comment, 'query_type') as query_type, @@ -90,7 +103,7 @@ GROUP BY team_id ''' # --- -# name: TestFeatureFlagsUsageReport.test_usage_report_decide_requests.14 +# name: TestFeatureFlagsUsageReport.test_usage_report_decide_requests.15 ''' WITH JSONExtractInt(log_comment, 'team_id') as team_id, JSONExtractString(log_comment, 'query_type') as query_type, @@ -107,7 +120,7 @@ GROUP BY team_id ''' # --- -# name: TestFeatureFlagsUsageReport.test_usage_report_decide_requests.15 +# name: TestFeatureFlagsUsageReport.test_usage_report_decide_requests.16 ''' WITH JSONExtractInt(log_comment, 'team_id') as team_id, JSONExtractString(log_comment, 'query_type') as query_type, @@ -124,7 +137,7 @@ GROUP BY team_id ''' # --- -# name: TestFeatureFlagsUsageReport.test_usage_report_decide_requests.16 +# name: TestFeatureFlagsUsageReport.test_usage_report_decide_requests.17 ''' WITH JSONExtractInt(log_comment, 'team_id') as team_id, JSONExtractString(log_comment, 'query_type') as query_type, @@ -141,7 +154,7 @@ GROUP BY team_id ''' # --- -# name: TestFeatureFlagsUsageReport.test_usage_report_decide_requests.17 +# name: TestFeatureFlagsUsageReport.test_usage_report_decide_requests.18 ''' WITH JSONExtractInt(log_comment, 'team_id') as team_id, JSONExtractString(log_comment, 'query_type') as query_type, @@ -158,7 +171,7 @@ GROUP BY team_id ''' # --- -# name: TestFeatureFlagsUsageReport.test_usage_report_decide_requests.18 +# name: TestFeatureFlagsUsageReport.test_usage_report_decide_requests.19 ''' WITH JSONExtractInt(log_comment, 'team_id') as team_id, JSONExtractString(log_comment, 'query_type') as query_type, @@ -175,7 +188,21 @@ GROUP BY team_id ''' # --- -# name: TestFeatureFlagsUsageReport.test_usage_report_decide_requests.19 +# name: TestFeatureFlagsUsageReport.test_usage_report_decide_requests.2 + ''' + + SELECT team_id, + count(distinct toDate(timestamp), event, cityHash64(distinct_id), cityHash64(uuid)) as count + FROM events + WHERE timestamp between '2022-01-10 00:00:00' AND '2022-01-10 23:59:59' + AND event != '$feature_flag_called' + AND event NOT IN ('survey sent', + 'survey shown', + 'survey dismissed') + GROUP BY team_id + ''' +# --- +# name: TestFeatureFlagsUsageReport.test_usage_report_decide_requests.20 ''' WITH JSONExtractInt(log_comment, 'team_id') as team_id, JSONExtractString(log_comment, 'query_type') as query_type, @@ -192,21 +219,7 @@ GROUP BY team_id ''' # --- -# name: TestFeatureFlagsUsageReport.test_usage_report_decide_requests.2 - ''' - - SELECT team_id, - count(1) as count - FROM events - WHERE timestamp between '2022-01-01 00:00:00' AND '2022-01-10 23:59:59' - AND event != '$feature_flag_called' - AND event NOT IN ('survey sent', - 'survey shown', - 'survey dismissed') - GROUP BY team_id - ''' -# --- -# name: TestFeatureFlagsUsageReport.test_usage_report_decide_requests.20 +# name: TestFeatureFlagsUsageReport.test_usage_report_decide_requests.21 ''' WITH JSONExtractInt(log_comment, 'team_id') as team_id, JSONExtractString(log_comment, 'query_type') as query_type, @@ -223,7 +236,7 @@ GROUP BY team_id ''' # --- -# name: TestFeatureFlagsUsageReport.test_usage_report_decide_requests.21 +# name: TestFeatureFlagsUsageReport.test_usage_report_decide_requests.22 ''' WITH JSONExtractInt(log_comment, 'team_id') as team_id, JSONExtractString(log_comment, 'query_type') as query_type, @@ -240,7 +253,7 @@ GROUP BY team_id ''' # --- -# name: TestFeatureFlagsUsageReport.test_usage_report_decide_requests.22 +# name: TestFeatureFlagsUsageReport.test_usage_report_decide_requests.23 ''' SELECT team_id, @@ -251,7 +264,7 @@ GROUP BY team_id ''' # --- -# name: TestFeatureFlagsUsageReport.test_usage_report_decide_requests.23 +# name: TestFeatureFlagsUsageReport.test_usage_report_decide_requests.24 ''' SELECT team_id, @@ -262,7 +275,7 @@ GROUP BY team_id ''' # --- -# name: TestFeatureFlagsUsageReport.test_usage_report_decide_requests.24 +# name: TestFeatureFlagsUsageReport.test_usage_report_decide_requests.25 ''' SELECT team, @@ -283,6 +296,20 @@ # name: TestFeatureFlagsUsageReport.test_usage_report_decide_requests.3 ''' + SELECT team_id, + count(1) as count + FROM events + WHERE timestamp between '2022-01-01 00:00:00' AND '2022-01-10 23:59:59' + AND event != '$feature_flag_called' + AND event NOT IN ('survey sent', + 'survey shown', + 'survey dismissed') + GROUP BY team_id + ''' +# --- +# name: TestFeatureFlagsUsageReport.test_usage_report_decide_requests.4 + ''' + SELECT team_id, count(1) as count FROM events @@ -295,7 +322,7 @@ GROUP BY team_id ''' # --- -# name: TestFeatureFlagsUsageReport.test_usage_report_decide_requests.4 +# name: TestFeatureFlagsUsageReport.test_usage_report_decide_requests.5 ''' SELECT team_id, @@ -310,7 +337,7 @@ GROUP BY team_id ''' # --- -# name: TestFeatureFlagsUsageReport.test_usage_report_decide_requests.5 +# name: TestFeatureFlagsUsageReport.test_usage_report_decide_requests.6 ''' SELECT team_id, @@ -319,7 +346,7 @@ GROUP BY team_id ''' # --- -# name: TestFeatureFlagsUsageReport.test_usage_report_decide_requests.6 +# name: TestFeatureFlagsUsageReport.test_usage_report_decide_requests.7 ''' SELECT distinct_id as team, @@ -332,7 +359,7 @@ GROUP BY team ''' # --- -# name: TestFeatureFlagsUsageReport.test_usage_report_decide_requests.7 +# name: TestFeatureFlagsUsageReport.test_usage_report_decide_requests.8 ''' SELECT distinct_id as team, @@ -345,19 +372,6 @@ GROUP BY team ''' # --- -# name: TestFeatureFlagsUsageReport.test_usage_report_decide_requests.8 - ''' - - SELECT distinct_id as team, - sum(JSONExtractInt(properties, 'count')) as sum - FROM events - WHERE team_id = 2 - AND event='local evaluation usage' - AND timestamp between '2022-01-10 00:00:00' AND '2022-01-10 23:59:59' - AND has(['correct'], replaceRegexpAll(JSONExtractRaw(properties, 'token'), '^"|"$', '')) - GROUP BY team - ''' -# --- # name: TestFeatureFlagsUsageReport.test_usage_report_decide_requests.9 ''' @@ -366,7 +380,7 @@ FROM events WHERE team_id = 2 AND event='local evaluation usage' - AND timestamp between '2022-01-01 00:00:00' AND '2022-01-10 23:59:59' + AND timestamp between '2022-01-10 00:00:00' AND '2022-01-10 23:59:59' AND has(['correct'], replaceRegexpAll(JSONExtractRaw(properties, 'token'), '^"|"$', '')) GROUP BY team ''' diff --git a/posthog/tasks/test/test_usage_report.py b/posthog/tasks/test/test_usage_report.py index 9f27af35da191..75e13fc845a70 100644 --- a/posthog/tasks/test/test_usage_report.py +++ b/posthog/tasks/test/test_usage_report.py @@ -369,6 +369,8 @@ def _test_usage_report(self) -> List[dict]: "instance_tag": "none", "event_count_lifetime": 55, "event_count_in_period": 22, + # TODO: enhanced_persons: modify this test so that there are fewer of these events than the base + "enhanced_persons_event_count_in_period": 22, "event_count_in_month": 42, "event_count_with_groups_in_period": 2, "recording_count_in_period": 5, @@ -411,6 +413,7 @@ def _test_usage_report(self) -> List[dict]: str(self.org_1_team_1.id): { "event_count_lifetime": 44, "event_count_in_period": 12, + "enhanced_persons_event_count_in_period": 12, "event_count_in_month": 32, "event_count_with_groups_in_period": 2, "recording_count_in_period": 0, @@ -447,6 +450,7 @@ def _test_usage_report(self) -> List[dict]: str(self.org_1_team_2.id): { "event_count_lifetime": 11, "event_count_in_period": 10, + "enhanced_persons_event_count_in_period": 10, "event_count_in_month": 10, "event_count_with_groups_in_period": 0, "recording_count_in_period": 5, @@ -506,6 +510,7 @@ def _test_usage_report(self) -> List[dict]: "instance_tag": "none", "event_count_lifetime": 11, "event_count_in_period": 10, + "enhanced_persons_event_count_in_period": 10, "event_count_in_month": 10, "event_count_with_groups_in_period": 0, "recording_count_in_period": 0, @@ -548,6 +553,7 @@ def _test_usage_report(self) -> List[dict]: str(self.org_2_team_3.id): { "event_count_lifetime": 11, "event_count_in_period": 10, + "enhanced_persons_event_count_in_period": 10, "event_count_in_month": 10, "event_count_with_groups_in_period": 0, "recording_count_in_period": 0, diff --git a/posthog/tasks/usage_report.py b/posthog/tasks/usage_report.py index c2daef6a4ec73..2a1f1f2591df9 100644 --- a/posthog/tasks/usage_report.py +++ b/posthog/tasks/usage_report.py @@ -76,6 +76,7 @@ class UsageReportCounters: event_count_lifetime: int event_count_in_period: int + enhanced_persons_event_count_in_period: int event_count_in_month: int event_count_with_groups_in_period: int # event_count_by_lib: Dict @@ -214,27 +215,31 @@ def get_instance_metadata(period: Tuple[datetime, datetime]) -> InstanceMetadata metadata.clickhouse_version = str(version_requirement.get_clickhouse_version()) metadata.users_who_logged_in = [ - {"id": user.id, "distinct_id": user.distinct_id} - if user.anonymize_data - else { - "id": user.id, - "distinct_id": user.distinct_id, - "first_name": user.first_name, - "email": user.email, - } + ( + {"id": user.id, "distinct_id": user.distinct_id} + if user.anonymize_data + else { + "id": user.id, + "distinct_id": user.distinct_id, + "first_name": user.first_name, + "email": user.email, + } + ) for user in User.objects.filter(is_active=True, last_login__gte=period_start, last_login__lte=period_end) ] metadata.users_who_logged_in_count = len(metadata.users_who_logged_in) metadata.users_who_signed_up = [ - {"id": user.id, "distinct_id": user.distinct_id} - if user.anonymize_data - else { - "id": user.id, - "distinct_id": user.distinct_id, - "first_name": user.first_name, - "email": user.email, - } + ( + {"id": user.id, "distinct_id": user.distinct_id} + if user.anonymize_data + else { + "id": user.id, + "distinct_id": user.distinct_id, + "first_name": user.first_name, + "email": user.email, + } + ) for user in User.objects.filter( is_active=True, date_joined__gte=period_start, @@ -411,6 +416,37 @@ def get_teams_with_billable_event_count_in_period( return result +@timed_log() +@retry(tries=QUERY_RETRIES, delay=QUERY_RETRY_DELAY, backoff=QUERY_RETRY_BACKOFF) +def get_teams_with_billable_enhanced_persons_event_count_in_period( + begin: datetime, end: datetime, count_distinct: bool = False +) -> List[Tuple[int, int]]: + # count only unique events + # Duplicate events will be eventually removed by ClickHouse and likely came from our library or pipeline. + # We shouldn't bill for these. However counting unique events is more expensive, and likely to fail on longer time ranges. + # So, we count uniques in small time periods only, controlled by the count_distinct parameter. + if count_distinct: + # Uses the same expression as the one used to de-duplicate events on the merge tree: + # https://github.com/PostHog/posthog/blob/master/posthog/models/event/sql.py#L92 + distinct_expression = "distinct toDate(timestamp), event, cityHash64(distinct_id), cityHash64(uuid)" + else: + distinct_expression = "1" + + # TODO: enhanced_persons: update this query to filter on enhanced_persons column + result = sync_execute( + f""" + SELECT team_id, count({distinct_expression}) as count + FROM events + WHERE timestamp between %(begin)s AND %(end)s AND event != '$feature_flag_called' AND event NOT IN ('survey sent', 'survey shown', 'survey dismissed') + GROUP BY team_id + """, + {"begin": begin, "end": end}, + workload=Workload.OFFLINE, + settings=CH_BILLING_SETTINGS, + ) + return result + + @timed_log() @retry(tries=QUERY_RETRIES, delay=QUERY_RETRY_DELAY, backoff=QUERY_RETRY_BACKOFF) def get_teams_with_event_count_with_groups_in_period(begin: datetime, end: datetime) -> List[Tuple[int, int]]: @@ -652,6 +688,7 @@ def capture_report( def has_non_zero_usage(report: FullUsageReport) -> bool: return ( report.event_count_in_period > 0 + or report.enhanced_persons_event_count_in_period > 0 or report.recording_count_in_period > 0 or report.decide_requests_count_in_period > 0 or report.local_evaluation_requests_count_in_period > 0 @@ -682,6 +719,9 @@ def _get_all_usage_data(period_start: datetime, period_end: datetime) -> Dict[st teams_with_event_count_in_period=get_teams_with_billable_event_count_in_period( period_start, period_end, count_distinct=True ), + teams_with_enhanced_persons_event_count_in_period=get_teams_with_billable_enhanced_persons_event_count_in_period( + period_start, period_end, count_distinct=True + ), teams_with_event_count_in_month=get_teams_with_billable_event_count_in_period( period_start.replace(day=1), period_end ), @@ -858,6 +898,9 @@ def _get_team_report(all_data: Dict[str, Any], team: Team) -> UsageReportCounter return UsageReportCounters( event_count_lifetime=all_data["teams_with_event_count_lifetime"].get(team.id, 0), event_count_in_period=all_data["teams_with_event_count_in_period"].get(team.id, 0), + enhanced_persons_event_count_in_period=all_data["teams_with_enhanced_persons_event_count_in_period"].get( + team.id, 0 + ), event_count_in_month=all_data["teams_with_event_count_in_month"].get(team.id, 0), event_count_with_groups_in_period=all_data["teams_with_event_count_with_groups_in_period"].get(team.id, 0), # event_count_by_lib: Di all_data["teams_with_#"].get(team.id, 0),