From 65e5ba7143055d4dee08f58b235728fb46973a95 Mon Sep 17 00:00:00 2001 From: Julian Bez Date: Wed, 6 Dec 2023 16:42:31 +0000 Subject: [PATCH] chore(hogql): Add retention queries in HogQL (#18831) * Set up retention query runnner * Use and adapt existing code * Streamline things by using HogQL AST * Add feature flag for insights retention to frontend --- frontend/src/lib/constants.tsx | 1 + .../queries/nodes/DataNode/dataNodeLogic.ts | 4 + .../utils/filtersToQueryNode.test.ts | 14 +- frontend/src/queries/query.ts | 5 + frontend/src/queries/schema.json | 100 +- frontend/src/queries/schema.ts | 19 +- .../insights/InsightNav/insightNavLogic.tsx | 4 +- .../scenes/insights/summarizeInsight.test.ts | 16 +- frontend/src/types.ts | 18 +- posthog/api/services/query.py | 1 + posthog/hogql/property.py | 30 +- posthog/hogql/test/test_property.py | 38 +- .../insights/retention_query_runner.py | 320 ++++ .../test_retention_query_runner.ambr | 567 ++++++ .../test/test_retention_query_runner.py | 1630 +++++++++++++++++ .../test/test_filter_to_query.py | 2 +- posthog/hogql_queries/query_runner.py | 12 + .../hogql_queries/utils/query_date_range.py | 65 +- .../utils/test/test_query_date_range.py | 75 +- posthog/schema.py | 61 +- 20 files changed, 2942 insertions(+), 40 deletions(-) create mode 100644 posthog/hogql_queries/insights/retention_query_runner.py create mode 100644 posthog/hogql_queries/insights/test/__snapshots__/test_retention_query_runner.ambr create mode 100644 posthog/hogql_queries/insights/test/test_retention_query_runner.py diff --git a/frontend/src/lib/constants.tsx b/frontend/src/lib/constants.tsx index 4771ac42c9ab2..3051a6e395dc4 100644 --- a/frontend/src/lib/constants.tsx +++ b/frontend/src/lib/constants.tsx @@ -165,6 +165,7 @@ export const FEATURE_FLAGS = { APPS_AND_EXPORTS_UI: 'apps-and-exports-ui', // owner: @benjackwhite SESSION_REPLAY_CORS_PROXY: 'session-replay-cors-proxy', // owner: #team-replay HOGQL_INSIGHTS_LIFECYCLE: 'hogql-insights-lifecycle', // owner: @mariusandra + HOGQL_INSIGHTS_RETENTION: 'hogql-insights-retention', // owner: @webjunkie HOGQL_INSIGHTS_TRENDS: 'hogql-insights-trends', // owner: @Gilbert09 HOGQL_INSIGHT_LIVE_COMPARE: 'hogql-insight-live-compare', // owner: @mariusandra BI_VIZ: 'bi_viz', // owner: @Gilbert09 diff --git a/frontend/src/queries/nodes/DataNode/dataNodeLogic.ts b/frontend/src/queries/nodes/DataNode/dataNodeLogic.ts index f6cb431fe51c2..0914fdf73fd33 100644 --- a/frontend/src/queries/nodes/DataNode/dataNodeLogic.ts +++ b/frontend/src/queries/nodes/DataNode/dataNodeLogic.ts @@ -345,6 +345,10 @@ export const dataNodeLogic = kea([ (s) => [s.featureFlags], (featureFlags) => !!featureFlags[FEATURE_FLAGS.HOGQL_INSIGHTS_LIFECYCLE], ], + hogQLInsightsRetentionFlagEnabled: [ + (s) => [s.featureFlags], + (featureFlags) => !!featureFlags[FEATURE_FLAGS.HOGQL_INSIGHTS_RETENTION], + ], hogQLInsightsTrendsFlagEnabled: [ (s) => [s.featureFlags], (featureFlags) => !!featureFlags[FEATURE_FLAGS.HOGQL_INSIGHTS_TRENDS], diff --git a/frontend/src/queries/nodes/InsightQuery/utils/filtersToQueryNode.test.ts b/frontend/src/queries/nodes/InsightQuery/utils/filtersToQueryNode.test.ts index d06bf3451adf9..6c92792b37412 100644 --- a/frontend/src/queries/nodes/InsightQuery/utils/filtersToQueryNode.test.ts +++ b/frontend/src/queries/nodes/InsightQuery/utils/filtersToQueryNode.test.ts @@ -167,7 +167,7 @@ describe('filtersToQueryNode', () => { const query: InsightQueryNode = { kind: NodeKind.RetentionQuery, filterTestAccounts: true, - } + } as InsightQueryNode expect(result).toEqual(query) }) @@ -212,7 +212,7 @@ describe('filtersToQueryNode', () => { }, ], }, - } + } as InsightQueryNode expect(result).toEqual(query) }) @@ -231,7 +231,7 @@ describe('filtersToQueryNode', () => { date_to: '2021-12-08', date_from: '2021-12-08', }, - } + } as InsightQueryNode expect(result).toEqual(query) }) }) @@ -401,8 +401,8 @@ describe('filtersToQueryNode', () => { retention_type: 'retention_first_time', retention_reference: 'total', total_intervals: 2, - returning_entity: [{ a: 1 }], - target_entity: [{ b: 1 }], + returning_entity: { id: '1' }, + target_entity: { id: '1' }, period: RetentionPeriod.Day, } @@ -414,8 +414,8 @@ describe('filtersToQueryNode', () => { retention_type: 'retention_first_time', retention_reference: 'total', total_intervals: 2, - returning_entity: [{ a: 1 }], - target_entity: [{ b: 1 }], + returning_entity: { id: '1' }, + target_entity: { id: '1' }, period: RetentionPeriod.Day, }, } diff --git a/frontend/src/queries/query.ts b/frontend/src/queries/query.ts index 6a45e77ed7660..a6caddffdbe9b 100644 --- a/frontend/src/queries/query.ts +++ b/frontend/src/queries/query.ts @@ -30,6 +30,7 @@ import { isLifecycleQuery, isPersonsNode, isPersonsQuery, + isRetentionQuery, isTimeToSeeDataQuery, isTimeToSeeDataSessionsNode, isTimeToSeeDataSessionsQuery, @@ -148,6 +149,9 @@ export async function query( const hogQLInsightsLifecycleFlagEnabled = Boolean( featureFlagLogic.findMounted()?.values.featureFlags?.[FEATURE_FLAGS.HOGQL_INSIGHTS_LIFECYCLE] ) + const hogQLInsightsRetentionFlagEnabled = Boolean( + featureFlagLogic.findMounted()?.values.featureFlags?.[FEATURE_FLAGS.HOGQL_INSIGHTS_RETENTION] + ) const hogQLInsightsTrendsFlagEnabled = Boolean( featureFlagLogic.findMounted()?.values.featureFlags?.[FEATURE_FLAGS.HOGQL_INSIGHTS_TRENDS] ) @@ -193,6 +197,7 @@ export async function query( } else if (isInsightQueryNode(queryNode)) { if ( (hogQLInsightsLifecycleFlagEnabled && isLifecycleQuery(queryNode)) || + (hogQLInsightsRetentionFlagEnabled && isRetentionQuery(queryNode)) || (hogQLInsightsTrendsFlagEnabled && isTrendsQuery(queryNode)) ) { if (hogQLInsightsLiveCompareEnabled) { diff --git a/frontend/src/queries/schema.json b/frontend/src/queries/schema.json index 673c50177c4aa..c5333f60524e0 100644 --- a/frontend/src/queries/schema.json +++ b/frontend/src/queries/schema.json @@ -2536,6 +2536,34 @@ "required": ["key", "operator", "type", "value"], "type": "object" }, + "RetentionEntity": { + "additionalProperties": false, + "properties": { + "custom_name": { + "type": "string" + }, + "id": { + "type": ["string", "number"] + }, + "kind": { + "enum": ["ActionsNode", "EventsNode"], + "type": "string" + }, + "name": { + "type": "string" + }, + "order": { + "type": "number" + }, + "type": { + "$ref": "#/definitions/EntityType" + }, + "uuid": { + "type": "string" + } + }, + "type": "object" + }, "RetentionFilter": { "additionalProperties": false, "description": "`RetentionFilterType` minus everything inherited from `FilterType`", @@ -2551,13 +2579,13 @@ "$ref": "#/definitions/RetentionType" }, "returning_entity": { - "type": "object" + "$ref": "#/definitions/RetentionEntity" }, "target_entity": { - "type": "object" + "$ref": "#/definitions/RetentionEntity" }, "total_intervals": { - "type": "number" + "type": "integer" } }, "type": "object" @@ -2599,6 +2627,9 @@ ], "description": "Property filters for all series" }, + "response": { + "$ref": "#/definitions/RetentionQueryResponse" + }, "retentionFilter": { "$ref": "#/definitions/RetentionFilter", "description": "Properties specific to the retention insight" @@ -2608,13 +2639,74 @@ "type": ["number", "null"] } }, - "required": ["kind"], + "required": ["kind", "retentionFilter"], + "type": "object" + }, + "RetentionQueryResponse": { + "additionalProperties": false, + "properties": { + "hogql": { + "type": "string" + }, + "is_cached": { + "type": "boolean" + }, + "last_refresh": { + "type": "string" + }, + "next_allowed_client_refresh": { + "type": "string" + }, + "results": { + "items": { + "$ref": "#/definitions/RetentionResult" + }, + "type": "array" + }, + "timings": { + "items": { + "$ref": "#/definitions/QueryTiming" + }, + "type": "array" + } + }, + "required": ["results"], + "type": "object" + }, + "RetentionResult": { + "additionalProperties": false, + "properties": { + "date": { + "format": "date-time", + "type": "string" + }, + "label": { + "type": "string" + }, + "values": { + "items": { + "$ref": "#/definitions/RetentionValue" + }, + "type": "array" + } + }, + "required": ["values", "label", "date"], "type": "object" }, "RetentionType": { "enum": ["retention_recurring", "retention_first_time"], "type": "string" }, + "RetentionValue": { + "additionalProperties": false, + "properties": { + "count": { + "type": "integer" + } + }, + "required": ["count"], + "type": "object" + }, "SavedInsightNode": { "additionalProperties": false, "properties": { diff --git a/frontend/src/queries/schema.ts b/frontend/src/queries/schema.ts index fba21f8bc1d53..4fef6be10ee8f 100644 --- a/frontend/src/queries/schema.ts +++ b/frontend/src/queries/schema.ts @@ -512,10 +512,27 @@ export interface FunnelsQuery extends InsightsQueryBase { /** `RetentionFilterType` minus everything inherited from `FilterType` */ export type RetentionFilter = Omit + +export interface RetentionValue { + /** @asType integer */ + count: number +} + +export interface RetentionResult { + values: RetentionValue[] + label: string + /** @format date-time */ + date: string +} + +export interface RetentionQueryResponse extends QueryResponse { + results: RetentionResult[] +} export interface RetentionQuery extends InsightsQueryBase { kind: NodeKind.RetentionQuery + response?: RetentionQueryResponse /** Properties specific to the retention insight */ - retentionFilter?: RetentionFilter + retentionFilter: RetentionFilter } /** `PathsFilterType` minus everything inherited from `FilterType` and persons modal related params */ diff --git a/frontend/src/scenes/insights/InsightNav/insightNavLogic.tsx b/frontend/src/scenes/insights/InsightNav/insightNavLogic.tsx index 45738f06c0e44..ab72670df3483 100644 --- a/frontend/src/scenes/insights/InsightNav/insightNavLogic.tsx +++ b/frontend/src/scenes/insights/InsightNav/insightNavLogic.tsx @@ -57,9 +57,9 @@ export interface CommonInsightFilter Partial {} export interface QueryPropertyCache - extends Omit, 'kind'>, + extends Omit, 'kind' | 'response'>, Omit, 'kind'>, - Omit, 'kind'>, + Omit, 'kind' | 'response'>, Omit, 'kind'>, Omit, 'kind'>, Omit, 'kind'> { diff --git a/frontend/src/scenes/insights/summarizeInsight.test.ts b/frontend/src/scenes/insights/summarizeInsight.test.ts index 8bc060941642f..8ed20b2fdc3c1 100644 --- a/frontend/src/scenes/insights/summarizeInsight.test.ts +++ b/frontend/src/scenes/insights/summarizeInsight.test.ts @@ -308,12 +308,12 @@ describe('summarizing insights', () => { target_entity: { id: '$autocapture', name: '$autocapture', - type: 'event', + type: 'events', }, returning_entity: { id: '$autocapture', name: '$autocapture', - type: 'event', + type: 'events', }, retention_type: RETENTION_FIRST_TIME, } as RetentionFilterType, @@ -333,12 +333,12 @@ describe('summarizing insights', () => { target_entity: { id: 'purchase', name: 'purchase', - type: 'event', + type: 'events', }, returning_entity: { id: '$pageview', name: '$pageview', - type: 'event', + type: 'events', }, retention_type: RETENTION_RECURRING, aggregation_group_type_index: 0, @@ -731,12 +731,12 @@ describe('summarizing insights', () => { target_entity: { id: '$autocapture', name: '$autocapture', - type: 'event', + type: 'events', }, returning_entity: { id: '$autocapture', name: '$autocapture', - type: 'event', + type: 'events', }, retention_type: RETENTION_FIRST_TIME, }, @@ -760,12 +760,12 @@ describe('summarizing insights', () => { target_entity: { id: 'purchase', name: 'purchase', - type: 'event', + type: 'events', }, returning_entity: { id: '$pageview', name: '$pageview', - type: 'event', + type: 'events', }, retention_type: RETENTION_RECURRING, }, diff --git a/frontend/src/types.ts b/frontend/src/types.ts index a1b8240bab7d6..964452da01fea 100644 --- a/frontend/src/types.ts +++ b/frontend/src/types.ts @@ -37,6 +37,7 @@ import type { InsightVizNode, Node, } from './queries/schema' +import { NodeKind } from './queries/schema' export type Optional = Omit & { [K in keyof T]?: T[K] } @@ -1857,12 +1858,25 @@ export interface PathsFilterType extends FilterType { path_end_key?: string // Paths People End Key path_dropoff_key?: string // Paths People Dropoff Key } + +export interface RetentionEntity { + id?: string | number // TODO: Fix weird typing issues + kind?: NodeKind.ActionsNode | NodeKind.EventsNode + name?: string + type?: EntityType + // @asType integer + order?: number + uuid?: string + custom_name?: string +} + export interface RetentionFilterType extends FilterType { retention_type?: RetentionType retention_reference?: 'total' | 'previous' // retention wrt cohort size or previous period + /** @asType integer */ total_intervals?: number // retention total intervals - returning_entity?: Record - target_entity?: Record + returning_entity?: RetentionEntity + target_entity?: RetentionEntity period?: RetentionPeriod } export interface LifecycleFilterType extends FilterType { diff --git a/posthog/api/services/query.py b/posthog/api/services/query.py index 48339aa38bad9..283d1ac7ab9ae 100644 --- a/posthog/api/services/query.py +++ b/posthog/api/services/query.py @@ -19,6 +19,7 @@ QUERY_WITH_RUNNER = [ "LifecycleQuery", + "RetentionQuery", "TrendsQuery", "WebOverviewQuery", "WebTopSourcesQuery", diff --git a/posthog/hogql/property.py b/posthog/hogql/property.py index 2617f36ddbb5d..ce4ea3bdfe14f 100644 --- a/posthog/hogql/property.py +++ b/posthog/hogql/property.py @@ -3,7 +3,13 @@ from pydantic import BaseModel -from posthog.constants import AUTOCAPTURE_EVENT, PropertyOperatorType +from posthog.constants import ( + AUTOCAPTURE_EVENT, + PropertyOperatorType, + TREND_FILTER_TYPE_ACTIONS, + TREND_FILTER_TYPE_EVENTS, + PAGEVIEW_EVENT, +) from posthog.hogql import ast from posthog.hogql.base import AST from posthog.hogql.functions import HOGQL_AGGREGATIONS @@ -27,6 +33,7 @@ PropertyGroupFilter, PropertyGroupFilterValue, FilterLogicalOperator, + RetentionEntity, ) @@ -373,6 +380,27 @@ def action_to_expr(action: Action) -> ast.Expr: return ast.Or(exprs=or_queries) +def entity_to_expr(entity: RetentionEntity, default_event=PAGEVIEW_EVENT) -> ast.Expr: + if entity.type == TREND_FILTER_TYPE_ACTIONS and entity.id is not None: + action = Action.objects.get(pk=entity.id) + return action_to_expr(action) + elif entity.type == TREND_FILTER_TYPE_EVENTS: + if entity.id is None: + return ast.Constant(value=True) + + return ast.CompareOperation( + op=ast.CompareOperationOp.Eq, + left=ast.Field(chain=["events", "event"]), + right=ast.Constant(value=entity.id), + ) + + return ast.CompareOperation( + op=ast.CompareOperationOp.Eq, + left=ast.Field(chain=["events", "event"]), + right=ast.Constant(value=default_event), + ) + + def element_chain_key_filter(key: str, text: str, operator: PropertyOperator): escaped = text.replace('"', r"\"") if operator == PropertyOperator.is_set or operator == PropertyOperator.is_not_set: diff --git a/posthog/hogql/test/test_property.py b/posthog/hogql/test/test_property.py index 538ff3daf0ac1..52e57d9c61c76 100644 --- a/posthog/hogql/test/test_property.py +++ b/posthog/hogql/test/test_property.py @@ -1,6 +1,7 @@ from typing import List, Union, cast, Optional, Dict, Any, Literal +from unittest.mock import MagicMock, patch -from posthog.constants import PropertyOperatorType +from posthog.constants import PropertyOperatorType, TREND_FILTER_TYPE_ACTIONS, TREND_FILTER_TYPE_EVENTS from posthog.hogql import ast from posthog.hogql.parser import parse_expr from posthog.hogql.property import ( @@ -10,6 +11,7 @@ property_to_expr, selector_to_expr, tag_name_to_expr, + entity_to_expr, ) from posthog.hogql.visitor import clear_locations from posthog.models import ( @@ -22,7 +24,7 @@ ) from posthog.models.property import PropertyGroup from posthog.models.property_definition import PropertyType -from posthog.schema import HogQLPropertyFilter, PropertyOperator +from posthog.schema import HogQLPropertyFilter, PropertyOperator, RetentionEntity from posthog.test.base import BaseTest elements_chain_match = lambda x: parse_expr("elements_chain =~ {regex}", {"regex": ast.Constant(value=str(x))}) @@ -604,3 +606,35 @@ def test_person_scope(self): str(e.exception), "The 'event' property filter only works in 'event' scope, not in 'person' scope", ) + + def test_entity_to_expr_actions_type_with_id(self): + action_mock = MagicMock() + with patch("posthog.models.Action.objects.get", return_value=action_mock): + entity = RetentionEntity(**{"type": TREND_FILTER_TYPE_ACTIONS, "id": 123}) + result = entity_to_expr(entity) + self.assertIsInstance(result, ast.Expr) + + def test_entity_to_expr_events_type_with_id(self): + entity = RetentionEntity(**{"type": TREND_FILTER_TYPE_EVENTS, "id": "event_id"}) + result = entity_to_expr(entity) + expected = ast.CompareOperation( + op=ast.CompareOperationOp.Eq, + left=ast.Field(chain=["events", "event"]), + right=ast.Constant(value="event_id"), + ) + self.assertEqual(result, expected) + + def test_entity_to_expr_events_type_without_id(self): + entity = RetentionEntity(**{"type": TREND_FILTER_TYPE_EVENTS, "id": None}) + result = entity_to_expr(entity) + self.assertEqual(result, ast.Constant(value=True)) + + def test_entity_to_expr_default_case(self): + entity = RetentionEntity() + result = entity_to_expr(entity, default_event="default_event") + expected = ast.CompareOperation( + op=ast.CompareOperationOp.Eq, + left=ast.Field(chain=["events", "event"]), + right=ast.Constant(value="default_event"), + ) + self.assertEqual(result, expected) diff --git a/posthog/hogql_queries/insights/retention_query_runner.py b/posthog/hogql_queries/insights/retention_query_runner.py new file mode 100644 index 0000000000000..b1c05f650e938 --- /dev/null +++ b/posthog/hogql_queries/insights/retention_query_runner.py @@ -0,0 +1,320 @@ +from datetime import datetime, timedelta +from math import ceil +from typing import Any, Dict +from typing import Optional + +from posthog.caching.insights_api import BASE_MINIMUM_INSIGHT_REFRESH_INTERVAL, REDUCED_MINIMUM_INSIGHT_REFRESH_INTERVAL +from posthog.caching.utils import is_stale +from posthog.constants import ( + TREND_FILTER_TYPE_EVENTS, + RetentionQueryType, +) +from posthog.hogql import ast +from posthog.hogql.constants import LimitContext +from posthog.hogql.parser import parse_select +from posthog.hogql.printer import to_printed_hogql +from posthog.hogql.property import property_to_expr, entity_to_expr +from posthog.hogql.query import execute_hogql_query +from posthog.hogql.timings import HogQLTimings +from posthog.hogql_queries.query_runner import QueryRunner +from posthog.hogql_queries.utils.query_date_range import QueryDateRangeWithIntervals +from posthog.models import Team +from posthog.models.filters.mixins.utils import cached_property +from posthog.queries.util import correct_result_for_sampling +from posthog.schema import ( + HogQLQueryModifiers, + RetentionQueryResponse, + IntervalType, + RetentionEntity, +) +from posthog.schema import RetentionQuery, RetentionType + +DEFAULT_INTERVAL = IntervalType("day") +DEFAULT_TOTAL_INTERVALS = 11 + + +class RetentionQueryRunner(QueryRunner): + query: RetentionQuery + query_type = RetentionQuery + + def __init__( + self, + query: RetentionQuery | Dict[str, Any], + team: Team, + timings: Optional[HogQLTimings] = None, + modifiers: Optional[HogQLQueryModifiers] = None, + limit_context: Optional[LimitContext] = None, + ): + super().__init__(query, team=team, timings=timings, modifiers=modifiers, limit_context=limit_context) + + def get_applicable_entity(self, event_query_type): + default_entity = RetentionEntity( + **{ + "id": "$pageview", + "type": TREND_FILTER_TYPE_EVENTS, + } + ) + target_entity = self.query.retentionFilter.target_entity or default_entity + if event_query_type in [RetentionQueryType.TARGET, RetentionQueryType.TARGET_FIRST_TIME]: + return target_entity + + return self.query.retentionFilter.returning_entity or target_entity + + def retention_events_query(self, event_query_type) -> ast.SelectQuery: + start_of_interval_sql = self.query_date_range.get_start_of_interval_hogql( + source=ast.Field(chain=["events", "timestamp"]) + ) + + if event_query_type == RetentionQueryType.TARGET_FIRST_TIME: + event_date_expr = ast.Call(name="min", args=[start_of_interval_sql]) + else: + event_date_expr = start_of_interval_sql + + fields = [ + ast.Alias(alias="event_date", expr=event_date_expr), + ast.Alias(alias="target", expr=ast.Field(chain=["events", "person_id"])), + ] + + if event_query_type in [RetentionQueryType.TARGET, RetentionQueryType.TARGET_FIRST_TIME]: + source_timestamp = ast.Field(chain=["events", "timestamp"]) + if event_query_type == RetentionQueryType.TARGET_FIRST_TIME: + source_timestamp = ast.Call( + name="min", + args=[source_timestamp], + ) + + datediff_call = ast.Call( + name="dateDiff", + args=[ + ast.Constant(value=self.query_date_range.interval_name), + self.query_date_range.get_start_of_interval_hogql(), + self.query_date_range.get_start_of_interval_hogql( + source=source_timestamp, + ), + ], + ) + fields.append( + ast.Alias(alias="breakdown_values", expr=ast.Array(exprs=[datediff_call])), + ) + + event_filters = [ + entity_to_expr(entity=self.get_applicable_entity(event_query_type)), + ] + + if self.query.properties is not None and self.query.properties != []: + event_filters.append(property_to_expr(self.query.properties, self.team)) + + if ( + self.query.filterTestAccounts + and isinstance(self.team.test_account_filters, list) + and len(self.team.test_account_filters) > 0 + ): + for prop in self.team.test_account_filters: + event_filters.append(property_to_expr(prop, self.team)) + + date_filter_expr = self.date_filter_expr(event_query_type) + if event_query_type != RetentionQueryType.TARGET_FIRST_TIME: + event_filters.append(date_filter_expr) + + group_by_fields = None + having_expr = None + if event_query_type == RetentionQueryType.TARGET_FIRST_TIME: + group_by_fields = [ast.Field(chain=["target"])] + having_expr = date_filter_expr + elif event_query_type == RetentionQueryType.RETURNING: + group_by_fields = [ast.Field(chain=["target"]), ast.Field(chain=["event_date"])] + + result = ast.SelectQuery( + select=fields, + distinct=event_query_type == RetentionQueryType.TARGET, + select_from=ast.JoinExpr(table=ast.Field(chain=["events"])), + where=ast.And(exprs=event_filters), + group_by=group_by_fields, + having=having_expr, + ) + + if self.query.samplingFactor is not None and isinstance(self.query.samplingFactor, float): + result.select_from.sample = ast.SampleExpr( + sample_value=ast.RatioExpr(left=ast.Constant(value=self.query.samplingFactor)) + ) + + return result + + def date_filter_expr(self, event_query_type) -> ast.Expr: + field_to_compare = ( + ast.Field(chain=["event_date"]) + if event_query_type == RetentionQueryType.TARGET_FIRST_TIME + else ast.Field(chain=["events", "timestamp"]) + ) + return ast.And( + exprs=[ + ast.CompareOperation( + op=ast.CompareOperationOp.GtEq, + left=field_to_compare, + right=ast.Constant(value=self.query_date_range.date_from()), + ), + ast.CompareOperation( + op=ast.CompareOperationOp.LtEq, + left=field_to_compare, + right=ast.Constant(value=self.query_date_range.date_to()), + ), + ] + ) + + def build_target_event_query(self) -> ast.SelectQuery: + event_query_type = ( + RetentionQueryType.TARGET_FIRST_TIME + if self.query.retentionFilter.retention_type == RetentionType.retention_first_time + else RetentionQueryType.TARGET + ) + return self.retention_events_query(event_query_type=event_query_type) + + def build_returning_event_query(self) -> ast.SelectQuery: + return self.retention_events_query(event_query_type=RetentionQueryType.RETURNING) + + def actor_query(self) -> ast.SelectQuery: + placeholders = { + **self.query_date_range.to_placeholders(), + "returning_event_query": self.build_returning_event_query(), + "target_event_query": self.build_target_event_query(), + "breakdown_values_filter": ast.Constant(value=None), + "selected_interval": ast.Constant(value=None), + } + return parse_select( + """ + SELECT DISTINCT breakdown_values, + intervals_from_base, + actor_id + + FROM ( + SELECT target_event.breakdown_values AS breakdown_values, + dateDiff( + {interval}, + target_event.event_date, + returning_event.event_date + ) AS intervals_from_base, + returning_event.target AS actor_id + + FROM {target_event_query} AS target_event + JOIN {returning_event_query} AS returning_event + ON returning_event.target = target_event.target + + WHERE returning_event.event_date > target_event.event_date + + UNION ALL + + SELECT target_event.breakdown_values AS breakdown_values, + 0 AS intervals_from_base, + target_event.target AS actor_id + + FROM {target_event_query} AS target_event + ) + + WHERE ({breakdown_values_filter} is NULL OR breakdown_values = {breakdown_values_filter}) + AND ({selected_interval} is NULL OR intervals_from_base = {selected_interval}) + """, + placeholders, + ) + + def to_query(self) -> ast.SelectQuery | ast.SelectUnionQuery: + placeholders = { + "actor_query": self.actor_query(), + } + with self.timings.measure("retention_query"): + retention_query = parse_select( + """ + SELECT actor_activity.breakdown_values AS breakdown_values, + actor_activity.intervals_from_base AS intervals_from_base, + COUNT(DISTINCT actor_activity.actor_id) AS count + + FROM {actor_query} AS actor_activity + + GROUP BY breakdown_values, + intervals_from_base + + ORDER BY breakdown_values, + intervals_from_base + + LIMIT 10000 + """, + placeholders, + timings=self.timings, + ) + return retention_query + + @cached_property + def query_date_range(self) -> QueryDateRangeWithIntervals: + total_intervals = self.query.retentionFilter.total_intervals or DEFAULT_TOTAL_INTERVALS + interval = ( + IntervalType(self.query.retentionFilter.period.lower()) + if self.query.retentionFilter.period + else DEFAULT_INTERVAL + ) + + return QueryDateRangeWithIntervals( + date_range=self.query.dateRange, + total_intervals=total_intervals, + team=self.team, + interval=interval, + now=datetime.now(), + ) + + def _is_stale(self, cached_result_package): + date_to = self.query_date_range.date_to() + interval = self.query_date_range.interval_name + return is_stale(self.team, date_to, interval, cached_result_package) + + def _refresh_frequency(self): + date_to = self.query_date_range.date_to() + date_from = self.query_date_range.date_from() + interval = self.query_date_range.interval_name + + delta_days: Optional[int] = None + if date_from and date_to: + delta = date_to - date_from + delta_days = ceil(delta.total_seconds() / timedelta(days=1).total_seconds()) + + refresh_frequency = BASE_MINIMUM_INSIGHT_REFRESH_INTERVAL + if interval == "hour" or (delta_days is not None and delta_days <= 7): + # The interval is shorter for short-term insights + refresh_frequency = REDUCED_MINIMUM_INSIGHT_REFRESH_INTERVAL + + return refresh_frequency + + def calculate(self) -> RetentionQueryResponse: + query = self.to_query() + hogql = to_printed_hogql(query, self.team.pk) + + response = execute_hogql_query( + query_type="RetentionQuery", + query=query, + team=self.team, + timings=self.timings, + modifiers=self.modifiers, + ) + + result_dict = { + (tuple(breakdown_values), intervals_from_base): { + "count": correct_result_for_sampling(count, self.query.samplingFactor), + } + for (breakdown_values, intervals_from_base, count) in response.results + } + + results = [ + { + "values": [ + result_dict.get(((first_interval,), return_interval), {"count": 0}) + for return_interval in range(self.query_date_range.total_intervals - first_interval) + ], + "label": f"{self.query_date_range.interval_name.title()} {first_interval}", + "date": ( + self.query_date_range.date_from() + + self.query_date_range.determine_time_delta( + first_interval, self.query_date_range.interval_name.title() + ) + ), + } + for first_interval in range(self.query_date_range.total_intervals) + ] + + return RetentionQueryResponse(results=results, timings=response.timings, hogql=hogql) diff --git a/posthog/hogql_queries/insights/test/__snapshots__/test_retention_query_runner.ambr b/posthog/hogql_queries/insights/test/__snapshots__/test_retention_query_runner.ambr new file mode 100644 index 0000000000000..bd6e5436ad1d5 --- /dev/null +++ b/posthog/hogql_queries/insights/test/__snapshots__/test_retention_query_runner.ambr @@ -0,0 +1,567 @@ +# name: TestRetention.test_day_interval_sampled + ' + SELECT actor_activity.breakdown_values AS breakdown_values, + actor_activity.intervals_from_base AS intervals_from_base, + count(DISTINCT actor_activity.actor_id) AS count + FROM + (SELECT DISTINCT breakdown_values, + intervals_from_base, + actor_id + FROM + (SELECT target_event.breakdown_values AS breakdown_values, + dateDiff('day', target_event.event_date, returning_event.event_date) AS intervals_from_base, + returning_event.target AS actor_id + FROM + (SELECT DISTINCT toStartOfDay(toTimeZone(events.timestamp, 'UTC')) AS event_date, + events__pdi.person_id AS target, + [dateDiff('day', toStartOfDay(toDateTime64('2020-06-10 00:00:00.000000', 6, 'UTC')), toStartOfDay(toTimeZone(events.timestamp, 'UTC')))] AS breakdown_values + FROM events SAMPLE 1.0 + INNER JOIN + (SELECT argMax(person_distinct_id2.person_id, person_distinct_id2.version) AS person_id, + person_distinct_id2.distinct_id AS distinct_id + FROM person_distinct_id2 + WHERE equals(person_distinct_id2.team_id, 2) + GROUP BY person_distinct_id2.distinct_id + HAVING ifNull(equals(argMax(person_distinct_id2.is_deleted, person_distinct_id2.version), 0), 0)) AS events__pdi ON equals(events.distinct_id, events__pdi.distinct_id) + WHERE and(equals(events.team_id, 2), equals(events.event, '$pageview'), and(greaterOrEquals(toTimeZone(events.timestamp, 'UTC'), toDateTime64('2020-06-10 00:00:00.000000', 6, 'UTC')), lessOrEquals(toTimeZone(events.timestamp, 'UTC'), toDateTime64('2020-06-21 00:00:00.000000', 6, 'UTC'))))) AS target_event + JOIN + (SELECT toStartOfDay(toTimeZone(events.timestamp, 'UTC')) AS event_date, + events__pdi.person_id AS target + FROM events SAMPLE 1.0 + INNER JOIN + (SELECT argMax(person_distinct_id2.person_id, person_distinct_id2.version) AS person_id, + person_distinct_id2.distinct_id AS distinct_id + FROM person_distinct_id2 + WHERE equals(person_distinct_id2.team_id, 2) + GROUP BY person_distinct_id2.distinct_id + HAVING ifNull(equals(argMax(person_distinct_id2.is_deleted, person_distinct_id2.version), 0), 0)) AS events__pdi ON equals(events.distinct_id, events__pdi.distinct_id) + WHERE and(equals(events.team_id, 2), equals(events.event, '$pageview'), and(greaterOrEquals(toTimeZone(events.timestamp, 'UTC'), toDateTime64('2020-06-10 00:00:00.000000', 6, 'UTC')), lessOrEquals(toTimeZone(events.timestamp, 'UTC'), toDateTime64('2020-06-21 00:00:00.000000', 6, 'UTC')))) + GROUP BY target, + event_date) AS returning_event ON equals(returning_event.target, target_event.target) + WHERE ifNull(greater(returning_event.event_date, target_event.event_date), 0) + UNION ALL SELECT target_event.breakdown_values AS breakdown_values, + 0 AS intervals_from_base, + target_event.target AS actor_id + FROM + (SELECT DISTINCT toStartOfDay(toTimeZone(events.timestamp, 'UTC')) AS event_date, + events__pdi.person_id AS target, + [dateDiff('day', toStartOfDay(toDateTime64('2020-06-10 00:00:00.000000', 6, 'UTC')), toStartOfDay(toTimeZone(events.timestamp, 'UTC')))] AS breakdown_values + FROM events SAMPLE 1.0 + INNER JOIN + (SELECT argMax(person_distinct_id2.person_id, person_distinct_id2.version) AS person_id, + person_distinct_id2.distinct_id AS distinct_id + FROM person_distinct_id2 + WHERE equals(person_distinct_id2.team_id, 2) + GROUP BY person_distinct_id2.distinct_id + HAVING ifNull(equals(argMax(person_distinct_id2.is_deleted, person_distinct_id2.version), 0), 0)) AS events__pdi ON equals(events.distinct_id, events__pdi.distinct_id) + WHERE and(equals(events.team_id, 2), equals(events.event, '$pageview'), and(greaterOrEquals(toTimeZone(events.timestamp, 'UTC'), toDateTime64('2020-06-10 00:00:00.000000', 6, 'UTC')), lessOrEquals(toTimeZone(events.timestamp, 'UTC'), toDateTime64('2020-06-21 00:00:00.000000', 6, 'UTC'))))) AS target_event) + WHERE and(or(1, isNull(breakdown_values)), or(1, isNull(intervals_from_base)))) AS actor_activity + GROUP BY breakdown_values, + intervals_from_base + ORDER BY breakdown_values ASC, + intervals_from_base ASC + LIMIT 10000 SETTINGS readonly=2, + max_execution_time=60, + allow_experimental_object_type=1 + ' +--- +# name: TestRetention.test_month_interval_with_person_on_events_v2 + ' + + SELECT distinct_id, + person_id + FROM events + WHERE team_id = 2 + AND distinct_id IN ('person1', + 'person2') + GROUP BY distinct_id, + person_id + ORDER BY if(distinct_id = 'person1', -1, 0) + ' +--- +# name: TestRetention.test_month_interval_with_person_on_events_v2.1 + ' + SELECT actor_activity.breakdown_values AS breakdown_values, + actor_activity.intervals_from_base AS intervals_from_base, + count(DISTINCT actor_activity.actor_id) AS count + FROM + (SELECT DISTINCT breakdown_values, + intervals_from_base, + actor_id + FROM + (SELECT target_event.breakdown_values AS breakdown_values, + dateDiff('month', target_event.event_date, returning_event.event_date) AS intervals_from_base, + returning_event.target AS actor_id + FROM + (SELECT DISTINCT toStartOfMonth(toTimeZone(events.timestamp, 'UTC')) AS event_date, + ifNull(nullIf(events__override.override_person_id, '00000000-0000-0000-0000-000000000000'), events.person_id) AS target, + [dateDiff('month', toStartOfMonth(toDateTime64('2020-01-10 00:00:00.000000', 6, 'UTC')), toStartOfMonth(toTimeZone(events.timestamp, 'UTC')))] AS breakdown_values + FROM events + LEFT OUTER JOIN + (SELECT argMax(person_overrides.override_person_id, person_overrides.version) AS override_person_id, + person_overrides.old_person_id AS old_person_id + FROM person_overrides + WHERE equals(person_overrides.team_id, 2) + GROUP BY person_overrides.old_person_id) AS events__override ON equals(events.person_id, events__override.old_person_id) + WHERE and(equals(events.team_id, 2), equals(events.event, '$pageview'), and(greaterOrEquals(toTimeZone(events.timestamp, 'UTC'), toDateTime64('2020-01-10 00:00:00.000000', 6, 'UTC')), lessOrEquals(toTimeZone(events.timestamp, 'UTC'), toDateTime64('2020-12-10 00:00:00.000000', 6, 'UTC'))))) AS target_event + JOIN + (SELECT toStartOfMonth(toTimeZone(events.timestamp, 'UTC')) AS event_date, + ifNull(nullIf(events__override.override_person_id, '00000000-0000-0000-0000-000000000000'), events.person_id) AS target + FROM events + LEFT OUTER JOIN + (SELECT argMax(person_overrides.override_person_id, person_overrides.version) AS override_person_id, + person_overrides.old_person_id AS old_person_id + FROM person_overrides + WHERE equals(person_overrides.team_id, 2) + GROUP BY person_overrides.old_person_id) AS events__override ON equals(events.person_id, events__override.old_person_id) + WHERE and(equals(events.team_id, 2), equals(events.event, '$pageview'), and(greaterOrEquals(toTimeZone(events.timestamp, 'UTC'), toDateTime64('2020-01-10 00:00:00.000000', 6, 'UTC')), lessOrEquals(toTimeZone(events.timestamp, 'UTC'), toDateTime64('2020-12-10 00:00:00.000000', 6, 'UTC')))) + GROUP BY target, + event_date) AS returning_event ON equals(returning_event.target, target_event.target) + WHERE ifNull(greater(returning_event.event_date, target_event.event_date), 0) + UNION ALL SELECT target_event.breakdown_values AS breakdown_values, + 0 AS intervals_from_base, + target_event.target AS actor_id + FROM + (SELECT DISTINCT toStartOfMonth(toTimeZone(events.timestamp, 'UTC')) AS event_date, + ifNull(nullIf(events__override.override_person_id, '00000000-0000-0000-0000-000000000000'), events.person_id) AS target, + [dateDiff('month', toStartOfMonth(toDateTime64('2020-01-10 00:00:00.000000', 6, 'UTC')), toStartOfMonth(toTimeZone(events.timestamp, 'UTC')))] AS breakdown_values + FROM events + LEFT OUTER JOIN + (SELECT argMax(person_overrides.override_person_id, person_overrides.version) AS override_person_id, + person_overrides.old_person_id AS old_person_id + FROM person_overrides + WHERE equals(person_overrides.team_id, 2) + GROUP BY person_overrides.old_person_id) AS events__override ON equals(events.person_id, events__override.old_person_id) + WHERE and(equals(events.team_id, 2), equals(events.event, '$pageview'), and(greaterOrEquals(toTimeZone(events.timestamp, 'UTC'), toDateTime64('2020-01-10 00:00:00.000000', 6, 'UTC')), lessOrEquals(toTimeZone(events.timestamp, 'UTC'), toDateTime64('2020-12-10 00:00:00.000000', 6, 'UTC'))))) AS target_event) + WHERE and(or(1, isNull(breakdown_values)), or(1, isNull(intervals_from_base)))) AS actor_activity + GROUP BY breakdown_values, + intervals_from_base + ORDER BY breakdown_values ASC, + intervals_from_base ASC + LIMIT 10000 SETTINGS readonly=2, + max_execution_time=60, + allow_experimental_object_type=1 + ' +--- +# name: TestRetention.test_retention_event_action + ' + SELECT actor_activity.breakdown_values AS breakdown_values, + actor_activity.intervals_from_base AS intervals_from_base, + count(DISTINCT actor_activity.actor_id) AS count + FROM + (SELECT DISTINCT breakdown_values, + intervals_from_base, + actor_id + FROM + (SELECT target_event.breakdown_values AS breakdown_values, + dateDiff('day', target_event.event_date, returning_event.event_date) AS intervals_from_base, + returning_event.target AS actor_id + FROM + (SELECT DISTINCT toStartOfDay(toTimeZone(events.timestamp, 'UTC')) AS event_date, + events__pdi.person_id AS target, + [dateDiff('day', toStartOfDay(toDateTime64('2020-06-10 00:00:00.000000', 6, 'UTC')), toStartOfDay(toTimeZone(events.timestamp, 'UTC')))] AS breakdown_values + FROM events + INNER JOIN + (SELECT argMax(person_distinct_id2.person_id, person_distinct_id2.version) AS person_id, + person_distinct_id2.distinct_id AS distinct_id + FROM person_distinct_id2 + WHERE equals(person_distinct_id2.team_id, 2) + GROUP BY person_distinct_id2.distinct_id + HAVING ifNull(equals(argMax(person_distinct_id2.is_deleted, person_distinct_id2.version), 0), 0)) AS events__pdi ON equals(events.distinct_id, events__pdi.distinct_id) + WHERE and(equals(events.team_id, 2), equals(events.event, 'sign up'), and(greaterOrEquals(toTimeZone(events.timestamp, 'UTC'), toDateTime64('2020-06-10 00:00:00.000000', 6, 'UTC')), lessOrEquals(toTimeZone(events.timestamp, 'UTC'), toDateTime64('2020-06-17 00:00:00.000000', 6, 'UTC'))))) AS target_event + JOIN + (SELECT toStartOfDay(toTimeZone(events.timestamp, 'UTC')) AS event_date, + events__pdi.person_id AS target + FROM events + INNER JOIN + (SELECT argMax(person_distinct_id2.person_id, person_distinct_id2.version) AS person_id, + person_distinct_id2.distinct_id AS distinct_id + FROM person_distinct_id2 + WHERE equals(person_distinct_id2.team_id, 2) + GROUP BY person_distinct_id2.distinct_id + HAVING ifNull(equals(argMax(person_distinct_id2.is_deleted, person_distinct_id2.version), 0), 0)) AS events__pdi ON equals(events.distinct_id, events__pdi.distinct_id) + WHERE and(equals(events.team_id, 2), equals(events.event, '$some_event'), and(greaterOrEquals(toTimeZone(events.timestamp, 'UTC'), toDateTime64('2020-06-10 00:00:00.000000', 6, 'UTC')), lessOrEquals(toTimeZone(events.timestamp, 'UTC'), toDateTime64('2020-06-17 00:00:00.000000', 6, 'UTC')))) + GROUP BY target, + event_date) AS returning_event ON equals(returning_event.target, target_event.target) + WHERE ifNull(greater(returning_event.event_date, target_event.event_date), 0) + UNION ALL SELECT target_event.breakdown_values AS breakdown_values, + 0 AS intervals_from_base, + target_event.target AS actor_id + FROM + (SELECT DISTINCT toStartOfDay(toTimeZone(events.timestamp, 'UTC')) AS event_date, + events__pdi.person_id AS target, + [dateDiff('day', toStartOfDay(toDateTime64('2020-06-10 00:00:00.000000', 6, 'UTC')), toStartOfDay(toTimeZone(events.timestamp, 'UTC')))] AS breakdown_values + FROM events + INNER JOIN + (SELECT argMax(person_distinct_id2.person_id, person_distinct_id2.version) AS person_id, + person_distinct_id2.distinct_id AS distinct_id + FROM person_distinct_id2 + WHERE equals(person_distinct_id2.team_id, 2) + GROUP BY person_distinct_id2.distinct_id + HAVING ifNull(equals(argMax(person_distinct_id2.is_deleted, person_distinct_id2.version), 0), 0)) AS events__pdi ON equals(events.distinct_id, events__pdi.distinct_id) + WHERE and(equals(events.team_id, 2), equals(events.event, 'sign up'), and(greaterOrEquals(toTimeZone(events.timestamp, 'UTC'), toDateTime64('2020-06-10 00:00:00.000000', 6, 'UTC')), lessOrEquals(toTimeZone(events.timestamp, 'UTC'), toDateTime64('2020-06-17 00:00:00.000000', 6, 'UTC'))))) AS target_event) + WHERE and(or(1, isNull(breakdown_values)), or(1, isNull(intervals_from_base)))) AS actor_activity + GROUP BY breakdown_values, + intervals_from_base + ORDER BY breakdown_values ASC, + intervals_from_base ASC + LIMIT 10000 SETTINGS readonly=2, + max_execution_time=60, + allow_experimental_object_type=1 + ' +--- +# name: TestRetention.test_retention_with_user_properties_via_action + ' + SELECT actor_activity.breakdown_values AS breakdown_values, + actor_activity.intervals_from_base AS intervals_from_base, + count(DISTINCT actor_activity.actor_id) AS count + FROM + (SELECT DISTINCT breakdown_values, + intervals_from_base, + actor_id + FROM + (SELECT target_event.breakdown_values AS breakdown_values, + dateDiff('day', target_event.event_date, returning_event.event_date) AS intervals_from_base, + returning_event.target AS actor_id + FROM + (SELECT DISTINCT toStartOfDay(toTimeZone(events.timestamp, 'UTC')) AS event_date, + events__pdi.person_id AS target, + [dateDiff('day', toStartOfDay(toDateTime64('2020-06-10 00:00:00.000000', 6, 'UTC')), toStartOfDay(toTimeZone(events.timestamp, 'UTC')))] AS breakdown_values + FROM events + INNER JOIN + (SELECT argMax(person_distinct_id2.person_id, person_distinct_id2.version) AS person_id, + person_distinct_id2.distinct_id AS distinct_id + FROM person_distinct_id2 + WHERE equals(person_distinct_id2.team_id, 2) + GROUP BY person_distinct_id2.distinct_id + HAVING ifNull(equals(argMax(person_distinct_id2.is_deleted, person_distinct_id2.version), 0), 0)) AS events__pdi ON equals(events.distinct_id, events__pdi.distinct_id) + INNER JOIN + (SELECT person.id, + replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(person.properties, 'email'), ''), 'null'), '^"|"$', '') AS properties___email + FROM person + WHERE and(equals(person.team_id, 2), ifNull(in(tuple(person.id, person.version), + (SELECT person.id, max(person.version) AS version + FROM person + WHERE equals(person.team_id, 2) + GROUP BY person.id + HAVING ifNull(equals(argMax(person.is_deleted, person.version), 0), 0))), 0)) SETTINGS optimize_aggregation_in_order=1) AS events__pdi__person ON equals(events__pdi.person_id, events__pdi__person.id) + WHERE and(equals(events.team_id, 2), and(equals(events.event, '$pageview'), ifNull(equals(events__pdi__person.properties___email, 'person1@test.com'), 0)), and(greaterOrEquals(toTimeZone(events.timestamp, 'UTC'), toDateTime64('2020-06-10 00:00:00.000000', 6, 'UTC')), lessOrEquals(toTimeZone(events.timestamp, 'UTC'), toDateTime64('2020-06-17 00:00:00.000000', 6, 'UTC'))))) AS target_event + JOIN + (SELECT toStartOfDay(toTimeZone(events.timestamp, 'UTC')) AS event_date, + events__pdi.person_id AS target + FROM events + INNER JOIN + (SELECT argMax(person_distinct_id2.person_id, person_distinct_id2.version) AS person_id, + person_distinct_id2.distinct_id AS distinct_id + FROM person_distinct_id2 + WHERE equals(person_distinct_id2.team_id, 2) + GROUP BY person_distinct_id2.distinct_id + HAVING ifNull(equals(argMax(person_distinct_id2.is_deleted, person_distinct_id2.version), 0), 0)) AS events__pdi ON equals(events.distinct_id, events__pdi.distinct_id) + WHERE and(equals(events.team_id, 2), equals(events.event, '$pageview'), and(greaterOrEquals(toTimeZone(events.timestamp, 'UTC'), toDateTime64('2020-06-10 00:00:00.000000', 6, 'UTC')), lessOrEquals(toTimeZone(events.timestamp, 'UTC'), toDateTime64('2020-06-17 00:00:00.000000', 6, 'UTC')))) + GROUP BY target, + event_date) AS returning_event ON equals(returning_event.target, target_event.target) + WHERE ifNull(greater(returning_event.event_date, target_event.event_date), 0) + UNION ALL SELECT target_event.breakdown_values AS breakdown_values, + 0 AS intervals_from_base, + target_event.target AS actor_id + FROM + (SELECT DISTINCT toStartOfDay(toTimeZone(events.timestamp, 'UTC')) AS event_date, + events__pdi.person_id AS target, + [dateDiff('day', toStartOfDay(toDateTime64('2020-06-10 00:00:00.000000', 6, 'UTC')), toStartOfDay(toTimeZone(events.timestamp, 'UTC')))] AS breakdown_values + FROM events + INNER JOIN + (SELECT argMax(person_distinct_id2.person_id, person_distinct_id2.version) AS person_id, + person_distinct_id2.distinct_id AS distinct_id + FROM person_distinct_id2 + WHERE equals(person_distinct_id2.team_id, 2) + GROUP BY person_distinct_id2.distinct_id + HAVING ifNull(equals(argMax(person_distinct_id2.is_deleted, person_distinct_id2.version), 0), 0)) AS events__pdi ON equals(events.distinct_id, events__pdi.distinct_id) + INNER JOIN + (SELECT person.id, + replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(person.properties, 'email'), ''), 'null'), '^"|"$', '') AS properties___email + FROM person + WHERE and(equals(person.team_id, 2), ifNull(in(tuple(person.id, person.version), + (SELECT person.id, max(person.version) AS version + FROM person + WHERE equals(person.team_id, 2) + GROUP BY person.id + HAVING ifNull(equals(argMax(person.is_deleted, person.version), 0), 0))), 0)) SETTINGS optimize_aggregation_in_order=1) AS events__pdi__person ON equals(events__pdi.person_id, events__pdi__person.id) + WHERE and(equals(events.team_id, 2), and(equals(events.event, '$pageview'), ifNull(equals(events__pdi__person.properties___email, 'person1@test.com'), 0)), and(greaterOrEquals(toTimeZone(events.timestamp, 'UTC'), toDateTime64('2020-06-10 00:00:00.000000', 6, 'UTC')), lessOrEquals(toTimeZone(events.timestamp, 'UTC'), toDateTime64('2020-06-17 00:00:00.000000', 6, 'UTC'))))) AS target_event) + WHERE and(or(1, isNull(breakdown_values)), or(1, isNull(intervals_from_base)))) AS actor_activity + GROUP BY breakdown_values, + intervals_from_base + ORDER BY breakdown_values ASC, + intervals_from_base ASC + LIMIT 10000 SETTINGS readonly=2, + max_execution_time=60, + allow_experimental_object_type=1 + ' +--- +# name: TestRetention.test_timezones + ' + SELECT actor_activity.breakdown_values AS breakdown_values, + actor_activity.intervals_from_base AS intervals_from_base, + count(DISTINCT actor_activity.actor_id) AS count + FROM + (SELECT DISTINCT breakdown_values, + intervals_from_base, + actor_id + FROM + (SELECT target_event.breakdown_values AS breakdown_values, + dateDiff('day', target_event.event_date, returning_event.event_date) AS intervals_from_base, + returning_event.target AS actor_id + FROM + (SELECT DISTINCT toStartOfDay(toTimeZone(events.timestamp, 'UTC')) AS event_date, + events__pdi.person_id AS target, + [dateDiff('day', toStartOfDay(toDateTime64('2020-06-10 00:00:00.000000', 6, 'UTC')), toStartOfDay(toTimeZone(events.timestamp, 'UTC')))] AS breakdown_values + FROM events + INNER JOIN + (SELECT argMax(person_distinct_id2.person_id, person_distinct_id2.version) AS person_id, + person_distinct_id2.distinct_id AS distinct_id + FROM person_distinct_id2 + WHERE equals(person_distinct_id2.team_id, 2) + GROUP BY person_distinct_id2.distinct_id + HAVING ifNull(equals(argMax(person_distinct_id2.is_deleted, person_distinct_id2.version), 0), 0)) AS events__pdi ON equals(events.distinct_id, events__pdi.distinct_id) + WHERE and(equals(events.team_id, 2), equals(events.event, '$pageview'), and(greaterOrEquals(toTimeZone(events.timestamp, 'UTC'), toDateTime64('2020-06-10 00:00:00.000000', 6, 'UTC')), lessOrEquals(toTimeZone(events.timestamp, 'UTC'), toDateTime64('2020-06-21 00:00:00.000000', 6, 'UTC'))))) AS target_event + JOIN + (SELECT toStartOfDay(toTimeZone(events.timestamp, 'UTC')) AS event_date, + events__pdi.person_id AS target + FROM events + INNER JOIN + (SELECT argMax(person_distinct_id2.person_id, person_distinct_id2.version) AS person_id, + person_distinct_id2.distinct_id AS distinct_id + FROM person_distinct_id2 + WHERE equals(person_distinct_id2.team_id, 2) + GROUP BY person_distinct_id2.distinct_id + HAVING ifNull(equals(argMax(person_distinct_id2.is_deleted, person_distinct_id2.version), 0), 0)) AS events__pdi ON equals(events.distinct_id, events__pdi.distinct_id) + WHERE and(equals(events.team_id, 2), equals(events.event, '$pageview'), and(greaterOrEquals(toTimeZone(events.timestamp, 'UTC'), toDateTime64('2020-06-10 00:00:00.000000', 6, 'UTC')), lessOrEquals(toTimeZone(events.timestamp, 'UTC'), toDateTime64('2020-06-21 00:00:00.000000', 6, 'UTC')))) + GROUP BY target, + event_date) AS returning_event ON equals(returning_event.target, target_event.target) + WHERE ifNull(greater(returning_event.event_date, target_event.event_date), 0) + UNION ALL SELECT target_event.breakdown_values AS breakdown_values, + 0 AS intervals_from_base, + target_event.target AS actor_id + FROM + (SELECT DISTINCT toStartOfDay(toTimeZone(events.timestamp, 'UTC')) AS event_date, + events__pdi.person_id AS target, + [dateDiff('day', toStartOfDay(toDateTime64('2020-06-10 00:00:00.000000', 6, 'UTC')), toStartOfDay(toTimeZone(events.timestamp, 'UTC')))] AS breakdown_values + FROM events + INNER JOIN + (SELECT argMax(person_distinct_id2.person_id, person_distinct_id2.version) AS person_id, + person_distinct_id2.distinct_id AS distinct_id + FROM person_distinct_id2 + WHERE equals(person_distinct_id2.team_id, 2) + GROUP BY person_distinct_id2.distinct_id + HAVING ifNull(equals(argMax(person_distinct_id2.is_deleted, person_distinct_id2.version), 0), 0)) AS events__pdi ON equals(events.distinct_id, events__pdi.distinct_id) + WHERE and(equals(events.team_id, 2), equals(events.event, '$pageview'), and(greaterOrEquals(toTimeZone(events.timestamp, 'UTC'), toDateTime64('2020-06-10 00:00:00.000000', 6, 'UTC')), lessOrEquals(toTimeZone(events.timestamp, 'UTC'), toDateTime64('2020-06-21 00:00:00.000000', 6, 'UTC'))))) AS target_event) + WHERE and(or(1, isNull(breakdown_values)), or(1, isNull(intervals_from_base)))) AS actor_activity + GROUP BY breakdown_values, + intervals_from_base + ORDER BY breakdown_values ASC, + intervals_from_base ASC + LIMIT 10000 SETTINGS readonly=2, + max_execution_time=60, + allow_experimental_object_type=1 + ' +--- +# name: TestRetention.test_timezones.1 + ' + SELECT actor_activity.breakdown_values AS breakdown_values, + actor_activity.intervals_from_base AS intervals_from_base, + count(DISTINCT actor_activity.actor_id) AS count + FROM + (SELECT DISTINCT breakdown_values, + intervals_from_base, + actor_id + FROM + (SELECT target_event.breakdown_values AS breakdown_values, + dateDiff('day', target_event.event_date, returning_event.event_date) AS intervals_from_base, + returning_event.target AS actor_id + FROM + (SELECT DISTINCT toStartOfDay(toTimeZone(events.timestamp, 'US/Pacific')) AS event_date, + events__pdi.person_id AS target, + [dateDiff('day', toStartOfDay(toDateTime64('2020-06-10 00:00:00.000000', 6, 'US/Pacific')), toStartOfDay(toTimeZone(events.timestamp, 'US/Pacific')))] AS breakdown_values + FROM events + INNER JOIN + (SELECT argMax(person_distinct_id2.person_id, person_distinct_id2.version) AS person_id, + person_distinct_id2.distinct_id AS distinct_id + FROM person_distinct_id2 + WHERE equals(person_distinct_id2.team_id, 2) + GROUP BY person_distinct_id2.distinct_id + HAVING ifNull(equals(argMax(person_distinct_id2.is_deleted, person_distinct_id2.version), 0), 0)) AS events__pdi ON equals(events.distinct_id, events__pdi.distinct_id) + WHERE and(equals(events.team_id, 2), equals(events.event, '$pageview'), and(greaterOrEquals(toTimeZone(events.timestamp, 'US/Pacific'), toDateTime64('2020-06-10 00:00:00.000000', 6, 'US/Pacific')), lessOrEquals(toTimeZone(events.timestamp, 'US/Pacific'), toDateTime64('2020-06-21 00:00:00.000000', 6, 'US/Pacific'))))) AS target_event + JOIN + (SELECT toStartOfDay(toTimeZone(events.timestamp, 'US/Pacific')) AS event_date, + events__pdi.person_id AS target + FROM events + INNER JOIN + (SELECT argMax(person_distinct_id2.person_id, person_distinct_id2.version) AS person_id, + person_distinct_id2.distinct_id AS distinct_id + FROM person_distinct_id2 + WHERE equals(person_distinct_id2.team_id, 2) + GROUP BY person_distinct_id2.distinct_id + HAVING ifNull(equals(argMax(person_distinct_id2.is_deleted, person_distinct_id2.version), 0), 0)) AS events__pdi ON equals(events.distinct_id, events__pdi.distinct_id) + WHERE and(equals(events.team_id, 2), equals(events.event, '$pageview'), and(greaterOrEquals(toTimeZone(events.timestamp, 'US/Pacific'), toDateTime64('2020-06-10 00:00:00.000000', 6, 'US/Pacific')), lessOrEquals(toTimeZone(events.timestamp, 'US/Pacific'), toDateTime64('2020-06-21 00:00:00.000000', 6, 'US/Pacific')))) + GROUP BY target, + event_date) AS returning_event ON equals(returning_event.target, target_event.target) + WHERE ifNull(greater(returning_event.event_date, target_event.event_date), 0) + UNION ALL SELECT target_event.breakdown_values AS breakdown_values, + 0 AS intervals_from_base, + target_event.target AS actor_id + FROM + (SELECT DISTINCT toStartOfDay(toTimeZone(events.timestamp, 'US/Pacific')) AS event_date, + events__pdi.person_id AS target, + [dateDiff('day', toStartOfDay(toDateTime64('2020-06-10 00:00:00.000000', 6, 'US/Pacific')), toStartOfDay(toTimeZone(events.timestamp, 'US/Pacific')))] AS breakdown_values + FROM events + INNER JOIN + (SELECT argMax(person_distinct_id2.person_id, person_distinct_id2.version) AS person_id, + person_distinct_id2.distinct_id AS distinct_id + FROM person_distinct_id2 + WHERE equals(person_distinct_id2.team_id, 2) + GROUP BY person_distinct_id2.distinct_id + HAVING ifNull(equals(argMax(person_distinct_id2.is_deleted, person_distinct_id2.version), 0), 0)) AS events__pdi ON equals(events.distinct_id, events__pdi.distinct_id) + WHERE and(equals(events.team_id, 2), equals(events.event, '$pageview'), and(greaterOrEquals(toTimeZone(events.timestamp, 'US/Pacific'), toDateTime64('2020-06-10 00:00:00.000000', 6, 'US/Pacific')), lessOrEquals(toTimeZone(events.timestamp, 'US/Pacific'), toDateTime64('2020-06-21 00:00:00.000000', 6, 'US/Pacific'))))) AS target_event) + WHERE and(or(1, isNull(breakdown_values)), or(1, isNull(intervals_from_base)))) AS actor_activity + GROUP BY breakdown_values, + intervals_from_base + ORDER BY breakdown_values ASC, + intervals_from_base ASC + LIMIT 10000 SETTINGS readonly=2, + max_execution_time=60, + allow_experimental_object_type=1 + ' +--- +# name: TestRetention.test_week_interval + ' + SELECT actor_activity.breakdown_values AS breakdown_values, + actor_activity.intervals_from_base AS intervals_from_base, + count(DISTINCT actor_activity.actor_id) AS count + FROM + (SELECT DISTINCT breakdown_values, + intervals_from_base, + actor_id + FROM + (SELECT target_event.breakdown_values AS breakdown_values, + dateDiff('week', target_event.event_date, returning_event.event_date) AS intervals_from_base, + returning_event.target AS actor_id + FROM + (SELECT DISTINCT toStartOfWeek(toTimeZone(events.timestamp, 'UTC'), 0) AS event_date, + events__pdi.person_id AS target, + [dateDiff('week', toStartOfWeek(toDateTime64('2020-06-07 00:00:00.000000', 6, 'UTC'), 0), toStartOfWeek(toTimeZone(events.timestamp, 'UTC'), 0))] AS breakdown_values + FROM events + INNER JOIN + (SELECT argMax(person_distinct_id2.person_id, person_distinct_id2.version) AS person_id, + person_distinct_id2.distinct_id AS distinct_id + FROM person_distinct_id2 + WHERE equals(person_distinct_id2.team_id, 2) + GROUP BY person_distinct_id2.distinct_id + HAVING ifNull(equals(argMax(person_distinct_id2.is_deleted, person_distinct_id2.version), 0), 0)) AS events__pdi ON equals(events.distinct_id, events__pdi.distinct_id) + WHERE and(equals(events.team_id, 2), equals(events.event, '$pageview'), and(greaterOrEquals(toTimeZone(events.timestamp, 'UTC'), toDateTime64('2020-06-07 00:00:00.000000', 6, 'UTC')), lessOrEquals(toTimeZone(events.timestamp, 'UTC'), toDateTime64('2020-07-27 00:00:00.000000', 6, 'UTC'))))) AS target_event + JOIN + (SELECT toStartOfWeek(toTimeZone(events.timestamp, 'UTC'), 0) AS event_date, + events__pdi.person_id AS target + FROM events + INNER JOIN + (SELECT argMax(person_distinct_id2.person_id, person_distinct_id2.version) AS person_id, + person_distinct_id2.distinct_id AS distinct_id + FROM person_distinct_id2 + WHERE equals(person_distinct_id2.team_id, 2) + GROUP BY person_distinct_id2.distinct_id + HAVING ifNull(equals(argMax(person_distinct_id2.is_deleted, person_distinct_id2.version), 0), 0)) AS events__pdi ON equals(events.distinct_id, events__pdi.distinct_id) + WHERE and(equals(events.team_id, 2), equals(events.event, '$pageview'), and(greaterOrEquals(toTimeZone(events.timestamp, 'UTC'), toDateTime64('2020-06-07 00:00:00.000000', 6, 'UTC')), lessOrEquals(toTimeZone(events.timestamp, 'UTC'), toDateTime64('2020-07-27 00:00:00.000000', 6, 'UTC')))) + GROUP BY target, + event_date) AS returning_event ON equals(returning_event.target, target_event.target) + WHERE ifNull(greater(returning_event.event_date, target_event.event_date), 0) + UNION ALL SELECT target_event.breakdown_values AS breakdown_values, + 0 AS intervals_from_base, + target_event.target AS actor_id + FROM + (SELECT DISTINCT toStartOfWeek(toTimeZone(events.timestamp, 'UTC'), 0) AS event_date, + events__pdi.person_id AS target, + [dateDiff('week', toStartOfWeek(toDateTime64('2020-06-07 00:00:00.000000', 6, 'UTC'), 0), toStartOfWeek(toTimeZone(events.timestamp, 'UTC'), 0))] AS breakdown_values + FROM events + INNER JOIN + (SELECT argMax(person_distinct_id2.person_id, person_distinct_id2.version) AS person_id, + person_distinct_id2.distinct_id AS distinct_id + FROM person_distinct_id2 + WHERE equals(person_distinct_id2.team_id, 2) + GROUP BY person_distinct_id2.distinct_id + HAVING ifNull(equals(argMax(person_distinct_id2.is_deleted, person_distinct_id2.version), 0), 0)) AS events__pdi ON equals(events.distinct_id, events__pdi.distinct_id) + WHERE and(equals(events.team_id, 2), equals(events.event, '$pageview'), and(greaterOrEquals(toTimeZone(events.timestamp, 'UTC'), toDateTime64('2020-06-07 00:00:00.000000', 6, 'UTC')), lessOrEquals(toTimeZone(events.timestamp, 'UTC'), toDateTime64('2020-07-27 00:00:00.000000', 6, 'UTC'))))) AS target_event) + WHERE and(or(1, isNull(breakdown_values)), or(1, isNull(intervals_from_base)))) AS actor_activity + GROUP BY breakdown_values, + intervals_from_base + ORDER BY breakdown_values ASC, + intervals_from_base ASC + LIMIT 10000 SETTINGS readonly=2, + max_execution_time=60, + allow_experimental_object_type=1 + ' +--- +# name: TestRetention.test_week_interval.1 + ' + SELECT actor_activity.breakdown_values AS breakdown_values, + actor_activity.intervals_from_base AS intervals_from_base, + count(DISTINCT actor_activity.actor_id) AS count + FROM + (SELECT DISTINCT breakdown_values, + intervals_from_base, + actor_id + FROM + (SELECT target_event.breakdown_values AS breakdown_values, + dateDiff('week', target_event.event_date, returning_event.event_date) AS intervals_from_base, + returning_event.target AS actor_id + FROM + (SELECT DISTINCT toStartOfWeek(toTimeZone(events.timestamp, 'UTC'), 3) AS event_date, + events__pdi.person_id AS target, + [dateDiff('week', toStartOfWeek(toDateTime64('2020-06-08 00:00:00.000000', 6, 'UTC'), 3), toStartOfWeek(toTimeZone(events.timestamp, 'UTC'), 3))] AS breakdown_values + FROM events + INNER JOIN + (SELECT argMax(person_distinct_id2.person_id, person_distinct_id2.version) AS person_id, + person_distinct_id2.distinct_id AS distinct_id + FROM person_distinct_id2 + WHERE equals(person_distinct_id2.team_id, 2) + GROUP BY person_distinct_id2.distinct_id + HAVING ifNull(equals(argMax(person_distinct_id2.is_deleted, person_distinct_id2.version), 0), 0)) AS events__pdi ON equals(events.distinct_id, events__pdi.distinct_id) + WHERE and(equals(events.team_id, 2), equals(events.event, '$pageview'), and(greaterOrEquals(toTimeZone(events.timestamp, 'UTC'), toDateTime64('2020-06-08 00:00:00.000000', 6, 'UTC')), lessOrEquals(toTimeZone(events.timestamp, 'UTC'), toDateTime64('2020-07-27 00:00:00.000000', 6, 'UTC'))))) AS target_event + JOIN + (SELECT toStartOfWeek(toTimeZone(events.timestamp, 'UTC'), 3) AS event_date, + events__pdi.person_id AS target + FROM events + INNER JOIN + (SELECT argMax(person_distinct_id2.person_id, person_distinct_id2.version) AS person_id, + person_distinct_id2.distinct_id AS distinct_id + FROM person_distinct_id2 + WHERE equals(person_distinct_id2.team_id, 2) + GROUP BY person_distinct_id2.distinct_id + HAVING ifNull(equals(argMax(person_distinct_id2.is_deleted, person_distinct_id2.version), 0), 0)) AS events__pdi ON equals(events.distinct_id, events__pdi.distinct_id) + WHERE and(equals(events.team_id, 2), equals(events.event, '$pageview'), and(greaterOrEquals(toTimeZone(events.timestamp, 'UTC'), toDateTime64('2020-06-08 00:00:00.000000', 6, 'UTC')), lessOrEquals(toTimeZone(events.timestamp, 'UTC'), toDateTime64('2020-07-27 00:00:00.000000', 6, 'UTC')))) + GROUP BY target, + event_date) AS returning_event ON equals(returning_event.target, target_event.target) + WHERE ifNull(greater(returning_event.event_date, target_event.event_date), 0) + UNION ALL SELECT target_event.breakdown_values AS breakdown_values, + 0 AS intervals_from_base, + target_event.target AS actor_id + FROM + (SELECT DISTINCT toStartOfWeek(toTimeZone(events.timestamp, 'UTC'), 3) AS event_date, + events__pdi.person_id AS target, + [dateDiff('week', toStartOfWeek(toDateTime64('2020-06-08 00:00:00.000000', 6, 'UTC'), 3), toStartOfWeek(toTimeZone(events.timestamp, 'UTC'), 3))] AS breakdown_values + FROM events + INNER JOIN + (SELECT argMax(person_distinct_id2.person_id, person_distinct_id2.version) AS person_id, + person_distinct_id2.distinct_id AS distinct_id + FROM person_distinct_id2 + WHERE equals(person_distinct_id2.team_id, 2) + GROUP BY person_distinct_id2.distinct_id + HAVING ifNull(equals(argMax(person_distinct_id2.is_deleted, person_distinct_id2.version), 0), 0)) AS events__pdi ON equals(events.distinct_id, events__pdi.distinct_id) + WHERE and(equals(events.team_id, 2), equals(events.event, '$pageview'), and(greaterOrEquals(toTimeZone(events.timestamp, 'UTC'), toDateTime64('2020-06-08 00:00:00.000000', 6, 'UTC')), lessOrEquals(toTimeZone(events.timestamp, 'UTC'), toDateTime64('2020-07-27 00:00:00.000000', 6, 'UTC'))))) AS target_event) + WHERE and(or(1, isNull(breakdown_values)), or(1, isNull(intervals_from_base)))) AS actor_activity + GROUP BY breakdown_values, + intervals_from_base + ORDER BY breakdown_values ASC, + intervals_from_base ASC + LIMIT 10000 SETTINGS readonly=2, + max_execution_time=60, + allow_experimental_object_type=1 + ' +--- diff --git a/posthog/hogql_queries/insights/test/test_retention_query_runner.py b/posthog/hogql_queries/insights/test/test_retention_query_runner.py new file mode 100644 index 0000000000000..7a6f076e43385 --- /dev/null +++ b/posthog/hogql_queries/insights/test/test_retention_query_runner.py @@ -0,0 +1,1630 @@ +import json +import uuid +from datetime import datetime +from typing import Any +from unittest import skip + +from zoneinfo import ZoneInfo +from django.test import override_settings +from rest_framework import status + +from posthog.constants import ( + RETENTION_FIRST_TIME, + RETENTION_TYPE, + TREND_FILTER_TYPE_ACTIONS, + TREND_FILTER_TYPE_EVENTS, +) +from posthog.hogql_queries.insights.retention_query_runner import RetentionQueryRunner +from posthog.models import Action, ActionStep +from posthog.models.filters import RetentionFilter as OldRetentionFilter +from posthog.test.base import ( + APIBaseTest, + ClickhouseTestMixin, + _create_event, + _create_person, + create_person_id_override_by_distinct_id, + snapshot_clickhouse_queries, +) + + +def _create_action(**kwargs): + team = kwargs.pop("team") + name = kwargs.pop("name") + action = Action.objects.create(team=team, name=name) + ActionStep.objects.create(action=action, event=name) + return action + + +def _create_signup_actions(team, user_and_timestamps): + for distinct_id, timestamp in user_and_timestamps: + _create_event(team=team, event="sign up", distinct_id=distinct_id, timestamp=timestamp) + sign_up_action = _create_action(team=team, name="sign up") + return sign_up_action + + +def _date(day, hour=5, month=0, minute=0): + return datetime(2020, 6 + month, 10 + day, hour, minute).isoformat() + + +def pluck(list_of_dicts, key, child_key=None): + return [pluck(d[key], child_key) if child_key else d[key] for d in list_of_dicts] + + +def _create_events(team, user_and_timestamps, event="$pageview"): + i = 0 + for distinct_id, timestamp, *properties_args in user_and_timestamps: + properties = {"$some_property": "value"} if i % 2 == 0 else {} + if len(properties_args) == 1: + properties.update(properties_args[0]) + + _create_event( + team=team, + event=event, + distinct_id=distinct_id, + timestamp=timestamp, + properties=properties, + ) + i += 1 + + +class TestRetention(ClickhouseTestMixin, APIBaseTest): + def run_query(self, query): + if not query.get("retentionFilter"): + query["retentionFilter"] = {} + runner = RetentionQueryRunner(team=self.team, query=query) + return runner.calculate().model_dump()["results"] + + def actors_in_period(self, *args, **kwargs) -> Any: + return args, kwargs + + def test_retention_default(self): + _create_person(team_id=self.team.pk, distinct_ids=["person1", "alias1"]) + _create_person(team_id=self.team.pk, distinct_ids=["person2"]) + + _create_events( + self.team, + [ + ("person1", _date(0)), + ("person1", _date(1)), + ("person1", _date(2)), + ("person1", _date(5)), + ("alias1", _date(5, 9)), + ("person1", _date(6)), + ("person2", _date(1)), + ("person2", _date(2)), + ("person2", _date(3)), + ("person2", _date(6)), + ], + ) + + result = self.run_query(query={}) + self.assertEqual( + pluck(result, "values", "count"), + [ + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0], + [0, 0, 0, 0], + [0, 0, 0], + [0, 0], + [0], + ], + ) + + def test_day_interval(self): + _create_person(team_id=self.team.pk, distinct_ids=["person1", "alias1"]) + _create_person(team_id=self.team.pk, distinct_ids=["person2"]) + + _create_events( + self.team, + [ + ("person1", _date(0)), + ("person1", _date(1)), + ("person1", _date(2)), + ("person1", _date(5)), + ("alias1", _date(5, 9)), + ("person1", _date(6)), + ("person2", _date(1)), + ("person2", _date(2)), + ("person2", _date(3)), + ("person2", _date(6)), + ], + ) + + # even if set to hour 6 it should default to beginning of day and include all pageviews above + result = self.run_query(query={"dateRange": {"date_to": _date(10, hour=6)}}) + self.assertEqual(len(result), 11) + self.assertEqual( + pluck(result, "label"), + [ + "Day 0", + "Day 1", + "Day 2", + "Day 3", + "Day 4", + "Day 5", + "Day 6", + "Day 7", + "Day 8", + "Day 9", + "Day 10", + ], + ) + self.assertEqual(result[0]["date"], datetime(2020, 6, 10, 0, tzinfo=ZoneInfo("UTC"))) + + self.assertEqual( + pluck(result, "values", "count"), + [ + [1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0], + [2, 2, 1, 0, 1, 2, 0, 0, 0, 0], + [2, 1, 0, 1, 2, 0, 0, 0, 0], + [1, 0, 0, 1, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0], + [1, 1, 0, 0, 0, 0], + [2, 0, 0, 0, 0], + [0, 0, 0, 0], + [0, 0, 0], + [0, 0], + [0], + ], + ) + + def test_month_interval(self): + _create_person( + team=self.team, + distinct_ids=["person1", "alias1"], + properties={"email": "person1@test.com"}, + ) + _create_person( + team=self.team, + distinct_ids=["person2"], + properties={"email": "person2@test.com"}, + ) + + _create_events( + self.team, + [ + ("person1", _date(day=0, month=-5)), + ("person2", _date(day=0, month=-5)), + ("person1", _date(day=0, month=-4)), + ("person2", _date(day=0, month=-4)), + ("person1", _date(day=0, month=-3)), + ("person2", _date(day=0, month=-3)), + ("person1", _date(day=0, month=-1)), + ("person1", _date(day=0, month=0)), + ("person2", _date(day=0, month=0)), + ("person2", _date(day=0, month=1)), + ("person1", _date(day=0, month=3)), + ("person2", _date(day=0, month=5)), + ], + ) + + result = self.run_query( + query={ + "dateRange": {"date_to": _date(0, month=5, hour=0)}, + "retentionFilter": { + "period": "Month", + "total_intervals": 11, + }, + } + ) + + self.assertEqual( + pluck(result, "label"), + [ + "Month 0", + "Month 1", + "Month 2", + "Month 3", + "Month 4", + "Month 5", + "Month 6", + "Month 7", + "Month 8", + "Month 9", + "Month 10", + ], + ) + + self.assertEqual( + pluck(result, "values", "count"), + [ + [2, 2, 2, 0, 1, 2, 1, 0, 1, 0, 1], + [2, 2, 0, 1, 2, 1, 0, 1, 0, 1], + [2, 0, 1, 2, 1, 0, 1, 0, 1], + [0, 0, 0, 0, 0, 0, 0, 0], + [1, 1, 0, 0, 1, 0, 0], + [2, 1, 0, 1, 0, 1], + [1, 0, 0, 0, 1], + [0, 0, 0, 0], + [1, 0, 0], + [0, 0], + [1], + ], + ) + + self.assertEqual( + pluck(result, "date"), + [ + datetime(2020, 1, 10, 0, tzinfo=ZoneInfo("UTC")), + datetime(2020, 2, 10, 0, tzinfo=ZoneInfo("UTC")), + datetime(2020, 3, 10, 0, tzinfo=ZoneInfo("UTC")), + datetime(2020, 4, 10, 0, tzinfo=ZoneInfo("UTC")), + datetime(2020, 5, 10, 0, tzinfo=ZoneInfo("UTC")), + datetime(2020, 6, 10, 0, tzinfo=ZoneInfo("UTC")), + datetime(2020, 7, 10, 0, tzinfo=ZoneInfo("UTC")), + datetime(2020, 8, 10, 0, tzinfo=ZoneInfo("UTC")), + datetime(2020, 9, 10, 0, tzinfo=ZoneInfo("UTC")), + datetime(2020, 10, 10, 0, tzinfo=ZoneInfo("UTC")), + datetime(2020, 11, 10, 0, tzinfo=ZoneInfo("UTC")), + ], + ) + + @override_settings(PERSON_ON_EVENTS_V2_OVERRIDE=True) + @snapshot_clickhouse_queries + def test_month_interval_with_person_on_events_v2(self): + _create_person( + team=self.team, + distinct_ids=["person1", "alias1"], + properties={"email": "person1@test.com"}, + ) + _create_person( + team=self.team, + distinct_ids=["person2"], + properties={"email": "person2@test.com"}, + ) + + person_id1 = str(uuid.uuid4()) + person_id2 = str(uuid.uuid4()) + _create_event( + event="$pageview", + team=self.team, + distinct_id="person1", + person_id=person_id1, + timestamp=_date(day=0, month=-5), + ) + _create_event( + event="$pageview", + team=self.team, + distinct_id="person2", + person_id=person_id2, + timestamp=_date(day=0, month=-4), + ) + _create_event( + event="$pageview", + team=self.team, + distinct_id="person1", + person_id=person_id1, + timestamp=_date(day=0, month=-3), + ) + _create_event( + event="$pageview", + team=self.team, + distinct_id="person2", + person_id=person_id2, + timestamp=_date(day=0, month=-2), + ) + _create_event( + event="$pageview", + team=self.team, + distinct_id="person1", + person_id=person_id1, + timestamp=_date(day=0, month=-1), + ) + _create_event( + event="$pageview", + team=self.team, + distinct_id="person2", + person_id=person_id2, + timestamp=_date(day=0, month=0), + ) + _create_event( + event="$pageview", + team=self.team, + distinct_id="person1", + person_id=person_id1, + timestamp=_date(day=0, month=1), + ) + _create_event( + event="$pageview", + team=self.team, + distinct_id="person2", + person_id=person_id2, + timestamp=_date(day=0, month=2), + ) + _create_event( + event="$pageview", + team=self.team, + distinct_id="person1", + person_id=person_id1, + timestamp=_date(day=0, month=3), + ) + _create_event( + event="$pageview", + team=self.team, + distinct_id="person2", + person_id=person_id2, + timestamp=_date(day=0, month=4), + ) + _create_event( + event="$pageview", + team=self.team, + distinct_id="person1", + person_id=person_id1, + timestamp=_date(day=0, month=5), + ) + + _create_events( + self.team, + [ + ("person1", _date(day=0, month=-5)), + ("person2", _date(day=0, month=-4)), + ("person1", _date(day=0, month=-3)), + ("person2", _date(day=0, month=-2)), + ("person1", _date(day=0, month=-1)), + ("person2", _date(day=0, month=0)), + ("person1", _date(day=0, month=1)), + ("person2", _date(day=0, month=2)), + ("person1", _date(day=0, month=3)), + ("person2", _date(day=0, month=4)), + ("person1", _date(day=0, month=5)), + ], + ) + + create_person_id_override_by_distinct_id("person1", "person2", self.team.pk) + + result = self.run_query( + query={ + "dateRange": {"date_to": _date(0, month=5, hour=0)}, + "retentionFilter": { + "period": "Month", + "total_intervals": 11, + }, + } + ) + + self.assertEqual( + pluck(result, "label"), + [ + "Month 0", + "Month 1", + "Month 2", + "Month 3", + "Month 4", + "Month 5", + "Month 6", + "Month 7", + "Month 8", + "Month 9", + "Month 10", + ], + ) + + # We expect 1s across the board due to the override set up from person1 to person2, making them the same person + self.assertEqual( + pluck(result, "values", "count"), + [ + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], + [1, 1, 1, 1, 1, 1, 1, 1, 1], + [1, 1, 1, 1, 1, 1, 1, 1], + [1, 1, 1, 1, 1, 1, 1], + [1, 1, 1, 1, 1, 1], + [1, 1, 1, 1, 1], + [1, 1, 1, 1], + [1, 1, 1], + [1, 1], + [1], + ], + ) + + self.assertEqual( + pluck(result, "date"), + [ + datetime(2020, 1, 10, 0, tzinfo=ZoneInfo("UTC")), + datetime(2020, 2, 10, 0, tzinfo=ZoneInfo("UTC")), + datetime(2020, 3, 10, 0, tzinfo=ZoneInfo("UTC")), + datetime(2020, 4, 10, 0, tzinfo=ZoneInfo("UTC")), + datetime(2020, 5, 10, 0, tzinfo=ZoneInfo("UTC")), + datetime(2020, 6, 10, 0, tzinfo=ZoneInfo("UTC")), + datetime(2020, 7, 10, 0, tzinfo=ZoneInfo("UTC")), + datetime(2020, 8, 10, 0, tzinfo=ZoneInfo("UTC")), + datetime(2020, 9, 10, 0, tzinfo=ZoneInfo("UTC")), + datetime(2020, 10, 10, 0, tzinfo=ZoneInfo("UTC")), + datetime(2020, 11, 10, 0, tzinfo=ZoneInfo("UTC")), + ], + ) + + @snapshot_clickhouse_queries + def test_week_interval(self): + _create_person( + team=self.team, + distinct_ids=["person1", "alias1"], + properties={"email": "person1@test.com"}, + ) + _create_person( + team=self.team, + distinct_ids=["person2"], + properties={"email": "person2@test.com"}, + ) + + _create_events( + self.team, + [ + ("person1", _date(0)), + ("person2", _date(0)), + ("person1", _date(1)), + ("person2", _date(1)), + ("person1", _date(7)), + ("person2", _date(7)), + ("person1", _date(14)), + ("person1", _date(month=1, day=-6)), + ("person2", _date(month=1, day=-6)), + ("person2", _date(month=1, day=1)), + ("person1", _date(month=1, day=1)), + ("person2", _date(month=1, day=15)), + ], + ) + + # Starting with Sunday + query = { + "dateRange": {"date_to": _date(10, month=1, hour=0)}, + "retentionFilter": { + "period": "Week", + "total_intervals": 7, + }, + } + result_sunday = self.run_query(query=query) + + self.assertEqual( + pluck(result_sunday, "label"), + ["Week 0", "Week 1", "Week 2", "Week 3", "Week 4", "Week 5", "Week 6"], + ) + + self.assertEqual( + pluck(result_sunday, "values", "count"), + [ + [2, 2, 1, 2, 2, 0, 1], + [2, 1, 2, 2, 0, 1], + [1, 1, 1, 0, 0], + [2, 2, 0, 1], + [2, 0, 1], + [0, 0], + [1], + ], + ) + + self.assertEqual( + pluck(result_sunday, "date"), + [ + datetime(2020, 6, 7, 0, tzinfo=ZoneInfo("UTC")), + datetime(2020, 6, 14, 0, tzinfo=ZoneInfo("UTC")), + datetime(2020, 6, 21, 0, tzinfo=ZoneInfo("UTC")), + datetime(2020, 6, 28, 0, tzinfo=ZoneInfo("UTC")), + datetime(2020, 7, 5, 0, tzinfo=ZoneInfo("UTC")), + datetime(2020, 7, 12, 0, tzinfo=ZoneInfo("UTC")), + datetime(2020, 7, 19, 0, tzinfo=ZoneInfo("UTC")), + ], + ) + + # Starting with Monday + self.team.week_start_day = 1 # WeekStartDay.MONDAY's concrete value + self.team.save() + + result_monday = self.run_query(query=query) + + self.assertEqual( + pluck(result_monday, "label"), + ["Week 0", "Week 1", "Week 2", "Week 3", "Week 4", "Week 5", "Week 6"], + ) + + self.assertEqual( + pluck(result_monday, "values", "count"), + [ + [2, 2, 1, 2, 2, 0, 1], + [2, 1, 2, 2, 0, 1], + [1, 1, 1, 0, 0], + [2, 2, 0, 1], + [2, 0, 1], + [0, 0], + [1], + ], + ) + + self.assertEqual( + pluck(result_monday, "date"), + [ + datetime(2020, 6, 8, 0, tzinfo=ZoneInfo("UTC")), + datetime(2020, 6, 15, 0, tzinfo=ZoneInfo("UTC")), + datetime(2020, 6, 22, 0, tzinfo=ZoneInfo("UTC")), + datetime(2020, 6, 29, 0, tzinfo=ZoneInfo("UTC")), + datetime(2020, 7, 6, 0, tzinfo=ZoneInfo("UTC")), + datetime(2020, 7, 13, 0, tzinfo=ZoneInfo("UTC")), + datetime(2020, 7, 20, 0, tzinfo=ZoneInfo("UTC")), + ], + ) + + def test_hour_interval(self): + _create_person( + team=self.team, + distinct_ids=["person1", "alias1"], + properties={"email": "person1@test.com"}, + ) + _create_person( + team=self.team, + distinct_ids=["person2"], + properties={"email": "person2@test.com"}, + ) + + _create_events( + self.team, + [ + ("person1", _date(day=0, hour=6)), + ("person2", _date(day=0, hour=6)), + ("person1", _date(day=0, hour=7)), + ("person2", _date(day=0, hour=7)), + ("person1", _date(day=0, hour=8)), + ("person2", _date(day=0, hour=8)), + ("person1", _date(day=0, hour=10)), + ("person1", _date(day=0, hour=11)), + ("person2", _date(day=0, hour=11)), + ("person2", _date(day=0, hour=12)), + ("person1", _date(day=0, hour=14)), + ("person2", _date(day=0, hour=16)), + ], + ) + + result = self.run_query( + query={ + "dateRange": {"date_to": _date(0, hour=16, minute=13)}, + "retentionFilter": { + "period": "Hour", + "total_intervals": 11, + }, + } + ) + + self.assertEqual( + pluck(result, "label"), + [ + "Hour 0", + "Hour 1", + "Hour 2", + "Hour 3", + "Hour 4", + "Hour 5", + "Hour 6", + "Hour 7", + "Hour 8", + "Hour 9", + "Hour 10", + ], + ) + + self.assertEqual( + pluck(result, "values", "count"), + [ + [2, 2, 2, 0, 1, 2, 1, 0, 1, 0, 1], + [2, 2, 0, 1, 2, 1, 0, 1, 0, 1], + [2, 0, 1, 2, 1, 0, 1, 0, 1], + [0, 0, 0, 0, 0, 0, 0, 0], + [1, 1, 0, 0, 1, 0, 0], + [2, 1, 0, 1, 0, 1], + [1, 0, 0, 0, 1], + [0, 0, 0, 0], + [1, 0, 0], + [0, 0], + [1], + ], + ) + + self.assertEqual( + pluck(result, "date"), + [ + datetime(2020, 6, 10, 6, tzinfo=ZoneInfo("UTC")), + datetime(2020, 6, 10, 7, tzinfo=ZoneInfo("UTC")), + datetime(2020, 6, 10, 8, tzinfo=ZoneInfo("UTC")), + datetime(2020, 6, 10, 9, tzinfo=ZoneInfo("UTC")), + datetime(2020, 6, 10, 10, tzinfo=ZoneInfo("UTC")), + datetime(2020, 6, 10, 11, tzinfo=ZoneInfo("UTC")), + datetime(2020, 6, 10, 12, tzinfo=ZoneInfo("UTC")), + datetime(2020, 6, 10, 13, tzinfo=ZoneInfo("UTC")), + datetime(2020, 6, 10, 14, tzinfo=ZoneInfo("UTC")), + datetime(2020, 6, 10, 15, tzinfo=ZoneInfo("UTC")), + datetime(2020, 6, 10, 16, tzinfo=ZoneInfo("UTC")), + ], + ) + + # ensure that the first interval is properly rounded according to the specified period + def test_interval_rounding(self): + _create_person( + team=self.team, + distinct_ids=["person1", "alias1"], + properties={"email": "person1@test.com"}, + ) + _create_person( + team=self.team, + distinct_ids=["person2"], + properties={"email": "person2@test.com"}, + ) + + _create_events( + self.team, + [ + ("person1", _date(0)), + ("person2", _date(0)), + ("person1", _date(1)), + ("person2", _date(1)), + ("person1", _date(7)), + ("person2", _date(7)), + ("person1", _date(14)), + ("person1", _date(month=1, day=-6)), + ("person2", _date(month=1, day=-6)), + ("person2", _date(month=1, day=1)), + ("person1", _date(month=1, day=1)), + ("person2", _date(month=1, day=15)), + ], + ) + + result = self.run_query( + query={ + "dateRange": {"date_to": _date(14, month=1, hour=0)}, + "retentionFilter": { + "period": "Week", + "total_intervals": 7, + }, + } + ) + + self.assertEqual( + pluck(result, "label"), + ["Week 0", "Week 1", "Week 2", "Week 3", "Week 4", "Week 5", "Week 6"], + ) + + self.assertEqual( + pluck(result, "values", "count"), + [ + [2, 2, 1, 2, 2, 0, 1], + [2, 1, 2, 2, 0, 1], + [1, 1, 1, 0, 0], + [2, 2, 0, 1], + [2, 0, 1], + [0, 0], + [1], + ], + ) + + self.assertEqual( + pluck(result, "date"), + [ + datetime(2020, 6, 7, 0, tzinfo=ZoneInfo("UTC")), + datetime(2020, 6, 14, 0, tzinfo=ZoneInfo("UTC")), + datetime(2020, 6, 21, 0, tzinfo=ZoneInfo("UTC")), + datetime(2020, 6, 28, 0, tzinfo=ZoneInfo("UTC")), + datetime(2020, 7, 5, 0, tzinfo=ZoneInfo("UTC")), + datetime(2020, 7, 12, 0, tzinfo=ZoneInfo("UTC")), + datetime(2020, 7, 19, 0, tzinfo=ZoneInfo("UTC")), + ], + ) + + @skip("TODO: Bring back when working on actors_in_period") + def test_retention_people_basic(self): + person1 = _create_person(team_id=self.team.pk, distinct_ids=["person1", "alias1"]) + _create_person(team_id=self.team.pk, distinct_ids=["person2"]) + + _create_events( + self.team, + [ + ("person1", _date(0)), + ("person1", _date(1)), + ("person1", _date(2)), + ("person1", _date(5)), + ("alias1", _date(5, 9)), + ("person1", _date(6)), + ("person2", _date(1)), + ("person2", _date(2)), + ("person2", _date(3)), + ("person2", _date(6)), + ], + ) + + # even if set to hour 6 it should default to beginning of day and include all pageviews above + result, _ = self.actors_in_period( + OldRetentionFilter( + data={"date_to": _date(10, hour=6), "selected_interval": 0}, + team=self.team, + ), + self.team, + ) + self.assertEqual(len(result), 1) + self.assertTrue(result[0]["person"]["id"] == person1.uuid, person1.uuid) + + @skip("TODO: Bring back when working on actors_in_period") + def test_retention_people_first_time(self): + _, _, p3, _ = self._create_first_time_retention_events() + # even if set to hour 6 it should default to beginning of day and include all pageviews above + + target_entity = json.dumps({"id": "$user_signed_up", "type": TREND_FILTER_TYPE_EVENTS}) + result, _ = self.actors_in_period( + OldRetentionFilter( + data={ + "date_to": _date(10, hour=6), + RETENTION_TYPE: RETENTION_FIRST_TIME, + "target_entity": target_entity, + "returning_entity": {"id": "$pageview", "type": "events"}, + "selected_interval": 0, + }, + team=self.team, + ), + self.team, + ) + + self.assertEqual(len(result), 1) + self.assertIn(result[0]["person"]["id"], [p3.uuid, p3.pk]) + + result, _ = self.actors_in_period( + OldRetentionFilter( + data={ + "date_to": _date(14, hour=6), + RETENTION_TYPE: RETENTION_FIRST_TIME, + "target_entity": target_entity, + "returning_entity": {"id": "$pageview", "type": "events"}, + "selected_interval": 0, + }, + team=self.team, + ), + self.team, + ) + + self.assertEqual(len(result), 0) + + def test_retention_people_paginated(self): + for i in range(150): + person_id = "person{}".format(i) + _create_person(team_id=self.team.pk, distinct_ids=[person_id]) + _create_events( + self.team, + [ + (person_id, _date(0)), + (person_id, _date(1)), + (person_id, _date(2)), + (person_id, _date(5)), + ], + ) + + # even if set to hour 6 it should default to beginning of day and include all pageviews above + result = self.client.get( + "/api/person/retention", + data={"date_to": _date(10, hour=6), "selected_interval": 2}, + ).json() + + self.assertEqual(len(result["result"]), 100) + + second_result = self.client.get(result["next"]).json() + self.assertEqual(len(second_result["result"]), 50) + + def test_retention_invalid_properties(self): + response = self.client.get("/api/person/retention", data={"properties": "invalid_json"}) + + self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST) + self.assertDictEqual( + response.json(), + self.validation_error_response("Properties are unparsable!", "invalid_input"), + ) + + @skip("TODO: Bring back when working on actors_in_period") + def test_retention_people_in_period(self): + person1 = _create_person(team_id=self.team.pk, distinct_ids=["person1", "alias1"]) + person2 = _create_person(team_id=self.team.pk, distinct_ids=["person2"]) + + _create_events( + self.team, + [ + ("person1", _date(0)), + ("person1", _date(1)), + ("person1", _date(2)), + ("person1", _date(5)), + ("alias1", _date(5, 9)), + ("person1", _date(6)), + ("person2", _date(1)), + ("person2", _date(2)), + ("person2", _date(3)), + ("person2", _date(6)), + ("person2", _date(7)), + ], + ) + + # even if set to hour 6 it should default to beginning of day and include all pageviews above + result, _ = self.actors_in_period( + OldRetentionFilter( + data={"date_to": _date(10, hour=6), "selected_interval": 2}, + team=self.team, + ), + self.team, + ) + + # should be descending order on number of appearances + self.assertIn(result[0]["person"]["id"], [person2.pk, person2.uuid]) + self.assertEqual(result[0]["appearances"], [1, 1, 0, 0, 1, 1, 0, 0, 0]) + + self.assertIn(result[1]["person"]["id"], [person1.pk, person1.uuid]) + self.assertEqual(result[1]["appearances"], [1, 0, 0, 1, 1, 0, 0, 0, 0]) + + @skip("TODO: Bring back when working on actors_in_period") + def test_retention_people_in_perieod_first_time(self): + p1, p2, p3, p4 = self._create_first_time_retention_events() + # even if set to hour 6 it should default to beginning of day and include all pageviews above + target_entity = json.dumps({"id": "$user_signed_up", "type": TREND_FILTER_TYPE_EVENTS}) + result1, _ = self.actors_in_period( + OldRetentionFilter( + data={ + "date_to": _date(10, hour=6), + RETENTION_TYPE: RETENTION_FIRST_TIME, + "target_entity": target_entity, + "returning_entity": {"id": "$pageview", "type": "events"}, + "selected_interval": 0, + }, + team=self.team, + ), + self.team, + ) + + self.assertEqual(len(result1), 1) + self.assertTrue(result1[0]["person"]["id"] == p3.pk or result1[0]["person"]["id"] == p3.uuid) + self.assertEqual(result1[0]["appearances"], [1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0]) + + def test_retention_multiple_events(self): + _create_person(team_id=self.team.pk, distinct_ids=["person1", "alias1"]) + _create_person(team_id=self.team.pk, distinct_ids=["person2"]) + _create_person(team_id=self.team.pk, distinct_ids=["person3"]) + _create_person(team_id=self.team.pk, distinct_ids=["person4"]) + + first_event = "$some_event" + _create_events( + self.team, + [ + ("person1", _date(0)), + ("person1", _date(1)), + ("person1", _date(2)), + ("person1", _date(3)), + ("person2", _date(0)), + ("person2", _date(1)), + ("person2", _date(2)), + ("person2", _date(3)), + ("person3", _date(5)), + ], + first_event, + ) + + _create_events( + self.team, + [("person1", _date(5)), ("person1", _date(6)), ("person2", _date(5))], + "$pageview", + ) + + result = self.run_query( + query={ + "dateRange": {"date_to": _date(6, hour=6)}, + "retentionFilter": { + "period": "Day", + "total_intervals": 7, + "target_entity": {"id": first_event, "name": first_event, "type": TREND_FILTER_TYPE_EVENTS}, + "returning_entity": {"id": "$pageview", "name": "$pageview", "type": "events"}, + }, + } + ) + self.assertEqual(len(result), 7) + self.assertEqual( + pluck(result, "label"), + ["Day 0", "Day 1", "Day 2", "Day 3", "Day 4", "Day 5", "Day 6"], + ) + + self.assertEqual( + pluck(result, "values", "count"), + [ + [2, 0, 0, 0, 0, 2, 1], + [2, 0, 0, 0, 2, 1], + [2, 0, 0, 2, 1], + [2, 0, 2, 1], + [0, 0, 0], + [1, 0], + [0], + ], + ) + + def test_retention_any_event(self): + _create_person(team_id=self.team.pk, distinct_ids=["person1", "alias1"]) + _create_person(team_id=self.team.pk, distinct_ids=["person2"]) + _create_person(team_id=self.team.pk, distinct_ids=["person3"]) + _create_person(team_id=self.team.pk, distinct_ids=["person4"]) + + _create_events( + self.team, + [ + ("person1", _date(0)), + ("person1", _date(1)), + ("person1", _date(2)), + ("person1", _date(3)), + ("person2", _date(0)), + ("person2", _date(1)), + ("person2", _date(2)), + ("person2", _date(3)), + ("person3", _date(5)), + ], + "$some_event", + ) + + _create_events( + self.team, + [("person1", _date(5)), ("person1", _date(6)), ("person2", _date(5))], + "$pageview", + ) + + result = self.run_query( + query={ + "dateRange": {"date_to": _date(6, hour=6)}, + "retentionFilter": { + "period": "Day", + "total_intervals": 7, + "target_entity": {"id": None, "type": "events"}, + "returning_entity": {"id": None, "type": "events"}, + }, + } + ) + self.assertEqual(len(result), 7) + self.assertEqual( + pluck(result, "label"), + ["Day 0", "Day 1", "Day 2", "Day 3", "Day 4", "Day 5", "Day 6"], + ) + + self.assertEqual( + pluck(result, "values", "count"), + [ + [2, 2, 2, 2, 0, 2, 1], + [2, 2, 2, 0, 2, 1], + [2, 2, 0, 2, 1], + [2, 0, 2, 1], + [0, 0, 0], + [3, 1], + [1], + ], + ) + + @snapshot_clickhouse_queries + def test_retention_event_action(self): + _create_person(team=self.team, distinct_ids=["person1", "alias1"]) + _create_person(team=self.team, distinct_ids=["person2"]) + + action = _create_signup_actions( + self.team, + [ + ("person1", _date(0)), + ("person1", _date(1)), + ("person1", _date(2)), + ("person1", _date(3)), + ("person2", _date(0)), + ("person2", _date(1)), + ("person2", _date(2)), + ("person2", _date(3)), + ], + ) + + some_event = "$some_event" + _create_events(self.team, [("person1", _date(3)), ("person2", _date(5))], some_event) + + result = self.run_query( + query={ + "dateRange": {"date_to": _date(6, hour=0)}, + "retentionFilter": { + "total_intervals": 7, + "target_entity": { + "id": action.pk, + "name": action.name, + "type": TREND_FILTER_TYPE_ACTIONS, + }, + "returning_entity": { + "id": some_event, + "name": some_event, + "type": TREND_FILTER_TYPE_EVENTS, + }, + }, + } + ) + + self.assertEqual(len(result), 7) + self.assertEqual( + pluck(result, "label"), + ["Day 0", "Day 1", "Day 2", "Day 3", "Day 4", "Day 5", "Day 6"], + ) + self.assertEqual(result[0]["date"], datetime(2020, 6, 10, 0, tzinfo=ZoneInfo("UTC"))) + + self.assertEqual( + pluck(result, "values", "count"), + [ + [2, 0, 0, 1, 0, 1, 0], + [2, 0, 1, 0, 1, 0], + [2, 1, 0, 1, 0], + [2, 0, 1, 0], + [0, 0, 0], + [0, 0], + [0], + ], + ) + + def test_first_time_retention(self): + self._create_first_time_retention_events() + + result = self.run_query( + query={ + "dateRange": {"date_to": _date(5, hour=6)}, + "retentionFilter": { + "period": "Day", + "total_intervals": 7, + "retention_type": RETENTION_FIRST_TIME, + "target_entity": { + "id": "$user_signed_up", + "name": "$user_signed_up", + "type": TREND_FILTER_TYPE_EVENTS, + }, + "returning_entity": {"id": "$pageview", "name": "$pageview", "type": "events"}, + }, + } + ) + + self.assertEqual(len(result), 7) + self.assertEqual( + pluck(result, "label"), + ["Day 0", "Day 1", "Day 2", "Day 3", "Day 4", "Day 5", "Day 6"], + ) + + self.assertEqual( + pluck(result, "values", "count"), + [ + [2, 1, 2, 2, 1, 0, 1], + [1, 1, 0, 1, 1, 1], + [0, 0, 0, 0, 0], + [1, 1, 0, 1], + [0, 0, 0], + [0, 0], + [0], + ], + ) + + def test_retention_with_properties(self): + _create_person(team_id=self.team.pk, distinct_ids=["person1", "alias1"]) + _create_person(team_id=self.team.pk, distinct_ids=["person2"]) + + _create_events( + self.team, + [ + ("person1", _date(0)), + ("person1", _date(1)), + ("person1", _date(2)), + ("person1", _date(5)), + ("alias1", _date(5, 9)), + ("person1", _date(6)), + ("person2", _date(1)), + ("person2", _date(2)), + ("person2", _date(3)), + ("person2", _date(6)), + ], + ) + + result = self.run_query( + query={ + "dateRange": {"date_to": _date(10, hour=0)}, + "properties": { + "type": "AND", + "values": [ + { + "type": "AND", + "values": [ + { + "key": "$some_property", + "operator": "exact", + "value": ["value"], + } + ], + } + ], + }, + } + ) + self.assertEqual(len(result), 11) + self.assertEqual( + pluck(result, "label"), + [ + "Day 0", + "Day 1", + "Day 2", + "Day 3", + "Day 4", + "Day 5", + "Day 6", + "Day 7", + "Day 8", + "Day 9", + "Day 10", + ], + ) + self.assertEqual(result[0]["date"], datetime(2020, 6, 10, 0, tzinfo=ZoneInfo("UTC"))) + + self.assertEqual( + pluck(result, "values", "count"), + [ + [1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0], + [1, 0, 1, 0, 0, 0, 0, 0, 0, 0], + [1, 0, 0, 1, 0, 0, 0, 0, 0], + [1, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0], + [1, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0], + [0, 0, 0, 0], + [0, 0, 0], + [0, 0], + [0], + ], + ) + + def test_retention_with_user_properties(self): + _create_person( + team_id=self.team.pk, + distinct_ids=["person1", "alias1"], + properties={"email": "person1@test.com"}, + ) + _create_person( + team_id=self.team.pk, + distinct_ids=["person2"], + properties={"email": "person2@test.com"}, + ) + + _create_events( + self.team, + [ + ("person1", _date(0)), + ("person1", _date(1)), + ("person1", _date(2)), + ("person1", _date(5)), + ("alias1", _date(5, 9)), + ("person1", _date(6)), + ("person2", _date(1)), + ("person2", _date(2)), + ("person2", _date(3)), + ("person2", _date(6)), + ], + ) + + result = self.run_query( + query={ + "dateRange": {"date_to": _date(6, hour=0)}, + "properties": { + "type": "AND", + "values": [ + { + "type": "AND", + "values": [ + { + "key": "email", + "operator": "exact", + "type": "person", + "value": ["person1@test.com"], + } + ], + } + ], + }, + "retentionFilter": { + "total_intervals": 7, + }, + } + ) + + self.assertEqual(len(result), 7) + self.assertEqual( + pluck(result, "label"), + ["Day 0", "Day 1", "Day 2", "Day 3", "Day 4", "Day 5", "Day 6"], + ) + self.assertEqual(result[0]["date"], datetime(2020, 6, 10, 0, tzinfo=ZoneInfo("UTC"))) + self.assertEqual( + pluck(result, "values", "count"), + [ + [1, 1, 1, 0, 0, 1, 1], + [1, 1, 0, 0, 1, 1], + [1, 0, 0, 1, 1], + [0, 0, 0, 0], + [0, 0, 0], + [1, 1], + [1], + ], + ) + + @snapshot_clickhouse_queries + def test_retention_with_user_properties_via_action(self): + action = Action.objects.create(team=self.team) + ActionStep.objects.create( + action=action, + event="$pageview", + properties=[{"key": "email", "value": "person1@test.com", "type": "person"}], + ) + + _create_person( + team_id=self.team.pk, + distinct_ids=["person1", "alias1"], + properties={"email": "person1@test.com"}, + ) + _create_person( + team_id=self.team.pk, + distinct_ids=["person2"], + properties={"email": "person2@test.com"}, + ) + + _create_events( + self.team, + [ + ("person1", _date(0)), + ("person1", _date(1)), + ("person1", _date(2)), + ("person1", _date(5)), + ("alias1", _date(5, 9)), + ("person1", _date(6)), + ("person2", _date(1)), + ("person2", _date(2)), + ("person2", _date(3)), + ("person2", _date(6)), + ], + ) + + result = self.run_query( + query={ + "dateRange": {"date_to": _date(6, hour=0)}, + "retentionFilter": { + "total_intervals": 7, + "target_entity": {"id": action.pk, "name": action.name, "type": TREND_FILTER_TYPE_ACTIONS}, + "returning_entity": {"id": "$pageview", "name": "$pageview", "type": "events"}, + }, + } + ) + + self.assertEqual(len(result), 7) + self.assertEqual( + pluck(result, "label"), + ["Day 0", "Day 1", "Day 2", "Day 3", "Day 4", "Day 5", "Day 6"], + ) + self.assertEqual(result[0]["date"], datetime(2020, 6, 10, 0, tzinfo=ZoneInfo("UTC"))) + self.assertEqual( + pluck(result, "values", "count"), + [ + [1, 1, 1, 0, 0, 1, 1], + [1, 1, 0, 0, 1, 1], + [1, 0, 0, 1, 1], + [0, 0, 0, 0], + [0, 0, 0], + [1, 1], + [1], + ], + ) + + def test_retention_action_start_point(self): + _create_person(team=self.team, distinct_ids=["person1", "alias1"]) + _create_person(team=self.team, distinct_ids=["person2"]) + + action = _create_signup_actions( + self.team, + [ + ("person1", _date(0)), + ("person1", _date(1)), + ("person1", _date(2)), + ("person1", _date(5)), + ("alias1", _date(5, 9)), + ("person1", _date(6)), + ("person2", _date(1)), + ("person2", _date(2)), + ("person2", _date(3)), + ("person2", _date(6)), + ], + ) + + result = self.run_query( + query={ + "dateRange": {"date_to": _date(6, hour=0)}, + "retentionFilter": { + "period": "Day", + "total_intervals": 7, + "target_entity": {"id": action.pk, "name": action.name, "type": TREND_FILTER_TYPE_ACTIONS}, + "returning_entity": {"id": action.pk, "name": action.name, "type": TREND_FILTER_TYPE_ACTIONS}, + }, + } + ) + + self.assertEqual(len(result), 7) + self.assertEqual( + pluck(result, "label"), + ["Day 0", "Day 1", "Day 2", "Day 3", "Day 4", "Day 5", "Day 6"], + ) + self.assertEqual(result[0]["date"], datetime(2020, 6, 10, 0, tzinfo=ZoneInfo("UTC"))) + + self.assertEqual( + pluck(result, "values", "count"), + [ + [1, 1, 1, 0, 0, 1, 1], + [2, 2, 1, 0, 1, 2], + [2, 1, 0, 1, 2], + [1, 0, 0, 1], + [0, 0, 0], + [1, 1], + [2], + ], + ) + + def test_filter_test_accounts(self): + _create_person( + team_id=self.team.pk, + distinct_ids=["person1", "alias1"], + properties={"email": "test@posthog.com"}, + ) + _create_person(team_id=self.team.pk, distinct_ids=["person2"]) + + _create_events( + self.team, + [ + ("person1", _date(0)), + ("person1", _date(1)), + ("person1", _date(2)), + ("person1", _date(5)), + ("alias1", _date(5, 9)), + ("person1", _date(6)), + ("person2", _date(1)), + ("person2", _date(2)), + ("person2", _date(3)), + ("person2", _date(6)), + ], + ) + + result = self.run_query( + query={ + "dateRange": {"date_to": _date(10, hour=6)}, + "filterTestAccounts": True, + } + ) + self.assertEqual(len(result), 11) + self.assertEqual( + pluck(result, "label"), + [ + "Day 0", + "Day 1", + "Day 2", + "Day 3", + "Day 4", + "Day 5", + "Day 6", + "Day 7", + "Day 8", + "Day 9", + "Day 10", + ], + ) + self.assertEqual(result[0]["date"], datetime(2020, 6, 10, 0, tzinfo=ZoneInfo("UTC"))) + + self.assertEqual( + pluck(result, "values", "count"), + [ + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [1, 1, 1, 0, 0, 1, 0, 0, 0, 0], + [1, 1, 0, 0, 1, 0, 0, 0, 0], + [1, 0, 0, 1, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0], + [1, 0, 0, 0, 0], + [0, 0, 0, 0], + [0, 0, 0], + [0, 0], + [0], + ], + ) + + def _create_first_time_retention_events(self): + p1 = _create_person(team_id=self.team.pk, distinct_ids=["person1", "alias1"]) + p2 = _create_person(team_id=self.team.pk, distinct_ids=["person2"]) + p3 = _create_person(team_id=self.team.pk, distinct_ids=["person3"]) + p4 = _create_person(team_id=self.team.pk, distinct_ids=["person4"]) + _create_person(team_id=self.team.pk, distinct_ids=["shouldnt_include"]) + + _create_events( + self.team, + [ + ("shouldnt_include", _date(-5)), + ("shouldnt_include", _date(-1)), + ("person1", _date(-1)), + ("person1", _date(1)), + ("person1", _date(2)), + ("person1", _date(3)), + ("person1", _date(4)), + ("person2", _date(-1)), + ], + "$user_signed_up", + ) + + _create_events( + self.team, + [ + ("person1", _date(0)), + ("person1", _date(1)), + ("person1", _date(2)), + ("person1", _date(5)), + ("alias1", _date(5, 9)), + ("person1", _date(6)), + ("person2", _date(1)), + ("person2", _date(2)), + ("person2", _date(3)), + ("person2", _date(6)), + ], + ) + + _create_events(self.team, [("person3", _date(0))], "$user_signed_up") + + _create_events( + self.team, + [ + ("person3", _date(1)), + ("person3", _date(3)), + ("person3", _date(4)), + ("person3", _date(5)), + ], + ) + + _create_events(self.team, [("person4", _date(2))], "$user_signed_up") + + _create_events(self.team, [("person4", _date(3)), ("person4", _date(5))]) + + return p1, p2, p3, p4 + + @snapshot_clickhouse_queries + def test_timezones(self): + _create_person(team_id=self.team.pk, distinct_ids=["person1", "alias1"]) + _create_person(team_id=self.team.pk, distinct_ids=["person2"]) + + _create_events( + self.team, + [ + ("person1", _date(-1, 1)), + ("person1", _date(0, 1)), + ( + "person1", + _date(1, 1), + ), # this is the only event in US Pacific on the first day + ("person2", _date(6, 1)), + ("person2", _date(6, 9)), + ], + ) + + result = self.run_query(query={"dateRange": {"date_to": _date(10, hour=6)}}) + + self.team.timezone = "US/Pacific" + self.team.save() + + result_pacific = self.run_query(query={"dateRange": {"date_to": _date(10, hour=6)}}) + + self.assertEqual( + pluck(result_pacific, "label"), + [ + "Day 0", + "Day 1", + "Day 2", + "Day 3", + "Day 4", + "Day 5", + "Day 6", + "Day 7", + "Day 8", + "Day 9", + "Day 10", + ], + ) + + self.assertEqual( + result_pacific[0]["date"], + datetime(2020, 6, 10, tzinfo=ZoneInfo("US/Pacific")), + ) + self.assertEqual(result_pacific[0]["date"].isoformat(), "2020-06-10T00:00:00-07:00") + + self.assertEqual( + pluck(result, "values", "count"), + [ + [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [1, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0], + [1, 0, 0, 0, 0], # person 2 + [0, 0, 0, 0], + [0, 0, 0], + [0, 0], + [0], + ], + ) + + self.assertEqual( + pluck(result_pacific, "values", "count"), + [ + [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0], + [1, 1, 0, 0, 0, 0], # person 2 is across two dates in US/Pacific + [1, 0, 0, 0, 0], + [0, 0, 0, 0], + [0, 0, 0], + [0, 0], + [0], + ], + ) + + @snapshot_clickhouse_queries + def test_day_interval_sampled(self): + _create_person(team_id=self.team.pk, distinct_ids=["person1", "alias1"]) + _create_person(team_id=self.team.pk, distinct_ids=["person2"]) + + _create_events( + self.team, + [ + ("person1", _date(0)), + ("person1", _date(1)), + ("person1", _date(2)), + ("person1", _date(5)), + ("alias1", _date(5, 9)), + ("person1", _date(6)), + ("person2", _date(1)), + ("person2", _date(2)), + ("person2", _date(3)), + ("person2", _date(6)), + ], + ) + + # even if set to hour 6 it should default to beginning of day and include all pageviews above + result = self.run_query( + query={ + "dateRange": {"date_to": _date(10, hour=6)}, + "samplingFactor": 1, + } + ) + self.assertEqual(len(result), 11) + self.assertEqual( + pluck(result, "label"), + [ + "Day 0", + "Day 1", + "Day 2", + "Day 3", + "Day 4", + "Day 5", + "Day 6", + "Day 7", + "Day 8", + "Day 9", + "Day 10", + ], + ) + self.assertEqual(result[0]["date"], datetime(2020, 6, 10, 0, tzinfo=ZoneInfo("UTC"))) + + self.assertEqual( + pluck(result, "values", "count"), + [ + [1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0], + [2, 2, 1, 0, 1, 2, 0, 0, 0, 0], + [2, 1, 0, 1, 2, 0, 0, 0, 0], + [1, 0, 0, 1, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0], + [1, 1, 0, 0, 0, 0], + [2, 0, 0, 0, 0], + [0, 0, 0, 0], + [0, 0, 0], + [0, 0], + [0], + ], + ) diff --git a/posthog/hogql_queries/legacy_compatibility/test/test_filter_to_query.py b/posthog/hogql_queries/legacy_compatibility/test/test_filter_to_query.py index 43c87373e8255..5906b3278d9da 100644 --- a/posthog/hogql_queries/legacy_compatibility/test/test_filter_to_query.py +++ b/posthog/hogql_queries/legacy_compatibility/test/test_filter_to_query.py @@ -920,7 +920,7 @@ def test_base_funnel(self): self.assertEqual(query.kind, "FunnelsQuery") def test_base_retention_query(self): - filter = {"insight": "RETENTION"} + filter = {"insight": "RETENTION", "retention_type": "retention_first_time"} query = filter_to_query(filter) diff --git a/posthog/hogql_queries/query_runner.py b/posthog/hogql_queries/query_runner.py index a7fba00b4d455..85c2d29372676 100644 --- a/posthog/hogql_queries/query_runner.py +++ b/posthog/hogql_queries/query_runner.py @@ -30,6 +30,7 @@ InsightPersonsQuery, DashboardFilter, HogQLQueryModifiers, + RetentionQuery, ) from posthog.utils import generate_cache_key, get_safe_cache @@ -78,6 +79,7 @@ class CachedQueryResponse(QueryResponse): InsightPersonsQuery, EventsQuery, PersonsQuery, + RetentionQuery, SessionsTimelineQuery, WebOverviewQuery, WebTopClicksQuery, @@ -120,6 +122,16 @@ def get_query_runner( limit_context=limit_context, modifiers=modifiers, ) + if kind == "RetentionQuery": + from .insights.retention_query_runner import RetentionQueryRunner + + return RetentionQueryRunner( + query=cast(RetentionQuery | Dict[str, Any], query), + team=team, + timings=timings, + limit_context=limit_context, + modifiers=modifiers, + ) if kind == "EventsQuery": from .events_query_runner import EventsQueryRunner diff --git a/posthog/hogql_queries/utils/query_date_range.py b/posthog/hogql_queries/utils/query_date_range.py index 99e0d99470707..b3003a0197ee0 100644 --- a/posthog/hogql_queries/utils/query_date_range.py +++ b/posthog/hogql_queries/utils/query_date_range.py @@ -1,5 +1,5 @@ import re -from datetime import datetime +from datetime import datetime, timedelta from functools import cached_property from typing import Literal, Optional, Dict, List from zoneinfo import ZoneInfo @@ -9,8 +9,8 @@ from posthog.hogql.ast import CompareOperationOp from posthog.hogql.errors import HogQLException from posthog.hogql.parser import ast -from posthog.models.team import Team -from posthog.queries.util import get_earliest_timestamp +from posthog.models.team import Team, WeekStartDay +from posthog.queries.util import get_earliest_timestamp, get_trunc_func_ch from posthog.schema import DateRange, IntervalType from posthog.utils import ( DEFAULT_DATE_FROM_DAYS, @@ -233,3 +233,62 @@ def to_properties(self, field: Optional[List[str]] = None) -> List[ast.Expr]: right=self.date_to_as_hogql(), ), ] + + +class QueryDateRangeWithIntervals(QueryDateRange): + def __init__( + self, + date_range: Optional[DateRange], + total_intervals: int, + team: Team, + interval: Optional[IntervalType], + now: datetime, + ) -> None: + self.total_intervals = total_intervals + super().__init__(date_range, team, interval, now) + + @staticmethod + def determine_time_delta(total_intervals: int, period: str) -> timedelta: + period_map = { + "hour": timedelta(hours=1), + "day": timedelta(days=1), + "week": timedelta(weeks=1), + "month": relativedelta(months=1), + } + + if period.lower() not in period_map: + raise ValueError(f"Period {period} is unsupported.") + + return period_map[period.lower()] * total_intervals + + def date_from(self) -> datetime: + delta = self.determine_time_delta(self.total_intervals, self._interval.name) + + if self._interval == IntervalType.hour: + return self.date_to() - delta + elif self._interval == IntervalType.week: + date_from = self.date_to() - delta + week_start_alignment_days = date_from.isoweekday() % 7 + if self._team.week_start_day == WeekStartDay.MONDAY: + week_start_alignment_days = date_from.weekday() + return date_from - timedelta(days=week_start_alignment_days) + else: + date_to = self.date_to().replace(hour=0, minute=0, second=0, microsecond=0) + return date_to - delta + + def date_to(self) -> datetime: + delta = self.determine_time_delta(1, self._interval.name) + date_to = super().date_to() + delta + + if self.is_hourly: + return date_to.replace(minute=0, second=0, microsecond=0) + return date_to.replace(hour=0, minute=0, second=0, microsecond=0) + + def get_start_of_interval_hogql(self, *, source: ast.Expr = None) -> ast.Expr: + trunc_func = get_trunc_func_ch(self._interval.name.lower()) + trunc_func_args = [source] if source else [ast.Constant(value=self.date_from())] + if trunc_func == "toStartOfWeek": + trunc_func_args.append( + ast.Constant(value=int((WeekStartDay(self._team.week_start_day or 0)).clickhouse_mode)) + ) + return ast.Call(name=trunc_func, args=trunc_func_args) diff --git a/posthog/hogql_queries/utils/test/test_query_date_range.py b/posthog/hogql_queries/utils/test/test_query_date_range.py index 0ab8467567a50..8444538d3d578 100644 --- a/posthog/hogql_queries/utils/test/test_query_date_range.py +++ b/posthog/hogql_queries/utils/test/test_query_date_range.py @@ -1,6 +1,10 @@ +from datetime import timedelta + from dateutil import parser -from posthog.hogql_queries.utils.query_date_range import QueryDateRange +from posthog.hogql import ast +from posthog.hogql_queries.utils.query_date_range import QueryDateRange, QueryDateRangeWithIntervals +from posthog.models.team import WeekStartDay from posthog.schema import DateRange, IntervalType from posthog.test.base import APIBaseTest @@ -50,3 +54,72 @@ def test_is_hourly(self): query_date_range = QueryDateRange(team=self.team, date_range=date_range, interval=IntervalType.hour, now=now) self.assertTrue(query_date_range.is_hourly) + + +class TestQueryDateRangeWithIntervals(APIBaseTest): + def setUp(self): + self.now = parser.isoparse("2021-08-25T00:00:00.000Z") + self.total_intervals = 5 + + def test_constructor_initialization(self): + query = QueryDateRangeWithIntervals(None, self.total_intervals, self.team, IntervalType.day, self.now) + self.assertEqual(query.total_intervals, self.total_intervals) + + def test_determine_time_delta_valid(self): + delta = QueryDateRangeWithIntervals.determine_time_delta(5, "day") + self.assertEqual(delta, timedelta(days=5)) + + def test_determine_time_delta_invalid_period(self): + with self.assertRaises(ValueError): + QueryDateRangeWithIntervals.determine_time_delta(5, "decade") + + def test_date_from_day_interval(self): + query = QueryDateRangeWithIntervals(None, 2, self.team, IntervalType.day, self.now) + self.assertEqual(query.date_from(), parser.isoparse("2021-08-24T00:00:00Z")) + + def test_date_from_hour_interval(self): + query = QueryDateRangeWithIntervals(None, 48, self.team, IntervalType.hour, self.now) + self.assertEqual(query.date_from(), parser.isoparse("2021-08-23T01:00:00Z")) + + def test_date_from_week_interval_starting_monday(self): + self.team.week_start_day = WeekStartDay.MONDAY + query = QueryDateRangeWithIntervals(None, 1, self.team, IntervalType.week, self.now) + self.assertEqual(query.date_from(), parser.isoparse("2021-08-23T00:00:00Z")) + + def test_date_from_week_interval_starting_sunday(self): + self.team.week_start_day = WeekStartDay.SUNDAY + query = QueryDateRangeWithIntervals(None, 1, self.team, IntervalType.week, self.now) + self.assertEqual(query.date_from(), parser.isoparse("2021-08-22T00:00:00Z")) + + def test_date_to_day_interval(self): + query = QueryDateRangeWithIntervals(None, 1, self.team, IntervalType.day, self.now) + self.assertEqual(query.date_to(), parser.isoparse("2021-08-26T00:00:00Z")) + + def test_date_to_hour_interval(self): + query = QueryDateRangeWithIntervals(None, 1, self.team, IntervalType.hour, self.now) + self.assertEqual(query.date_to(), parser.isoparse("2021-08-25T01:00:00Z")) + + def test_get_start_of_interval_hogql_day_interval(self): + query = QueryDateRangeWithIntervals(None, 1, self.team, IntervalType.day, self.now) + expected_expr = ast.Call(name="toStartOfDay", args=[ast.Constant(value=query.date_from())]) + self.assertEqual(query.get_start_of_interval_hogql(), expected_expr) + + def test_get_start_of_interval_hogql_hour_interval(self): + query = QueryDateRangeWithIntervals(None, 1, self.team, IntervalType.hour, self.now) + expected_expr = ast.Call(name="toStartOfHour", args=[ast.Constant(value=query.date_from())]) + self.assertEqual(query.get_start_of_interval_hogql(), expected_expr) + + def test_get_start_of_interval_hogql_week_interval(self): + self.team.week_start_day = WeekStartDay.MONDAY + query = QueryDateRangeWithIntervals(None, 1, self.team, IntervalType.week, self.now) + week_mode = WeekStartDay(self.team.week_start_day or 0).clickhouse_mode + expected_expr = ast.Call( + name="toStartOfWeek", args=[ast.Constant(value=query.date_from()), ast.Constant(value=int(week_mode))] + ) + self.assertEqual(query.get_start_of_interval_hogql(), expected_expr) + + def test_get_start_of_interval_hogql_with_source(self): + source_expr = ast.Constant(value="2021-08-25T00:00:00.000Z") + query = QueryDateRangeWithIntervals(None, 1, self.team, IntervalType.day, self.now) + expected_expr = ast.Call(name="toStartOfDay", args=[source_expr]) + self.assertEqual(query.get_start_of_interval_hogql(source=source_expr), expected_expr) diff --git a/posthog/schema.py b/posthog/schema.py index 5fe5a8ab37c70..75ff596013c8d 100644 --- a/posthog/schema.py +++ b/posthog/schema.py @@ -485,6 +485,24 @@ class RecordingDurationFilter(BaseModel): value: float +class Kind(str, Enum): + ActionsNode = "ActionsNode" + EventsNode = "EventsNode" + + +class RetentionEntity(BaseModel): + model_config = ConfigDict( + extra="forbid", + ) + custom_name: Optional[str] = None + id: Optional[Union[str, float]] = None + kind: Optional[Kind] = None + name: Optional[str] = None + order: Optional[float] = None + type: Optional[EntityType] = None + uuid: Optional[str] = None + + class RetentionReference(str, Enum): total = "total" previous = "previous" @@ -502,6 +520,13 @@ class RetentionType(str, Enum): retention_first_time = "retention_first_time" +class RetentionValue(BaseModel): + model_config = ConfigDict( + extra="forbid", + ) + count: int + + class SessionPropertyFilter(BaseModel): model_config = ConfigDict( extra="forbid", @@ -624,7 +649,7 @@ class VizSpecificOptions(BaseModel): RETENTION: Optional[RETENTION] = None -class Kind(str, Enum): +class Kind1(str, Enum): unit = "unit" duration_s = "duration_s" percentage = "percentage" @@ -637,7 +662,7 @@ class WebOverviewItem(BaseModel): changeFromPreviousPct: Optional[float] = None isIncreaseBad: Optional[bool] = None key: str - kind: Kind + kind: Kind1 previous: Optional[float] = None value: Optional[float] = None @@ -927,9 +952,18 @@ class RetentionFilter(BaseModel): period: Optional[RetentionPeriod] = None retention_reference: Optional[RetentionReference] = None retention_type: Optional[RetentionType] = None - returning_entity: Optional[Dict[str, Any]] = None - target_entity: Optional[Dict[str, Any]] = None - total_intervals: Optional[float] = None + returning_entity: Optional[RetentionEntity] = None + target_entity: Optional[RetentionEntity] = None + total_intervals: Optional[int] = None + + +class RetentionResult(BaseModel): + model_config = ConfigDict( + extra="forbid", + ) + date: datetime + label: str + values: List[RetentionValue] class SavedInsightNode(BaseModel): @@ -1383,6 +1417,18 @@ class PropertyGroupFilterValue(BaseModel): ] +class RetentionQueryResponse(BaseModel): + model_config = ConfigDict( + extra="forbid", + ) + hogql: Optional[str] = None + is_cached: Optional[bool] = None + last_refresh: Optional[str] = None + next_allowed_client_refresh: Optional[str] = None + results: List[RetentionResult] + timings: Optional[List[QueryTiming]] = None + + class SessionsTimelineQuery(BaseModel): model_config = ConfigDict( extra="forbid", @@ -1501,9 +1547,8 @@ class RetentionQuery(BaseModel): PropertyGroupFilter, ] ] = Field(default=None, description="Property filters for all series") - retentionFilter: Optional[RetentionFilter] = Field( - default=None, description="Properties specific to the retention insight" - ) + response: Optional[RetentionQueryResponse] = None + retentionFilter: RetentionFilter = Field(..., description="Properties specific to the retention insight") samplingFactor: Optional[float] = Field(default=None, description="Sampling rate")