diff --git a/.github/workflows/ci-backend-depot.yml b/.github/workflows/ci-backend-depot.yml index 3cf935ced141e..928886d44cf52 100644 --- a/.github/workflows/ci-backend-depot.yml +++ b/.github/workflows/ci-backend-depot.yml @@ -5,15 +5,7 @@ name: Backend CI (depot) on: - push: - branches: - - master pull_request: - workflow_dispatch: - inputs: - clickhouseServerVersion: - description: ClickHouse server version. Leave blank for default - type: string concurrency: group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} @@ -41,7 +33,7 @@ jobs: changes: runs-on: depot-ubuntu-latest-4 timeout-minutes: 5 - if: github.repository == 'PostHog/posthog' + if: ${{ contains(github.event.pull_request.labels.*.name, 'test-depot') }} name: Determine need to run backend checks # Set job outputs to values from filter step outputs: diff --git a/.github/workflows/ci-e2e-depot.yml b/.github/workflows/ci-e2e-depot.yml index 4985dac9d746a..697d42e97f945 100644 --- a/.github/workflows/ci-e2e-depot.yml +++ b/.github/workflows/ci-e2e-depot.yml @@ -16,7 +16,7 @@ jobs: changes: runs-on: depot-ubuntu-latest-4 timeout-minutes: 5 - if: github.repository == 'PostHog/posthog' + if: ${{ contains(github.event.pull_request.labels.*.name, 'test-depot') }} name: Determine need to run E2E checks # Set job outputs to values from filter step outputs: diff --git a/ee/frontend/mobile-replay/__mocks__/increment-with-child-duplication.json b/ee/frontend/mobile-replay/__mocks__/increment-with-child-duplication.json index c17efc6d9e246..7ffc2e5f38e5c 100644 --- a/ee/frontend/mobile-replay/__mocks__/increment-with-child-duplication.json +++ b/ee/frontend/mobile-replay/__mocks__/increment-with-child-duplication.json @@ -191,6 +191,13 @@ "x": 66, "y": 556 } + }, + { + "parentId": 209272202, + "wireframe": { + "id": 52129787123, + "type": "text" + } } ], "removes": [ diff --git a/ee/frontend/mobile-replay/__snapshots__/transform.test.ts.snap b/ee/frontend/mobile-replay/__snapshots__/transform.test.ts.snap index a421f7ff220bf..bbde91f8defc6 100644 --- a/ee/frontend/mobile-replay/__snapshots__/transform.test.ts.snap +++ b/ee/frontend/mobile-replay/__snapshots__/transform.test.ts.snap @@ -160,6 +160,147 @@ exports[`replay/transform transform can convert images 1`] = ` ] `; +exports[`replay/transform transform can convert invalid text wireframe 1`] = ` +[ + { + "data": { + "height": 600, + "href": "", + "width": 300, + }, + "timestamp": 1, + "type": 4, + }, + { + "data": { + "initialOffset": { + "left": 0, + "top": 0, + }, + "node": { + "childNodes": [ + { + "id": 2, + "name": "html", + "publicId": "", + "systemId": "", + "type": 1, + }, + { + "attributes": { + "data-rrweb-id": 3, + "style": "height: 100vh; width: 100vw;", + }, + "childNodes": [ + { + "attributes": { + "data-rrweb-id": 4, + }, + "childNodes": [ + { + "attributes": { + "type": "text/css", + }, + "childNodes": [ + { + "id": 102, + "textContent": " + body { + margin: unset; + } + input, button, select, textarea { + font: inherit; + margin: 0; + padding: 0; + border: 0; + outline: 0; + background: transparent; + padding-block: 0 !important; + } + .input:focus { + outline: none; + } + img { + border-style: none; + } + ", + "type": 3, + }, + ], + "id": 101, + "tagName": "style", + "type": 2, + }, + ], + "id": 4, + "tagName": "head", + "type": 2, + }, + { + "attributes": { + "data-rrweb-id": 5, + "style": "height: 100vh; width: 100vw;", + }, + "childNodes": [ + { + "attributes": { + "data-rrweb-id": 12345, + "style": "border-width: 4px;border-radius: 10px;border-color: #ee3ee4;border-style: solid;color: #ee3ee4;width: 100px;height: 30px;position: fixed;left: 11px;top: 12px;overflow:hidden;white-space:normal;", + }, + "childNodes": [], + "id": 12345, + "tagName": "div", + "type": 2, + }, + { + "attributes": { + "data-render-reason": "a fixed placeholder to contain the keyboard in the correct stacking position", + "data-rrweb-id": 9, + }, + "childNodes": [], + "id": 9, + "tagName": "div", + "type": 2, + }, + { + "attributes": { + "data-rrweb-id": 7, + }, + "childNodes": [], + "id": 7, + "tagName": "div", + "type": 2, + }, + { + "attributes": { + "data-rrweb-id": 11, + }, + "childNodes": [], + "id": 11, + "tagName": "div", + "type": 2, + }, + ], + "id": 5, + "tagName": "body", + "type": 2, + }, + ], + "id": 3, + "tagName": "html", + "type": 2, + }, + ], + "id": 1, + "type": 0, + }, + }, + "timestamp": 1, + "type": 2, + }, +] +`; + exports[`replay/transform transform can convert navigation bar 1`] = ` [ { @@ -1453,6 +1594,20 @@ exports[`replay/transform transform incremental mutations de-duplicate the tree }, "parentId": 52129787, }, + { + "nextId": null, + "node": { + "attributes": { + "data-rrweb-id": 52129787123, + "style": "position: fixed;left: 0px;top: 0px;overflow:hidden;white-space:normal;", + }, + "childNodes": [], + "id": 52129787123, + "tagName": "div", + "type": 2, + }, + "parentId": 209272202, + }, ], "attributes": [], "removes": [ @@ -1689,6 +1844,13 @@ AAAAAAAAAAAAAAAAAAAAAAAAgCN/AW0xMqHnNQceAAAAAElFTkSuQmCC "y": 556, }, }, + { + "parentId": 209272202, + "wireframe": { + "id": 52129787123, + "type": "text", + }, + }, ], "removes": [ { diff --git a/ee/frontend/mobile-replay/transform.test.ts b/ee/frontend/mobile-replay/transform.test.ts index 788bb65655d3d..92384e48b2986 100644 --- a/ee/frontend/mobile-replay/transform.test.ts +++ b/ee/frontend/mobile-replay/transform.test.ts @@ -480,6 +480,43 @@ describe('replay/transform', () => { expect(converted).toMatchSnapshot() }) + test('can convert invalid text wireframe', () => { + const converted = posthogEEModule.mobileReplay?.transformToWeb([ + { + data: { + width: 300, + height: 600, + }, + timestamp: 1, + type: 4, + }, + { + type: 2, + data: { + wireframes: [ + { + id: 12345, + type: 'text', + x: 11, + y: 12, + width: 100, + height: 30, + style: { + color: '#ee3ee4', + borderColor: '#ee3ee4', + borderWidth: '4', + borderRadius: '10px', + }, + // text property is missing + }, + ], + }, + timestamp: 1, + }, + ]) + expect(converted).toMatchSnapshot() + }) + test('can set background image to base64 png', () => { const converted = posthogEEModule.mobileReplay?.transformToWeb([ { diff --git a/ee/frontend/mobile-replay/transformer/transformers.ts b/ee/frontend/mobile-replay/transformer/transformers.ts index 1527a24d7dbeb..f2b7324a475fa 100644 --- a/ee/frontend/mobile-replay/transformer/transformers.ts +++ b/ee/frontend/mobile-replay/transformer/transformers.ts @@ -105,6 +105,10 @@ export function _isPositiveInteger(id: unknown): id is number { return typeof id === 'number' && id > 0 && id % 1 === 0 } +function _isNullish(x: unknown): x is null | undefined { + return x === null || x === undefined +} + function isRemovedNodeMutation(x: addedNodeMutation | removedNodeMutation): x is removedNodeMutation { return isObject(x) && 'id' in x } @@ -218,6 +222,17 @@ function makeTextElement( // because we might have to style the text, we always wrap it in a div // and apply styles to that const id = context.idSequence.next().value + + const childNodes = [...children] + if (!_isNullish(wireframe.text)) { + childNodes.unshift({ + type: NodeType.Text, + textContent: wireframe.text, + // since the text node is wrapped, we assign it a synthetic id + id, + }) + } + return { result: { type: NodeType.Element, @@ -227,15 +242,7 @@ function makeTextElement( 'data-rrweb-id': wireframe.id, }, id: wireframe.id, - childNodes: [ - { - type: NodeType.Text, - textContent: wireframe.text, - // since the text node is wrapped, we assign it a synthetic id - id: id, - }, - ...children, - ], + childNodes, }, context, } @@ -983,6 +990,7 @@ function isMobileIncrementalSnapshotEvent(x: unknown): x is MobileIncrementalSna function makeIncrementalAdd(add: MobileNodeMutation, context: ConversionContext): addedNodeMutation[] | null { const converted = convertWireframe(add.wireframe, context) + if (!converted) { return null } diff --git a/ee/session_recordings/ai/error_clustering.py b/ee/session_recordings/ai/error_clustering.py index edc06ff471355..7a3c12c44dec0 100644 --- a/ee/session_recordings/ai/error_clustering.py +++ b/ee/session_recordings/ai/error_clustering.py @@ -6,6 +6,7 @@ import pandas as pd import numpy as np from posthog.session_recordings.models.session_recording_event import SessionRecordingViewed +from datetime import date CLUSTER_REPLAY_ERRORS_TIMING = Histogram( "posthog_session_recordings_cluster_replay_errors", @@ -30,7 +31,7 @@ def error_clustering(team: Team, user: User): if not results: return [] - df = pd.DataFrame(results, columns=["session_id", "input", "embeddings"]) + df = pd.DataFrame(results, columns=["session_id", "error", "embeddings", "timestamp"]) df["cluster"] = cluster_embeddings(df["embeddings"].tolist()) @@ -42,7 +43,7 @@ def error_clustering(team: Team, user: User): def fetch_error_embeddings(team_id: int): query = """ SELECT - session_id, input, embeddings + session_id, input, embeddings, generation_timestamp FROM session_replay_embeddings WHERE @@ -76,13 +77,21 @@ def construct_response(df: pd.DataFrame, team: Team, user: User): clusters = [] for cluster, rows in df.groupby("cluster"): session_ids = rows["session_id"].unique() - sample = rows.sample(n=1)[["session_id", "input"]].rename(columns={"input": "error"}).to_dict("records")[0] + sample = rows.sample(n=1)[["session_id", "error"]].to_dict("records")[0] + + date_series = ( + df.groupby([df["timestamp"].dt.date]) + .size() + .reindex(pd.date_range(end=date.today(), periods=7), fill_value=0) + ) + sparkline = dict(zip(date_series.index.astype(str), date_series)) clusters.append( { "cluster": cluster, "sample": sample.get("error"), "session_ids": np.random.choice(session_ids, size=DBSCAN_MIN_SAMPLES - 1), "occurrences": rows.size, + "sparkline": sparkline, "unique_sessions": len(session_ids), "viewed": len(np.intersect1d(session_ids, viewed_session_ids, assume_unique=True)), } diff --git a/frontend/__snapshots__/scenes-app-insights--funnel-top-to-bottom-breakdown--light.png b/frontend/__snapshots__/scenes-app-insights--funnel-top-to-bottom-breakdown--light.png index 7067293aa1a1e..2ed59b8746a73 100644 Binary files a/frontend/__snapshots__/scenes-app-insights--funnel-top-to-bottom-breakdown--light.png and b/frontend/__snapshots__/scenes-app-insights--funnel-top-to-bottom-breakdown--light.png differ diff --git a/frontend/__snapshots__/scenes-other-signup--cloud--light.png b/frontend/__snapshots__/scenes-other-signup--cloud--light.png index b839308feb12b..05ee6352a0fad 100644 Binary files a/frontend/__snapshots__/scenes-other-signup--cloud--light.png and b/frontend/__snapshots__/scenes-other-signup--cloud--light.png differ diff --git a/frontend/__snapshots__/scenes-other-signup--self-hosted--light.png b/frontend/__snapshots__/scenes-other-signup--self-hosted--light.png index 2f2b83c6a83fb..cccbf29d688de 100644 Binary files a/frontend/__snapshots__/scenes-other-signup--self-hosted--light.png and b/frontend/__snapshots__/scenes-other-signup--self-hosted--light.png differ diff --git a/frontend/__snapshots__/scenes-other-signup--self-hosted-sso--light.png b/frontend/__snapshots__/scenes-other-signup--self-hosted-sso--light.png index b496f531dbe5d..d2e55de38afac 100644 Binary files a/frontend/__snapshots__/scenes-other-signup--self-hosted-sso--light.png and b/frontend/__snapshots__/scenes-other-signup--self-hosted-sso--light.png differ diff --git a/frontend/__snapshots__/scenes-other-toolbar--unauthenticated--dark.png b/frontend/__snapshots__/scenes-other-toolbar--unauthenticated--dark.png index f5283e47bdf43..dc149beabe203 100644 Binary files a/frontend/__snapshots__/scenes-other-toolbar--unauthenticated--dark.png and b/frontend/__snapshots__/scenes-other-toolbar--unauthenticated--dark.png differ diff --git a/frontend/__snapshots__/scenes-other-toolbar--unauthenticated--light.png b/frontend/__snapshots__/scenes-other-toolbar--unauthenticated--light.png index 15b96030b8a25..ace3a2b7f9373 100644 Binary files a/frontend/__snapshots__/scenes-other-toolbar--unauthenticated--light.png and b/frontend/__snapshots__/scenes-other-toolbar--unauthenticated--light.png differ diff --git a/frontend/src/queries/query.ts b/frontend/src/queries/query.ts index 78778cec4322a..f866b2f336d31 100644 --- a/frontend/src/queries/query.ts +++ b/frontend/src/queries/query.ts @@ -220,7 +220,13 @@ export async function query( (hogQLInsightsFunnelsFlagEnabled && isFunnelsQuery(queryNode)) ) { if (hogQLInsightsLiveCompareEnabled) { - const legacyFunction = legacyUrl ? fetchLegacyUrl : fetchLegacyInsights + const legacyFunction = (): any => { + try { + return legacyUrl ? fetchLegacyUrl : fetchLegacyInsights + } catch (e) { + console.error('Error fetching legacy insights', e) + } + } let legacyResponse: any ;[response, legacyResponse] = await Promise.all([ executeQuery(queryNode, methodOptions, refresh, queryId), diff --git a/frontend/src/queries/schema.json b/frontend/src/queries/schema.json index 898d4e0974004..c00c309797421 100644 --- a/frontend/src/queries/schema.json +++ b/frontend/src/queries/schema.json @@ -2567,7 +2567,7 @@ "type": "string" }, "personsOnEventsMode": { - "enum": ["disabled", "v1_enabled", "v1_mixed", "v2_enabled"], + "enum": ["disabled", "v1_enabled", "v1_mixed", "v2_enabled", "v3_enabled"], "type": "string" } }, diff --git a/frontend/src/queries/schema.ts b/frontend/src/queries/schema.ts index 7eaa1734ea463..e07838648f4af 100644 --- a/frontend/src/queries/schema.ts +++ b/frontend/src/queries/schema.ts @@ -176,7 +176,7 @@ export interface DataNode extends Node { /** HogQL Query Options are automatically set per team. However, they can be overriden in the query. */ export interface HogQLQueryModifiers { - personsOnEventsMode?: 'disabled' | 'v1_enabled' | 'v1_mixed' | 'v2_enabled' + personsOnEventsMode?: 'disabled' | 'v1_enabled' | 'v1_mixed' | 'v2_enabled' | 'v3_enabled' personsArgMaxVersion?: 'auto' | 'v1' | 'v2' inCohortVia?: 'auto' | 'leftjoin' | 'subquery' | 'leftjoin_conjoined' materializationMode?: 'auto' | 'legacy_null_as_string' | 'legacy_null_as_null' | 'disabled' diff --git a/frontend/src/scenes/authentication/signup/SignupContainer.tsx b/frontend/src/scenes/authentication/signup/SignupContainer.tsx index ff4b65c9b88ce..3113cde8b3702 100644 --- a/frontend/src/scenes/authentication/signup/SignupContainer.tsx +++ b/frontend/src/scenes/authentication/signup/SignupContainer.tsx @@ -22,12 +22,8 @@ export function SignupContainer(): JSX.Element | null { const { user } = useValues(userLogic) const footerHighlights = { - cloud: ['Hosted & managed by PostHog', 'Pay per event, cancel anytime', 'Community, Slack & email support'], - selfHosted: [ - 'Fully featured product, unlimited events', - 'Data in your own infrastructure', - 'Community, Slack & email support', - ], + cloud: ['Hosted & managed by PostHog', 'Pay per event, cancel anytime', 'Fast and reliable support'], + selfHosted: ['Fully featured product, unlimited events', 'Data in your own infrastructure', 'Community forum'], } return !user ? ( diff --git a/frontend/src/scenes/batch_exports/BatchExportEditForm.tsx b/frontend/src/scenes/batch_exports/BatchExportEditForm.tsx index 7fbbc8cc29d69..a2a9f9968f82c 100644 --- a/frontend/src/scenes/batch_exports/BatchExportEditForm.tsx +++ b/frontend/src/scenes/batch_exports/BatchExportEditForm.tsx @@ -242,6 +242,15 @@ export function BatchExportsEditFields({ ]} /> + + + +
@@ -375,17 +384,21 @@ export function BatchExportsEditFields({ - - Does your Postgres instance have a self-signed SSL certificate? - - - - - } - /> + {({ value, onChange }) => ( + + Does your Postgres instance have a self-signed SSL certificate? + + + + + } + checked={!!value} + onChange={onChange} + /> + )} diff --git a/frontend/src/scenes/batch_exports/BatchExports.stories.tsx b/frontend/src/scenes/batch_exports/BatchExports.stories.tsx index 0dd616c44982a..dbd6779cb208d 100644 --- a/frontend/src/scenes/batch_exports/BatchExports.stories.tsx +++ b/frontend/src/scenes/batch_exports/BatchExports.stories.tsx @@ -42,6 +42,7 @@ export default { include_events: [], encryption: null, kms_key_id: null, + file_format: 'JSONLines', }, }, start_at: null, diff --git a/frontend/src/scenes/batch_exports/batchExportEditLogic.ts b/frontend/src/scenes/batch_exports/batchExportEditLogic.ts index bc86d1618fe4f..30c123256d81a 100644 --- a/frontend/src/scenes/batch_exports/batchExportEditLogic.ts +++ b/frontend/src/scenes/batch_exports/batchExportEditLogic.ts @@ -90,6 +90,7 @@ export const batchExportFormFields = ( aws_secret_access_key: isNew ? (!config.aws_secret_access_key ? 'This field is required' : '') : '', compression: '', encryption: '', + file_format: isNew ? (!config.file_format ? 'This field is required' : '') : '', kms_key_id: !config.kms_key_id && config.encryption == 'aws:kms' ? 'This field is required' : '', exclude_events: '', include_events: '', diff --git a/frontend/src/scenes/data-warehouse/ViewLinkModal.tsx b/frontend/src/scenes/data-warehouse/ViewLinkModal.tsx index 2116d2da6e74d..11f50ca4f0d27 100644 --- a/frontend/src/scenes/data-warehouse/ViewLinkModal.tsx +++ b/frontend/src/scenes/data-warehouse/ViewLinkModal.tsx @@ -1,6 +1,6 @@ import './ViewLinkModal.scss' -import { IconTrash } from '@posthog/icons' +import { IconCollapse, IconExpand, IconTrash } from '@posthog/icons' import { LemonButton, LemonDivider, @@ -35,7 +35,7 @@ export function ViewLinkModal(): JSX.Element { } isOpen={isJoinTableModalOpen} onClose={toggleJoinTableModal} - width={600} + width={700} > @@ -57,6 +57,7 @@ export function ViewLinkForm(): JSX.Element { selectedJoiningKey, sourceIsUsingHogQLExpression, joiningIsUsingHogQLExpression, + isViewLinkSubmitting, } = useValues(viewLinkLogic) const { selectJoiningTable, @@ -66,12 +67,13 @@ export function ViewLinkForm(): JSX.Element { selectSourceKey, selectJoiningKey, } = useActions(viewLinkLogic) + const [advancedSettingsExpanded, setAdvancedSettingsExpanded] = useState(false) return (
-
+
Source Table {isNewJoin ? ( @@ -86,7 +88,7 @@ export function ViewLinkForm(): JSX.Element { selectedSourceTableName ?? '' )}
-
+
Joining Table
-
-
+
+
Source Table Key <> @@ -124,7 +126,7 @@ export function ViewLinkForm(): JSX.Element {
-
+
Joining Table Key <> @@ -148,8 +150,22 @@ export function ViewLinkForm(): JSX.Element {
{sqlCodeSnippet && ( - <> +
+ setAdvancedSettingsExpanded(!advancedSettingsExpanded)} + sideIcon={advancedSettingsExpanded ? : } + > +
+

Advanced settings

+
Customize how the fields are accessed
+
+
+
+ )} + {sqlCodeSnippet && advancedSettingsExpanded && ( + <>
Field Name @@ -185,7 +201,7 @@ export function ViewLinkForm(): JSX.Element { Close - + Save
diff --git a/frontend/src/scenes/debug/HogQLDebug.tsx b/frontend/src/scenes/debug/HogQLDebug.tsx index 5defca51309b1..e2c4a08341d13 100644 --- a/frontend/src/scenes/debug/HogQLDebug.tsx +++ b/frontend/src/scenes/debug/HogQLDebug.tsx @@ -71,6 +71,7 @@ export function HogQLDebug({ query, setQuery, queryKey }: HogQLDebugProps): JSX. { value: 'v1_enabled', label: 'V1 Enabled' }, { value: 'v1_mixed', label: 'V1 Mixed' }, { value: 'v2_enabled', label: 'V2 Enabled' }, + { value: 'v3_enabled', label: 'V3 Enabled (Join)' }, ]} onChange={(value) => setQuery({ diff --git a/frontend/src/scenes/insights/views/InsightsTable/InsightsTable.tsx b/frontend/src/scenes/insights/views/InsightsTable/InsightsTable.tsx index d426ca87f525c..c6d037c3d1792 100644 --- a/frontend/src/scenes/insights/views/InsightsTable/InsightsTable.tsx +++ b/frontend/src/scenes/insights/views/InsightsTable/InsightsTable.tsx @@ -3,6 +3,7 @@ import './InsightsTable.scss' import { useActions, useValues } from 'kea' import { getSeriesColor } from 'lib/colors' import { LemonTable, LemonTableColumn } from 'lib/lemon-ui/LemonTable' +import { compare as compareFn } from 'natural-orderby' import { insightLogic } from 'scenes/insights/insightLogic' import { insightSceneLogic } from 'scenes/insights/insightSceneLogic' import { isTrendsFilter } from 'scenes/insights/sharedUtils' @@ -157,7 +158,7 @@ export function InsightsTable({ } const labelA = formatItemBreakdownLabel(a) const labelB = formatItemBreakdownLabel(b) - return labelA.localeCompare(labelB) + return compareFn()(labelA, labelB) }, }) if (isTrends && display === ChartDisplayType.WorldMap) { diff --git a/frontend/src/scenes/session-recordings/errors/SessionRecordingErrors.tsx b/frontend/src/scenes/session-recordings/errors/SessionRecordingErrors.tsx index 4b2dd2d1abed3..8b73fbcc1f924 100644 --- a/frontend/src/scenes/session-recordings/errors/SessionRecordingErrors.tsx +++ b/frontend/src/scenes/session-recordings/errors/SessionRecordingErrors.tsx @@ -2,6 +2,7 @@ import { IconFeatures } from '@posthog/icons' import { LemonButton, LemonTable, LemonTabs, Spinner } from '@posthog/lemon-ui' import { useActions, useValues } from 'kea' import { JSONViewer } from 'lib/components/JSONViewer' +import { Sparkline } from 'lib/lemon-ui/Sparkline' import { useState } from 'react' import { urls } from 'scenes/urls' @@ -45,6 +46,17 @@ export function SessionRecordingErrors(): JSX.Element { }, width: '50%', }, + { + title: '', + render: (_, cluster) => { + return ( + + ) + }, + }, { title: 'Occurrences', dataIndex: 'occurrences', diff --git a/frontend/src/toolbar/bar/Toolbar.scss b/frontend/src/toolbar/bar/Toolbar.scss index ff83b7c9d922a..ec98167332fbd 100644 --- a/frontend/src/toolbar/bar/Toolbar.scss +++ b/frontend/src/toolbar/bar/Toolbar.scss @@ -144,8 +144,4 @@ transform: var(--toolbar-translate) scale(0); } } - - &--unauthenticated { - width: calc(5rem + 1px); // Account for border - } } diff --git a/frontend/src/toolbar/bar/Toolbar.tsx b/frontend/src/toolbar/bar/Toolbar.tsx index f255ccc26800d..8ed031f904d86 100644 --- a/frontend/src/toolbar/bar/Toolbar.tsx +++ b/frontend/src/toolbar/bar/Toolbar.tsx @@ -77,15 +77,18 @@ function MoreMenu(): JSX.Element { } maxContentWidth={true} > - } title="More options" /> + + + ) } -export function ToolbarInfoMenu(): JSX.Element { +export function ToolbarInfoMenu(): JSX.Element | null { const ref = useRef(null) const { visibleMenu, isDragging, menuProperties, minimized, isBlurred } = useValues(toolbarLogic) const { setMenu } = useActions(toolbarLogic) + const { isAuthenticated } = useValues(toolbarConfigLogic) const content = minimized ? null : visibleMenu === 'flags' ? ( @@ -102,6 +105,10 @@ export function ToolbarInfoMenu(): JSX.Element { return () => setMenu(null) }, [ref.current]) + if (!isAuthenticated) { + return null + } + return (
} onClick={isAuthenticated ? toggleMinimized : authenticate} title={isAuthenticated ? 'Minimize' : 'Authenticate the PostHog Toolbar'} titleMinimized={isAuthenticated ? 'Expand the toolbar' : 'Authenticate the PostHog Toolbar'} - /> + > + + {isAuthenticated ? ( <> - } menuId="inspect" /> - } menuId="heatmap" /> - } menuId="actions" /> - } menuId="flags" title="Feature flags" /> + + + + + + + + + + + + - ) : null} + ) : ( + + Authenticate + + )}
diff --git a/frontend/src/toolbar/bar/ToolbarButton.scss b/frontend/src/toolbar/bar/ToolbarButton.scss index 0d0bb666fa540..ce480f3fbab35 100644 --- a/frontend/src/toolbar/bar/ToolbarButton.scss +++ b/frontend/src/toolbar/bar/ToolbarButton.scss @@ -15,6 +15,8 @@ width: 2rem; height: 2rem; min-height: var(--lemon-button-height); + margin: 0.25rem; + font-weight: 600; color: var(--muted-alt); appearance: none !important; // Important as this gets overridden by Ant styles... cursor: pointer; @@ -43,4 +45,13 @@ } } } + + &--flex { + flex-grow: 1; + width: auto; + + button { + width: 100%; + } + } } diff --git a/frontend/src/toolbar/bar/ToolbarButton.tsx b/frontend/src/toolbar/bar/ToolbarButton.tsx index add0e5f2580ce..f5dfc755be469 100644 --- a/frontend/src/toolbar/bar/ToolbarButton.tsx +++ b/frontend/src/toolbar/bar/ToolbarButton.tsx @@ -10,17 +10,18 @@ import React from 'react' import { MenuState, toolbarLogic } from './toolbarLogic' export type ToolbarButtonProps = { - icon: React.ReactElement | null + children: React.ReactNode onClick?: () => void title?: string titleMinimized?: JSX.Element | string menuId?: MenuState + flex?: boolean } export const ToolbarButton: FunctionComponent = React.forwardRef< HTMLDivElement, ToolbarButtonProps ->(({ icon, title, onClick, titleMinimized, menuId, ...props }, ref): JSX.Element => { +>(({ children, title, onClick, titleMinimized, menuId, flex, ...props }, ref): JSX.Element => { const { visibleMenu, minimized, isDragging } = useValues(toolbarLogic) const { setVisibleMenu } = useActions(toolbarLogic) @@ -54,9 +55,13 @@ export const ToolbarButton: FunctionComponent = React.forwar } const theButton = ( -
+
) diff --git a/frontend/src/toolbar/flags/flagsToolbarLogic.ts b/frontend/src/toolbar/flags/flagsToolbarLogic.ts index 60c1f568f45a6..e1f41cabca73c 100644 --- a/frontend/src/toolbar/flags/flagsToolbarLogic.ts +++ b/frontend/src/toolbar/flags/flagsToolbarLogic.ts @@ -40,11 +40,6 @@ export const flagsToolbarLogic = kea([ `/api/projects/@current/feature_flags/my_flags${encodeParams(params, '?')}` ) - if (response.status >= 400) { - toolbarConfigLogic.actions.tokenExpired() - return [] - } - breakpoint() if (!response.ok) { return [] diff --git a/frontend/src/toolbar/toolbarConfigLogic.ts b/frontend/src/toolbar/toolbarConfigLogic.ts index 1b4638b8f39f8..853b03bdeea32 100644 --- a/frontend/src/toolbar/toolbarConfigLogic.ts +++ b/frontend/src/toolbar/toolbarConfigLogic.ts @@ -119,10 +119,12 @@ export async function toolbarFetch( }) if (response.status === 403) { const responseData = await response.json() - // Do not try to authenticate if the user has no project access altogether - if (responseData.detail !== "You don't have access to the project.") { + if (responseData.detail === "You don't have access to the project.") { toolbarConfigLogic.actions.authenticate() } } + if (response.status == 401) { + toolbarConfigLogic.actions.tokenExpired() + } return response } diff --git a/frontend/src/toolbar/toolbarLogic.ts b/frontend/src/toolbar/toolbarLogic.ts deleted file mode 100644 index d5183a6734f20..0000000000000 --- a/frontend/src/toolbar/toolbarLogic.ts +++ /dev/null @@ -1,87 +0,0 @@ -import { actions, afterMount, kea, listeners, path, props, reducers, selectors } from 'kea' -import { lemonToast } from 'lib/lemon-ui/LemonToast/LemonToast' - -import { actionsTabLogic } from '~/toolbar/actions/actionsTabLogic' -import { posthog } from '~/toolbar/posthog' -import { clearSessionToolbarToken } from '~/toolbar/utils' -import { ToolbarProps } from '~/types' - -import type { toolbarLogicType } from './toolbarLogicType' - -export const toolbarLogic = kea([ - path(['toolbar', 'toolbarLogic']), - props({} as ToolbarProps), - - actions({ - authenticate: true, - logout: true, - tokenExpired: true, - processUserIntent: true, - clearUserIntent: true, - showButton: true, - hideButton: true, - }), - - reducers(({ props }) => ({ - rawApiURL: [props.apiURL as string], - rawJsURL: [(props.jsURL || props.apiURL) as string], - temporaryToken: [props.temporaryToken || null, { logout: () => null, tokenExpired: () => null }], - actionId: [props.actionId || null, { logout: () => null, clearUserIntent: () => null }], - userIntent: [props.userIntent || null, { logout: () => null, clearUserIntent: () => null }], - source: [props.source || null, { logout: () => null }], - buttonVisible: [true, { showButton: () => true, hideButton: () => false, logout: () => false }], - dataAttributes: [props.dataAttributes || []], - posthog: [props.posthog ?? null], - })), - - selectors({ - apiURL: [(s) => [s.rawApiURL], (apiURL) => `${apiURL.endsWith('/') ? apiURL.replace(/\/+$/, '') : apiURL}`], - jsURL: [ - (s) => [s.rawJsURL, s.apiURL], - (rawJsURL, apiUrl) => - `${rawJsURL ? (rawJsURL.endsWith('/') ? rawJsURL.replace(/\/+$/, '') : rawJsURL) : apiUrl}`, - ], - isAuthenticated: [(s) => [s.temporaryToken], (temporaryToken) => !!temporaryToken], - }), - - listeners(({ values, props }) => ({ - authenticate: () => { - posthog.capture('toolbar authenticate', { is_authenticated: values.isAuthenticated }) - const encodedUrl = encodeURIComponent(window.location.href) - window.location.href = `${values.apiURL}/authorize_and_redirect/?redirect=${encodedUrl}` - clearSessionToolbarToken() - }, - logout: () => { - posthog.capture('toolbar logout') - clearSessionToolbarToken() - }, - tokenExpired: () => { - posthog.capture('toolbar token expired') - console.warn('PostHog Toolbar API token expired. Clearing session.') - if (values.source !== 'localstorage') { - lemonToast.error('PostHog Toolbar API token expired.') - } - clearSessionToolbarToken() - }, - processUserIntent: () => { - if (props.userIntent === 'add-action' || props.userIntent === 'edit-action') { - actionsTabLogic.actions.showButtonActions() - // the right view will next be opened in `actionsTabLogic` on `getActionsSuccess` - } - }, - })), - - afterMount(({ props, actions, values }) => { - if (props.instrument) { - const distinctId = props.distinctId - if (distinctId) { - posthog.identify(distinctId, props.userEmail ? { email: props.userEmail } : {}) - } - posthog.optIn() - } - if (props.userIntent) { - actions.processUserIntent() - } - posthog.capture('toolbar loaded', { is_authenticated: values.isAuthenticated }) - }), -]) diff --git a/frontend/src/types.ts b/frontend/src/types.ts index a7f65e84c5473..58f54b786a7df 100644 --- a/frontend/src/types.ts +++ b/frontend/src/types.ts @@ -906,6 +906,7 @@ export type ErrorCluster = { sample: string occurrences: number session_ids: string[] + sparkline: Record unique_sessions: number viewed: number } @@ -3596,6 +3597,7 @@ export type BatchExportDestinationS3 = { encryption: string | null kms_key_id: string | null endpoint_url: string | null + file_format: string } } diff --git a/mypy-baseline.txt b/mypy-baseline.txt index f0a464d26c546..b8d2d1c94da64 100644 --- a/mypy-baseline.txt +++ b/mypy-baseline.txt @@ -85,6 +85,7 @@ posthog/hogql/parser.py:0: error: "None" has no attribute "text" [attr-defined] posthog/hogql/parser.py:0: error: "None" has no attribute "text" [attr-defined] posthog/hogql/parser.py:0: error: Statement is unreachable [unreachable] posthog/hogql/database/schema/person_distinct_ids.py:0: error: Argument 1 to "select_from_person_distinct_ids_table" has incompatible type "dict[str, list[str]]"; expected "dict[str, list[str | int]]" [arg-type] +posthog/hogql/database/schema/person_distinct_id_overrides.py:0: error: Argument 1 to "select_from_person_distinct_id_overrides_table" has incompatible type "dict[str, list[str]]"; expected "dict[str, list[str | int]]" [arg-type] posthog/hogql/database/schema/cohort_people.py:0: error: Argument "chain" to "Field" has incompatible type "list[str]"; expected "list[str | int]" [arg-type] posthog/hogql/database/schema/cohort_people.py:0: note: "List" is invariant -- see https://mypy.readthedocs.io/en/stable/common_issues.html#variance posthog/hogql/database/schema/cohort_people.py:0: note: Consider using "Sequence" instead, which is covariant @@ -105,6 +106,7 @@ posthog/hogql/database/database.py:0: error: "FieldOrTable" has no attribute "fi posthog/hogql/database/database.py:0: error: "FieldOrTable" has no attribute "fields" [attr-defined] posthog/hogql/database/database.py:0: error: "FieldOrTable" has no attribute "fields" [attr-defined] posthog/hogql/database/database.py:0: error: "FieldOrTable" has no attribute "fields" [attr-defined] +posthog/hogql/database/database.py:0: error: "FieldOrTable" has no attribute "fields" [attr-defined] posthog/hogql/database/database.py:0: error: Incompatible types (expression has type "Literal['view', 'lazy_table']", TypedDict item "type" has type "Literal['integer', 'float', 'string', 'datetime', 'date', 'boolean', 'array', 'json', 'lazy_table', 'virtual_table', 'field_traverser', 'expression']") [typeddict-item] posthog/warehouse/models/datawarehouse_saved_query.py:0: error: Argument 1 to "create_hogql_database" has incompatible type "int | None"; expected "int" [arg-type] posthog/warehouse/models/datawarehouse_saved_query.py:0: error: Incompatible types in assignment (expression has type "Expr", variable has type "SelectQuery | SelectUnionQuery") [assignment] @@ -346,20 +348,12 @@ posthog/hogql_queries/sessions_timeline_query_runner.py:0: error: Statement is u posthog/hogql_queries/insights/trends/breakdown.py:0: error: Item "None" of "BreakdownFilter | None" has no attribute "breakdown_type" [union-attr] posthog/hogql_queries/insights/trends/breakdown.py:0: error: Item "None" of "BreakdownFilter | None" has no attribute "breakdown_histogram_bin_count" [union-attr] posthog/hogql_queries/insights/trends/breakdown.py:0: error: Item "None" of "BreakdownFilter | None" has no attribute "breakdown_type" [union-attr] -posthog/hogql_queries/insights/trends/breakdown.py:0: error: Item "None" of "BreakdownFilter | None" has no attribute "breakdown" [union-attr] -posthog/hogql_queries/insights/trends/breakdown.py:0: error: Argument 1 to "parse_expr" has incompatible type "str | float | list[str | float] | Any | None"; expected "str" [arg-type] -posthog/hogql_queries/insights/trends/breakdown.py:0: error: Item "None" of "BreakdownFilter | None" has no attribute "breakdown_type" [union-attr] -posthog/hogql_queries/insights/trends/breakdown.py:0: error: Item "None" of "BreakdownFilter | None" has no attribute "breakdown_type" [union-attr] -posthog/hogql_queries/insights/trends/breakdown.py:0: error: Item "None" of "BreakdownFilter | None" has no attribute "breakdown" [union-attr] -posthog/hogql_queries/insights/trends/breakdown.py:0: error: Argument 1 to "parse_expr" has incompatible type "str | float | list[str | float] | Any | None"; expected "str" [arg-type] -posthog/hogql_queries/insights/trends/breakdown.py:0: error: Statement is unreachable [unreachable] posthog/hogql_queries/insights/trends/breakdown.py:0: error: Argument "exprs" to "Or" has incompatible type "list[CompareOperation]"; expected "list[Expr]" [arg-type] posthog/hogql_queries/insights/trends/breakdown.py:0: note: "List" is invariant -- see https://mypy.readthedocs.io/en/stable/common_issues.html#variance posthog/hogql_queries/insights/trends/breakdown.py:0: note: Consider using "Sequence" instead, which is covariant -posthog/hogql_queries/insights/trends/breakdown.py:0: error: Incompatible types in assignment (expression has type "float", variable has type "int") [assignment] -posthog/hogql_queries/insights/trends/breakdown.py:0: error: Incompatible types in assignment (expression has type "float", variable has type "int") [assignment] -posthog/hogql_queries/insights/trends/breakdown.py:0: error: Incompatible types in assignment (expression has type "str", variable has type "int") [assignment] -posthog/hogql_queries/insights/trends/breakdown.py:0: error: Incompatible types in assignment (expression has type "str", variable has type "int") [assignment] +posthog/hogql_queries/insights/trends/breakdown.py:0: error: Item "None" of "BreakdownFilter | None" has no attribute "breakdown_type" [union-attr] +posthog/hogql_queries/insights/trends/breakdown.py:0: error: Item "None" of "BreakdownFilter | None" has no attribute "breakdown" [union-attr] +posthog/hogql_queries/insights/trends/breakdown.py:0: error: Argument 1 to "parse_expr" has incompatible type "str | float | list[str | float] | Any | None"; expected "str" [arg-type] posthog/hogql_queries/insights/trends/breakdown.py:0: error: Item "None" of "BreakdownFilter | None" has no attribute "breakdown_type" [union-attr] posthog/hogql_queries/insights/trends/breakdown.py:0: error: Item "None" of "BreakdownFilter | None" has no attribute "breakdown" [union-attr] posthog/hogql_queries/insights/trends/breakdown.py:0: error: Argument "breakdown_field" to "get_properties_chain" has incompatible type "str | float | list[str | float] | Any | None"; expected "str" [arg-type] diff --git a/package.json b/package.json index 770be74997198..2128fa3207a76 100644 --- a/package.json +++ b/package.json @@ -140,11 +140,12 @@ "maplibre-gl": "^3.5.1", "md5": "^2.3.0", "monaco-editor": "^0.39.0", + "natural-orderby": "^3.0.2", "papaparse": "^5.4.1", "pmtiles": "^2.11.0", "postcss": "^8.4.31", "postcss-preset-env": "^9.3.0", - "posthog-js": "1.116.3", + "posthog-js": "1.116.4", "posthog-js-lite": "2.5.0", "prettier": "^2.8.8", "prop-types": "^15.7.2", diff --git a/plugin-server/src/config/config.ts b/plugin-server/src/config/config.ts index dcaebe4c1097a..def72eea474bb 100644 --- a/plugin-server/src/config/config.ts +++ b/plugin-server/src/config/config.ts @@ -164,8 +164,8 @@ export function getDefaultConfig(): PluginsServerConfig { SESSION_RECORDING_KAFKA_DEBUG: undefined, SESSION_RECORDING_MAX_PARALLEL_FLUSHES: 10, SESSION_RECORDING_OVERFLOW_ENABLED: false, - SESSION_RECORDING_OVERFLOW_BUCKET_REPLENISH_RATE: 2_000_000, // 2MB/second uncompressed, sustained - SESSION_RECORDING_OVERFLOW_BUCKET_CAPACITY: 100_000_000, // 100MB burst + SESSION_RECORDING_OVERFLOW_BUCKET_REPLENISH_RATE: 5_000_000, // 5MB/second uncompressed, sustained + SESSION_RECORDING_OVERFLOW_BUCKET_CAPACITY: 200_000_000, // 200MB burst } } diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 5f157ec8b039e..04806adc67be8 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -238,6 +238,9 @@ dependencies: monaco-editor: specifier: ^0.39.0 version: 0.39.0 + natural-orderby: + specifier: ^3.0.2 + version: 3.0.2 papaparse: specifier: ^5.4.1 version: 5.4.1 @@ -251,8 +254,8 @@ dependencies: specifier: ^9.3.0 version: 9.3.0(postcss@8.4.31) posthog-js: - specifier: 1.116.3 - version: 1.116.3 + specifier: 1.116.4 + version: 1.116.4 posthog-js-lite: specifier: 2.5.0 version: 2.5.0 @@ -6793,7 +6796,7 @@ packages: '@storybook/csf': 0.1.3 '@storybook/global': 5.0.0 '@storybook/types': 7.6.17 - '@types/qs': 6.9.13 + '@types/qs': 6.9.14 dequal: 2.0.3 lodash: 4.17.21 memoizerific: 1.11.3 @@ -8197,8 +8200,8 @@ packages: resolution: {integrity: sha512-bZcOkJ6uWrL0Qb2NAWKa7TBU+mJHPzhx9jjLL1KHF+XpzEcR7EXHvjbHlGtR/IsP1vyPrehuS6XqkmaePy//mg==} dev: false - /@types/qs@6.9.13: - resolution: {integrity: sha512-iLR+1vTTJ3p0QaOUq6ACbY1mzKTODFDT/XedZI8BksOotFmL4ForwDfRQ/DZeuTHR7/2i4lI1D203gdfxuqTlA==} + /@types/qs@6.9.14: + resolution: {integrity: sha512-5khscbd3SwWMhFqylJBLQ0zIu7c1K6Vz0uBIt915BI3zV0q1nfjRQD3RqSBcPaO6PHEF4ov/t9y89fSiyThlPA==} dev: true /@types/query-selector-shadow-dom@1.0.0: @@ -15917,6 +15920,11 @@ packages: resolution: {integrity: sha512-OWND8ei3VtNC9h7V60qff3SVobHr996CTwgxubgyQYEpg290h9J0buyECNNJexkFm5sOajh5G116RYA1c8ZMSw==} dev: true + /natural-orderby@3.0.2: + resolution: {integrity: sha512-x7ZdOwBxZCEm9MM7+eQCjkrNLrW3rkBKNHVr78zbtqnMGVNlnDi6C/eUEYgxHNrcbu0ymvjzcwIL/6H1iHri9g==} + engines: {node: '>=18'} + dev: false + /needle@3.3.1: resolution: {integrity: sha512-6k0YULvhpw+RoLNiQCRKOl09Rv1dPLr8hHnVjHqdolKwDrdNyk+Hmrthi4lIGPPz3r39dLx0hsF5s40sZ3Us4Q==} engines: {node: '>= 4.4.x'} @@ -17446,8 +17454,8 @@ packages: resolution: {integrity: sha512-Urvlp0Vu9h3td0BVFWt0QXFJDoOZcaAD83XM9d91NKMKTVPZtfU0ysoxstIf5mw/ce9ZfuMgpWPaagrZI4rmSg==} dev: false - /posthog-js@1.116.3: - resolution: {integrity: sha512-KakGsQ8rS/K/U5Q/tiBrRrFRCgGrR0oI9VSYw9hwNCY00EClwAU3EuykUuQTFdQ1EuYMrZDIMWDD4NW6zgf7wQ==} + /posthog-js@1.116.4: + resolution: {integrity: sha512-PZg208/k5OZRQbd9tnGvUgtyRl1IAYyyh74teyIDIH3EnlsAolBlVM4gcoyEYoVkUi5sZLKitj9gTX3/vnEG4Q==} dependencies: fflate: 0.4.8 preact: 10.20.0 diff --git a/posthog/batch_exports/service.py b/posthog/batch_exports/service.py index c26be9a77ed1a..d51dfdb2fbc3c 100644 --- a/posthog/batch_exports/service.py +++ b/posthog/batch_exports/service.py @@ -90,6 +90,7 @@ class S3BatchExportInputs: kms_key_id: str | None = None batch_export_schema: BatchExportSchema | None = None endpoint_url: str | None = None + file_format: str = "JSONLines" @dataclass @@ -439,8 +440,11 @@ def create_batch_export_run( return run -def update_batch_export_run_status( - run_id: UUID, status: str, latest_error: str | None, records_completed: int = 0 +def update_batch_export_run( + run_id: UUID, + status: str, + latest_error: str | None, + records_completed: int = 0, ) -> BatchExportRun: """Update the status of an BatchExportRun with given id. @@ -448,7 +452,14 @@ def update_batch_export_run_status( id: The id of the BatchExportRun to update. """ model = BatchExportRun.objects.filter(id=run_id) - updated = model.update(status=status, latest_error=latest_error, records_completed=records_completed) + update_at = dt.datetime.now() + + updated = model.update( + status=status, + latest_error=latest_error, + records_completed=records_completed, + last_updated_at=update_at, + ) if not updated: raise ValueError(f"BatchExportRun with id {run_id} not found.") diff --git a/posthog/hogql/ast.py b/posthog/hogql/ast.py index a459514f2524f..806226b8f1b9e 100644 --- a/posthog/hogql/ast.py +++ b/posthog/hogql/ast.py @@ -46,8 +46,17 @@ def resolve_constant_type(self, context: HogQLContext): def resolve_database_field(self, context: HogQLContext): if isinstance(self.type, FieldType): return self.type.resolve_database_field(context) + if isinstance(self.type, PropertyType): + return self.type.field_type.resolve_database_field(context) raise NotImplementedException("FieldAliasType.resolve_database_field not implemented") + def resolve_table_type(self, context: HogQLContext): + if isinstance(self.type, FieldType): + return self.type.table_type + if isinstance(self.type, PropertyType): + return self.type.field_type.table_type + raise NotImplementedException("FieldAliasType.resolve_table_type not implemented") + @dataclass(kw_only=True) class BaseTableType(Type): @@ -339,6 +348,9 @@ def get_child(self, name: str | int, context: HogQLContext) -> Type: f'Can not access property "{name}" on field "{self.name}" of type: {type(database_field).__name__}' ) + def resolve_table_type(self, context: HogQLContext): + return self.table_type + @dataclass(kw_only=True) class PropertyType(Type): diff --git a/posthog/hogql/base.py b/posthog/hogql/base.py index fbdafffb2d08c..e8a74025b78be 100644 --- a/posthog/hogql/base.py +++ b/posthog/hogql/base.py @@ -32,7 +32,7 @@ def accept(self, visitor): return visit(self) if hasattr(visitor, "visit_unknown"): return visitor.visit_unknown(self) - raise NotImplementedException(f"Visitor has no method {method_name}") + raise NotImplementedException(f"{visitor.__class__.__name__} has no method {method_name}") @dataclass(kw_only=True) diff --git a/posthog/hogql/database/__init__.py b/posthog/hogql/database/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/posthog/hogql/database/database.py b/posthog/hogql/database/database.py index 6909211070e59..de12a8267d911 100644 --- a/posthog/hogql/database/database.py +++ b/posthog/hogql/database/database.py @@ -31,6 +31,11 @@ from posthog.hogql.database.schema.events import EventsTable from posthog.hogql.database.schema.groups import GroupsTable, RawGroupsTable from posthog.hogql.database.schema.numbers import NumbersTable +from posthog.hogql.database.schema.person_distinct_id_overrides import ( + PersonDistinctIdOverridesTable, + RawPersonDistinctIdOverridesTable, + join_with_person_distinct_id_overrides_table, +) from posthog.hogql.database.schema.person_distinct_ids import ( PersonDistinctIdsTable, RawPersonDistinctIdsTable, @@ -66,6 +71,7 @@ class Database(BaseModel): groups: GroupsTable = GroupsTable() persons: PersonsTable = PersonsTable() person_distinct_ids: PersonDistinctIdsTable = PersonDistinctIdsTable() + person_distinct_id_overrides: PersonDistinctIdOverridesTable = PersonDistinctIdOverridesTable() person_overrides: PersonOverridesTable = PersonOverridesTable() session_replay_events: SessionReplayEventsTable = SessionReplayEventsTable() @@ -81,6 +87,7 @@ class Database(BaseModel): raw_persons: RawPersonsTable = RawPersonsTable() raw_groups: RawGroupsTable = RawGroupsTable() raw_cohort_people: RawCohortPeople = RawCohortPeople() + raw_person_distinct_id_overrides: RawPersonDistinctIdOverridesTable = RawPersonDistinctIdOverridesTable() raw_person_overrides: RawPersonOverridesTable = RawPersonOverridesTable() raw_sessions: RawSessionsTable = RawSessionsTable() @@ -186,6 +193,24 @@ def create_hogql_database( database.events.fields["poe"].fields["id"] = database.events.fields["person_id"] database.events.fields["person"] = FieldTraverser(chain=["poe"]) + elif modifiers.personsOnEventsMode == PersonsOnEventsMode.v3_enabled: + database.events.fields["event_person_id"] = StringDatabaseField(name="person_id") + database.events.fields["override"] = LazyJoin( + from_field=["distinct_id"], # ??? + join_table=PersonDistinctIdOverridesTable(), + join_function=join_with_person_distinct_id_overrides_table, + ) + database.events.fields["person_id"] = ExpressionField( + name="person_id", + expr=parse_expr( + # NOTE: assumes `join_use_nulls = 0` (the default), as ``override.distinct_id`` is not Nullable + "if(not(empty(override.distinct_id)), override.person_id, event_person_id)", + start=None, + ), + ) + database.events.fields["poe"].fields["id"] = database.events.fields["person_id"] + database.events.fields["person"] = FieldTraverser(chain=["poe"]) + database.persons.fields["$virt_initial_referring_domain_type"] = create_initial_domain_type( "$virt_initial_referring_domain_type" ) diff --git a/posthog/hogql/database/models.py b/posthog/hogql/database/models.py index 95a00595c6472..d2da7868a7f9c 100644 --- a/posthog/hogql/database/models.py +++ b/posthog/hogql/database/models.py @@ -3,7 +3,6 @@ from posthog.hogql.base import Expr from posthog.hogql.errors import HogQLException, NotImplementedException -from posthog.schema import HogQLQueryModifiers if TYPE_CHECKING: from posthog.hogql.context import HogQLContext @@ -126,12 +125,14 @@ def resolve_table(self, context: "HogQLContext") -> Table: class LazyTable(Table): """ - A table that is replaced with a subquery returned from `lazy_select(requested_fields: Dict[name, chain], modifiers: HogQLQueryModifiers)` + A table that is replaced with a subquery returned from `lazy_select(requested_fields: Dict[name, chain], modifiers: HogQLQueryModifiers, node: SelectQuery)` """ model_config = ConfigDict(extra="forbid") - def lazy_select(self, requested_fields: Dict[str, List[str | int]], modifiers: HogQLQueryModifiers) -> Any: + def lazy_select( + self, requested_fields: Dict[str, List[str | int]], context: "HogQLContext", node: "SelectQuery" + ) -> Any: raise NotImplementedException("LazyTable.lazy_select not overridden") diff --git a/posthog/hogql/database/schema/__init__.py b/posthog/hogql/database/schema/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/posthog/hogql/database/schema/cohort_people.py b/posthog/hogql/database/schema/cohort_people.py index 72080419b7355..11723f0194619 100644 --- a/posthog/hogql/database/schema/cohort_people.py +++ b/posthog/hogql/database/schema/cohort_people.py @@ -9,7 +9,6 @@ FieldOrTable, ) from posthog.hogql.database.schema.persons import join_with_persons_table -from posthog.schema import HogQLQueryModifiers COHORT_PEOPLE_FIELDS = { "person_id": StringDatabaseField(name="person_id"), @@ -67,7 +66,7 @@ def to_printed_hogql(self): class CohortPeople(LazyTable): fields: Dict[str, FieldOrTable] = COHORT_PEOPLE_FIELDS - def lazy_select(self, requested_fields: Dict[str, List[str | int]], modifiers: HogQLQueryModifiers): + def lazy_select(self, requested_fields: Dict[str, List[str | int]], context, node): return select_from_cohort_people_table(requested_fields) def to_printed_clickhouse(self, context): diff --git a/posthog/hogql/database/schema/groups.py b/posthog/hogql/database/schema/groups.py index bb237d68e8070..3b9de7f08befc 100644 --- a/posthog/hogql/database/schema/groups.py +++ b/posthog/hogql/database/schema/groups.py @@ -13,7 +13,6 @@ FieldOrTable, ) from posthog.hogql.errors import HogQLException -from posthog.schema import HogQLQueryModifiers GROUPS_TABLE_FIELDS = { "index": IntegerDatabaseField(name="group_type_index"), @@ -83,7 +82,7 @@ def to_printed_hogql(self): class GroupsTable(LazyTable): fields: Dict[str, FieldOrTable] = GROUPS_TABLE_FIELDS - def lazy_select(self, requested_fields: Dict[str, List[str | int]], modifiers: HogQLQueryModifiers): + def lazy_select(self, requested_fields: Dict[str, List[str | int]], context, node): return select_from_groups_table(requested_fields) def to_printed_clickhouse(self, context): diff --git a/posthog/hogql/database/schema/log_entries.py b/posthog/hogql/database/schema/log_entries.py index c14e90e26da50..9f5dc816ac4b0 100644 --- a/posthog/hogql/database/schema/log_entries.py +++ b/posthog/hogql/database/schema/log_entries.py @@ -9,7 +9,6 @@ LazyTable, FieldOrTable, ) -from posthog.schema import HogQLQueryModifiers LOG_ENTRIES_FIELDS: Dict[str, FieldOrTable] = { "team_id": IntegerDatabaseField(name="team_id"), @@ -35,7 +34,7 @@ def to_printed_hogql(self): class ReplayConsoleLogsLogEntriesTable(LazyTable): fields: Dict[str, FieldOrTable] = LOG_ENTRIES_FIELDS - def lazy_select(self, requested_fields: Dict[str, List[str | int]], modifiers: HogQLQueryModifiers): + def lazy_select(self, requested_fields: Dict[str, List[str | int]], context, node): fields: List[ast.Expr] = [ast.Field(chain=["log_entries"] + chain) for name, chain in requested_fields.items()] return ast.SelectQuery( @@ -58,7 +57,7 @@ def to_printed_hogql(self): class BatchExportLogEntriesTable(LazyTable): fields: Dict[str, FieldOrTable] = LOG_ENTRIES_FIELDS - def lazy_select(self, requested_fields: Dict[str, List[str | int]], modifiers: HogQLQueryModifiers): + def lazy_select(self, requested_fields: Dict[str, List[str | int]], context, node): fields: List[ast.Expr] = [ast.Field(chain=["log_entries"] + chain) for name, chain in requested_fields.items()] return ast.SelectQuery( diff --git a/posthog/hogql/database/schema/person_distinct_id_overrides.py b/posthog/hogql/database/schema/person_distinct_id_overrides.py new file mode 100644 index 0000000000000..34df59655c24d --- /dev/null +++ b/posthog/hogql/database/schema/person_distinct_id_overrides.py @@ -0,0 +1,91 @@ +from typing import Dict, List +from posthog.hogql.ast import SelectQuery +from posthog.hogql.context import HogQLContext + +from posthog.hogql.database.argmax import argmax_select +from posthog.hogql.database.models import ( + Table, + IntegerDatabaseField, + StringDatabaseField, + BooleanDatabaseField, + LazyJoin, + LazyTable, + FieldOrTable, +) +from posthog.hogql.database.schema.persons import join_with_persons_table +from posthog.hogql.errors import HogQLException + +PERSON_DISTINCT_ID_OVERRIDES_FIELDS = { + "team_id": IntegerDatabaseField(name="team_id"), + "distinct_id": StringDatabaseField(name="distinct_id"), + "person_id": StringDatabaseField(name="person_id"), + "person": LazyJoin( + from_field=["person_id"], + join_table="persons", + join_function=join_with_persons_table, + ), +} + + +def select_from_person_distinct_id_overrides_table(requested_fields: Dict[str, List[str | int]]): + # Always include "person_id", as it's the key we use to make further joins, and it'd be great if it's available + if "person_id" not in requested_fields: + requested_fields = {**requested_fields, "person_id": ["person_id"]} + return argmax_select( + table_name="raw_person_distinct_id_overrides", + select_fields=requested_fields, + group_fields=["distinct_id"], + argmax_field="version", + deleted_field="is_deleted", + ) + + +def join_with_person_distinct_id_overrides_table( + from_table: str, + to_table: str, + requested_fields: Dict[str, List[str]], + context: HogQLContext, + node: SelectQuery, +): + from posthog.hogql import ast + + if not requested_fields: + raise HogQLException("No fields requested from person_distinct_id_overrides") + join_expr = ast.JoinExpr(table=select_from_person_distinct_id_overrides_table(requested_fields)) + join_expr.join_type = "LEFT OUTER JOIN" + join_expr.alias = to_table + join_expr.constraint = ast.JoinConstraint( + expr=ast.CompareOperation( + op=ast.CompareOperationOp.Eq, + left=ast.Field(chain=[from_table, "distinct_id"]), + right=ast.Field(chain=[to_table, "distinct_id"]), + ) + ) + return join_expr + + +class RawPersonDistinctIdOverridesTable(Table): + fields: Dict[str, FieldOrTable] = { + **PERSON_DISTINCT_ID_OVERRIDES_FIELDS, + "is_deleted": BooleanDatabaseField(name="is_deleted"), + "version": IntegerDatabaseField(name="version"), + } + + def to_printed_clickhouse(self, context): + return "person_distinct_id_overrides" + + def to_printed_hogql(self): + return "raw_person_distinct_id_overrides" + + +class PersonDistinctIdOverridesTable(LazyTable): + fields: Dict[str, FieldOrTable] = PERSON_DISTINCT_ID_OVERRIDES_FIELDS + + def lazy_select(self, requested_fields: Dict[str, List[str | int]], context: HogQLContext, node: SelectQuery): + return select_from_person_distinct_id_overrides_table(requested_fields) + + def to_printed_clickhouse(self, context): + return "person_distinct_id_overrides" + + def to_printed_hogql(self): + return "person_distinct_id_overrides" diff --git a/posthog/hogql/database/schema/person_distinct_ids.py b/posthog/hogql/database/schema/person_distinct_ids.py index 02144b35fc3d8..3304eccda862e 100644 --- a/posthog/hogql/database/schema/person_distinct_ids.py +++ b/posthog/hogql/database/schema/person_distinct_ids.py @@ -14,7 +14,6 @@ ) from posthog.hogql.database.schema.persons import join_with_persons_table from posthog.hogql.errors import HogQLException -from posthog.schema import HogQLQueryModifiers PERSON_DISTINCT_IDS_FIELDS = { "team_id": IntegerDatabaseField(name="team_id"), @@ -82,7 +81,7 @@ def to_printed_hogql(self): class PersonDistinctIdsTable(LazyTable): fields: Dict[str, FieldOrTable] = PERSON_DISTINCT_IDS_FIELDS - def lazy_select(self, requested_fields: Dict[str, List[str | int]], modifiers: HogQLQueryModifiers): + def lazy_select(self, requested_fields: Dict[str, List[str | int]], context, node): return select_from_person_distinct_ids_table(requested_fields) def to_printed_clickhouse(self, context): diff --git a/posthog/hogql/database/schema/persons.py b/posthog/hogql/database/schema/persons.py index a248da56b7307..c7abdd89e14c6 100644 --- a/posthog/hogql/database/schema/persons.py +++ b/posthog/hogql/database/schema/persons.py @@ -123,8 +123,8 @@ def to_printed_hogql(self): class PersonsTable(LazyTable): fields: Dict[str, FieldOrTable] = PERSONS_FIELDS - def lazy_select(self, requested_fields: Dict[str, List[str | int]], modifiers: HogQLQueryModifiers): - return select_from_persons_table(requested_fields, modifiers) + def lazy_select(self, requested_fields: Dict[str, List[str | int]], context, node): + return select_from_persons_table(requested_fields, context.modifiers) def to_printed_clickhouse(self, context): return "person" diff --git a/posthog/hogql/database/schema/persons_pdi.py b/posthog/hogql/database/schema/persons_pdi.py index 9f476f407b4d2..195643b90c08c 100644 --- a/posthog/hogql/database/schema/persons_pdi.py +++ b/posthog/hogql/database/schema/persons_pdi.py @@ -10,7 +10,6 @@ FieldOrTable, ) from posthog.hogql.errors import HogQLException -from posthog.schema import HogQLQueryModifiers # :NOTE: We already have person_distinct_ids.py, which most tables link to. This persons_pdi.py is a hack to @@ -63,7 +62,7 @@ class PersonsPDITable(LazyTable): "person_id": StringDatabaseField(name="person_id"), } - def lazy_select(self, requested_fields: Dict[str, List[str | int]], modifiers: HogQLQueryModifiers): + def lazy_select(self, requested_fields: Dict[str, List[str | int]], context, node): return persons_pdi_select(requested_fields) def to_printed_clickhouse(self, context): diff --git a/posthog/hogql/database/schema/session_replay_events.py b/posthog/hogql/database/schema/session_replay_events.py index c9d564c7d4588..baaecef89e049 100644 --- a/posthog/hogql/database/schema/session_replay_events.py +++ b/posthog/hogql/database/schema/session_replay_events.py @@ -15,7 +15,6 @@ PersonDistinctIdsTable, join_with_person_distinct_ids_table, ) -from posthog.schema import HogQLQueryModifiers RAW_ONLY_FIELDS = ["min_first_timestamp", "max_last_timestamp"] @@ -115,7 +114,7 @@ class SessionReplayEventsTable(LazyTable): "first_url": StringDatabaseField(name="first_url"), } - def lazy_select(self, requested_fields: Dict[str, List[str | int]], modifiers: HogQLQueryModifiers): + def lazy_select(self, requested_fields: Dict[str, List[str | int]], context, node): return select_from_session_replay_events_table(requested_fields) def to_printed_clickhouse(self, context): diff --git a/posthog/hogql/database/schema/sessions.py b/posthog/hogql/database/schema/sessions.py index 2a4865798eeb8..770daceaa23c5 100644 --- a/posthog/hogql/database/schema/sessions.py +++ b/posthog/hogql/database/schema/sessions.py @@ -1,5 +1,7 @@ from typing import Dict, List, cast +from posthog.hogql import ast +from posthog.hogql.context import HogQLContext from posthog.hogql.database.models import ( StringDatabaseField, DateTimeDatabaseField, @@ -11,7 +13,7 @@ LazyTable, ) from posthog.hogql.database.schema.channel_type import create_channel_type_expr -from posthog.schema import HogQLQueryModifiers +from posthog.hogql.database.schema.util.session_where_clause_extractor import SessionMinTimestampWhereClauseExtractor SESSIONS_COMMON_FIELDS: Dict[str, FieldOrTable] = { @@ -62,7 +64,9 @@ def avoid_asterisk_fields(self) -> List[str]: ] -def select_from_sessions_table(requested_fields: Dict[str, List[str | int]]): +def select_from_sessions_table( + requested_fields: Dict[str, List[str | int]], node: ast.SelectQuery, context: HogQLContext +): from posthog.hogql import ast table_name = "raw_sessions" @@ -134,10 +138,13 @@ def select_from_sessions_table(requested_fields: Dict[str, List[str | int]]): ) group_by_fields.append(ast.Field(chain=cast(list[str | int], [table_name]) + chain)) + where = SessionMinTimestampWhereClauseExtractor(context).get_inner_where(node) + return ast.SelectQuery( select=select_fields, select_from=ast.JoinExpr(table=ast.Field(chain=[table_name])), group_by=group_by_fields, + where=where, ) @@ -148,8 +155,8 @@ class SessionsTable(LazyTable): "channel_type": StringDatabaseField(name="channel_type"), } - def lazy_select(self, requested_fields: Dict[str, List[str | int]], modifiers: HogQLQueryModifiers): - return select_from_sessions_table(requested_fields) + def lazy_select(self, requested_fields: Dict[str, List[str | int]], context, node: ast.SelectQuery): + return select_from_sessions_table(requested_fields, node, context) def to_printed_clickhouse(self, context): return "sessions" diff --git a/posthog/hogql/database/schema/util/__init__.py b/posthog/hogql/database/schema/util/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/posthog/hogql/database/schema/util/session_where_clause_extractor.py b/posthog/hogql/database/schema/util/session_where_clause_extractor.py new file mode 100644 index 0000000000000..83933bdde8b85 --- /dev/null +++ b/posthog/hogql/database/schema/util/session_where_clause_extractor.py @@ -0,0 +1,398 @@ +from dataclasses import dataclass +from typing import Optional + +from posthog.hogql import ast +from posthog.hogql.ast import CompareOperationOp, ArithmeticOperationOp +from posthog.hogql.context import HogQLContext +from posthog.hogql.database.models import DatabaseField + +from posthog.hogql.visitor import clone_expr, CloningVisitor, Visitor + +SESSION_BUFFER_DAYS = 3 + + +@dataclass +class SessionMinTimestampWhereClauseExtractor(CloningVisitor): + """This class extracts the Where clause from the lazy sessions table, to the clickhouse sessions table. + + The sessions table in Clickhouse is an AggregatingMergeTree, and will have one row per session per day. This means that + when we want to query sessions, we need to pre-group these rows, so that we only have one row per session. + + We hide this detail using a lazy table, but to make querying the underlying Clickhouse table faster, we can inline the + min_timestamp where conditions from the select on the outer lazy table to the select on the inner real table. + + This class is called on the select query of the lazy table, and will return the where clause that should be applied to + the inner table. + + As a query can be unreasonably complex, we only handle simple cases, but this class is designed to fail-safe. If it + can't reason about a particular expression, it will just return a constant True, i.e. fetch more rows than necessary. + + This means that we can incrementally add support for more complex queries, without breaking existing queries, by + handling more cases. + + Some examples of failing-safe: + + `SELECT * FROM sessions where min_timestamp > '2022-01-01' AND f(session_id)` + only the` min_timestamp > '2022-01-01'` part is relevant, so we can ignore the `f(session_id)` part, and it is safe + to replace it with a constant True, which collapses the AND to just the `min_timestamp > '2022-01-01'` part. + + `SELECT * FROM sessions where min_timestamp > '2022-01-01' OR f(session_id)` + only the` min_timestamp > '2022-01-01'` part is relevant, and turning the `f(session_id)` part into a constant True + would collapse the OR to True. In this case we return None as no pre-filtering is possible. + + All min_timestamp comparisons are given a buffer of SESSION_BUFFER_DAYS each side, to ensure that we collect all the + relevant rows for each session. + """ + + context: HogQLContext + clear_types: bool = False + clear_locations: bool = False + + def get_inner_where(self, parsed_query: ast.SelectQuery) -> Optional[ast.Expr]: + if not parsed_query.where: + return None + + # visit the where clause + where = self.visit(parsed_query.where) + + if isinstance(where, ast.Constant): + return None + + return clone_expr(where, clear_types=True, clear_locations=True) + + def visit_compare_operation(self, node: ast.CompareOperation) -> ast.Expr: + is_left_constant = is_time_or_interval_constant(node.left) + is_right_constant = is_time_or_interval_constant(node.right) + is_left_timestamp_field = is_simple_timestamp_field_expression(node.left, self.context) + is_right_timestamp_field = is_simple_timestamp_field_expression(node.right, self.context) + + if is_left_constant and is_right_constant: + # just ignore this comparison + return ast.Constant(value=True) + + # handle the left side being a min_timestamp expression and the right being constant + if is_left_timestamp_field and is_right_constant: + if node.op == CompareOperationOp.Eq: + return ast.And( + exprs=[ + ast.CompareOperation( + op=ast.CompareOperationOp.LtEq, + left=ast.ArithmeticOperation( + op=ast.ArithmeticOperationOp.Sub, + left=rewrite_timestamp_field(node.left, self.context), + right=ast.Call(name="toIntervalDay", args=[ast.Constant(value=SESSION_BUFFER_DAYS)]), + ), + right=node.right, + ), + ast.CompareOperation( + op=ast.CompareOperationOp.GtEq, + left=ast.ArithmeticOperation( + op=ast.ArithmeticOperationOp.Add, + left=rewrite_timestamp_field(node.left, self.context), + right=ast.Call(name="toIntervalDay", args=[ast.Constant(value=SESSION_BUFFER_DAYS)]), + ), + right=node.right, + ), + ] + ) + elif node.op == CompareOperationOp.Gt or node.op == CompareOperationOp.GtEq: + return ast.CompareOperation( + op=ast.CompareOperationOp.GtEq, + left=ast.ArithmeticOperation( + op=ast.ArithmeticOperationOp.Add, + left=rewrite_timestamp_field(node.left, self.context), + right=ast.Call(name="toIntervalDay", args=[ast.Constant(value=SESSION_BUFFER_DAYS)]), + ), + right=node.right, + ) + elif node.op == CompareOperationOp.Lt or node.op == CompareOperationOp.LtEq: + return ast.CompareOperation( + op=ast.CompareOperationOp.LtEq, + left=ast.ArithmeticOperation( + op=ast.ArithmeticOperationOp.Sub, + left=rewrite_timestamp_field(node.left, self.context), + right=ast.Call(name="toIntervalDay", args=[ast.Constant(value=SESSION_BUFFER_DAYS)]), + ), + right=node.right, + ) + elif is_right_timestamp_field and is_left_constant: + # let's not duplicate the logic above, instead just flip and it and recurse + if node.op in [ + CompareOperationOp.Eq, + CompareOperationOp.Lt, + CompareOperationOp.LtEq, + CompareOperationOp.Gt, + CompareOperationOp.GtEq, + ]: + return self.visit( + ast.CompareOperation( + op=CompareOperationOp.Eq + if node.op == CompareOperationOp.Eq + else CompareOperationOp.Lt + if node.op == CompareOperationOp.Gt + else CompareOperationOp.LtEq + if node.op == CompareOperationOp.GtEq + else CompareOperationOp.Gt + if node.op == CompareOperationOp.Lt + else CompareOperationOp.GtEq, + left=node.right, + right=node.left, + ) + ) + + return ast.Constant(value=True) + + def visit_arithmetic_operation(self, node: ast.ArithmeticOperation) -> ast.Expr: + # don't even try to handle complex logic + return ast.Constant(value=True) + + def visit_not(self, node: ast.Not) -> ast.Expr: + return ast.Constant(value=True) + + def visit_call(self, node: ast.Call) -> ast.Expr: + if node.name == "and": + return self.visit_and(ast.And(exprs=node.args)) + elif node.name == "or": + return self.visit_or(ast.Or(exprs=node.args)) + return ast.Constant(value=True) + + def visit_field(self, node: ast.Field) -> ast.Expr: + return ast.Constant(value=True) + + def visit_constant(self, node: ast.Constant) -> ast.Expr: + return ast.Constant(value=True) + + def visit_placeholder(self, node: ast.Placeholder) -> ast.Expr: + raise Exception() # this should never happen, as placeholders should be resolved before this runs + + def visit_and(self, node: ast.And) -> ast.Expr: + exprs = [self.visit(expr) for expr in node.exprs] + + flattened = [] + for expr in exprs: + if isinstance(expr, ast.And): + flattened.extend(expr.exprs) + else: + flattened.append(expr) + + if any(isinstance(expr, ast.Constant) and expr.value is False for expr in flattened): + return ast.Constant(value=False) + + filtered = [expr for expr in flattened if not isinstance(expr, ast.Constant) or expr.value is not True] + if len(filtered) == 0: + return ast.Constant(value=True) + elif len(filtered) == 1: + return filtered[0] + else: + return ast.And(exprs=filtered) + + def visit_or(self, node: ast.Or) -> ast.Expr: + exprs = [self.visit(expr) for expr in node.exprs] + + flattened = [] + for expr in exprs: + if isinstance(expr, ast.Or): + flattened.extend(expr.exprs) + else: + flattened.append(expr) + + if any(isinstance(expr, ast.Constant) and expr.value is True for expr in flattened): + return ast.Constant(value=True) + + filtered = [expr for expr in flattened if not isinstance(expr, ast.Constant) or expr.value is not False] + if len(filtered) == 0: + return ast.Constant(value=False) + elif len(filtered) == 1: + return filtered[0] + else: + return ast.Or(exprs=filtered) + + def visit_alias(self, node: ast.Alias) -> ast.Expr: + return self.visit(node.expr) + + +def is_time_or_interval_constant(expr: ast.Expr) -> bool: + return IsTimeOrIntervalConstantVisitor().visit(expr) + + +class IsTimeOrIntervalConstantVisitor(Visitor[bool]): + def visit_constant(self, node: ast.Constant) -> bool: + return True + + def visit_compare_operation(self, node: ast.CompareOperation) -> bool: + return self.visit(node.left) and self.visit(node.right) + + def visit_arithmetic_operation(self, node: ast.ArithmeticOperation) -> bool: + return self.visit(node.left) and self.visit(node.right) + + def visit_call(self, node: ast.Call) -> bool: + # some functions just return a constant + if node.name in ["today", "now"]: + return True + # some functions return a constant if the first argument is a constant + if node.name in [ + "parseDateTime64BestEffortOrNull", + "toDateTime", + "toTimeZone", + "assumeNotNull", + "toIntervalYear", + "toIntervalMonth", + "toIntervalWeek", + "toIntervalDay", + "toIntervalHour", + "toIntervalMinute", + "toIntervalSecond", + "toStartOfDay", + "toStartOfWeek", + "toStartOfMonth", + "toStartOfQuarter", + "toStartOfYear", + ]: + return self.visit(node.args[0]) + + if node.name in ["minus", "add"]: + return all(self.visit(arg) for arg in node.args) + + # otherwise we don't know, so return False + return False + + def visit_field(self, node: ast.Field) -> bool: + return False + + def visit_and(self, node: ast.And) -> bool: + return False + + def visit_or(self, node: ast.Or) -> bool: + return False + + def visit_not(self, node: ast.Not) -> bool: + return False + + def visit_placeholder(self, node: ast.Placeholder) -> bool: + raise Exception() + + def visit_alias(self, node: ast.Alias) -> bool: + return self.visit(node.expr) + + +def is_simple_timestamp_field_expression(expr: ast.Expr, context: HogQLContext) -> bool: + return IsSimpleTimestampFieldExpressionVisitor(context).visit(expr) + + +@dataclass +class IsSimpleTimestampFieldExpressionVisitor(Visitor[bool]): + context: HogQLContext + + def visit_constant(self, node: ast.Constant) -> bool: + return False + + def visit_field(self, node: ast.Field) -> bool: + if node.type and isinstance(node.type, ast.FieldType): + resolved_field = node.type.resolve_database_field(self.context) + if resolved_field and isinstance(resolved_field, DatabaseField) and resolved_field: + return resolved_field.name in ["min_timestamp", "timestamp"] + # no type information, so just use the name of the field + return node.chain[-1] in ["min_timestamp", "timestamp"] + + def visit_arithmetic_operation(self, node: ast.ArithmeticOperation) -> bool: + # only allow the min_timestamp field to be used on one side of the arithmetic operation + return ( + self.visit(node.left) + and is_time_or_interval_constant(node.right) + or (self.visit(node.right) and is_time_or_interval_constant(node.left)) + ) + + def visit_call(self, node: ast.Call) -> bool: + # some functions count as a timestamp field expression if their first argument is + if node.name in [ + "parseDateTime64BestEffortOrNull", + "toDateTime", + "toTimeZone", + "assumeNotNull", + "toStartOfDay", + "toStartOfWeek", + "toStartOfMonth", + "toStartOfQuarter", + "toStartOfYear", + ]: + return self.visit(node.args[0]) + + if node.name in ["minus", "add"]: + return self.visit_arithmetic_operation( + ast.ArithmeticOperation( + op=ArithmeticOperationOp.Sub if node.name == "minus" else ArithmeticOperationOp.Add, + left=node.args[0], + right=node.args[1], + ) + ) + + # otherwise we don't know, so return False + return False + + def visit_compare_operation(self, node: ast.CompareOperation) -> bool: + return False + + def visit_and(self, node: ast.And) -> bool: + return False + + def visit_or(self, node: ast.Or) -> bool: + return False + + def visit_not(self, node: ast.Not) -> bool: + return False + + def visit_placeholder(self, node: ast.Placeholder) -> bool: + raise Exception() + + def visit_alias(self, node: ast.Alias) -> bool: + from posthog.hogql.database.schema.events import EventsTable + from posthog.hogql.database.schema.sessions import SessionsTable + + if node.type and isinstance(node.type, ast.FieldAliasType): + resolved_field = node.type.resolve_database_field(self.context) + table_type = node.type.resolve_table_type(self.context) + if not table_type: + return False + return ( + isinstance(table_type, ast.TableType) + and isinstance(table_type.table, EventsTable) + and resolved_field.name == "timestamp" + ) or ( + isinstance(table_type, ast.LazyTableType) + and isinstance(table_type.table, SessionsTable) + and resolved_field.name == "min_timestamp" + ) + + return self.visit(node.expr) + + +def rewrite_timestamp_field(expr: ast.Expr, context: HogQLContext) -> ast.Expr: + return RewriteTimestampFieldVisitor(context).visit(expr) + + +class RewriteTimestampFieldVisitor(CloningVisitor): + context: HogQLContext + + def __init__(self, context: HogQLContext, *args, **kwargs): + super().__init__(*args, **kwargs) + self.context = context + + def visit_field(self, node: ast.Field) -> ast.Field: + from posthog.hogql.database.schema.events import EventsTable + from posthog.hogql.database.schema.sessions import SessionsTable + + if node.type and isinstance(node.type, ast.FieldType): + resolved_field = node.type.resolve_database_field(self.context) + table = node.type.resolve_table_type(self.context).table + if resolved_field and isinstance(resolved_field, DatabaseField): + if (isinstance(table, EventsTable) and resolved_field.name == "timestamp") or ( + isinstance(table, SessionsTable) and resolved_field.name == "min_timestamp" + ): + return ast.Field(chain=["raw_sessions", "min_timestamp"]) + # no type information, so just use the name of the field + if node.chain[-1] in ["min_timestamp", "timestamp"]: + return ast.Field(chain=["raw_sessions", "min_timestamp"]) + return node + + def visit_alias(self, node: ast.Alias) -> ast.Expr: + return self.visit(node.expr) diff --git a/posthog/hogql/database/schema/util/test/test_session_where_clause_extractor.py b/posthog/hogql/database/schema/util/test/test_session_where_clause_extractor.py new file mode 100644 index 0000000000000..bc5324e739ad9 --- /dev/null +++ b/posthog/hogql/database/schema/util/test/test_session_where_clause_extractor.py @@ -0,0 +1,284 @@ +from typing import Union, Optional, Dict + +from posthog.hogql import ast +from posthog.hogql.context import HogQLContext +from posthog.hogql.database.schema.util.session_where_clause_extractor import SessionMinTimestampWhereClauseExtractor +from posthog.hogql.modifiers import create_default_modifiers_for_team +from posthog.hogql.parser import parse_select, parse_expr +from posthog.hogql.printer import prepare_ast_for_printing, print_prepared_ast +from posthog.hogql.visitor import clone_expr +from posthog.test.base import ClickhouseTestMixin, APIBaseTest + + +def f(s: Union[str, ast.Expr, None], placeholders: Optional[dict[str, ast.Expr]] = None) -> Union[ast.Expr, None]: + if s is None: + return None + if isinstance(s, str): + expr = parse_expr(s, placeholders=placeholders) + else: + expr = s + return clone_expr(expr, clear_types=True, clear_locations=True) + + +def parse( + s: str, + placeholders: Optional[Dict[str, ast.Expr]] = None, +) -> ast.SelectQuery: + parsed = parse_select(s, placeholders=placeholders) + assert isinstance(parsed, ast.SelectQuery) + return parsed + + +class TestSessionTimestampInliner(ClickhouseTestMixin, APIBaseTest): + @property + def inliner(self): + team = self.team + modifiers = create_default_modifiers_for_team(team) + context = HogQLContext( + team_id=team.pk, + team=team, + enable_select_queries=True, + modifiers=modifiers, + ) + return SessionMinTimestampWhereClauseExtractor(context) + + def test_handles_select_with_no_where_claus(self): + inner_where = self.inliner.get_inner_where(parse("SELECT * FROM sessions")) + assert inner_where is None + + def test_handles_select_with_eq(self): + actual = f(self.inliner.get_inner_where(parse("SELECT * FROM sessions WHERE min_timestamp = '2021-01-01'"))) + expected = f( + "((raw_sessions.min_timestamp - toIntervalDay(3)) <= '2021-01-01') AND ((raw_sessions.min_timestamp + toIntervalDay(3)) >= '2021-01-01')" + ) + assert expected == actual + + def test_handles_select_with_eq_flipped(self): + actual = f(self.inliner.get_inner_where(parse("SELECT * FROM sessions WHERE '2021-01-01' = min_timestamp"))) + expected = f( + "((raw_sessions.min_timestamp - toIntervalDay(3)) <= '2021-01-01') AND ((raw_sessions.min_timestamp + toIntervalDay(3)) >= '2021-01-01')" + ) + assert expected == actual + + def test_handles_select_with_simple_gt(self): + actual = f(self.inliner.get_inner_where(parse("SELECT * FROM sessions WHERE min_timestamp > '2021-01-01'"))) + expected = f("((raw_sessions.min_timestamp + toIntervalDay(3)) >= '2021-01-01')") + assert expected == actual + + def test_handles_select_with_simple_gte(self): + actual = f(self.inliner.get_inner_where(parse("SELECT * FROM sessions WHERE min_timestamp >= '2021-01-01'"))) + expected = f("((raw_sessions.min_timestamp + toIntervalDay(3)) >= '2021-01-01')") + assert expected == actual + + def test_handles_select_with_simple_lt(self): + actual = f(self.inliner.get_inner_where(parse("SELECT * FROM sessions WHERE min_timestamp < '2021-01-01'"))) + expected = f("((raw_sessions.min_timestamp - toIntervalDay(3)) <= '2021-01-01')") + assert expected == actual + + def test_handles_select_with_simple_lte(self): + actual = f(self.inliner.get_inner_where(parse("SELECT * FROM sessions WHERE min_timestamp <= '2021-01-01'"))) + expected = f("((raw_sessions.min_timestamp - toIntervalDay(3)) <= '2021-01-01')") + assert expected == actual + + def test_select_with_placeholder(self): + actual = f( + self.inliner.get_inner_where( + parse( + "SELECT * FROM sessions WHERE min_timestamp > {timestamp}", + placeholders={"timestamp": ast.Constant(value="2021-01-01")}, + ) + ) + ) + expected = f("((raw_sessions.min_timestamp + toIntervalDay(3)) >= '2021-01-01')") + assert expected == actual + + def test_unrelated_equals(self): + actual = self.inliner.get_inner_where( + parse("SELECT * FROM sessions WHERE initial_utm_campaign = initial_utm_source") + ) + assert actual is None + + def test_timestamp_and(self): + actual = f( + self.inliner.get_inner_where( + parse("SELECT * FROM sessions WHERE and(min_timestamp >= '2021-01-01', min_timestamp <= '2021-01-03')") + ) + ) + expected = f( + "((raw_sessions.min_timestamp + toIntervalDay(3)) >= '2021-01-01') AND ((raw_sessions.min_timestamp - toIntervalDay(3)) <= '2021-01-03')" + ) + assert expected == actual + + def test_timestamp_or(self): + actual = f( + self.inliner.get_inner_where( + parse("SELECT * FROM sessions WHERE and(min_timestamp <= '2021-01-01', min_timestamp >= '2021-01-03')") + ) + ) + expected = f( + "((raw_sessions.min_timestamp - toIntervalDay(3)) <= '2021-01-01') AND ((raw_sessions.min_timestamp + toIntervalDay(3)) >= '2021-01-03')" + ) + assert expected == actual + + def test_unrelated_function(self): + actual = f(self.inliner.get_inner_where(parse("SELECT * FROM sessions WHERE like('a', 'b')"))) + assert actual is None + + def test_timestamp_unrelated_function(self): + actual = f( + self.inliner.get_inner_where(parse("SELECT * FROM sessions WHERE like(toString(min_timestamp), 'b')")) + ) + assert actual is None + + def test_timestamp_unrelated_function_timestamp(self): + actual = f( + self.inliner.get_inner_where(parse("SELECT * FROM sessions WHERE like(toString(min_timestamp), 'b')")) + ) + assert actual is None + + def test_ambiguous_or(self): + actual = f( + self.inliner.get_inner_where( + parse( + "SELECT * FROM sessions WHERE or(min_timestamp > '2021-01-03', like(toString(min_timestamp), 'b'))" + ) + ) + ) + assert actual is None + + def test_ambiguous_and(self): + actual = f( + self.inliner.get_inner_where( + parse( + "SELECT * FROM sessions WHERE and(min_timestamp > '2021-01-03', like(toString(min_timestamp), 'b'))" + ) + ) + ) + assert actual == f("(raw_sessions.min_timestamp + toIntervalDay(3)) >= '2021-01-03'") + + def test_join(self): + actual = f( + self.inliner.get_inner_where( + parse( + "SELECT * FROM events JOIN sessions ON events.session_id = raw_sessions.session_id WHERE min_timestamp > '2021-01-03'" + ) + ) + ) + expected = f("((raw_sessions.min_timestamp + toIntervalDay(3)) >= '2021-01-03')") + assert expected == actual + + def test_join_using_events_timestamp_filter(self): + actual = f( + self.inliner.get_inner_where( + parse( + "SELECT * FROM events JOIN sessions ON events.session_id = raw_sessions.session_id WHERE timestamp > '2021-01-03'" + ) + ) + ) + expected = f("((raw_sessions.min_timestamp + toIntervalDay(3)) >= '2021-01-03')") + assert expected == actual + + def test_minus(self): + actual = f(self.inliner.get_inner_where(parse("SELECT * FROM sessions WHERE min_timestamp >= today() - 2"))) + expected = f("((raw_sessions.min_timestamp + toIntervalDay(3)) >= (today() - 2))") + assert expected == actual + + def test_minus_function(self): + actual = f( + self.inliner.get_inner_where(parse("SELECT * FROM sessions WHERE min_timestamp >= minus(today() , 2)")) + ) + expected = f("((raw_sessions.min_timestamp + toIntervalDay(3)) >= minus(today(), 2))") + assert expected == actual + + def test_real_example(self): + actual = f( + self.inliner.get_inner_where( + parse( + "SELECT * FROM events JOIN sessions ON events.session_id = raw_sessions.session_id WHERE event = '$pageview' AND toTimeZone(timestamp, 'US/Pacific') >= toDateTime('2024-03-12 00:00:00', 'US/Pacific') AND toTimeZone(timestamp, 'US/Pacific') <= toDateTime('2024-03-19 23:59:59', 'US/Pacific')" + ) + ) + ) + expected = f( + "(toTimeZone(raw_sessions.min_timestamp, 'US/Pacific') + toIntervalDay(3)) >= toDateTime('2024-03-12 00:00:00', 'US/Pacific') AND (toTimeZone(raw_sessions.min_timestamp, 'US/Pacific') - toIntervalDay(3)) <= toDateTime('2024-03-19 23:59:59', 'US/Pacific') " + ) + assert expected == actual + + def test_collapse_and(self): + actual = f( + self.inliner.get_inner_where( + parse( + "SELECT * FROM sesions WHERE event = '$pageview' AND (TRUE AND (TRUE AND TRUE AND (timestamp >= '2024-03-12' AND TRUE)))" + ) + ) + ) + expected = f("(raw_sessions.min_timestamp + toIntervalDay(3)) >= '2024-03-12'") + assert expected == actual + + +class TestSessionsQueriesHogQLToClickhouse(ClickhouseTestMixin, APIBaseTest): + def print_query(self, query: str) -> str: + team = self.team + modifiers = create_default_modifiers_for_team(team) + context = HogQLContext( + team_id=team.pk, + team=team, + enable_select_queries=True, + modifiers=modifiers, + ) + prepared_ast = prepare_ast_for_printing(node=parse(query), context=context, dialect="clickhouse") + pretty = print_prepared_ast(prepared_ast, context=context, dialect="clickhouse", pretty=True) + return pretty + + def test_select_with_timestamp(self): + actual = self.print_query("SELECT session_id FROM sessions WHERE min_timestamp > '2021-01-01'") + expected = f"""SELECT + sessions.session_id AS session_id +FROM + (SELECT + sessions.session_id AS session_id, + min(sessions.min_timestamp) AS min_timestamp + FROM + sessions + WHERE + and(equals(sessions.team_id, {self.team.id}), ifNull(greaterOrEquals(plus(toTimeZone(sessions.min_timestamp, %(hogql_val_0)s), toIntervalDay(3)), %(hogql_val_1)s), 0)) + GROUP BY + sessions.session_id, + sessions.session_id) AS sessions +WHERE + ifNull(greater(toTimeZone(sessions.min_timestamp, %(hogql_val_2)s), %(hogql_val_3)s), 0) +LIMIT 10000""" + assert expected == actual + + def test_join_with_events(self): + actual = self.print_query( + """ +SELECT + sessions.session_id, + uniq(uuid) +FROM events +JOIN sessions +ON events.$session_id = sessions.session_id +WHERE events.timestamp > '2021-01-01' +GROUP BY sessions.session_id +""" + ) + expected = f"""SELECT + sessions.session_id AS session_id, + uniq(events.uuid) +FROM + events + JOIN (SELECT + sessions.session_id AS session_id + FROM + sessions + WHERE + and(equals(sessions.team_id, {self.team.id}), ifNull(greaterOrEquals(plus(toTimeZone(sessions.min_timestamp, %(hogql_val_0)s), toIntervalDay(3)), %(hogql_val_1)s), 0)) + GROUP BY + sessions.session_id, + sessions.session_id) AS sessions ON equals(events.`$session_id`, sessions.session_id) +WHERE + and(equals(events.team_id, {self.team.id}), greater(toTimeZone(events.timestamp, %(hogql_val_2)s), %(hogql_val_3)s)) +GROUP BY + sessions.session_id +LIMIT 10000""" + assert expected == actual diff --git a/posthog/hogql/database/test/__snapshots__/test_database.ambr b/posthog/hogql/database/test/__snapshots__/test_database.ambr index db4dfc8f6df9f..63c2d16ce87aa 100644 --- a/posthog/hogql/database/test/__snapshots__/test_database.ambr +++ b/posthog/hogql/database/test/__snapshots__/test_database.ambr @@ -304,6 +304,31 @@ ] } ], + "person_distinct_id_overrides": [ + { + "key": "distinct_id", + "type": "string" + }, + { + "key": "person_id", + "type": "string" + }, + { + "key": "person", + "type": "lazy_table", + "table": "persons", + "fields": [ + "id", + "created_at", + "team_id", + "properties", + "is_identified", + "pdi", + "$virt_initial_referring_domain_type", + "$virt_initial_channel_type" + ] + } + ], "person_overrides": [ { "key": "old_person_id", @@ -790,6 +815,39 @@ "type": "integer" } ], + "raw_person_distinct_id_overrides": [ + { + "key": "distinct_id", + "type": "string" + }, + { + "key": "person_id", + "type": "string" + }, + { + "key": "person", + "type": "lazy_table", + "table": "persons", + "fields": [ + "id", + "created_at", + "team_id", + "properties", + "is_identified", + "pdi", + "$virt_initial_referring_domain_type", + "$virt_initial_channel_type" + ] + }, + { + "key": "is_deleted", + "type": "boolean" + }, + { + "key": "version", + "type": "integer" + } + ], "raw_person_overrides": [ { "key": "old_person_id", @@ -1155,6 +1213,31 @@ ] } ], + "person_distinct_id_overrides": [ + { + "key": "distinct_id", + "type": "string" + }, + { + "key": "person_id", + "type": "string" + }, + { + "key": "person", + "type": "lazy_table", + "table": "persons", + "fields": [ + "id", + "created_at", + "team_id", + "properties", + "is_identified", + "pdi", + "$virt_initial_referring_domain_type", + "$virt_initial_channel_type" + ] + } + ], "person_overrides": [ { "key": "old_person_id", @@ -1641,6 +1724,39 @@ "type": "integer" } ], + "raw_person_distinct_id_overrides": [ + { + "key": "distinct_id", + "type": "string" + }, + { + "key": "person_id", + "type": "string" + }, + { + "key": "person", + "type": "lazy_table", + "table": "persons", + "fields": [ + "id", + "created_at", + "team_id", + "properties", + "is_identified", + "pdi", + "$virt_initial_referring_domain_type", + "$virt_initial_channel_type" + ] + }, + { + "key": "is_deleted", + "type": "boolean" + }, + { + "key": "version", + "type": "integer" + } + ], "raw_person_overrides": [ { "key": "old_person_id", diff --git a/posthog/hogql/test/test_bytecode.py b/posthog/hogql/test/test_bytecode.py index cf0b8113b574d..f7d810700e74a 100644 --- a/posthog/hogql/test/test_bytecode.py +++ b/posthog/hogql/test/test_bytecode.py @@ -130,7 +130,7 @@ def test_bytecode_create(self): def test_bytecode_create_error(self): with self.assertRaises(NotImplementedException) as e: to_bytecode("(select 1)") - self.assertEqual(str(e.exception), "Visitor has no method visit_select_query") + self.assertEqual(str(e.exception), "BytecodeBuilder has no method visit_select_query") with self.assertRaises(NotImplementedException) as e: to_bytecode("1 in cohort 2") diff --git a/posthog/hogql/test/test_modifiers.py b/posthog/hogql/test/test_modifiers.py index eba1f5195ab3d..b2b0ef1e40630 100644 --- a/posthog/hogql/test/test_modifiers.py +++ b/posthog/hogql/test/test_modifiers.py @@ -74,6 +74,13 @@ def test_modifiers_persons_on_events_mode_mapping(self): "events.person_properties AS properties", "toTimeZone(events.person_created_at, %(hogql_val_1)s) AS created_at", ), + ( + PersonsOnEventsMode.v3_enabled, + "events.event AS event", + "if(not(empty(events__override.distinct_id)), events__override.person_id, events.person_id) AS id", + "events.person_properties AS properties", + "toTimeZone(events.person_created_at, %(hogql_val_0)s) AS created_at", + ), ] for mode, *expected in test_cases: diff --git a/posthog/hogql/test/test_visitor.py b/posthog/hogql/test/test_visitor.py index 8aa6689328fbf..a01193f788d5f 100644 --- a/posthog/hogql/test/test_visitor.py +++ b/posthog/hogql/test/test_visitor.py @@ -125,7 +125,7 @@ def visit_arithmetic_operation(self, node: ast.ArithmeticOperation): with self.assertRaises(HogQLException) as e: UnknownNotDefinedVisitor().visit(parse_expr("1 + 3 / 'asd2'")) - self.assertEqual(str(e.exception), "Visitor has no method visit_constant") + self.assertEqual(str(e.exception), "UnknownNotDefinedVisitor has no method visit_constant") def test_hogql_exception_start_end(self): class EternalVisitor(TraversingVisitor): diff --git a/posthog/hogql/transforms/lazy_tables.py b/posthog/hogql/transforms/lazy_tables.py index bdbb322d54397..df8ce6962259c 100644 --- a/posthog/hogql/transforms/lazy_tables.py +++ b/posthog/hogql/transforms/lazy_tables.py @@ -309,7 +309,7 @@ def create_override(table_name: str, field_chain: List[str | int]) -> None: # For all the collected tables, create the subqueries, and add them to the table. for table_name, table_to_add in tables_to_add.items(): - subquery = table_to_add.lazy_table.lazy_select(table_to_add.fields_accessed, self.context.modifiers) + subquery = table_to_add.lazy_table.lazy_select(table_to_add.fields_accessed, self.context, node=node) subquery = cast(ast.SelectQuery, clone_expr(subquery, clear_locations=True)) subquery = cast(ast.SelectQuery, resolve_types(subquery, self.context, self.dialect, [node.type])) old_table_type = select_type.tables[table_name] diff --git a/posthog/hogql/transforms/test/__snapshots__/test_in_cohort.ambr b/posthog/hogql/transforms/test/__snapshots__/test_in_cohort.ambr index 9ff7f8ee0ab49..e0f5ea847110d 100644 --- a/posthog/hogql/transforms/test/__snapshots__/test_in_cohort.ambr +++ b/posthog/hogql/transforms/test/__snapshots__/test_in_cohort.ambr @@ -31,7 +31,7 @@ FROM events LEFT JOIN ( SELECT person_static_cohort.person_id AS cohort_person_id, 1 AS matched, person_static_cohort.cohort_id AS cohort_id FROM person_static_cohort - WHERE and(equals(person_static_cohort.team_id, 420), in(person_static_cohort.cohort_id, [12]))) AS __in_cohort ON equals(__in_cohort.cohort_person_id, events.person_id) + WHERE and(equals(person_static_cohort.team_id, 420), in(person_static_cohort.cohort_id, [11]))) AS __in_cohort ON equals(__in_cohort.cohort_person_id, events.person_id) WHERE and(equals(events.team_id, 420), 1, ifNull(equals(__in_cohort.matched, 1), 0)) LIMIT 100 SETTINGS readonly=2, max_execution_time=60, allow_experimental_object_type=1 @@ -42,7 +42,7 @@ FROM events LEFT JOIN ( SELECT person_id AS cohort_person_id, 1 AS matched, cohort_id FROM static_cohort_people - WHERE in(cohort_id, [12])) AS __in_cohort ON equals(__in_cohort.cohort_person_id, person_id) + WHERE in(cohort_id, [11])) AS __in_cohort ON equals(__in_cohort.cohort_person_id, person_id) WHERE and(1, equals(__in_cohort.matched, 1)) LIMIT 100 ''' @@ -55,7 +55,7 @@ FROM events LEFT JOIN ( SELECT person_static_cohort.person_id AS cohort_person_id, 1 AS matched, person_static_cohort.cohort_id AS cohort_id FROM person_static_cohort - WHERE and(equals(person_static_cohort.team_id, 420), in(person_static_cohort.cohort_id, [13]))) AS __in_cohort ON equals(__in_cohort.cohort_person_id, events.person_id) + WHERE and(equals(person_static_cohort.team_id, 420), in(person_static_cohort.cohort_id, [12]))) AS __in_cohort ON equals(__in_cohort.cohort_person_id, events.person_id) WHERE and(equals(events.team_id, 420), 1, ifNull(equals(__in_cohort.matched, 1), 0)) LIMIT 100 SETTINGS readonly=2, max_execution_time=60, allow_experimental_object_type=1 @@ -66,7 +66,7 @@ FROM events LEFT JOIN ( SELECT person_id AS cohort_person_id, 1 AS matched, cohort_id FROM static_cohort_people - WHERE in(cohort_id, [13])) AS __in_cohort ON equals(__in_cohort.cohort_person_id, person_id) + WHERE in(cohort_id, [12])) AS __in_cohort ON equals(__in_cohort.cohort_person_id, person_id) WHERE and(1, equals(__in_cohort.matched, 1)) LIMIT 100 ''' diff --git a/posthog/hogql/visitor.py b/posthog/hogql/visitor.py index c11856169297f..2bf968abf2ab0 100644 --- a/posthog/hogql/visitor.py +++ b/posthog/hogql/visitor.py @@ -1,4 +1,4 @@ -from typing import Optional +from typing import Optional, TypeVar, Generic, Any from posthog.hogql import ast from posthog.hogql.base import AST, Expr @@ -14,8 +14,11 @@ def clear_locations(expr: Expr) -> Expr: return CloningVisitor(clear_locations=True).visit(expr) -class Visitor(object): - def visit(self, node: AST): +T = TypeVar("T") + + +class Visitor(Generic[T]): + def visit(self, node: AST) -> T: if node is None: return node @@ -28,7 +31,7 @@ def visit(self, node: AST): raise e -class TraversingVisitor(Visitor): +class TraversingVisitor(Visitor[None]): """Visitor that traverses the AST tree without returning anything""" def visit_expr(self, node: Expr): @@ -258,7 +261,7 @@ def visit_hogqlx_attribute(self, node: ast.HogQLXAttribute): self.visit(node.value) -class CloningVisitor(Visitor): +class CloningVisitor(Visitor[Any]): """Visitor that traverses and clones the AST tree. Clears types.""" def __init__( diff --git a/posthog/hogql_queries/insights/trends/breakdown.py b/posthog/hogql_queries/insights/trends/breakdown.py index 45a3a8421e8d8..bde2cd807b6a7 100644 --- a/posthog/hogql_queries/insights/trends/breakdown.py +++ b/posthog/hogql_queries/insights/trends/breakdown.py @@ -3,9 +3,7 @@ from posthog.hogql.parser import parse_expr from posthog.hogql.timings import HogQLTimings from posthog.hogql_queries.insights.trends.breakdown_values import ( - BREAKDOWN_NULL_NUMERIC_LABEL, BREAKDOWN_NULL_STRING_LABEL, - BREAKDOWN_OTHER_NUMERIC_LABEL, BREAKDOWN_OTHER_STRING_LABEL, BreakdownValues, ) @@ -19,6 +17,10 @@ from posthog.schema import ActionsNode, EventsNode, DataWarehouseNode, HogQLQueryModifiers, InCohortVia, TrendsQuery +def hogql_to_string(expr: ast.Expr) -> ast.Call: + return ast.Call(name="toString", args=[expr]) + + class Breakdown: query: TrendsQuery team: Team @@ -27,7 +29,7 @@ class Breakdown: timings: HogQLTimings modifiers: HogQLQueryModifiers events_filter: ast.Expr - breakdown_values_override: Optional[List[str | int | float]] + breakdown_values_override: Optional[List[str]] def __init__( self, @@ -38,7 +40,7 @@ def __init__( timings: HogQLTimings, modifiers: HogQLQueryModifiers, events_filter: ast.Expr, - breakdown_values_override: Optional[List[str | int | float]] = None, + breakdown_values_override: Optional[List[str]] = None, ): self.team = team self.query = query @@ -70,19 +72,15 @@ def placeholders(self) -> Dict[str, ast.Expr]: return {"cross_join_breakdown_values": ast.Alias(alias="breakdown_value", expr=values)} - def column_expr(self) -> ast.Expr: + def column_expr(self) -> ast.Alias: if self.is_histogram_breakdown: return ast.Alias(alias="breakdown_value", expr=self._get_breakdown_histogram_multi_if()) - elif self.query.breakdownFilter.breakdown_type == "hogql": - return ast.Alias( - alias="breakdown_value", - expr=parse_expr(self.query.breakdownFilter.breakdown), - ) - elif self.query.breakdownFilter.breakdown_type == "cohort": + + if self.query.breakdownFilter.breakdown_type == "cohort": if self.modifiers.inCohortVia == InCohortVia.leftjoin_conjoined: return ast.Alias( alias="breakdown_value", - expr=ast.Field(chain=["__in_cohort", "cohort_id"]), + expr=hogql_to_string(ast.Field(chain=["__in_cohort", "cohort_id"])), ) cohort_breakdown = ( @@ -90,19 +88,9 @@ def column_expr(self) -> ast.Expr: ) return ast.Alias( alias="breakdown_value", - expr=ast.Constant(value=cohort_breakdown), - ) - - if self.query.breakdownFilter.breakdown_type == "hogql": - return ast.Alias( - alias="breakdown_value", - expr=parse_expr(self.query.breakdownFilter.breakdown), + expr=hogql_to_string(ast.Constant(value=cohort_breakdown)), ) - # If there's no breakdown values - if len(self._breakdown_values) == 1 and self._breakdown_values[0] is None: - return ast.Alias(alias="breakdown_value", expr=ast.Field(chain=self._properties_chain)) - return ast.Alias(alias="breakdown_value", expr=self._get_breakdown_transform_func) def events_where_filter(self) -> ast.Expr | None: @@ -148,15 +136,14 @@ def events_where_filter(self) -> ast.Expr | None: else: left = ast.Field(chain=self._properties_chain) + if not self.is_histogram_breakdown: + left = hogql_to_string(left) + compare_ops = [] for _value in self._breakdown_values: - value: Optional[str | int | float] = _value + value: Optional[str] = str(_value) # non-cohorts are always strings # If the value is one of the "other" values, then use the `transform()` func - if ( - value == BREAKDOWN_OTHER_STRING_LABEL - or value == BREAKDOWN_OTHER_NUMERIC_LABEL - or value == float(BREAKDOWN_OTHER_NUMERIC_LABEL) - ): + if value == BREAKDOWN_OTHER_STRING_LABEL: transform_func = self._get_breakdown_transform_func compare_ops.append( ast.CompareOperation( @@ -164,11 +151,7 @@ def events_where_filter(self) -> ast.Expr | None: ) ) else: - if ( - value == BREAKDOWN_NULL_STRING_LABEL - or value == BREAKDOWN_NULL_NUMERIC_LABEL - or value == float(BREAKDOWN_NULL_NUMERIC_LABEL) - ): + if value == BREAKDOWN_NULL_STRING_LABEL: value = None compare_ops.append( @@ -184,30 +167,25 @@ def events_where_filter(self) -> ast.Expr | None: @cached_property def _get_breakdown_transform_func(self) -> ast.Call: - values = self._breakdown_values - all_values_are_ints_or_none = all(isinstance(value, int) or value is None for value in values) - all_values_are_floats_or_none = all(isinstance(value, float) or value is None for value in values) - - if all_values_are_ints_or_none: - breakdown_other_value = BREAKDOWN_OTHER_NUMERIC_LABEL - breakdown_null_value = BREAKDOWN_NULL_NUMERIC_LABEL - elif all_values_are_floats_or_none: - breakdown_other_value = float(BREAKDOWN_OTHER_NUMERIC_LABEL) - breakdown_null_value = float(BREAKDOWN_NULL_NUMERIC_LABEL) - else: - breakdown_other_value = BREAKDOWN_OTHER_STRING_LABEL - breakdown_null_value = BREAKDOWN_NULL_STRING_LABEL + if self.query.breakdownFilter.breakdown_type == "hogql": + return self._get_breakdown_values_transform(parse_expr(self.query.breakdownFilter.breakdown)) + return self._get_breakdown_values_transform(ast.Field(chain=self._properties_chain)) + def _get_breakdown_values_transform(self, node: ast.Expr) -> ast.Call: + breakdown_values = self._breakdown_values_ast return ast.Call( name="transform", args=[ ast.Call( name="ifNull", - args=[ast.Field(chain=self._properties_chain), ast.Constant(value=breakdown_null_value)], + args=[ + hogql_to_string(node), + ast.Constant(value=BREAKDOWN_NULL_STRING_LABEL), + ], ), - self._breakdown_values_ast, - self._breakdown_values_ast, - ast.Constant(value=breakdown_other_value), + breakdown_values, + breakdown_values, + ast.Constant(value=BREAKDOWN_OTHER_STRING_LABEL), ], ) @@ -220,15 +198,21 @@ def _breakdown_buckets_ast(self) -> ast.Array: return ast.Array(exprs=list(map(lambda v: ast.Constant(value=v), values))) - @cached_property + @property def _breakdown_values_ast(self) -> ast.Array: - return ast.Array(exprs=[ast.Constant(value=v) for v in self._breakdown_values]) + exprs: list[ast.Expr] = [] + for value in self._breakdown_values: + if isinstance(value, str): + exprs.append(ast.Constant(value=value)) + else: + exprs.append(hogql_to_string(ast.Constant(value=value))) + return ast.Array(exprs=exprs) @cached_property - def _all_breakdown_values(self) -> List[str | int | float | None]: + def _all_breakdown_values(self) -> List[str | int | None]: # Used in the actors query if self.breakdown_values_override is not None: - return cast(List[str | int | float | None], self.breakdown_values_override) + return cast(List[str | int | None], self.breakdown_values_override) if self.query.breakdownFilter is None: return [] @@ -243,25 +227,12 @@ def _all_breakdown_values(self) -> List[str | int | float | None]: query_date_range=self.query_date_range, modifiers=self.modifiers, ) - return cast(List[str | int | float | None], breakdown.get_breakdown_values()) + return cast(List[str | int | None], breakdown.get_breakdown_values()) @cached_property - def _breakdown_values(self) -> List[str | int | float]: - values = self._all_breakdown_values - if len(values) == 0 or all(value is None for value in values): - return [] - - if None in values: - all_values_are_ints_or_none = all(isinstance(value, int) or value is None for value in values) - all_values_are_floats_or_none = all(isinstance(value, float) or value is None for value in values) - - if all_values_are_ints_or_none: - values = [v if v is not None else BREAKDOWN_NULL_NUMERIC_LABEL for v in values] - elif all_values_are_floats_or_none: - values = [v if v is not None else float(BREAKDOWN_NULL_NUMERIC_LABEL) for v in values] - else: - values = [v if v is not None else BREAKDOWN_NULL_STRING_LABEL for v in values] - return cast(List[str | int | float], values) + def _breakdown_values(self) -> List[str | int]: + values = [BREAKDOWN_NULL_STRING_LABEL if v is None else v for v in self._all_breakdown_values] + return cast(List[str | int], values) @cached_property def has_breakdown_values(self) -> bool: diff --git a/posthog/hogql_queries/insights/trends/breakdown_values.py b/posthog/hogql_queries/insights/trends/breakdown_values.py index 7b1522d5f25c5..d9ab11891f210 100644 --- a/posthog/hogql_queries/insights/trends/breakdown_values.py +++ b/posthog/hogql_queries/insights/trends/breakdown_values.py @@ -97,6 +97,9 @@ def get_breakdown_values(self) -> List[str | int]: ), ) + if not self.histogram_bin_count: + select_field.expr = ast.Call(name="toString", args=[select_field.expr]) + if self.chart_display_type == ChartDisplayType.WorldMap: breakdown_limit = BREAKDOWN_VALUES_LIMIT_FOR_COUNTRIES else: @@ -211,23 +214,9 @@ def get_breakdown_values(self) -> List[str | int]: # Add "other" value if "other" is not hidden and we're not bucketing numeric values if self.hide_other_aggregation is not True and self.histogram_bin_count is None: - all_values_are_ints_or_none = all(isinstance(value, int) or value is None for value in values) - all_values_are_floats_or_none = all(isinstance(value, float) or value is None for value in values) - all_values_are_string_or_none = all(isinstance(value, str) or value is None for value in values) - - if all_values_are_string_or_none: - values = [BREAKDOWN_NULL_STRING_LABEL if value in (None, "") else value for value in values] - if needs_other: - values.insert(0, BREAKDOWN_OTHER_STRING_LABEL) - elif all_values_are_ints_or_none or all_values_are_floats_or_none: - if all_values_are_ints_or_none: - values = [BREAKDOWN_NULL_NUMERIC_LABEL if value is None else value for value in values] - if needs_other: - values.insert(0, BREAKDOWN_OTHER_NUMERIC_LABEL) - else: - values = [float(BREAKDOWN_NULL_NUMERIC_LABEL) if value is None else value for value in values] - if needs_other: - values.insert(0, float(BREAKDOWN_OTHER_NUMERIC_LABEL)) + values = [BREAKDOWN_NULL_STRING_LABEL if value in (None, "") else value for value in values] + if needs_other: + values = [BREAKDOWN_OTHER_STRING_LABEL] + values if len(values) == 0: values.insert(0, None) diff --git a/posthog/hogql_queries/insights/trends/test/__snapshots__/test_trends.ambr b/posthog/hogql_queries/insights/trends/test/__snapshots__/test_trends.ambr index f6eb3748afb2b..e2ec22fb9fb1c 100644 --- a/posthog/hogql_queries/insights/trends/test/__snapshots__/test_trends.ambr +++ b/posthog/hogql_queries/insights/trends/test/__snapshots__/test_trends.ambr @@ -187,7 +187,7 @@ # --- # name: TestTrends.test_breakdown_by_group_props_person_on_events ''' - SELECT e__group_0.properties___industry AS value, + SELECT toString(e__group_0.properties___industry) AS value, count(e.uuid) AS count FROM events AS e LEFT JOIN @@ -210,7 +210,7 @@ ''' SELECT groupArray(day_start) AS date, groupArray(count) AS total, - ifNull(toString(breakdown_value), '') AS breakdown_value + ifNull(toString(breakdown_value), '$$_posthog_breakdown_null_$$') AS breakdown_value FROM (SELECT sum(total) AS count, day_start AS day_start, @@ -231,7 +231,7 @@ ORDER BY sec.breakdown_value ASC, day_start ASC UNION ALL SELECT count(e.uuid) AS total, toStartOfDay(toTimeZone(e.timestamp, 'UTC')) AS day_start, - transform(ifNull(e__group_0.properties___industry, '$$_posthog_breakdown_null_$$'), ['finance', 'technology'], ['finance', 'technology'], '$$_posthog_breakdown_other_$$') AS breakdown_value + transform(ifNull(toString(e__group_0.properties___industry), '$$_posthog_breakdown_null_$$'), ['finance', 'technology'], ['finance', 'technology'], '$$_posthog_breakdown_other_$$') AS breakdown_value FROM events AS e SAMPLE 1 LEFT JOIN (SELECT argMax(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(groups.group_properties, 'industry'), ''), 'null'), '^"|"$', ''), groups._timestamp) AS properties___industry, @@ -241,7 +241,7 @@ WHERE and(equals(groups.team_id, 2), ifNull(equals(index, 0), 0)) GROUP BY groups.group_type_index, groups.group_key) AS e__group_0 ON equals(e.`$group_0`, e__group_0.key) - WHERE and(equals(e.team_id, 2), greaterOrEquals(toTimeZone(e.timestamp, 'UTC'), toStartOfDay(assumeNotNull(parseDateTime64BestEffortOrNull('2020-01-01 00:00:00', 6, 'UTC')))), lessOrEquals(toTimeZone(e.timestamp, 'UTC'), assumeNotNull(parseDateTime64BestEffortOrNull('2020-01-12 23:59:59', 6, 'UTC'))), equals(e.event, 'sign up'), or(ifNull(equals(e__group_0.properties___industry, 'finance'), 0), ifNull(equals(e__group_0.properties___industry, 'technology'), 0))) + WHERE and(equals(e.team_id, 2), greaterOrEquals(toTimeZone(e.timestamp, 'UTC'), toStartOfDay(assumeNotNull(parseDateTime64BestEffortOrNull('2020-01-01 00:00:00', 6, 'UTC')))), lessOrEquals(toTimeZone(e.timestamp, 'UTC'), assumeNotNull(parseDateTime64BestEffortOrNull('2020-01-12 23:59:59', 6, 'UTC'))), equals(e.event, 'sign up'), or(ifNull(equals(toString(e__group_0.properties___industry), 'finance'), 0), ifNull(equals(toString(e__group_0.properties___industry), 'technology'), 0))) GROUP BY day_start, breakdown_value) GROUP BY day_start, @@ -287,7 +287,7 @@ # --- # name: TestTrends.test_breakdown_by_group_props_with_person_filter_person_on_events ''' - SELECT e__group_0.properties___industry AS value, + SELECT toString(e__group_0.properties___industry) AS value, count(e.uuid) AS count FROM events AS e LEFT JOIN @@ -310,7 +310,7 @@ ''' SELECT groupArray(day_start) AS date, groupArray(count) AS total, - ifNull(toString(breakdown_value), '') AS breakdown_value + ifNull(toString(breakdown_value), '$$_posthog_breakdown_null_$$') AS breakdown_value FROM (SELECT sum(total) AS count, day_start AS day_start, @@ -331,7 +331,7 @@ ORDER BY sec.breakdown_value ASC, day_start ASC UNION ALL SELECT count(e.uuid) AS total, toStartOfDay(toTimeZone(e.timestamp, 'UTC')) AS day_start, - transform(ifNull(e__group_0.properties___industry, '$$_posthog_breakdown_null_$$'), ['finance'], ['finance'], '$$_posthog_breakdown_other_$$') AS breakdown_value + transform(ifNull(toString(e__group_0.properties___industry), '$$_posthog_breakdown_null_$$'), ['finance'], ['finance'], '$$_posthog_breakdown_other_$$') AS breakdown_value FROM events AS e SAMPLE 1 LEFT JOIN (SELECT argMax(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(groups.group_properties, 'industry'), ''), 'null'), '^"|"$', ''), groups._timestamp) AS properties___industry, @@ -341,7 +341,7 @@ WHERE and(equals(groups.team_id, 2), ifNull(equals(index, 0), 0)) GROUP BY groups.group_type_index, groups.group_key) AS e__group_0 ON equals(e.`$group_0`, e__group_0.key) - WHERE and(equals(e.team_id, 2), greaterOrEquals(toTimeZone(e.timestamp, 'UTC'), toStartOfDay(assumeNotNull(parseDateTime64BestEffortOrNull('2020-01-01 00:00:00', 6, 'UTC')))), lessOrEquals(toTimeZone(e.timestamp, 'UTC'), assumeNotNull(parseDateTime64BestEffortOrNull('2020-01-12 23:59:59', 6, 'UTC'))), equals(e.event, 'sign up'), ifNull(equals(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.person_properties, 'key'), ''), 'null'), '^"|"$', ''), 'value'), 0), ifNull(equals(e__group_0.properties___industry, 'finance'), 0)) + WHERE and(equals(e.team_id, 2), greaterOrEquals(toTimeZone(e.timestamp, 'UTC'), toStartOfDay(assumeNotNull(parseDateTime64BestEffortOrNull('2020-01-01 00:00:00', 6, 'UTC')))), lessOrEquals(toTimeZone(e.timestamp, 'UTC'), assumeNotNull(parseDateTime64BestEffortOrNull('2020-01-12 23:59:59', 6, 'UTC'))), equals(e.event, 'sign up'), ifNull(equals(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.person_properties, 'key'), ''), 'null'), '^"|"$', ''), 'value'), 0), ifNull(equals(toString(e__group_0.properties___industry), 'finance'), 0)) GROUP BY day_start, breakdown_value) GROUP BY day_start, @@ -356,7 +356,7 @@ # --- # name: TestTrends.test_breakdown_filtering_with_properties_in_new_format ''' - SELECT replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$current_url'), ''), 'null'), '^"|"$', '') AS value, + SELECT toString(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$current_url'), ''), 'null'), '^"|"$', '')) AS value, count(e.uuid) AS count FROM events AS e WHERE and(equals(e.team_id, 2), and(greaterOrEquals(toTimeZone(e.timestamp, 'UTC'), toStartOfDay(assumeNotNull(parseDateTime64BestEffortOrNull('2019-12-22 00:00:00', 6, 'UTC')))), lessOrEquals(toTimeZone(e.timestamp, 'UTC'), assumeNotNull(parseDateTime64BestEffortOrNull('2020-01-05 23:59:59', 6, 'UTC')))), and(greaterOrEquals(toTimeZone(e.timestamp, 'UTC'), toStartOfDay(assumeNotNull(parseDateTime64BestEffortOrNull('2019-12-22 00:00:00', 6, 'UTC')))), lessOrEquals(toTimeZone(e.timestamp, 'UTC'), assumeNotNull(parseDateTime64BestEffortOrNull('2020-01-05 23:59:59', 6, 'UTC'))), equals(e.event, 'sign up'), or(ifNull(equals(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$browser'), ''), 'null'), '^"|"$', ''), 'Firefox'), 0), ifNull(equals(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$os'), ''), 'null'), '^"|"$', ''), 'Windows'), 0)), ifNull(equals(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$os'), ''), 'null'), '^"|"$', ''), 'Mac'), 0))) @@ -371,7 +371,7 @@ ''' SELECT groupArray(day_start) AS date, groupArray(count) AS total, - ifNull(toString(breakdown_value), '') AS breakdown_value + ifNull(toString(breakdown_value), '$$_posthog_breakdown_null_$$') AS breakdown_value FROM (SELECT sum(total) AS count, day_start AS day_start, @@ -392,9 +392,9 @@ ORDER BY sec.breakdown_value ASC, day_start ASC UNION ALL SELECT count(e.uuid) AS total, toStartOfDay(toTimeZone(e.timestamp, 'UTC')) AS day_start, - transform(ifNull(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$current_url'), ''), 'null'), '^"|"$', ''), '$$_posthog_breakdown_null_$$'), ['second url'], ['second url'], '$$_posthog_breakdown_other_$$') AS breakdown_value + transform(ifNull(toString(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$current_url'), ''), 'null'), '^"|"$', '')), '$$_posthog_breakdown_null_$$'), ['second url'], ['second url'], '$$_posthog_breakdown_other_$$') AS breakdown_value FROM events AS e SAMPLE 1 - WHERE and(equals(e.team_id, 2), greaterOrEquals(toTimeZone(e.timestamp, 'UTC'), toStartOfDay(assumeNotNull(parseDateTime64BestEffortOrNull('2019-12-22 00:00:00', 6, 'UTC')))), lessOrEquals(toTimeZone(e.timestamp, 'UTC'), assumeNotNull(parseDateTime64BestEffortOrNull('2020-01-05 23:59:59', 6, 'UTC'))), equals(e.event, 'sign up'), or(ifNull(equals(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$browser'), ''), 'null'), '^"|"$', ''), 'Firefox'), 0), ifNull(equals(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$os'), ''), 'null'), '^"|"$', ''), 'Windows'), 0)), ifNull(equals(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$os'), ''), 'null'), '^"|"$', ''), 'Mac'), 0), ifNull(equals(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$current_url'), ''), 'null'), '^"|"$', ''), 'second url'), 0)) + WHERE and(equals(e.team_id, 2), greaterOrEquals(toTimeZone(e.timestamp, 'UTC'), toStartOfDay(assumeNotNull(parseDateTime64BestEffortOrNull('2019-12-22 00:00:00', 6, 'UTC')))), lessOrEquals(toTimeZone(e.timestamp, 'UTC'), assumeNotNull(parseDateTime64BestEffortOrNull('2020-01-05 23:59:59', 6, 'UTC'))), equals(e.event, 'sign up'), or(ifNull(equals(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$browser'), ''), 'null'), '^"|"$', ''), 'Firefox'), 0), ifNull(equals(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$os'), ''), 'null'), '^"|"$', ''), 'Windows'), 0)), ifNull(equals(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$os'), ''), 'null'), '^"|"$', ''), 'Mac'), 0), ifNull(equals(toString(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$current_url'), ''), 'null'), '^"|"$', '')), 'second url'), 0)) GROUP BY day_start, breakdown_value) GROUP BY day_start, @@ -409,7 +409,7 @@ # --- # name: TestTrends.test_breakdown_filtering_with_properties_in_new_format.2 ''' - SELECT replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$current_url'), ''), 'null'), '^"|"$', '') AS value, + SELECT toString(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$current_url'), ''), 'null'), '^"|"$', '')) AS value, count(e.uuid) AS count FROM events AS e WHERE and(equals(e.team_id, 2), and(greaterOrEquals(toTimeZone(e.timestamp, 'UTC'), toStartOfDay(assumeNotNull(parseDateTime64BestEffortOrNull('2019-12-22 00:00:00', 6, 'UTC')))), lessOrEquals(toTimeZone(e.timestamp, 'UTC'), assumeNotNull(parseDateTime64BestEffortOrNull('2020-01-05 23:59:59', 6, 'UTC')))), and(greaterOrEquals(toTimeZone(e.timestamp, 'UTC'), toStartOfDay(assumeNotNull(parseDateTime64BestEffortOrNull('2019-12-22 00:00:00', 6, 'UTC')))), lessOrEquals(toTimeZone(e.timestamp, 'UTC'), assumeNotNull(parseDateTime64BestEffortOrNull('2020-01-05 23:59:59', 6, 'UTC'))), equals(e.event, 'sign up'), and(ifNull(equals(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$browser'), ''), 'null'), '^"|"$', ''), 'Firefox'), 0), ifNull(equals(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$os'), ''), 'null'), '^"|"$', ''), 'Windows'), 0)), ifNull(equals(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$os'), ''), 'null'), '^"|"$', ''), 'Mac'), 0))) @@ -423,24 +423,38 @@ # name: TestTrends.test_breakdown_filtering_with_properties_in_new_format.3 ''' SELECT groupArray(day_start) AS date, - groupArray(count) AS total + groupArray(count) AS total, + ifNull(toString(breakdown_value), '$$_posthog_breakdown_null_$$') AS breakdown_value FROM (SELECT sum(total) AS count, - day_start AS day_start + day_start AS day_start, + breakdown_value AS breakdown_value FROM (SELECT 0 AS total, - minus(toStartOfDay(assumeNotNull(parseDateTime64BestEffortOrNull('2020-01-05 23:59:59', 6, 'UTC'))), toIntervalDay(numbers.number)) AS day_start - FROM numbers(coalesce(dateDiff('day', assumeNotNull(parseDateTime64BestEffortOrNull('2019-12-22 00:00:00', 6, 'UTC')), assumeNotNull(parseDateTime64BestEffortOrNull('2020-01-05 23:59:59', 6, 'UTC'))), 0)) AS numbers - UNION ALL SELECT 0 AS total, - toStartOfDay(assumeNotNull(parseDateTime64BestEffortOrNull('2019-12-22 00:00:00', 6, 'UTC'))) AS day_start + ticks.day_start AS day_start, + sec.breakdown_value AS breakdown_value + FROM + (SELECT minus(toStartOfDay(assumeNotNull(parseDateTime64BestEffortOrNull('2020-01-05 23:59:59', 6, 'UTC'))), toIntervalDay(numbers.number)) AS day_start + FROM numbers(coalesce(dateDiff('day', assumeNotNull(parseDateTime64BestEffortOrNull('2019-12-22 00:00:00', 6, 'UTC')), assumeNotNull(parseDateTime64BestEffortOrNull('2020-01-05 23:59:59', 6, 'UTC'))), 0)) AS numbers + UNION ALL SELECT toStartOfDay(assumeNotNull(parseDateTime64BestEffortOrNull('2019-12-22 00:00:00', 6, 'UTC'))) AS day_start) AS ticks + CROSS JOIN + (SELECT breakdown_value + FROM + (SELECT ['$$_posthog_breakdown_null_$$'] AS breakdown_value) ARRAY + JOIN breakdown_value AS breakdown_value) AS sec + ORDER BY sec.breakdown_value ASC, day_start ASC UNION ALL SELECT count(e.uuid) AS total, - toStartOfDay(toTimeZone(e.timestamp, 'UTC')) AS day_start + toStartOfDay(toTimeZone(e.timestamp, 'UTC')) AS day_start, + transform(ifNull(toString(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$current_url'), ''), 'null'), '^"|"$', '')), '$$_posthog_breakdown_null_$$'), ['$$_posthog_breakdown_null_$$'], ['$$_posthog_breakdown_null_$$'], '$$_posthog_breakdown_other_$$') AS breakdown_value FROM events AS e SAMPLE 1 - WHERE and(equals(e.team_id, 2), greaterOrEquals(toTimeZone(e.timestamp, 'UTC'), toStartOfDay(assumeNotNull(parseDateTime64BestEffortOrNull('2019-12-22 00:00:00', 6, 'UTC')))), lessOrEquals(toTimeZone(e.timestamp, 'UTC'), assumeNotNull(parseDateTime64BestEffortOrNull('2020-01-05 23:59:59', 6, 'UTC'))), equals(e.event, 'sign up'), and(ifNull(equals(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$browser'), ''), 'null'), '^"|"$', ''), 'Firefox'), 0), ifNull(equals(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$os'), ''), 'null'), '^"|"$', ''), 'Windows'), 0)), ifNull(equals(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$os'), ''), 'null'), '^"|"$', ''), 'Mac'), 0)) - GROUP BY day_start) - GROUP BY day_start - ORDER BY day_start ASC) - ORDER BY sum(count) DESC + WHERE and(equals(e.team_id, 2), greaterOrEquals(toTimeZone(e.timestamp, 'UTC'), toStartOfDay(assumeNotNull(parseDateTime64BestEffortOrNull('2019-12-22 00:00:00', 6, 'UTC')))), lessOrEquals(toTimeZone(e.timestamp, 'UTC'), assumeNotNull(parseDateTime64BestEffortOrNull('2020-01-05 23:59:59', 6, 'UTC'))), equals(e.event, 'sign up'), and(ifNull(equals(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$browser'), ''), 'null'), '^"|"$', ''), 'Firefox'), 0), ifNull(equals(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$os'), ''), 'null'), '^"|"$', ''), 'Windows'), 0)), ifNull(equals(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$os'), ''), 'null'), '^"|"$', ''), 'Mac'), 0), isNull(toString(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$current_url'), ''), 'null'), '^"|"$', '')))) + GROUP BY day_start, + breakdown_value) + GROUP BY day_start, + breakdown_value + ORDER BY day_start ASC, breakdown_value ASC) + GROUP BY breakdown_value + ORDER BY sum(count) DESC, breakdown_value ASC LIMIT 10000 SETTINGS readonly=2, max_execution_time=60, allow_experimental_object_type=1 @@ -448,7 +462,7 @@ # --- # name: TestTrends.test_breakdown_weekly_active_users_aggregated ''' - SELECT replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, 'key'), ''), 'null'), '^"|"$', '') AS value, + SELECT toString(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, 'key'), ''), 'null'), '^"|"$', '')) AS value, count(DISTINCT e__pdi.person_id) AS count FROM events AS e INNER JOIN @@ -480,7 +494,7 @@ CROSS JOIN (SELECT toTimeZone(e.timestamp, 'UTC') AS timestamp, e__pdi.person_id AS actor_id, - transform(ifNull(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, 'key'), ''), 'null'), '^"|"$', ''), '$$_posthog_breakdown_null_$$'), ['val', 'bor'], ['val', 'bor'], '$$_posthog_breakdown_other_$$') AS breakdown_value + transform(ifNull(toString(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, 'key'), ''), 'null'), '^"|"$', '')), '$$_posthog_breakdown_null_$$'), ['val', 'bor'], ['val', 'bor'], '$$_posthog_breakdown_other_$$') AS breakdown_value FROM events AS e SAMPLE 1 INNER JOIN (SELECT argMax(person_distinct_id2.person_id, person_distinct_id2.version) AS person_id, @@ -489,7 +503,7 @@ WHERE equals(person_distinct_id2.team_id, 2) GROUP BY person_distinct_id2.distinct_id HAVING ifNull(equals(argMax(person_distinct_id2.is_deleted, person_distinct_id2.version), 0), 0)) AS e__pdi ON equals(e.distinct_id, e__pdi.distinct_id) - WHERE and(equals(e.team_id, 2), and(equals(e.event, '$pageview'), or(ifNull(equals(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, 'key'), ''), 'null'), '^"|"$', ''), 'val'), 0), ifNull(equals(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, 'key'), ''), 'null'), '^"|"$', ''), 'bor'), 0))), ifNull(greaterOrEquals(timestamp, minus(assumeNotNull(parseDateTime64BestEffortOrNull('2020-01-11 00:00:00', 6, 'UTC')), toIntervalDay(7))), 0), ifNull(lessOrEquals(timestamp, assumeNotNull(parseDateTime64BestEffortOrNull('2020-01-11 23:59:59', 6, 'UTC'))), 0)) + WHERE and(equals(e.team_id, 2), and(equals(e.event, '$pageview'), or(ifNull(equals(toString(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, 'key'), ''), 'null'), '^"|"$', '')), 'val'), 0), ifNull(equals(toString(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, 'key'), ''), 'null'), '^"|"$', '')), 'bor'), 0))), ifNull(greaterOrEquals(timestamp, minus(assumeNotNull(parseDateTime64BestEffortOrNull('2020-01-11 00:00:00', 6, 'UTC')), toIntervalDay(7))), 0), ifNull(lessOrEquals(timestamp, assumeNotNull(parseDateTime64BestEffortOrNull('2020-01-11 23:59:59', 6, 'UTC'))), 0)) GROUP BY timestamp, actor_id, breakdown_value) AS e WHERE and(ifNull(lessOrEquals(e.timestamp, plus(d.timestamp, toIntervalDay(1))), 0), ifNull(greater(e.timestamp, minus(d.timestamp, toIntervalDay(6))), 0)) @@ -506,7 +520,7 @@ # --- # name: TestTrends.test_breakdown_weekly_active_users_aggregated_materialized ''' - SELECT nullIf(nullIf(e.mat_key, ''), 'null') AS value, + SELECT toString(nullIf(nullIf(e.mat_key, ''), 'null')) AS value, count(DISTINCT e__pdi.person_id) AS count FROM events AS e INNER JOIN @@ -538,7 +552,7 @@ CROSS JOIN (SELECT toTimeZone(e.timestamp, 'UTC') AS timestamp, e__pdi.person_id AS actor_id, - transform(ifNull(nullIf(nullIf(e.mat_key, ''), 'null'), '$$_posthog_breakdown_null_$$'), ['val', 'bor'], ['val', 'bor'], '$$_posthog_breakdown_other_$$') AS breakdown_value + transform(ifNull(toString(nullIf(nullIf(e.mat_key, ''), 'null')), '$$_posthog_breakdown_null_$$'), ['val', 'bor'], ['val', 'bor'], '$$_posthog_breakdown_other_$$') AS breakdown_value FROM events AS e SAMPLE 1 INNER JOIN (SELECT argMax(person_distinct_id2.person_id, person_distinct_id2.version) AS person_id, @@ -547,7 +561,7 @@ WHERE equals(person_distinct_id2.team_id, 2) GROUP BY person_distinct_id2.distinct_id HAVING ifNull(equals(argMax(person_distinct_id2.is_deleted, person_distinct_id2.version), 0), 0)) AS e__pdi ON equals(e.distinct_id, e__pdi.distinct_id) - WHERE and(equals(e.team_id, 2), and(equals(e.event, '$pageview'), or(ifNull(equals(nullIf(nullIf(e.mat_key, ''), 'null'), 'val'), 0), ifNull(equals(nullIf(nullIf(e.mat_key, ''), 'null'), 'bor'), 0))), ifNull(greaterOrEquals(timestamp, minus(assumeNotNull(parseDateTime64BestEffortOrNull('2020-01-11 00:00:00', 6, 'UTC')), toIntervalDay(7))), 0), ifNull(lessOrEquals(timestamp, assumeNotNull(parseDateTime64BestEffortOrNull('2020-01-11 23:59:59', 6, 'UTC'))), 0)) + WHERE and(equals(e.team_id, 2), and(equals(e.event, '$pageview'), or(ifNull(equals(toString(nullIf(nullIf(e.mat_key, ''), 'null')), 'val'), 0), ifNull(equals(toString(nullIf(nullIf(e.mat_key, ''), 'null')), 'bor'), 0))), ifNull(greaterOrEquals(timestamp, minus(assumeNotNull(parseDateTime64BestEffortOrNull('2020-01-11 00:00:00', 6, 'UTC')), toIntervalDay(7))), 0), ifNull(lessOrEquals(timestamp, assumeNotNull(parseDateTime64BestEffortOrNull('2020-01-11 23:59:59', 6, 'UTC'))), 0)) GROUP BY timestamp, actor_id, breakdown_value) AS e WHERE and(ifNull(lessOrEquals(e.timestamp, plus(d.timestamp, toIntervalDay(1))), 0), ifNull(greater(e.timestamp, minus(d.timestamp, toIntervalDay(6))), 0)) @@ -584,7 +598,7 @@ # --- # name: TestTrends.test_breakdown_weekly_active_users_daily_based_on_action.2 ''' - SELECT replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, 'key'), ''), 'null'), '^"|"$', '') AS value, + SELECT toString(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, 'key'), ''), 'null'), '^"|"$', '')) AS value, count(DISTINCT e__pdi.person_id) AS count FROM events AS e INNER JOIN @@ -622,7 +636,7 @@ ''' SELECT groupArray(day_start) AS date, groupArray(count) AS total, - ifNull(toString(breakdown_value), '') AS breakdown_value + ifNull(toString(breakdown_value), '$$_posthog_breakdown_null_$$') AS breakdown_value FROM (SELECT sum(total) AS count, day_start AS day_start, @@ -654,7 +668,7 @@ CROSS JOIN (SELECT toTimeZone(e.timestamp, 'UTC') AS timestamp, e__pdi.person_id AS actor_id, - transform(ifNull(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, 'key'), ''), 'null'), '^"|"$', ''), '$$_posthog_breakdown_null_$$'), ['val'], ['val'], '$$_posthog_breakdown_other_$$') AS breakdown_value + transform(ifNull(toString(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, 'key'), ''), 'null'), '^"|"$', '')), '$$_posthog_breakdown_null_$$'), ['val'], ['val'], '$$_posthog_breakdown_other_$$') AS breakdown_value FROM events AS e SAMPLE 1 INNER JOIN (SELECT argMax(person_distinct_id2.person_id, person_distinct_id2.version) AS person_id, @@ -679,7 +693,7 @@ FROM cohortpeople WHERE and(equals(cohortpeople.team_id, 2), equals(cohortpeople.cohort_id, 2)) GROUP BY cohortpeople.person_id, cohortpeople.cohort_id, cohortpeople.version - HAVING ifNull(greater(sum(cohortpeople.sign), 0), 0))), 0))), ifNull(equals(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, 'key'), ''), 'null'), '^"|"$', ''), 'val'), 0)), ifNull(greaterOrEquals(timestamp, minus(assumeNotNull(parseDateTime64BestEffortOrNull('2020-01-01 00:00:00', 6, 'UTC')), toIntervalDay(7))), 0), ifNull(lessOrEquals(timestamp, assumeNotNull(parseDateTime64BestEffortOrNull('2020-01-12 23:59:59', 6, 'UTC'))), 0)) + HAVING ifNull(greater(sum(cohortpeople.sign), 0), 0))), 0))), ifNull(equals(toString(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, 'key'), ''), 'null'), '^"|"$', '')), 'val'), 0)), ifNull(greaterOrEquals(timestamp, minus(assumeNotNull(parseDateTime64BestEffortOrNull('2020-01-01 00:00:00', 6, 'UTC')), toIntervalDay(7))), 0), ifNull(lessOrEquals(timestamp, assumeNotNull(parseDateTime64BestEffortOrNull('2020-01-12 23:59:59', 6, 'UTC'))), 0)) GROUP BY timestamp, actor_id, breakdown_value) AS e WHERE and(ifNull(lessOrEquals(e.timestamp, plus(d.timestamp, toIntervalDay(1))), 0), ifNull(greater(e.timestamp, minus(d.timestamp, toIntervalDay(6))), 0)) @@ -699,7 +713,7 @@ # --- # name: TestTrends.test_breakdown_with_filter_groups_person_on_events ''' - SELECT replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, 'key'), ''), 'null'), '^"|"$', '') AS value, + SELECT toString(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, 'key'), ''), 'null'), '^"|"$', '')) AS value, count(e.uuid) AS count FROM events AS e LEFT JOIN @@ -722,7 +736,7 @@ ''' SELECT groupArray(day_start) AS date, groupArray(count) AS total, - ifNull(toString(breakdown_value), '') AS breakdown_value + ifNull(toString(breakdown_value), '$$_posthog_breakdown_null_$$') AS breakdown_value FROM (SELECT sum(total) AS count, day_start AS day_start, @@ -743,7 +757,7 @@ ORDER BY sec.breakdown_value ASC, day_start ASC UNION ALL SELECT count(e.uuid) AS total, toStartOfDay(toTimeZone(e.timestamp, 'UTC')) AS day_start, - transform(ifNull(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, 'key'), ''), 'null'), '^"|"$', ''), '$$_posthog_breakdown_null_$$'), ['uh', 'oh'], ['uh', 'oh'], '$$_posthog_breakdown_other_$$') AS breakdown_value + transform(ifNull(toString(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, 'key'), ''), 'null'), '^"|"$', '')), '$$_posthog_breakdown_null_$$'), ['uh', 'oh'], ['uh', 'oh'], '$$_posthog_breakdown_other_$$') AS breakdown_value FROM events AS e SAMPLE 1 LEFT JOIN (SELECT argMax(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(groups.group_properties, 'industry'), ''), 'null'), '^"|"$', ''), groups._timestamp) AS properties___industry, @@ -753,7 +767,7 @@ WHERE and(equals(groups.team_id, 2), ifNull(equals(index, 0), 0)) GROUP BY groups.group_type_index, groups.group_key) AS e__group_0 ON equals(e.`$group_0`, e__group_0.key) - WHERE and(equals(e.team_id, 2), greaterOrEquals(toTimeZone(e.timestamp, 'UTC'), toStartOfDay(assumeNotNull(parseDateTime64BestEffortOrNull('2020-01-01 00:00:00', 6, 'UTC')))), lessOrEquals(toTimeZone(e.timestamp, 'UTC'), assumeNotNull(parseDateTime64BestEffortOrNull('2020-01-12 23:59:59', 6, 'UTC'))), equals(e.event, 'sign up'), ifNull(equals(e__group_0.properties___industry, 'finance'), 0), or(ifNull(equals(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, 'key'), ''), 'null'), '^"|"$', ''), 'uh'), 0), ifNull(equals(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, 'key'), ''), 'null'), '^"|"$', ''), 'oh'), 0))) + WHERE and(equals(e.team_id, 2), greaterOrEquals(toTimeZone(e.timestamp, 'UTC'), toStartOfDay(assumeNotNull(parseDateTime64BestEffortOrNull('2020-01-01 00:00:00', 6, 'UTC')))), lessOrEquals(toTimeZone(e.timestamp, 'UTC'), assumeNotNull(parseDateTime64BestEffortOrNull('2020-01-12 23:59:59', 6, 'UTC'))), equals(e.event, 'sign up'), ifNull(equals(e__group_0.properties___industry, 'finance'), 0), or(ifNull(equals(toString(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, 'key'), ''), 'null'), '^"|"$', '')), 'uh'), 0), ifNull(equals(toString(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, 'key'), ''), 'null'), '^"|"$', '')), 'oh'), 0))) GROUP BY day_start, breakdown_value) GROUP BY day_start, @@ -782,7 +796,7 @@ # --- # name: TestTrends.test_breakdown_with_filter_groups_person_on_events_v2.1 ''' - SELECT replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, 'key'), ''), 'null'), '^"|"$', '') AS value, + SELECT toString(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, 'key'), ''), 'null'), '^"|"$', '')) AS value, count(e.uuid) AS count FROM events AS e LEFT JOIN @@ -805,7 +819,7 @@ ''' SELECT groupArray(day_start) AS date, groupArray(count) AS total, - ifNull(toString(breakdown_value), '') AS breakdown_value + ifNull(toString(breakdown_value), '$$_posthog_breakdown_null_$$') AS breakdown_value FROM (SELECT sum(total) AS count, day_start AS day_start, @@ -826,7 +840,7 @@ ORDER BY sec.breakdown_value ASC, day_start ASC UNION ALL SELECT count(DISTINCT ifNull(nullIf(e__override.override_person_id, '00000000-0000-0000-0000-000000000000'), e.person_id)) AS total, toStartOfDay(toTimeZone(e.timestamp, 'UTC')) AS day_start, - transform(ifNull(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, 'key'), ''), 'null'), '^"|"$', ''), '$$_posthog_breakdown_null_$$'), ['uh', 'oh'], ['uh', 'oh'], '$$_posthog_breakdown_other_$$') AS breakdown_value + transform(ifNull(toString(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, 'key'), ''), 'null'), '^"|"$', '')), '$$_posthog_breakdown_null_$$'), ['uh', 'oh'], ['uh', 'oh'], '$$_posthog_breakdown_other_$$') AS breakdown_value FROM events AS e SAMPLE 1 LEFT OUTER JOIN (SELECT argMax(person_overrides.override_person_id, person_overrides.version) AS override_person_id, @@ -842,7 +856,7 @@ WHERE and(equals(groups.team_id, 2), ifNull(equals(index, 0), 0)) GROUP BY groups.group_type_index, groups.group_key) AS e__group_0 ON equals(e.`$group_0`, e__group_0.key) - WHERE and(equals(e.team_id, 2), greaterOrEquals(toTimeZone(e.timestamp, 'UTC'), toStartOfDay(assumeNotNull(parseDateTime64BestEffortOrNull('2020-01-01 00:00:00', 6, 'UTC')))), lessOrEquals(toTimeZone(e.timestamp, 'UTC'), assumeNotNull(parseDateTime64BestEffortOrNull('2020-01-12 23:59:59', 6, 'UTC'))), equals(e.event, 'sign up'), ifNull(equals(e__group_0.properties___industry, 'finance'), 0), or(ifNull(equals(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, 'key'), ''), 'null'), '^"|"$', ''), 'uh'), 0), ifNull(equals(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, 'key'), ''), 'null'), '^"|"$', ''), 'oh'), 0))) + WHERE and(equals(e.team_id, 2), greaterOrEquals(toTimeZone(e.timestamp, 'UTC'), toStartOfDay(assumeNotNull(parseDateTime64BestEffortOrNull('2020-01-01 00:00:00', 6, 'UTC')))), lessOrEquals(toTimeZone(e.timestamp, 'UTC'), assumeNotNull(parseDateTime64BestEffortOrNull('2020-01-12 23:59:59', 6, 'UTC'))), equals(e.event, 'sign up'), ifNull(equals(e__group_0.properties___industry, 'finance'), 0), or(ifNull(equals(toString(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, 'key'), ''), 'null'), '^"|"$', '')), 'uh'), 0), ifNull(equals(toString(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, 'key'), ''), 'null'), '^"|"$', '')), 'oh'), 0))) GROUP BY day_start, breakdown_value) GROUP BY day_start, @@ -857,7 +871,7 @@ # --- # name: TestTrends.test_dau_with_breakdown_filtering_with_sampling ''' - SELECT replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$some_property'), ''), 'null'), '^"|"$', '') AS value, + SELECT toString(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$some_property'), ''), 'null'), '^"|"$', '')) AS value, count(e.uuid) AS count FROM events AS e WHERE and(equals(e.team_id, 2), and(greaterOrEquals(toTimeZone(e.timestamp, 'UTC'), toStartOfDay(assumeNotNull(parseDateTime64BestEffortOrNull('2019-12-28 00:00:00', 6, 'UTC')))), lessOrEquals(toTimeZone(e.timestamp, 'UTC'), assumeNotNull(parseDateTime64BestEffortOrNull('2020-01-04 23:59:59', 6, 'UTC')))), and(greaterOrEquals(toTimeZone(e.timestamp, 'UTC'), toStartOfDay(assumeNotNull(parseDateTime64BestEffortOrNull('2019-12-28 00:00:00', 6, 'UTC')))), lessOrEquals(toTimeZone(e.timestamp, 'UTC'), assumeNotNull(parseDateTime64BestEffortOrNull('2020-01-04 23:59:59', 6, 'UTC'))), equals(e.event, 'sign up'))) @@ -872,7 +886,7 @@ ''' SELECT groupArray(day_start) AS date, groupArray(count) AS total, - ifNull(toString(breakdown_value), '') AS breakdown_value + ifNull(toString(breakdown_value), '$$_posthog_breakdown_null_$$') AS breakdown_value FROM (SELECT sum(total) AS count, day_start AS day_start, @@ -893,7 +907,7 @@ ORDER BY sec.breakdown_value ASC, day_start ASC UNION ALL SELECT count(DISTINCT e__pdi.person_id) AS total, toStartOfDay(toTimeZone(e.timestamp, 'UTC')) AS day_start, - transform(ifNull(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$some_property'), ''), 'null'), '^"|"$', ''), '$$_posthog_breakdown_null_$$'), ['other_value', '$$_posthog_breakdown_null_$$', 'value'], ['other_value', '$$_posthog_breakdown_null_$$', 'value'], '$$_posthog_breakdown_other_$$') AS breakdown_value + transform(ifNull(toString(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$some_property'), ''), 'null'), '^"|"$', '')), '$$_posthog_breakdown_null_$$'), ['other_value', '$$_posthog_breakdown_null_$$', 'value'], ['other_value', '$$_posthog_breakdown_null_$$', 'value'], '$$_posthog_breakdown_other_$$') AS breakdown_value FROM events AS e SAMPLE 1.0 INNER JOIN (SELECT argMax(person_distinct_id2.person_id, person_distinct_id2.version) AS person_id, @@ -902,7 +916,7 @@ WHERE equals(person_distinct_id2.team_id, 2) GROUP BY person_distinct_id2.distinct_id HAVING ifNull(equals(argMax(person_distinct_id2.is_deleted, person_distinct_id2.version), 0), 0)) AS e__pdi ON equals(e.distinct_id, e__pdi.distinct_id) - WHERE and(equals(e.team_id, 2), greaterOrEquals(toTimeZone(e.timestamp, 'UTC'), toStartOfDay(assumeNotNull(parseDateTime64BestEffortOrNull('2019-12-28 00:00:00', 6, 'UTC')))), lessOrEquals(toTimeZone(e.timestamp, 'UTC'), assumeNotNull(parseDateTime64BestEffortOrNull('2020-01-04 23:59:59', 6, 'UTC'))), equals(e.event, 'sign up'), or(ifNull(equals(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$some_property'), ''), 'null'), '^"|"$', ''), 'other_value'), 0), isNull(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$some_property'), ''), 'null'), '^"|"$', '')), ifNull(equals(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$some_property'), ''), 'null'), '^"|"$', ''), 'value'), 0))) + WHERE and(equals(e.team_id, 2), greaterOrEquals(toTimeZone(e.timestamp, 'UTC'), toStartOfDay(assumeNotNull(parseDateTime64BestEffortOrNull('2019-12-28 00:00:00', 6, 'UTC')))), lessOrEquals(toTimeZone(e.timestamp, 'UTC'), assumeNotNull(parseDateTime64BestEffortOrNull('2020-01-04 23:59:59', 6, 'UTC'))), equals(e.event, 'sign up'), or(ifNull(equals(toString(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$some_property'), ''), 'null'), '^"|"$', '')), 'other_value'), 0), isNull(toString(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$some_property'), ''), 'null'), '^"|"$', ''))), ifNull(equals(toString(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$some_property'), ''), 'null'), '^"|"$', '')), 'value'), 0))) GROUP BY day_start, breakdown_value) GROUP BY day_start, @@ -917,7 +931,7 @@ # --- # name: TestTrends.test_dau_with_breakdown_filtering_with_sampling.2 ''' - SELECT replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$some_property'), ''), 'null'), '^"|"$', '') AS value, + SELECT toString(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$some_property'), ''), 'null'), '^"|"$', '')) AS value, count(e.uuid) AS count FROM events AS e WHERE and(equals(e.team_id, 2), and(greaterOrEquals(toTimeZone(e.timestamp, 'UTC'), toStartOfDay(assumeNotNull(parseDateTime64BestEffortOrNull('2019-12-28 00:00:00', 6, 'UTC')))), lessOrEquals(toTimeZone(e.timestamp, 'UTC'), assumeNotNull(parseDateTime64BestEffortOrNull('2020-01-04 23:59:59', 6, 'UTC')))), and(greaterOrEquals(toTimeZone(e.timestamp, 'UTC'), toStartOfDay(assumeNotNull(parseDateTime64BestEffortOrNull('2019-12-28 00:00:00', 6, 'UTC')))), lessOrEquals(toTimeZone(e.timestamp, 'UTC'), assumeNotNull(parseDateTime64BestEffortOrNull('2020-01-04 23:59:59', 6, 'UTC'))), equals(e.event, 'sign up'))) @@ -932,7 +946,7 @@ ''' SELECT groupArray(day_start) AS date, groupArray(count) AS total, - ifNull(toString(breakdown_value), '') AS breakdown_value + ifNull(toString(breakdown_value), '$$_posthog_breakdown_null_$$') AS breakdown_value FROM (SELECT sum(total) AS count, day_start AS day_start, @@ -953,7 +967,7 @@ ORDER BY sec.breakdown_value ASC, day_start ASC UNION ALL SELECT count(DISTINCT e__pdi.person_id) AS total, toStartOfDay(toTimeZone(e.timestamp, 'UTC')) AS day_start, - transform(ifNull(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$some_property'), ''), 'null'), '^"|"$', ''), '$$_posthog_breakdown_null_$$'), ['other_value', '$$_posthog_breakdown_null_$$', 'value'], ['other_value', '$$_posthog_breakdown_null_$$', 'value'], '$$_posthog_breakdown_other_$$') AS breakdown_value + transform(ifNull(toString(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$some_property'), ''), 'null'), '^"|"$', '')), '$$_posthog_breakdown_null_$$'), ['other_value', '$$_posthog_breakdown_null_$$', 'value'], ['other_value', '$$_posthog_breakdown_null_$$', 'value'], '$$_posthog_breakdown_other_$$') AS breakdown_value FROM events AS e SAMPLE 1.0 INNER JOIN (SELECT argMax(person_distinct_id2.person_id, person_distinct_id2.version) AS person_id, @@ -962,7 +976,7 @@ WHERE equals(person_distinct_id2.team_id, 2) GROUP BY person_distinct_id2.distinct_id HAVING ifNull(equals(argMax(person_distinct_id2.is_deleted, person_distinct_id2.version), 0), 0)) AS e__pdi ON equals(e.distinct_id, e__pdi.distinct_id) - WHERE and(equals(e.team_id, 2), greaterOrEquals(toTimeZone(e.timestamp, 'UTC'), toStartOfDay(assumeNotNull(parseDateTime64BestEffortOrNull('2019-12-28 00:00:00', 6, 'UTC')))), lessOrEquals(toTimeZone(e.timestamp, 'UTC'), assumeNotNull(parseDateTime64BestEffortOrNull('2020-01-04 23:59:59', 6, 'UTC'))), equals(e.event, 'sign up'), or(ifNull(equals(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$some_property'), ''), 'null'), '^"|"$', ''), 'other_value'), 0), isNull(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$some_property'), ''), 'null'), '^"|"$', '')), ifNull(equals(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$some_property'), ''), 'null'), '^"|"$', ''), 'value'), 0))) + WHERE and(equals(e.team_id, 2), greaterOrEquals(toTimeZone(e.timestamp, 'UTC'), toStartOfDay(assumeNotNull(parseDateTime64BestEffortOrNull('2019-12-28 00:00:00', 6, 'UTC')))), lessOrEquals(toTimeZone(e.timestamp, 'UTC'), assumeNotNull(parseDateTime64BestEffortOrNull('2020-01-04 23:59:59', 6, 'UTC'))), equals(e.event, 'sign up'), or(ifNull(equals(toString(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$some_property'), ''), 'null'), '^"|"$', '')), 'other_value'), 0), isNull(toString(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$some_property'), ''), 'null'), '^"|"$', ''))), ifNull(equals(toString(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$some_property'), ''), 'null'), '^"|"$', '')), 'value'), 0))) GROUP BY day_start, breakdown_value) GROUP BY day_start, @@ -1242,7 +1256,7 @@ # --- # name: TestTrends.test_mau_with_breakdown_filtering_and_prop_filter ''' - SELECT e__pdi__person.`properties___$some_prop` AS value, + SELECT toString(e__pdi__person.`properties___$some_prop`) AS value, count(DISTINCT e__pdi.person_id) AS count FROM events AS e INNER JOIN @@ -1276,7 +1290,7 @@ ''' SELECT groupArray(day_start) AS date, groupArray(count) AS total, - ifNull(toString(breakdown_value), '') AS breakdown_value + ifNull(toString(breakdown_value), '$$_posthog_breakdown_null_$$') AS breakdown_value FROM (SELECT sum(total) AS count, day_start AS day_start, @@ -1308,7 +1322,7 @@ CROSS JOIN (SELECT toTimeZone(e.timestamp, 'UTC') AS timestamp, e__pdi.person_id AS actor_id, - transform(ifNull(e__pdi__person.`properties___$some_prop`, '$$_posthog_breakdown_null_$$'), ['some_val2', 'some_val'], ['some_val2', 'some_val'], '$$_posthog_breakdown_other_$$') AS breakdown_value + transform(ifNull(toString(e__pdi__person.`properties___$some_prop`), '$$_posthog_breakdown_null_$$'), ['some_val2', 'some_val'], ['some_val2', 'some_val'], '$$_posthog_breakdown_other_$$') AS breakdown_value FROM events AS e SAMPLE 1 INNER JOIN (SELECT argMax(person_distinct_id2.person_id, person_distinct_id2.version) AS person_id, @@ -1329,7 +1343,7 @@ WHERE equals(person.team_id, 2) GROUP BY person.id HAVING ifNull(equals(argMax(person.is_deleted, person.version), 0), 0))), 0)) SETTINGS optimize_aggregation_in_order=1) AS e__pdi__person ON equals(e__pdi.e__pdi___person_id, e__pdi__person.id) - WHERE and(equals(e.team_id, 2), and(equals(e.event, 'sign up'), ifNull(equals(e__pdi__person.properties___filter_prop, 'filter_val'), 0), or(ifNull(equals(e__pdi__person.`properties___$some_prop`, 'some_val2'), 0), ifNull(equals(e__pdi__person.`properties___$some_prop`, 'some_val'), 0))), ifNull(greaterOrEquals(timestamp, minus(assumeNotNull(parseDateTime64BestEffortOrNull('2019-12-28 00:00:00', 6, 'UTC')), toIntervalDay(30))), 0), ifNull(lessOrEquals(timestamp, assumeNotNull(parseDateTime64BestEffortOrNull('2020-01-04 23:59:59', 6, 'UTC'))), 0)) + WHERE and(equals(e.team_id, 2), and(equals(e.event, 'sign up'), ifNull(equals(e__pdi__person.properties___filter_prop, 'filter_val'), 0), or(ifNull(equals(toString(e__pdi__person.`properties___$some_prop`), 'some_val2'), 0), ifNull(equals(toString(e__pdi__person.`properties___$some_prop`), 'some_val'), 0))), ifNull(greaterOrEquals(timestamp, minus(assumeNotNull(parseDateTime64BestEffortOrNull('2019-12-28 00:00:00', 6, 'UTC')), toIntervalDay(30))), 0), ifNull(lessOrEquals(timestamp, assumeNotNull(parseDateTime64BestEffortOrNull('2020-01-04 23:59:59', 6, 'UTC'))), 0)) GROUP BY timestamp, actor_id, breakdown_value) AS e WHERE and(ifNull(lessOrEquals(e.timestamp, plus(d.timestamp, toIntervalDay(1))), 0), ifNull(greater(e.timestamp, minus(d.timestamp, toIntervalDay(29))), 0)) @@ -1349,7 +1363,7 @@ # --- # name: TestTrends.test_mau_with_breakdown_filtering_and_prop_filter_poe_v2 ''' - SELECT replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.person_properties, '$some_prop'), ''), 'null'), '^"|"$', '') AS value, + SELECT toString(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.person_properties, '$some_prop'), ''), 'null'), '^"|"$', '')) AS value, count(DISTINCT ifNull(nullIf(e__override.override_person_id, '00000000-0000-0000-0000-000000000000'), e.person_id)) AS count FROM events AS e LEFT OUTER JOIN @@ -1370,7 +1384,7 @@ ''' SELECT groupArray(day_start) AS date, groupArray(count) AS total, - ifNull(toString(breakdown_value), '') AS breakdown_value + ifNull(toString(breakdown_value), '$$_posthog_breakdown_null_$$') AS breakdown_value FROM (SELECT sum(total) AS count, day_start AS day_start, @@ -1402,7 +1416,7 @@ CROSS JOIN (SELECT toTimeZone(e.timestamp, 'UTC') AS timestamp, ifNull(nullIf(e__override.override_person_id, '00000000-0000-0000-0000-000000000000'), e.person_id) AS actor_id, - transform(ifNull(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.person_properties, '$some_prop'), ''), 'null'), '^"|"$', ''), '$$_posthog_breakdown_null_$$'), ['some_val2', 'some_val'], ['some_val2', 'some_val'], '$$_posthog_breakdown_other_$$') AS breakdown_value + transform(ifNull(toString(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.person_properties, '$some_prop'), ''), 'null'), '^"|"$', '')), '$$_posthog_breakdown_null_$$'), ['some_val2', 'some_val'], ['some_val2', 'some_val'], '$$_posthog_breakdown_other_$$') AS breakdown_value FROM events AS e SAMPLE 1 LEFT OUTER JOIN (SELECT argMax(person_overrides.override_person_id, person_overrides.version) AS override_person_id, @@ -1410,7 +1424,7 @@ FROM person_overrides WHERE equals(person_overrides.team_id, 2) GROUP BY person_overrides.old_person_id) AS e__override ON equals(e.person_id, e__override.old_person_id) - WHERE and(equals(e.team_id, 2), and(equals(e.event, 'sign up'), ifNull(equals(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.person_properties, 'filter_prop'), ''), 'null'), '^"|"$', ''), 'filter_val'), 0), or(ifNull(equals(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.person_properties, '$some_prop'), ''), 'null'), '^"|"$', ''), 'some_val2'), 0), ifNull(equals(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.person_properties, '$some_prop'), ''), 'null'), '^"|"$', ''), 'some_val'), 0))), ifNull(greaterOrEquals(timestamp, minus(assumeNotNull(parseDateTime64BestEffortOrNull('2019-12-28 00:00:00', 6, 'UTC')), toIntervalDay(30))), 0), ifNull(lessOrEquals(timestamp, assumeNotNull(parseDateTime64BestEffortOrNull('2020-01-04 23:59:59', 6, 'UTC'))), 0)) + WHERE and(equals(e.team_id, 2), and(equals(e.event, 'sign up'), ifNull(equals(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.person_properties, 'filter_prop'), ''), 'null'), '^"|"$', ''), 'filter_val'), 0), or(ifNull(equals(toString(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.person_properties, '$some_prop'), ''), 'null'), '^"|"$', '')), 'some_val2'), 0), ifNull(equals(toString(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.person_properties, '$some_prop'), ''), 'null'), '^"|"$', '')), 'some_val'), 0))), ifNull(greaterOrEquals(timestamp, minus(assumeNotNull(parseDateTime64BestEffortOrNull('2019-12-28 00:00:00', 6, 'UTC')), toIntervalDay(30))), 0), ifNull(lessOrEquals(timestamp, assumeNotNull(parseDateTime64BestEffortOrNull('2020-01-04 23:59:59', 6, 'UTC'))), 0)) GROUP BY timestamp, actor_id, breakdown_value) AS e WHERE and(ifNull(lessOrEquals(e.timestamp, plus(d.timestamp, toIntervalDay(1))), 0), ifNull(greater(e.timestamp, minus(d.timestamp, toIntervalDay(29))), 0)) @@ -1476,7 +1490,7 @@ # --- # name: TestTrends.test_person_filtering_in_cohort_in_action.2 ''' - SELECT replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$some_property'), ''), 'null'), '^"|"$', '') AS value, + SELECT toString(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$some_property'), ''), 'null'), '^"|"$', '')) AS value, count(e.uuid) AS count FROM events AS e INNER JOIN @@ -1503,7 +1517,7 @@ ''' SELECT groupArray(day_start) AS date, groupArray(count) AS total, - ifNull(toString(breakdown_value), '') AS breakdown_value + ifNull(toString(breakdown_value), '$$_posthog_breakdown_null_$$') AS breakdown_value FROM (SELECT sum(total) AS count, day_start AS day_start, @@ -1524,7 +1538,7 @@ ORDER BY sec.breakdown_value ASC, day_start ASC UNION ALL SELECT count(e.uuid) AS total, toStartOfDay(toTimeZone(e.timestamp, 'UTC')) AS day_start, - transform(ifNull(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$some_property'), ''), 'null'), '^"|"$', ''), '$$_posthog_breakdown_null_$$'), ['$$_posthog_breakdown_null_$$', 'value', 'other_value'], ['$$_posthog_breakdown_null_$$', 'value', 'other_value'], '$$_posthog_breakdown_other_$$') AS breakdown_value + transform(ifNull(toString(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$some_property'), ''), 'null'), '^"|"$', '')), '$$_posthog_breakdown_null_$$'), ['$$_posthog_breakdown_null_$$', 'value', 'other_value'], ['$$_posthog_breakdown_null_$$', 'value', 'other_value'], '$$_posthog_breakdown_other_$$') AS breakdown_value FROM events AS e SAMPLE 1 INNER JOIN (SELECT argMax(person_distinct_id2.person_id, person_distinct_id2.version) AS person_id, @@ -1538,7 +1552,7 @@ FROM cohortpeople WHERE and(equals(cohortpeople.team_id, 2), equals(cohortpeople.cohort_id, 2)) GROUP BY cohortpeople.person_id, cohortpeople.cohort_id, cohortpeople.version - HAVING ifNull(greater(sum(cohortpeople.sign), 0), 0))), 0)), or(isNull(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$some_property'), ''), 'null'), '^"|"$', '')), ifNull(equals(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$some_property'), ''), 'null'), '^"|"$', ''), 'value'), 0), ifNull(equals(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$some_property'), ''), 'null'), '^"|"$', ''), 'other_value'), 0))) + HAVING ifNull(greater(sum(cohortpeople.sign), 0), 0))), 0)), or(isNull(toString(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$some_property'), ''), 'null'), '^"|"$', ''))), ifNull(equals(toString(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$some_property'), ''), 'null'), '^"|"$', '')), 'value'), 0), ifNull(equals(toString(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$some_property'), ''), 'null'), '^"|"$', '')), 'other_value'), 0))) GROUP BY day_start, breakdown_value) GROUP BY day_start, @@ -1573,7 +1587,7 @@ # --- # name: TestTrends.test_person_filtering_in_cohort_in_action_poe_v2.2 ''' - SELECT replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$some_property'), ''), 'null'), '^"|"$', '') AS value, + SELECT toString(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$some_property'), ''), 'null'), '^"|"$', '')) AS value, count(e.uuid) AS count FROM events AS e LEFT OUTER JOIN @@ -1599,7 +1613,7 @@ ''' SELECT groupArray(day_start) AS date, groupArray(count) AS total, - ifNull(toString(breakdown_value), '') AS breakdown_value + ifNull(toString(breakdown_value), '$$_posthog_breakdown_null_$$') AS breakdown_value FROM (SELECT sum(total) AS count, day_start AS day_start, @@ -1620,7 +1634,7 @@ ORDER BY sec.breakdown_value ASC, day_start ASC UNION ALL SELECT count(e.uuid) AS total, toStartOfDay(toTimeZone(e.timestamp, 'UTC')) AS day_start, - transform(ifNull(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$some_property'), ''), 'null'), '^"|"$', ''), '$$_posthog_breakdown_null_$$'), ['$$_posthog_breakdown_null_$$', 'value', 'other_value'], ['$$_posthog_breakdown_null_$$', 'value', 'other_value'], '$$_posthog_breakdown_other_$$') AS breakdown_value + transform(ifNull(toString(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$some_property'), ''), 'null'), '^"|"$', '')), '$$_posthog_breakdown_null_$$'), ['$$_posthog_breakdown_null_$$', 'value', 'other_value'], ['$$_posthog_breakdown_null_$$', 'value', 'other_value'], '$$_posthog_breakdown_other_$$') AS breakdown_value FROM events AS e SAMPLE 1 LEFT OUTER JOIN (SELECT argMax(person_overrides.override_person_id, person_overrides.version) AS override_person_id, @@ -1633,7 +1647,7 @@ FROM cohortpeople WHERE and(equals(cohortpeople.team_id, 2), equals(cohortpeople.cohort_id, 2)) GROUP BY cohortpeople.person_id, cohortpeople.cohort_id, cohortpeople.version - HAVING ifNull(greater(sum(cohortpeople.sign), 0), 0))), 0)), or(isNull(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$some_property'), ''), 'null'), '^"|"$', '')), ifNull(equals(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$some_property'), ''), 'null'), '^"|"$', ''), 'value'), 0), ifNull(equals(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$some_property'), ''), 'null'), '^"|"$', ''), 'other_value'), 0))) + HAVING ifNull(greater(sum(cohortpeople.sign), 0), 0))), 0)), or(isNull(toString(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$some_property'), ''), 'null'), '^"|"$', ''))), ifNull(equals(toString(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$some_property'), ''), 'null'), '^"|"$', '')), 'value'), 0), ifNull(equals(toString(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$some_property'), ''), 'null'), '^"|"$', '')), 'other_value'), 0))) GROUP BY day_start, breakdown_value) GROUP BY day_start, @@ -2217,7 +2231,7 @@ # --- # name: TestTrends.test_timezones_daily.4 ''' - SELECT replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$os'), ''), 'null'), '^"|"$', '') AS value, + SELECT toString(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$os'), ''), 'null'), '^"|"$', '')) AS value, count(e.uuid) AS count FROM events AS e WHERE and(equals(e.team_id, 2), and(greaterOrEquals(toTimeZone(e.timestamp, 'UTC'), toStartOfDay(assumeNotNull(parseDateTime64BestEffortOrNull('2019-12-29 00:00:00', 6, 'UTC')))), lessOrEquals(toTimeZone(e.timestamp, 'UTC'), assumeNotNull(parseDateTime64BestEffortOrNull('2020-01-05 23:59:59', 6, 'UTC')))), and(greaterOrEquals(toTimeZone(e.timestamp, 'UTC'), toStartOfDay(assumeNotNull(parseDateTime64BestEffortOrNull('2019-12-29 00:00:00', 6, 'UTC')))), lessOrEquals(toTimeZone(e.timestamp, 'UTC'), assumeNotNull(parseDateTime64BestEffortOrNull('2020-01-05 23:59:59', 6, 'UTC'))), equals(e.event, 'sign up'))) @@ -2232,7 +2246,7 @@ ''' SELECT groupArray(day_start) AS date, groupArray(count) AS total, - ifNull(toString(breakdown_value), '') AS breakdown_value + ifNull(toString(breakdown_value), '$$_posthog_breakdown_null_$$') AS breakdown_value FROM (SELECT sum(total) AS count, day_start AS day_start, @@ -2253,7 +2267,7 @@ ORDER BY sec.breakdown_value ASC, day_start ASC UNION ALL SELECT count(DISTINCT e__pdi.person_id) AS total, toStartOfDay(toTimeZone(e.timestamp, 'UTC')) AS day_start, - transform(ifNull(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$os'), ''), 'null'), '^"|"$', ''), '$$_posthog_breakdown_null_$$'), ['Mac'], ['Mac'], '$$_posthog_breakdown_other_$$') AS breakdown_value + transform(ifNull(toString(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$os'), ''), 'null'), '^"|"$', '')), '$$_posthog_breakdown_null_$$'), ['Mac'], ['Mac'], '$$_posthog_breakdown_other_$$') AS breakdown_value FROM events AS e SAMPLE 1 INNER JOIN (SELECT argMax(person_distinct_id2.person_id, person_distinct_id2.version) AS person_id, @@ -2262,7 +2276,7 @@ WHERE equals(person_distinct_id2.team_id, 2) GROUP BY person_distinct_id2.distinct_id HAVING ifNull(equals(argMax(person_distinct_id2.is_deleted, person_distinct_id2.version), 0), 0)) AS e__pdi ON equals(e.distinct_id, e__pdi.distinct_id) - WHERE and(equals(e.team_id, 2), greaterOrEquals(toTimeZone(e.timestamp, 'UTC'), toStartOfDay(assumeNotNull(parseDateTime64BestEffortOrNull('2019-12-29 00:00:00', 6, 'UTC')))), lessOrEquals(toTimeZone(e.timestamp, 'UTC'), assumeNotNull(parseDateTime64BestEffortOrNull('2020-01-05 23:59:59', 6, 'UTC'))), equals(e.event, 'sign up'), ifNull(equals(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$os'), ''), 'null'), '^"|"$', ''), 'Mac'), 0)) + WHERE and(equals(e.team_id, 2), greaterOrEquals(toTimeZone(e.timestamp, 'UTC'), toStartOfDay(assumeNotNull(parseDateTime64BestEffortOrNull('2019-12-29 00:00:00', 6, 'UTC')))), lessOrEquals(toTimeZone(e.timestamp, 'UTC'), assumeNotNull(parseDateTime64BestEffortOrNull('2020-01-05 23:59:59', 6, 'UTC'))), equals(e.event, 'sign up'), ifNull(equals(toString(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$os'), ''), 'null'), '^"|"$', '')), 'Mac'), 0)) GROUP BY day_start, breakdown_value) GROUP BY day_start, @@ -2408,7 +2422,7 @@ # --- # name: TestTrends.test_timezones_daily_minus_utc.4 ''' - SELECT replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$os'), ''), 'null'), '^"|"$', '') AS value, + SELECT toString(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$os'), ''), 'null'), '^"|"$', '')) AS value, count(e.uuid) AS count FROM events AS e WHERE and(equals(e.team_id, 2), and(greaterOrEquals(toTimeZone(e.timestamp, 'America/Phoenix'), toStartOfDay(assumeNotNull(parseDateTime64BestEffortOrNull('2019-12-29 00:00:00', 6, 'America/Phoenix')))), lessOrEquals(toTimeZone(e.timestamp, 'America/Phoenix'), assumeNotNull(parseDateTime64BestEffortOrNull('2020-01-05 23:59:59', 6, 'America/Phoenix')))), and(greaterOrEquals(toTimeZone(e.timestamp, 'America/Phoenix'), toStartOfDay(assumeNotNull(parseDateTime64BestEffortOrNull('2019-12-29 00:00:00', 6, 'America/Phoenix')))), lessOrEquals(toTimeZone(e.timestamp, 'America/Phoenix'), assumeNotNull(parseDateTime64BestEffortOrNull('2020-01-05 23:59:59', 6, 'America/Phoenix'))), equals(e.event, 'sign up'))) @@ -2423,7 +2437,7 @@ ''' SELECT groupArray(day_start) AS date, groupArray(count) AS total, - ifNull(toString(breakdown_value), '') AS breakdown_value + ifNull(toString(breakdown_value), '$$_posthog_breakdown_null_$$') AS breakdown_value FROM (SELECT sum(total) AS count, day_start AS day_start, @@ -2444,7 +2458,7 @@ ORDER BY sec.breakdown_value ASC, day_start ASC UNION ALL SELECT count(DISTINCT e__pdi.person_id) AS total, toStartOfDay(toTimeZone(e.timestamp, 'America/Phoenix')) AS day_start, - transform(ifNull(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$os'), ''), 'null'), '^"|"$', ''), '$$_posthog_breakdown_null_$$'), ['Mac'], ['Mac'], '$$_posthog_breakdown_other_$$') AS breakdown_value + transform(ifNull(toString(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$os'), ''), 'null'), '^"|"$', '')), '$$_posthog_breakdown_null_$$'), ['Mac'], ['Mac'], '$$_posthog_breakdown_other_$$') AS breakdown_value FROM events AS e SAMPLE 1 INNER JOIN (SELECT argMax(person_distinct_id2.person_id, person_distinct_id2.version) AS person_id, @@ -2453,7 +2467,7 @@ WHERE equals(person_distinct_id2.team_id, 2) GROUP BY person_distinct_id2.distinct_id HAVING ifNull(equals(argMax(person_distinct_id2.is_deleted, person_distinct_id2.version), 0), 0)) AS e__pdi ON equals(e.distinct_id, e__pdi.distinct_id) - WHERE and(equals(e.team_id, 2), greaterOrEquals(toTimeZone(e.timestamp, 'America/Phoenix'), toStartOfDay(assumeNotNull(parseDateTime64BestEffortOrNull('2019-12-29 00:00:00', 6, 'America/Phoenix')))), lessOrEquals(toTimeZone(e.timestamp, 'America/Phoenix'), assumeNotNull(parseDateTime64BestEffortOrNull('2020-01-05 23:59:59', 6, 'America/Phoenix'))), equals(e.event, 'sign up'), ifNull(equals(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$os'), ''), 'null'), '^"|"$', ''), 'Mac'), 0)) + WHERE and(equals(e.team_id, 2), greaterOrEquals(toTimeZone(e.timestamp, 'America/Phoenix'), toStartOfDay(assumeNotNull(parseDateTime64BestEffortOrNull('2019-12-29 00:00:00', 6, 'America/Phoenix')))), lessOrEquals(toTimeZone(e.timestamp, 'America/Phoenix'), assumeNotNull(parseDateTime64BestEffortOrNull('2020-01-05 23:59:59', 6, 'America/Phoenix'))), equals(e.event, 'sign up'), ifNull(equals(toString(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$os'), ''), 'null'), '^"|"$', '')), 'Mac'), 0)) GROUP BY day_start, breakdown_value) GROUP BY day_start, @@ -2599,7 +2613,7 @@ # --- # name: TestTrends.test_timezones_daily_plus_utc.4 ''' - SELECT replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$os'), ''), 'null'), '^"|"$', '') AS value, + SELECT toString(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$os'), ''), 'null'), '^"|"$', '')) AS value, count(e.uuid) AS count FROM events AS e WHERE and(equals(e.team_id, 2), and(greaterOrEquals(toTimeZone(e.timestamp, 'Asia/Tokyo'), toStartOfDay(assumeNotNull(parseDateTime64BestEffortOrNull('2019-12-29 00:00:00', 6, 'Asia/Tokyo')))), lessOrEquals(toTimeZone(e.timestamp, 'Asia/Tokyo'), assumeNotNull(parseDateTime64BestEffortOrNull('2020-01-05 23:59:59', 6, 'Asia/Tokyo')))), and(greaterOrEquals(toTimeZone(e.timestamp, 'Asia/Tokyo'), toStartOfDay(assumeNotNull(parseDateTime64BestEffortOrNull('2019-12-29 00:00:00', 6, 'Asia/Tokyo')))), lessOrEquals(toTimeZone(e.timestamp, 'Asia/Tokyo'), assumeNotNull(parseDateTime64BestEffortOrNull('2020-01-05 23:59:59', 6, 'Asia/Tokyo'))), equals(e.event, 'sign up'))) @@ -2614,7 +2628,7 @@ ''' SELECT groupArray(day_start) AS date, groupArray(count) AS total, - ifNull(toString(breakdown_value), '') AS breakdown_value + ifNull(toString(breakdown_value), '$$_posthog_breakdown_null_$$') AS breakdown_value FROM (SELECT sum(total) AS count, day_start AS day_start, @@ -2635,7 +2649,7 @@ ORDER BY sec.breakdown_value ASC, day_start ASC UNION ALL SELECT count(DISTINCT e__pdi.person_id) AS total, toStartOfDay(toTimeZone(e.timestamp, 'Asia/Tokyo')) AS day_start, - transform(ifNull(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$os'), ''), 'null'), '^"|"$', ''), '$$_posthog_breakdown_null_$$'), ['Mac'], ['Mac'], '$$_posthog_breakdown_other_$$') AS breakdown_value + transform(ifNull(toString(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$os'), ''), 'null'), '^"|"$', '')), '$$_posthog_breakdown_null_$$'), ['Mac'], ['Mac'], '$$_posthog_breakdown_other_$$') AS breakdown_value FROM events AS e SAMPLE 1 INNER JOIN (SELECT argMax(person_distinct_id2.person_id, person_distinct_id2.version) AS person_id, @@ -2644,7 +2658,7 @@ WHERE equals(person_distinct_id2.team_id, 2) GROUP BY person_distinct_id2.distinct_id HAVING ifNull(equals(argMax(person_distinct_id2.is_deleted, person_distinct_id2.version), 0), 0)) AS e__pdi ON equals(e.distinct_id, e__pdi.distinct_id) - WHERE and(equals(e.team_id, 2), greaterOrEquals(toTimeZone(e.timestamp, 'Asia/Tokyo'), toStartOfDay(assumeNotNull(parseDateTime64BestEffortOrNull('2019-12-29 00:00:00', 6, 'Asia/Tokyo')))), lessOrEquals(toTimeZone(e.timestamp, 'Asia/Tokyo'), assumeNotNull(parseDateTime64BestEffortOrNull('2020-01-05 23:59:59', 6, 'Asia/Tokyo'))), equals(e.event, 'sign up'), ifNull(equals(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$os'), ''), 'null'), '^"|"$', ''), 'Mac'), 0)) + WHERE and(equals(e.team_id, 2), greaterOrEquals(toTimeZone(e.timestamp, 'Asia/Tokyo'), toStartOfDay(assumeNotNull(parseDateTime64BestEffortOrNull('2019-12-29 00:00:00', 6, 'Asia/Tokyo')))), lessOrEquals(toTimeZone(e.timestamp, 'Asia/Tokyo'), assumeNotNull(parseDateTime64BestEffortOrNull('2020-01-05 23:59:59', 6, 'Asia/Tokyo'))), equals(e.event, 'sign up'), ifNull(equals(toString(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$os'), ''), 'null'), '^"|"$', '')), 'Mac'), 0)) GROUP BY day_start, breakdown_value) GROUP BY day_start, @@ -2992,7 +3006,7 @@ # --- # name: TestTrends.test_trend_breakdown_user_props_with_filter_with_partial_property_pushdowns ''' - SELECT e__pdi__person.properties___email AS value, + SELECT toString(e__pdi__person.properties___email) AS value, count(e.uuid) AS count FROM events AS e INNER JOIN @@ -3027,7 +3041,7 @@ ''' SELECT groupArray(day_start) AS date, groupArray(count) AS total, - ifNull(toString(breakdown_value), '') AS breakdown_value + ifNull(toString(breakdown_value), '$$_posthog_breakdown_null_$$') AS breakdown_value FROM (SELECT sum(total) AS count, day_start AS day_start, @@ -3048,7 +3062,7 @@ ORDER BY sec.breakdown_value ASC, day_start ASC UNION ALL SELECT count(e.uuid) AS total, toStartOfDay(toTimeZone(e.timestamp, 'UTC')) AS day_start, - transform(ifNull(e__pdi__person.properties___email, '$$_posthog_breakdown_null_$$'), ['test2@posthog.com', 'test@gmail.com', 'test5@posthog.com', 'test4@posthog.com', 'test3@posthog.com'], ['test2@posthog.com', 'test@gmail.com', 'test5@posthog.com', 'test4@posthog.com', 'test3@posthog.com'], '$$_posthog_breakdown_other_$$') AS breakdown_value + transform(ifNull(toString(e__pdi__person.properties___email), '$$_posthog_breakdown_null_$$'), ['test2@posthog.com', 'test@gmail.com', 'test5@posthog.com', 'test4@posthog.com', 'test3@posthog.com'], ['test2@posthog.com', 'test@gmail.com', 'test5@posthog.com', 'test4@posthog.com', 'test3@posthog.com'], '$$_posthog_breakdown_other_$$') AS breakdown_value FROM events AS e SAMPLE 1 INNER JOIN (SELECT argMax(person_distinct_id2.person_id, person_distinct_id2.version) AS e__pdi___person_id, @@ -3070,7 +3084,7 @@ WHERE equals(person.team_id, 2) GROUP BY person.id HAVING ifNull(equals(argMax(person.is_deleted, person.version), 0), 0))), 0)) SETTINGS optimize_aggregation_in_order=1) AS e__pdi__person ON equals(e__pdi.e__pdi___person_id, e__pdi__person.id) - WHERE and(equals(e.team_id, 2), greaterOrEquals(toTimeZone(e.timestamp, 'UTC'), toStartOfDay(assumeNotNull(parseDateTime64BestEffortOrNull('2020-01-01 00:00:00', 6, 'UTC')))), lessOrEquals(toTimeZone(e.timestamp, 'UTC'), assumeNotNull(parseDateTime64BestEffortOrNull('2020-07-01 23:59:59', 6, 'UTC'))), equals(e.event, 'sign up'), and(or(ifNull(notILike(e__pdi__person.properties___email, '%@posthog.com%'), 1), ifNull(equals(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, 'key'), ''), 'null'), '^"|"$', ''), 'val'), 0)), or(ifNull(equals(e__pdi__person.`properties___$os`, 'android'), 0), ifNull(equals(e__pdi__person.`properties___$browser`, 'safari'), 0))), or(ifNull(equals(e__pdi__person.properties___email, 'test2@posthog.com'), 0), ifNull(equals(e__pdi__person.properties___email, 'test@gmail.com'), 0), ifNull(equals(e__pdi__person.properties___email, 'test5@posthog.com'), 0), ifNull(equals(e__pdi__person.properties___email, 'test4@posthog.com'), 0), ifNull(equals(e__pdi__person.properties___email, 'test3@posthog.com'), 0))) + WHERE and(equals(e.team_id, 2), greaterOrEquals(toTimeZone(e.timestamp, 'UTC'), toStartOfDay(assumeNotNull(parseDateTime64BestEffortOrNull('2020-01-01 00:00:00', 6, 'UTC')))), lessOrEquals(toTimeZone(e.timestamp, 'UTC'), assumeNotNull(parseDateTime64BestEffortOrNull('2020-07-01 23:59:59', 6, 'UTC'))), equals(e.event, 'sign up'), and(or(ifNull(notILike(e__pdi__person.properties___email, '%@posthog.com%'), 1), ifNull(equals(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, 'key'), ''), 'null'), '^"|"$', ''), 'val'), 0)), or(ifNull(equals(e__pdi__person.`properties___$os`, 'android'), 0), ifNull(equals(e__pdi__person.`properties___$browser`, 'safari'), 0))), or(ifNull(equals(toString(e__pdi__person.properties___email), 'test2@posthog.com'), 0), ifNull(equals(toString(e__pdi__person.properties___email), 'test@gmail.com'), 0), ifNull(equals(toString(e__pdi__person.properties___email), 'test5@posthog.com'), 0), ifNull(equals(toString(e__pdi__person.properties___email), 'test4@posthog.com'), 0), ifNull(equals(toString(e__pdi__person.properties___email), 'test3@posthog.com'), 0))) GROUP BY day_start, breakdown_value) GROUP BY day_start, @@ -3085,7 +3099,7 @@ # --- # name: TestTrends.test_trend_breakdown_user_props_with_filter_with_partial_property_pushdowns.2 ''' - SELECT e__pdi__person.properties___email AS value, + SELECT toString(e__pdi__person.properties___email) AS value, count(e.uuid) AS count FROM events AS e INNER JOIN @@ -3120,7 +3134,7 @@ ''' SELECT groupArray(day_start) AS date, groupArray(count) AS total, - ifNull(toString(breakdown_value), '') AS breakdown_value + ifNull(toString(breakdown_value), '$$_posthog_breakdown_null_$$') AS breakdown_value FROM (SELECT sum(total) AS count, day_start AS day_start, @@ -3141,7 +3155,7 @@ ORDER BY sec.breakdown_value ASC, day_start ASC UNION ALL SELECT count(e.uuid) AS total, toStartOfDay(toTimeZone(e.timestamp, 'UTC')) AS day_start, - transform(ifNull(e__pdi__person.properties___email, '$$_posthog_breakdown_null_$$'), ['test2@posthog.com'], ['test2@posthog.com'], '$$_posthog_breakdown_other_$$') AS breakdown_value + transform(ifNull(toString(e__pdi__person.properties___email), '$$_posthog_breakdown_null_$$'), ['test2@posthog.com'], ['test2@posthog.com'], '$$_posthog_breakdown_other_$$') AS breakdown_value FROM events AS e SAMPLE 1 INNER JOIN (SELECT argMax(person_distinct_id2.person_id, person_distinct_id2.version) AS e__pdi___person_id, @@ -3163,7 +3177,7 @@ WHERE equals(person.team_id, 2) GROUP BY person.id HAVING ifNull(equals(argMax(person.is_deleted, person.version), 0), 0))), 0)) SETTINGS optimize_aggregation_in_order=1) AS e__pdi__person ON equals(e__pdi.e__pdi___person_id, e__pdi__person.id) - WHERE and(equals(e.team_id, 2), greaterOrEquals(toTimeZone(e.timestamp, 'UTC'), toStartOfDay(assumeNotNull(parseDateTime64BestEffortOrNull('2020-01-01 00:00:00', 6, 'UTC')))), lessOrEquals(toTimeZone(e.timestamp, 'UTC'), assumeNotNull(parseDateTime64BestEffortOrNull('2020-07-01 23:59:59', 6, 'UTC'))), equals(e.event, 'sign up'), and(ifNull(equals(e__pdi__person.`properties___$os`, 'android'), 0), ifNull(equals(e__pdi__person.`properties___$browser`, 'chrome'), 0)), and(ifNull(equals(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, 'key'), ''), 'null'), '^"|"$', ''), 'val'), 0), ifNull(ilike(e__pdi__person.properties___email, '%@posthog.com%'), 0)), ifNull(equals(e__pdi__person.properties___email, 'test2@posthog.com'), 0)) + WHERE and(equals(e.team_id, 2), greaterOrEquals(toTimeZone(e.timestamp, 'UTC'), toStartOfDay(assumeNotNull(parseDateTime64BestEffortOrNull('2020-01-01 00:00:00', 6, 'UTC')))), lessOrEquals(toTimeZone(e.timestamp, 'UTC'), assumeNotNull(parseDateTime64BestEffortOrNull('2020-07-01 23:59:59', 6, 'UTC'))), equals(e.event, 'sign up'), and(ifNull(equals(e__pdi__person.`properties___$os`, 'android'), 0), ifNull(equals(e__pdi__person.`properties___$browser`, 'chrome'), 0)), and(ifNull(equals(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, 'key'), ''), 'null'), '^"|"$', ''), 'val'), 0), ifNull(ilike(e__pdi__person.properties___email, '%@posthog.com%'), 0)), ifNull(equals(toString(e__pdi__person.properties___email), 'test2@posthog.com'), 0)) GROUP BY day_start, breakdown_value) GROUP BY day_start, @@ -3248,7 +3262,7 @@ # --- # name: TestTrends.test_trends_aggregate_by_distinct_id.2 ''' - SELECT e__pdi__person.`properties___$some_prop` AS value, + SELECT toString(e__pdi__person.`properties___$some_prop`) AS value, count(e.uuid) AS count FROM events AS e INNER JOIN @@ -3281,7 +3295,7 @@ ''' SELECT groupArray(day_start) AS date, groupArray(count) AS total, - ifNull(toString(breakdown_value), '') AS breakdown_value + ifNull(toString(breakdown_value), '$$_posthog_breakdown_null_$$') AS breakdown_value FROM (SELECT sum(total) AS count, day_start AS day_start, @@ -3302,7 +3316,7 @@ ORDER BY sec.breakdown_value ASC, day_start ASC UNION ALL SELECT count(DISTINCT e.distinct_id) AS total, toStartOfDay(toTimeZone(e.timestamp, 'UTC')) AS day_start, - transform(ifNull(e__pdi__person.`properties___$some_prop`, '$$_posthog_breakdown_null_$$'), ['some_val', '$$_posthog_breakdown_null_$$'], ['some_val', '$$_posthog_breakdown_null_$$'], '$$_posthog_breakdown_other_$$') AS breakdown_value + transform(ifNull(toString(e__pdi__person.`properties___$some_prop`), '$$_posthog_breakdown_null_$$'), ['some_val', '$$_posthog_breakdown_null_$$'], ['some_val', '$$_posthog_breakdown_null_$$'], '$$_posthog_breakdown_other_$$') AS breakdown_value FROM events AS e SAMPLE 1 INNER JOIN (SELECT argMax(person_distinct_id2.person_id, person_distinct_id2.version) AS e__pdi___person_id, @@ -3322,7 +3336,7 @@ WHERE equals(person.team_id, 2) GROUP BY person.id HAVING ifNull(equals(argMax(person.is_deleted, person.version), 0), 0))), 0)) SETTINGS optimize_aggregation_in_order=1) AS e__pdi__person ON equals(e__pdi.e__pdi___person_id, e__pdi__person.id) - WHERE and(equals(e.team_id, 2), greaterOrEquals(toTimeZone(e.timestamp, 'UTC'), toStartOfDay(assumeNotNull(parseDateTime64BestEffortOrNull('2019-12-24 00:00:00', 6, 'UTC')))), lessOrEquals(toTimeZone(e.timestamp, 'UTC'), assumeNotNull(parseDateTime64BestEffortOrNull('2019-12-31 23:59:59', 6, 'UTC'))), equals(e.event, 'sign up'), or(ifNull(equals(e__pdi__person.`properties___$some_prop`, 'some_val'), 0), isNull(e__pdi__person.`properties___$some_prop`))) + WHERE and(equals(e.team_id, 2), greaterOrEquals(toTimeZone(e.timestamp, 'UTC'), toStartOfDay(assumeNotNull(parseDateTime64BestEffortOrNull('2019-12-24 00:00:00', 6, 'UTC')))), lessOrEquals(toTimeZone(e.timestamp, 'UTC'), assumeNotNull(parseDateTime64BestEffortOrNull('2019-12-31 23:59:59', 6, 'UTC'))), equals(e.event, 'sign up'), or(ifNull(equals(toString(e__pdi__person.`properties___$some_prop`), 'some_val'), 0), isNull(toString(e__pdi__person.`properties___$some_prop`)))) GROUP BY day_start, breakdown_value) GROUP BY day_start, @@ -3415,7 +3429,7 @@ # --- # name: TestTrends.test_trends_aggregate_by_distinct_id.6 ''' - SELECT replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$some_prop'), ''), 'null'), '^"|"$', '') AS value, + SELECT toString(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$some_prop'), ''), 'null'), '^"|"$', '')) AS value, count(e.uuid) AS count FROM events AS e WHERE and(equals(e.team_id, 2), and(greaterOrEquals(toTimeZone(e.timestamp, 'UTC'), toStartOfDay(assumeNotNull(parseDateTime64BestEffortOrNull('2019-12-24 00:00:00', 6, 'UTC')))), lessOrEquals(toTimeZone(e.timestamp, 'UTC'), assumeNotNull(parseDateTime64BestEffortOrNull('2019-12-31 23:59:59', 6, 'UTC')))), and(greaterOrEquals(toTimeZone(e.timestamp, 'UTC'), toStartOfDay(assumeNotNull(parseDateTime64BestEffortOrNull('2019-12-24 00:00:00', 6, 'UTC')))), lessOrEquals(toTimeZone(e.timestamp, 'UTC'), assumeNotNull(parseDateTime64BestEffortOrNull('2019-12-31 23:59:59', 6, 'UTC'))), equals(e.event, 'sign up'))) @@ -3430,7 +3444,7 @@ ''' SELECT groupArray(day_start) AS date, groupArray(count) AS total, - ifNull(toString(breakdown_value), '') AS breakdown_value + ifNull(toString(breakdown_value), '$$_posthog_breakdown_null_$$') AS breakdown_value FROM (SELECT sum(total) AS count, day_start AS day_start, @@ -3451,9 +3465,9 @@ ORDER BY sec.breakdown_value ASC, day_start ASC UNION ALL SELECT count(DISTINCT e.distinct_id) AS total, toStartOfDay(toTimeZone(e.timestamp, 'UTC')) AS day_start, - transform(ifNull(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$some_prop'), ''), 'null'), '^"|"$', ''), '$$_posthog_breakdown_null_$$'), ['$$_posthog_breakdown_null_$$'], ['$$_posthog_breakdown_null_$$'], '$$_posthog_breakdown_other_$$') AS breakdown_value + transform(ifNull(toString(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$some_prop'), ''), 'null'), '^"|"$', '')), '$$_posthog_breakdown_null_$$'), ['$$_posthog_breakdown_null_$$'], ['$$_posthog_breakdown_null_$$'], '$$_posthog_breakdown_other_$$') AS breakdown_value FROM events AS e SAMPLE 1 - WHERE and(equals(e.team_id, 2), greaterOrEquals(toTimeZone(e.timestamp, 'UTC'), toStartOfDay(assumeNotNull(parseDateTime64BestEffortOrNull('2019-12-24 00:00:00', 6, 'UTC')))), lessOrEquals(toTimeZone(e.timestamp, 'UTC'), assumeNotNull(parseDateTime64BestEffortOrNull('2019-12-31 23:59:59', 6, 'UTC'))), equals(e.event, 'sign up'), isNull(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$some_prop'), ''), 'null'), '^"|"$', ''))) + WHERE and(equals(e.team_id, 2), greaterOrEquals(toTimeZone(e.timestamp, 'UTC'), toStartOfDay(assumeNotNull(parseDateTime64BestEffortOrNull('2019-12-24 00:00:00', 6, 'UTC')))), lessOrEquals(toTimeZone(e.timestamp, 'UTC'), assumeNotNull(parseDateTime64BestEffortOrNull('2019-12-31 23:59:59', 6, 'UTC'))), equals(e.event, 'sign up'), isNull(toString(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$some_prop'), ''), 'null'), '^"|"$', '')))) GROUP BY day_start, breakdown_value) GROUP BY day_start, @@ -3520,7 +3534,7 @@ # --- # name: TestTrends.test_trends_breakdown_cumulative ''' - SELECT replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$some_property'), ''), 'null'), '^"|"$', '') AS value, + SELECT toString(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$some_property'), ''), 'null'), '^"|"$', '')) AS value, count(e.uuid) AS count FROM events AS e WHERE and(equals(e.team_id, 2), and(greaterOrEquals(toTimeZone(e.timestamp, 'UTC'), toStartOfDay(assumeNotNull(parseDateTime64BestEffortOrNull('2019-12-28 00:00:00', 6, 'UTC')))), lessOrEquals(toTimeZone(e.timestamp, 'UTC'), assumeNotNull(parseDateTime64BestEffortOrNull('2020-01-04 23:59:59', 6, 'UTC')))), and(greaterOrEquals(toTimeZone(e.timestamp, 'UTC'), toStartOfDay(assumeNotNull(parseDateTime64BestEffortOrNull('2019-12-28 00:00:00', 6, 'UTC')))), lessOrEquals(toTimeZone(e.timestamp, 'UTC'), assumeNotNull(parseDateTime64BestEffortOrNull('2020-01-04 23:59:59', 6, 'UTC'))), equals(e.event, 'sign up'))) @@ -3535,7 +3549,7 @@ ''' SELECT groupArray(day_start) AS date, groupArray(count) AS total, - ifNull(toString(breakdown_value), '') AS breakdown_value + ifNull(toString(breakdown_value), '$$_posthog_breakdown_null_$$') AS breakdown_value FROM (SELECT day_start AS day_start, sum(count) OVER (PARTITION BY breakdown_value @@ -3561,7 +3575,7 @@ ORDER BY sec.breakdown_value ASC, day_start ASC UNION ALL SELECT count(DISTINCT e__pdi.person_id) AS total, min(toStartOfDay(toTimeZone(e.timestamp, 'UTC'))) AS day_start, - transform(ifNull(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$some_property'), ''), 'null'), '^"|"$', ''), '$$_posthog_breakdown_null_$$'), ['$$_posthog_breakdown_null_$$', 'value', 'other_value'], ['$$_posthog_breakdown_null_$$', 'value', 'other_value'], '$$_posthog_breakdown_other_$$') AS breakdown_value + transform(ifNull(toString(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$some_property'), ''), 'null'), '^"|"$', '')), '$$_posthog_breakdown_null_$$'), ['$$_posthog_breakdown_null_$$', 'value', 'other_value'], ['$$_posthog_breakdown_null_$$', 'value', 'other_value'], '$$_posthog_breakdown_other_$$') AS breakdown_value FROM events AS e SAMPLE 1 INNER JOIN (SELECT argMax(person_distinct_id2.person_id, person_distinct_id2.version) AS person_id, @@ -3570,7 +3584,7 @@ WHERE equals(person_distinct_id2.team_id, 2) GROUP BY person_distinct_id2.distinct_id HAVING ifNull(equals(argMax(person_distinct_id2.is_deleted, person_distinct_id2.version), 0), 0)) AS e__pdi ON equals(e.distinct_id, e__pdi.distinct_id) - WHERE and(equals(e.team_id, 2), greaterOrEquals(toTimeZone(e.timestamp, 'UTC'), toStartOfDay(assumeNotNull(parseDateTime64BestEffortOrNull('2019-12-28 00:00:00', 6, 'UTC')))), lessOrEquals(toTimeZone(e.timestamp, 'UTC'), assumeNotNull(parseDateTime64BestEffortOrNull('2020-01-04 23:59:59', 6, 'UTC'))), equals(e.event, 'sign up'), or(isNull(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$some_property'), ''), 'null'), '^"|"$', '')), ifNull(equals(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$some_property'), ''), 'null'), '^"|"$', ''), 'value'), 0), ifNull(equals(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$some_property'), ''), 'null'), '^"|"$', ''), 'other_value'), 0))) + WHERE and(equals(e.team_id, 2), greaterOrEquals(toTimeZone(e.timestamp, 'UTC'), toStartOfDay(assumeNotNull(parseDateTime64BestEffortOrNull('2019-12-28 00:00:00', 6, 'UTC')))), lessOrEquals(toTimeZone(e.timestamp, 'UTC'), assumeNotNull(parseDateTime64BestEffortOrNull('2020-01-04 23:59:59', 6, 'UTC'))), equals(e.event, 'sign up'), or(isNull(toString(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$some_property'), ''), 'null'), '^"|"$', ''))), ifNull(equals(toString(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$some_property'), ''), 'null'), '^"|"$', '')), 'value'), 0), ifNull(equals(toString(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$some_property'), ''), 'null'), '^"|"$', '')), 'other_value'), 0))) GROUP BY e__pdi.person_id, breakdown_value) GROUP BY day_start, @@ -3585,7 +3599,7 @@ # --- # name: TestTrends.test_trends_breakdown_cumulative_poe_v2 ''' - SELECT replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$some_property'), ''), 'null'), '^"|"$', '') AS value, + SELECT toString(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$some_property'), ''), 'null'), '^"|"$', '')) AS value, count(e.uuid) AS count FROM events AS e WHERE and(equals(e.team_id, 2), and(greaterOrEquals(toTimeZone(e.timestamp, 'UTC'), toStartOfDay(assumeNotNull(parseDateTime64BestEffortOrNull('2019-12-28 00:00:00', 6, 'UTC')))), lessOrEquals(toTimeZone(e.timestamp, 'UTC'), assumeNotNull(parseDateTime64BestEffortOrNull('2020-01-04 23:59:59', 6, 'UTC')))), and(greaterOrEquals(toTimeZone(e.timestamp, 'UTC'), toStartOfDay(assumeNotNull(parseDateTime64BestEffortOrNull('2019-12-28 00:00:00', 6, 'UTC')))), lessOrEquals(toTimeZone(e.timestamp, 'UTC'), assumeNotNull(parseDateTime64BestEffortOrNull('2020-01-04 23:59:59', 6, 'UTC'))), equals(e.event, 'sign up'))) @@ -3600,7 +3614,7 @@ ''' SELECT groupArray(day_start) AS date, groupArray(count) AS total, - ifNull(toString(breakdown_value), '') AS breakdown_value + ifNull(toString(breakdown_value), '$$_posthog_breakdown_null_$$') AS breakdown_value FROM (SELECT day_start AS day_start, sum(count) OVER (PARTITION BY breakdown_value @@ -3626,7 +3640,7 @@ ORDER BY sec.breakdown_value ASC, day_start ASC UNION ALL SELECT count(DISTINCT ifNull(nullIf(e__override.override_person_id, '00000000-0000-0000-0000-000000000000'), e.person_id)) AS total, min(toStartOfDay(toTimeZone(e.timestamp, 'UTC'))) AS day_start, - transform(ifNull(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$some_property'), ''), 'null'), '^"|"$', ''), '$$_posthog_breakdown_null_$$'), ['$$_posthog_breakdown_null_$$', 'value', 'other_value'], ['$$_posthog_breakdown_null_$$', 'value', 'other_value'], '$$_posthog_breakdown_other_$$') AS breakdown_value + transform(ifNull(toString(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$some_property'), ''), 'null'), '^"|"$', '')), '$$_posthog_breakdown_null_$$'), ['$$_posthog_breakdown_null_$$', 'value', 'other_value'], ['$$_posthog_breakdown_null_$$', 'value', 'other_value'], '$$_posthog_breakdown_other_$$') AS breakdown_value FROM events AS e SAMPLE 1 LEFT OUTER JOIN (SELECT argMax(person_overrides.override_person_id, person_overrides.version) AS override_person_id, @@ -3634,7 +3648,7 @@ FROM person_overrides WHERE equals(person_overrides.team_id, 2) GROUP BY person_overrides.old_person_id) AS e__override ON equals(e.person_id, e__override.old_person_id) - WHERE and(equals(e.team_id, 2), greaterOrEquals(toTimeZone(e.timestamp, 'UTC'), toStartOfDay(assumeNotNull(parseDateTime64BestEffortOrNull('2019-12-28 00:00:00', 6, 'UTC')))), lessOrEquals(toTimeZone(e.timestamp, 'UTC'), assumeNotNull(parseDateTime64BestEffortOrNull('2020-01-04 23:59:59', 6, 'UTC'))), equals(e.event, 'sign up'), or(isNull(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$some_property'), ''), 'null'), '^"|"$', '')), ifNull(equals(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$some_property'), ''), 'null'), '^"|"$', ''), 'value'), 0), ifNull(equals(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$some_property'), ''), 'null'), '^"|"$', ''), 'other_value'), 0))) + WHERE and(equals(e.team_id, 2), greaterOrEquals(toTimeZone(e.timestamp, 'UTC'), toStartOfDay(assumeNotNull(parseDateTime64BestEffortOrNull('2019-12-28 00:00:00', 6, 'UTC')))), lessOrEquals(toTimeZone(e.timestamp, 'UTC'), assumeNotNull(parseDateTime64BestEffortOrNull('2020-01-04 23:59:59', 6, 'UTC'))), equals(e.event, 'sign up'), or(isNull(toString(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$some_property'), ''), 'null'), '^"|"$', ''))), ifNull(equals(toString(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$some_property'), ''), 'null'), '^"|"$', '')), 'value'), 0), ifNull(equals(toString(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$some_property'), ''), 'null'), '^"|"$', '')), 'other_value'), 0))) GROUP BY ifNull(nullIf(e__override.override_person_id, '00000000-0000-0000-0000-000000000000'), e.person_id), breakdown_value) GROUP BY day_start, @@ -3649,7 +3663,7 @@ # --- # name: TestTrends.test_trends_breakdown_with_session_property_single_aggregate_math_and_breakdown ''' - SELECT replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$some_property'), ''), 'null'), '^"|"$', '') AS value, + SELECT toString(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$some_property'), ''), 'null'), '^"|"$', '')) AS value, max(e__session.duration) AS count FROM events AS e INNER JOIN @@ -3672,7 +3686,7 @@ breakdown_value AS breakdown_value FROM (SELECT any(e__session.duration) AS session_duration, - transform(ifNull(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$some_property'), ''), 'null'), '^"|"$', ''), '$$_posthog_breakdown_null_$$'), ['value2', 'value1', '$$_posthog_breakdown_null_$$'], ['value2', 'value1', '$$_posthog_breakdown_null_$$'], '$$_posthog_breakdown_other_$$') AS breakdown_value + transform(ifNull(toString(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$some_property'), ''), 'null'), '^"|"$', '')), '$$_posthog_breakdown_null_$$'), ['value2', 'value1', '$$_posthog_breakdown_null_$$'], ['value2', 'value1', '$$_posthog_breakdown_null_$$'], '$$_posthog_breakdown_other_$$') AS breakdown_value FROM events AS e SAMPLE 1 INNER JOIN (SELECT events.`$session_id` AS id, @@ -3680,7 +3694,7 @@ FROM events WHERE and(equals(events.team_id, 2), greaterOrEquals(toTimeZone(events.timestamp, 'UTC'), toStartOfWeek(assumeNotNull(parseDateTime64BestEffortOrNull('2019-12-28 00:00:00', 6, 'UTC')), 0)), lessOrEquals(toTimeZone(events.timestamp, 'UTC'), assumeNotNull(parseDateTime64BestEffortOrNull('2020-01-04 23:59:59', 6, 'UTC'))), ifNull(notEquals(id, ''), 1)) GROUP BY id) AS e__session ON equals(e.`$session_id`, e__session.id) - WHERE and(equals(e.team_id, 2), greaterOrEquals(toTimeZone(e.timestamp, 'UTC'), toStartOfWeek(assumeNotNull(parseDateTime64BestEffortOrNull('2019-12-28 00:00:00', 6, 'UTC')), 0)), lessOrEquals(toTimeZone(e.timestamp, 'UTC'), assumeNotNull(parseDateTime64BestEffortOrNull('2020-01-04 23:59:59', 6, 'UTC'))), equals(e.event, 'sign up'), or(ifNull(equals(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$some_property'), ''), 'null'), '^"|"$', ''), 'value2'), 0), ifNull(equals(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$some_property'), ''), 'null'), '^"|"$', ''), 'value1'), 0), isNull(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$some_property'), ''), 'null'), '^"|"$', '')))) + WHERE and(equals(e.team_id, 2), greaterOrEquals(toTimeZone(e.timestamp, 'UTC'), toStartOfWeek(assumeNotNull(parseDateTime64BestEffortOrNull('2019-12-28 00:00:00', 6, 'UTC')), 0)), lessOrEquals(toTimeZone(e.timestamp, 'UTC'), assumeNotNull(parseDateTime64BestEffortOrNull('2020-01-04 23:59:59', 6, 'UTC'))), equals(e.event, 'sign up'), or(ifNull(equals(toString(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$some_property'), ''), 'null'), '^"|"$', '')), 'value2'), 0), ifNull(equals(toString(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$some_property'), ''), 'null'), '^"|"$', '')), 'value1'), 0), isNull(toString(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$some_property'), ''), 'null'), '^"|"$', ''))))) GROUP BY e__session.id, breakdown_value) GROUP BY breakdown_value @@ -3691,7 +3705,7 @@ # --- # name: TestTrends.test_trends_breakdown_with_session_property_single_aggregate_math_and_breakdown.2 ''' - SELECT replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$some_property'), ''), 'null'), '^"|"$', '') AS value, + SELECT toString(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$some_property'), ''), 'null'), '^"|"$', '')) AS value, max(e__session.duration) AS count FROM events AS e INNER JOIN @@ -3714,7 +3728,7 @@ breakdown_value AS breakdown_value FROM (SELECT any(e__session.duration) AS session_duration, - transform(ifNull(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$some_property'), ''), 'null'), '^"|"$', ''), '$$_posthog_breakdown_null_$$'), ['value2', 'value1', '$$_posthog_breakdown_null_$$'], ['value2', 'value1', '$$_posthog_breakdown_null_$$'], '$$_posthog_breakdown_other_$$') AS breakdown_value + transform(ifNull(toString(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$some_property'), ''), 'null'), '^"|"$', '')), '$$_posthog_breakdown_null_$$'), ['value2', 'value1', '$$_posthog_breakdown_null_$$'], ['value2', 'value1', '$$_posthog_breakdown_null_$$'], '$$_posthog_breakdown_other_$$') AS breakdown_value FROM events AS e SAMPLE 1 INNER JOIN (SELECT events.`$session_id` AS id, @@ -3722,7 +3736,7 @@ FROM events WHERE and(equals(events.team_id, 2), greaterOrEquals(toTimeZone(events.timestamp, 'UTC'), toStartOfDay(assumeNotNull(parseDateTime64BestEffortOrNull('2019-12-28 00:00:00', 6, 'UTC')))), lessOrEquals(toTimeZone(events.timestamp, 'UTC'), assumeNotNull(parseDateTime64BestEffortOrNull('2020-01-04 23:59:59', 6, 'UTC'))), ifNull(notEquals(id, ''), 1)) GROUP BY id) AS e__session ON equals(e.`$session_id`, e__session.id) - WHERE and(equals(e.team_id, 2), greaterOrEquals(toTimeZone(e.timestamp, 'UTC'), toStartOfDay(assumeNotNull(parseDateTime64BestEffortOrNull('2019-12-28 00:00:00', 6, 'UTC')))), lessOrEquals(toTimeZone(e.timestamp, 'UTC'), assumeNotNull(parseDateTime64BestEffortOrNull('2020-01-04 23:59:59', 6, 'UTC'))), equals(e.event, 'sign up'), or(ifNull(equals(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$some_property'), ''), 'null'), '^"|"$', ''), 'value2'), 0), ifNull(equals(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$some_property'), ''), 'null'), '^"|"$', ''), 'value1'), 0), isNull(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$some_property'), ''), 'null'), '^"|"$', '')))) + WHERE and(equals(e.team_id, 2), greaterOrEquals(toTimeZone(e.timestamp, 'UTC'), toStartOfDay(assumeNotNull(parseDateTime64BestEffortOrNull('2019-12-28 00:00:00', 6, 'UTC')))), lessOrEquals(toTimeZone(e.timestamp, 'UTC'), assumeNotNull(parseDateTime64BestEffortOrNull('2020-01-04 23:59:59', 6, 'UTC'))), equals(e.event, 'sign up'), or(ifNull(equals(toString(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$some_property'), ''), 'null'), '^"|"$', '')), 'value2'), 0), ifNull(equals(toString(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$some_property'), ''), 'null'), '^"|"$', '')), 'value1'), 0), isNull(toString(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$some_property'), ''), 'null'), '^"|"$', ''))))) GROUP BY e__session.id, breakdown_value) GROUP BY breakdown_value @@ -3854,7 +3868,7 @@ # --- # name: TestTrends.test_trends_count_per_user_average_aggregated_with_event_property_breakdown_with_sampling ''' - SELECT replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, 'color'), ''), 'null'), '^"|"$', '') AS value, + SELECT toString(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, 'color'), ''), 'null'), '^"|"$', '')) AS value, count(e.uuid) AS count FROM events AS e WHERE and(equals(e.team_id, 2), and(greaterOrEquals(toTimeZone(e.timestamp, 'UTC'), toStartOfDay(assumeNotNull(parseDateTime64BestEffortOrNull('2020-01-01 00:00:00', 6, 'UTC')))), lessOrEquals(toTimeZone(e.timestamp, 'UTC'), assumeNotNull(parseDateTime64BestEffortOrNull('2020-01-07 23:59:59', 6, 'UTC')))), equals(e.event, 'viewed video')) @@ -3874,7 +3888,7 @@ breakdown_value AS breakdown_value FROM (SELECT count(e.uuid) AS total, - transform(ifNull(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, 'color'), ''), 'null'), '^"|"$', ''), '$$_posthog_breakdown_null_$$'), ['red', 'blue', '$$_posthog_breakdown_null_$$'], ['red', 'blue', '$$_posthog_breakdown_null_$$'], '$$_posthog_breakdown_other_$$') AS breakdown_value + transform(ifNull(toString(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, 'color'), ''), 'null'), '^"|"$', '')), '$$_posthog_breakdown_null_$$'), ['red', 'blue', '$$_posthog_breakdown_null_$$'], ['red', 'blue', '$$_posthog_breakdown_null_$$'], '$$_posthog_breakdown_other_$$') AS breakdown_value FROM events AS e SAMPLE 1.0 INNER JOIN (SELECT argMax(person_distinct_id2.person_id, person_distinct_id2.version) AS person_id, @@ -3883,7 +3897,7 @@ WHERE equals(person_distinct_id2.team_id, 2) GROUP BY person_distinct_id2.distinct_id HAVING ifNull(equals(argMax(person_distinct_id2.is_deleted, person_distinct_id2.version), 0), 0)) AS e__pdi ON equals(e.distinct_id, e__pdi.distinct_id) - WHERE and(equals(e.team_id, 2), and(equals(e.event, 'viewed video'), or(ifNull(equals(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, 'color'), ''), 'null'), '^"|"$', ''), 'red'), 0), ifNull(equals(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, 'color'), ''), 'null'), '^"|"$', ''), 'blue'), 0), isNull(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, 'color'), ''), 'null'), '^"|"$', '')))), greaterOrEquals(toTimeZone(e.timestamp, 'UTC'), minus(assumeNotNull(parseDateTime64BestEffortOrNull('2020-01-01 00:00:00', 6, 'UTC')), toIntervalDay(0))), lessOrEquals(toTimeZone(e.timestamp, 'UTC'), assumeNotNull(parseDateTime64BestEffortOrNull('2020-01-07 23:59:59', 6, 'UTC')))) + WHERE and(equals(e.team_id, 2), and(equals(e.event, 'viewed video'), or(ifNull(equals(toString(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, 'color'), ''), 'null'), '^"|"$', '')), 'red'), 0), ifNull(equals(toString(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, 'color'), ''), 'null'), '^"|"$', '')), 'blue'), 0), isNull(toString(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, 'color'), ''), 'null'), '^"|"$', ''))))), greaterOrEquals(toTimeZone(e.timestamp, 'UTC'), minus(assumeNotNull(parseDateTime64BestEffortOrNull('2020-01-01 00:00:00', 6, 'UTC')), toIntervalDay(0))), lessOrEquals(toTimeZone(e.timestamp, 'UTC'), assumeNotNull(parseDateTime64BestEffortOrNull('2020-01-07 23:59:59', 6, 'UTC')))) GROUP BY e__pdi.person_id, breakdown_value) GROUP BY breakdown_value) @@ -4098,7 +4112,7 @@ # --- # name: TestTrends.test_trends_person_breakdown_with_session_property_single_aggregate_math_and_breakdown ''' - SELECT e__pdi__person.`properties___$some_prop` AS value, + SELECT toString(e__pdi__person.`properties___$some_prop`) AS value, max(e__session.duration) AS count FROM events AS e INNER JOIN @@ -4139,7 +4153,7 @@ breakdown_value AS breakdown_value FROM (SELECT any(e__session.duration) AS session_duration, - transform(ifNull(e__pdi__person.`properties___$some_prop`, '$$_posthog_breakdown_null_$$'), ['some_val', 'another_val'], ['some_val', 'another_val'], '$$_posthog_breakdown_other_$$') AS breakdown_value + transform(ifNull(toString(e__pdi__person.`properties___$some_prop`), '$$_posthog_breakdown_null_$$'), ['some_val', 'another_val'], ['some_val', 'another_val'], '$$_posthog_breakdown_other_$$') AS breakdown_value FROM events AS e SAMPLE 1 INNER JOIN (SELECT events.`$session_id` AS id, @@ -4165,7 +4179,7 @@ WHERE equals(person.team_id, 2) GROUP BY person.id HAVING ifNull(equals(argMax(person.is_deleted, person.version), 0), 0))), 0)) SETTINGS optimize_aggregation_in_order=1) AS e__pdi__person ON equals(e__pdi.e__pdi___person_id, e__pdi__person.id) - WHERE and(equals(e.team_id, 2), greaterOrEquals(toTimeZone(e.timestamp, 'UTC'), toStartOfWeek(assumeNotNull(parseDateTime64BestEffortOrNull('2019-12-28 00:00:00', 6, 'UTC')), 0)), lessOrEquals(toTimeZone(e.timestamp, 'UTC'), assumeNotNull(parseDateTime64BestEffortOrNull('2020-01-04 23:59:59', 6, 'UTC'))), equals(e.event, 'sign up'), or(ifNull(equals(e__pdi__person.`properties___$some_prop`, 'some_val'), 0), ifNull(equals(e__pdi__person.`properties___$some_prop`, 'another_val'), 0))) + WHERE and(equals(e.team_id, 2), greaterOrEquals(toTimeZone(e.timestamp, 'UTC'), toStartOfWeek(assumeNotNull(parseDateTime64BestEffortOrNull('2019-12-28 00:00:00', 6, 'UTC')), 0)), lessOrEquals(toTimeZone(e.timestamp, 'UTC'), assumeNotNull(parseDateTime64BestEffortOrNull('2020-01-04 23:59:59', 6, 'UTC'))), equals(e.event, 'sign up'), or(ifNull(equals(toString(e__pdi__person.`properties___$some_prop`), 'some_val'), 0), ifNull(equals(toString(e__pdi__person.`properties___$some_prop`), 'another_val'), 0))) GROUP BY e__session.id, breakdown_value) GROUP BY breakdown_value @@ -4316,7 +4330,7 @@ # --- # name: TestTrends.test_trends_with_session_property_total_volume_math_with_breakdowns ''' - SELECT replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$some_property'), ''), 'null'), '^"|"$', '') AS value, + SELECT toString(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$some_property'), ''), 'null'), '^"|"$', '')) AS value, max(e__session.duration) AS count FROM events AS e INNER JOIN @@ -4337,7 +4351,7 @@ ''' SELECT groupArray(day_start) AS date, groupArray(count) AS total, - ifNull(toString(breakdown_value), '') AS breakdown_value + ifNull(toString(breakdown_value), '$$_posthog_breakdown_null_$$') AS breakdown_value FROM (SELECT sum(total) AS count, day_start AS day_start, @@ -4361,7 +4375,7 @@ breakdown_value AS breakdown_value FROM (SELECT any(e__session.duration) AS session_duration, - transform(ifNull(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$some_property'), ''), 'null'), '^"|"$', ''), '$$_posthog_breakdown_null_$$'), ['value2', 'value1'], ['value2', 'value1'], '$$_posthog_breakdown_other_$$') AS breakdown_value, + transform(ifNull(toString(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$some_property'), ''), 'null'), '^"|"$', '')), '$$_posthog_breakdown_null_$$'), ['value2', 'value1'], ['value2', 'value1'], '$$_posthog_breakdown_other_$$') AS breakdown_value, toStartOfWeek(toTimeZone(e.timestamp, 'UTC'), 0) AS day_start FROM events AS e SAMPLE 1 INNER JOIN @@ -4370,7 +4384,7 @@ FROM events WHERE and(equals(events.team_id, 2), greaterOrEquals(toTimeZone(events.timestamp, 'UTC'), toStartOfWeek(assumeNotNull(parseDateTime64BestEffortOrNull('2019-12-28 00:00:00', 6, 'UTC')), 0)), lessOrEquals(toTimeZone(events.timestamp, 'UTC'), assumeNotNull(parseDateTime64BestEffortOrNull('2020-01-04 23:59:59', 6, 'UTC'))), ifNull(notEquals(id, ''), 1)) GROUP BY id) AS e__session ON equals(e.`$session_id`, e__session.id) - WHERE and(equals(e.team_id, 2), greaterOrEquals(toTimeZone(e.timestamp, 'UTC'), toStartOfWeek(assumeNotNull(parseDateTime64BestEffortOrNull('2019-12-28 00:00:00', 6, 'UTC')), 0)), lessOrEquals(toTimeZone(e.timestamp, 'UTC'), assumeNotNull(parseDateTime64BestEffortOrNull('2020-01-04 23:59:59', 6, 'UTC'))), equals(e.event, 'sign up'), or(ifNull(equals(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$some_property'), ''), 'null'), '^"|"$', ''), 'value2'), 0), ifNull(equals(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$some_property'), ''), 'null'), '^"|"$', ''), 'value1'), 0))) + WHERE and(equals(e.team_id, 2), greaterOrEquals(toTimeZone(e.timestamp, 'UTC'), toStartOfWeek(assumeNotNull(parseDateTime64BestEffortOrNull('2019-12-28 00:00:00', 6, 'UTC')), 0)), lessOrEquals(toTimeZone(e.timestamp, 'UTC'), assumeNotNull(parseDateTime64BestEffortOrNull('2020-01-04 23:59:59', 6, 'UTC'))), equals(e.event, 'sign up'), or(ifNull(equals(toString(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$some_property'), ''), 'null'), '^"|"$', '')), 'value2'), 0), ifNull(equals(toString(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$some_property'), ''), 'null'), '^"|"$', '')), 'value1'), 0))) GROUP BY day_start, e__session.id, breakdown_value, @@ -4389,7 +4403,7 @@ # --- # name: TestTrends.test_trends_with_session_property_total_volume_math_with_breakdowns.2 ''' - SELECT replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$some_property'), ''), 'null'), '^"|"$', '') AS value, + SELECT toString(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$some_property'), ''), 'null'), '^"|"$', '')) AS value, max(e__session.duration) AS count FROM events AS e INNER JOIN @@ -4410,7 +4424,7 @@ ''' SELECT groupArray(day_start) AS date, groupArray(count) AS total, - ifNull(toString(breakdown_value), '') AS breakdown_value + ifNull(toString(breakdown_value), '$$_posthog_breakdown_null_$$') AS breakdown_value FROM (SELECT sum(total) AS count, day_start AS day_start, @@ -4434,7 +4448,7 @@ breakdown_value AS breakdown_value FROM (SELECT any(e__session.duration) AS session_duration, - transform(ifNull(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$some_property'), ''), 'null'), '^"|"$', ''), '$$_posthog_breakdown_null_$$'), ['value2', 'value1'], ['value2', 'value1'], '$$_posthog_breakdown_other_$$') AS breakdown_value, + transform(ifNull(toString(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$some_property'), ''), 'null'), '^"|"$', '')), '$$_posthog_breakdown_null_$$'), ['value2', 'value1'], ['value2', 'value1'], '$$_posthog_breakdown_other_$$') AS breakdown_value, toStartOfDay(toTimeZone(e.timestamp, 'UTC')) AS day_start FROM events AS e SAMPLE 1 INNER JOIN @@ -4443,7 +4457,7 @@ FROM events WHERE and(equals(events.team_id, 2), greaterOrEquals(toTimeZone(events.timestamp, 'UTC'), toStartOfDay(assumeNotNull(parseDateTime64BestEffortOrNull('2019-12-28 00:00:00', 6, 'UTC')))), lessOrEquals(toTimeZone(events.timestamp, 'UTC'), assumeNotNull(parseDateTime64BestEffortOrNull('2020-01-04 23:59:59', 6, 'UTC'))), ifNull(notEquals(id, ''), 1)) GROUP BY id) AS e__session ON equals(e.`$session_id`, e__session.id) - WHERE and(equals(e.team_id, 2), greaterOrEquals(toTimeZone(e.timestamp, 'UTC'), toStartOfDay(assumeNotNull(parseDateTime64BestEffortOrNull('2019-12-28 00:00:00', 6, 'UTC')))), lessOrEquals(toTimeZone(e.timestamp, 'UTC'), assumeNotNull(parseDateTime64BestEffortOrNull('2020-01-04 23:59:59', 6, 'UTC'))), equals(e.event, 'sign up'), or(ifNull(equals(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$some_property'), ''), 'null'), '^"|"$', ''), 'value2'), 0), ifNull(equals(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$some_property'), ''), 'null'), '^"|"$', ''), 'value1'), 0))) + WHERE and(equals(e.team_id, 2), greaterOrEquals(toTimeZone(e.timestamp, 'UTC'), toStartOfDay(assumeNotNull(parseDateTime64BestEffortOrNull('2019-12-28 00:00:00', 6, 'UTC')))), lessOrEquals(toTimeZone(e.timestamp, 'UTC'), assumeNotNull(parseDateTime64BestEffortOrNull('2020-01-04 23:59:59', 6, 'UTC'))), equals(e.event, 'sign up'), or(ifNull(equals(toString(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$some_property'), ''), 'null'), '^"|"$', '')), 'value2'), 0), ifNull(equals(toString(replaceRegexpAll(nullIf(nullIf(JSONExtractRaw(e.properties, '$some_property'), ''), 'null'), '^"|"$', '')), 'value1'), 0))) GROUP BY day_start, e__session.id, breakdown_value, diff --git a/posthog/hogql_queries/insights/trends/test/__snapshots__/test_trends_data_warehouse_query.ambr b/posthog/hogql_queries/insights/trends/test/__snapshots__/test_trends_data_warehouse_query.ambr index db9e8e1d45000..1e3bc1b5cbad6 100644 --- a/posthog/hogql_queries/insights/trends/test/__snapshots__/test_trends_data_warehouse_query.ambr +++ b/posthog/hogql_queries/insights/trends/test/__snapshots__/test_trends_data_warehouse_query.ambr @@ -1,7 +1,7 @@ # serializer version: 1 # name: TestTrendsDataWarehouseQuery.test_trends_breakdown ''' - SELECT e.prop_1 AS value, + SELECT toString(e.prop_1) AS value, count(e.id) AS count FROM s3('http://host.docker.internal:19000/posthog/test_storage_bucket-posthog.hogql.datawarehouse.trendquery/*.parquet', 'object_storage_root_user', 'object_storage_root_password', 'Parquet', 'id String, prop_1 String, prop_2 String, created DateTime64(3, \'UTC\')') AS e WHERE and(and(ifNull(greaterOrEquals(toTimeZone(e.created, 'UTC'), toStartOfDay(assumeNotNull(parseDateTime64BestEffortOrNull('2023-01-01 00:00:00', 6, 'UTC')))), 0), ifNull(lessOrEquals(toTimeZone(e.created, 'UTC'), assumeNotNull(parseDateTime64BestEffortOrNull('2023-01-07 23:59:59', 6, 'UTC'))), 0)), and(ifNull(greaterOrEquals(toDateTime(toTimeZone(e.created, 'UTC'), 'UTC'), toStartOfDay(assumeNotNull(parseDateTime64BestEffortOrNull('2023-01-01 00:00:00', 6, 'UTC')))), 0), ifNull(lessOrEquals(toDateTime(toTimeZone(e.created, 'UTC'), 'UTC'), assumeNotNull(parseDateTime64BestEffortOrNull('2023-01-07 23:59:59', 6, 'UTC'))), 0))) @@ -16,7 +16,7 @@ ''' SELECT groupArray(day_start) AS date, groupArray(count) AS total, - ifNull(toString(breakdown_value), '') AS breakdown_value + ifNull(toString(breakdown_value), '$$_posthog_breakdown_null_$$') AS breakdown_value FROM (SELECT sum(total) AS count, day_start AS day_start, @@ -37,9 +37,9 @@ ORDER BY sec.breakdown_value ASC, day_start ASC UNION ALL SELECT count(e.id) AS total, toStartOfDay(toDateTime(toTimeZone(e.created, 'UTC'), 'UTC')) AS day_start, - transform(ifNull(e.prop_1, '$$_posthog_breakdown_null_$$'), ['d', 'c', 'b', 'a'], ['d', 'c', 'b', 'a'], '$$_posthog_breakdown_other_$$') AS breakdown_value + transform(ifNull(toString(e.prop_1), '$$_posthog_breakdown_null_$$'), ['d', 'c', 'b', 'a'], ['d', 'c', 'b', 'a'], '$$_posthog_breakdown_other_$$') AS breakdown_value FROM s3('http://host.docker.internal:19000/posthog/test_storage_bucket-posthog.hogql.datawarehouse.trendquery/*.parquet', 'object_storage_root_user', 'object_storage_root_password', 'Parquet', 'id String, prop_1 String, prop_2 String, created DateTime64(3, \'UTC\')') AS e - WHERE and(ifNull(greaterOrEquals(toDateTime(toTimeZone(e.created, 'UTC'), 'UTC'), toStartOfDay(assumeNotNull(parseDateTime64BestEffortOrNull('2023-01-01 00:00:00', 6, 'UTC')))), 0), ifNull(lessOrEquals(toDateTime(toTimeZone(e.created, 'UTC'), 'UTC'), assumeNotNull(parseDateTime64BestEffortOrNull('2023-01-07 23:59:59', 6, 'UTC'))), 0), or(equals(e.prop_1, 'd'), equals(e.prop_1, 'c'), equals(e.prop_1, 'b'), equals(e.prop_1, 'a'))) + WHERE and(ifNull(greaterOrEquals(toDateTime(toTimeZone(e.created, 'UTC'), 'UTC'), toStartOfDay(assumeNotNull(parseDateTime64BestEffortOrNull('2023-01-01 00:00:00', 6, 'UTC')))), 0), ifNull(lessOrEquals(toDateTime(toTimeZone(e.created, 'UTC'), 'UTC'), assumeNotNull(parseDateTime64BestEffortOrNull('2023-01-07 23:59:59', 6, 'UTC'))), 0), or(ifNull(equals(toString(e.prop_1), 'd'), 0), ifNull(equals(toString(e.prop_1), 'c'), 0), ifNull(equals(toString(e.prop_1), 'b'), 0), ifNull(equals(toString(e.prop_1), 'a'), 0))) GROUP BY day_start, breakdown_value) GROUP BY day_start, @@ -54,7 +54,7 @@ # --- # name: TestTrendsDataWarehouseQuery.test_trends_breakdown_with_property ''' - SELECT e.prop_1 AS value, + SELECT toString(e.prop_1) AS value, count(e.id) AS count FROM s3('http://host.docker.internal:19000/posthog/test_storage_bucket-posthog.hogql.datawarehouse.trendquery/*.parquet', 'object_storage_root_user', 'object_storage_root_password', 'Parquet', 'id String, prop_1 String, prop_2 String, created DateTime64(3, \'UTC\')') AS e WHERE and(and(ifNull(greaterOrEquals(toTimeZone(e.created, 'UTC'), toStartOfDay(assumeNotNull(parseDateTime64BestEffortOrNull('2023-01-01 00:00:00', 6, 'UTC')))), 0), ifNull(lessOrEquals(toTimeZone(e.created, 'UTC'), assumeNotNull(parseDateTime64BestEffortOrNull('2023-01-07 23:59:59', 6, 'UTC'))), 0)), and(ifNull(greaterOrEquals(toDateTime(toTimeZone(e.created, 'UTC'), 'UTC'), toStartOfDay(assumeNotNull(parseDateTime64BestEffortOrNull('2023-01-01 00:00:00', 6, 'UTC')))), 0), ifNull(lessOrEquals(toDateTime(toTimeZone(e.created, 'UTC'), 'UTC'), assumeNotNull(parseDateTime64BestEffortOrNull('2023-01-07 23:59:59', 6, 'UTC'))), 0), equals(e.prop_1, 'a'))) @@ -69,7 +69,7 @@ ''' SELECT groupArray(day_start) AS date, groupArray(count) AS total, - ifNull(toString(breakdown_value), '') AS breakdown_value + ifNull(toString(breakdown_value), '$$_posthog_breakdown_null_$$') AS breakdown_value FROM (SELECT sum(total) AS count, day_start AS day_start, @@ -90,9 +90,9 @@ ORDER BY sec.breakdown_value ASC, day_start ASC UNION ALL SELECT count(e.id) AS total, toStartOfDay(toDateTime(toTimeZone(e.created, 'UTC'), 'UTC')) AS day_start, - transform(ifNull(e.prop_1, '$$_posthog_breakdown_null_$$'), ['a'], ['a'], '$$_posthog_breakdown_other_$$') AS breakdown_value + transform(ifNull(toString(e.prop_1), '$$_posthog_breakdown_null_$$'), ['a'], ['a'], '$$_posthog_breakdown_other_$$') AS breakdown_value FROM s3('http://host.docker.internal:19000/posthog/test_storage_bucket-posthog.hogql.datawarehouse.trendquery/*.parquet', 'object_storage_root_user', 'object_storage_root_password', 'Parquet', 'id String, prop_1 String, prop_2 String, created DateTime64(3, \'UTC\')') AS e - WHERE and(ifNull(greaterOrEquals(toDateTime(toTimeZone(e.created, 'UTC'), 'UTC'), toStartOfDay(assumeNotNull(parseDateTime64BestEffortOrNull('2023-01-01 00:00:00', 6, 'UTC')))), 0), ifNull(lessOrEquals(toDateTime(toTimeZone(e.created, 'UTC'), 'UTC'), assumeNotNull(parseDateTime64BestEffortOrNull('2023-01-07 23:59:59', 6, 'UTC'))), 0), equals(e.prop_1, 'a'), equals(e.prop_1, 'a')) + WHERE and(ifNull(greaterOrEquals(toDateTime(toTimeZone(e.created, 'UTC'), 'UTC'), toStartOfDay(assumeNotNull(parseDateTime64BestEffortOrNull('2023-01-01 00:00:00', 6, 'UTC')))), 0), ifNull(lessOrEquals(toDateTime(toTimeZone(e.created, 'UTC'), 'UTC'), assumeNotNull(parseDateTime64BestEffortOrNull('2023-01-07 23:59:59', 6, 'UTC'))), 0), equals(e.prop_1, 'a'), ifNull(equals(toString(e.prop_1), 'a'), 0)) GROUP BY day_start, breakdown_value) GROUP BY day_start, diff --git a/posthog/hogql_queries/insights/trends/test/test_trends.py b/posthog/hogql_queries/insights/trends/test/test_trends.py index 1ac54e16de629..9e885fbadcc1d 100644 --- a/posthog/hogql_queries/insights/trends/test/test_trends.py +++ b/posthog/hogql_queries/insights/trends/test/test_trends.py @@ -5180,7 +5180,9 @@ def test_breakdown_filtering_with_properties_in_new_format(self): ) response = sorted(response, key=lambda x: x["label"]) - self.assertEqual(len(response), 0) + self.assertEqual(len(response), 1) + self.assertEqual(response[0]["label"], "$$_posthog_breakdown_null_$$") + self.assertEqual(response[0]["count"], 0) @also_test_with_person_on_events_v2 @snapshot_clickhouse_queries diff --git a/posthog/hogql_queries/insights/trends/test/test_trends_query_runner.py b/posthog/hogql_queries/insights/trends/test/test_trends_query_runner.py index 433df7c7df23b..8d14950ec23b2 100644 --- a/posthog/hogql_queries/insights/trends/test/test_trends_query_runner.py +++ b/posthog/hogql_queries/insights/trends/test/test_trends_query_runner.py @@ -381,7 +381,7 @@ def test_trends_query_formula_breakdown_no_data(self): TrendsFilter(formula="A+B"), BreakdownFilter(breakdown_type=BreakdownType.person, breakdown="$browser"), ) - self.assertEqual([], response.results) + self.assertEqual([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], response.results[0]["data"]) def test_trends_query_formula_aggregate(self): self._create_test_events() @@ -714,16 +714,7 @@ def test_trends_breakdowns_multiple_hogql(self): breakdown_labels = [result["breakdown_value"] for result in response.results] assert len(response.results) == 8 - assert breakdown_labels == [ - "Chrome", - "Firefox", - "Edge", - "Safari", - "Chrome", - "Edge", - "Firefox", - "Safari", - ] + assert breakdown_labels == ["Chrome", "Firefox", "Edge", "Safari", "Chrome", "Edge", "Firefox", "Safari"] assert response.results[0]["label"] == f"$pageview - Chrome" assert response.results[1]["label"] == f"$pageview - Firefox" assert response.results[2]["label"] == f"$pageview - Edge" @@ -823,6 +814,7 @@ def test_trends_breakdown_and_aggregation_query_orchestration(self): 10, 0, ] + assert response.results[1]["data"] == [ 20, 0, @@ -1606,9 +1598,8 @@ def test_to_actors_query_options_breakdowns_boolean(self): assert response.series == [InsightActorsQuerySeries(label="$pageview", value=0)] assert response.breakdown == [ - # BreakdownItem(label="Other", value="$$_posthog_breakdown_other_$$"), # TODO: Add when "Other" works - BreakdownItem(label="true", value=1), - BreakdownItem(label="false", value=0), + BreakdownItem(label="true", value="true"), + BreakdownItem(label="false", value="false"), ] def test_to_actors_query_options_breakdowns_histogram(self): diff --git a/posthog/hogql_queries/insights/trends/trends_query_builder.py b/posthog/hogql_queries/insights/trends/trends_query_builder.py index 7be735d3b0a8b..a911e4bf8302a 100644 --- a/posthog/hogql_queries/insights/trends/trends_query_builder.py +++ b/posthog/hogql_queries/insights/trends/trends_query_builder.py @@ -14,6 +14,7 @@ from posthog.models.action.action import Action from posthog.models.filters.mixins.utils import cached_property from posthog.models.team.team import Team +from posthog.queries.trends.breakdown import BREAKDOWN_NULL_STRING_LABEL from posthog.schema import ( ActionsNode, DataWarehouseNode, @@ -68,7 +69,7 @@ def build_query(self) -> ast.SelectQuery | ast.SelectUnionQuery: return full_query def build_actors_query( - self, time_frame: Optional[str] = None, breakdown_filter: Optional[str | int] = None + self, time_frame: Optional[str] = None, breakdown_filter: Optional[str] = None ) -> ast.SelectQuery | ast.SelectUnionQuery: breakdown = self._breakdown(is_actors_query=True, breakdown_values_override=breakdown_filter) @@ -292,7 +293,8 @@ def _get_events_subquery( # Just breakdowns elif breakdown.enabled: if not is_actors_query: - default_query.select.append(breakdown.column_expr()) + breakdown_expr = breakdown.column_expr() + default_query.select.append(breakdown_expr) default_query.group_by.append(ast.Field(chain=["breakdown_value"])) # Just session duration math property elif self._aggregation_operation.aggregating_on_session_duration(): @@ -369,7 +371,7 @@ def _outer_select_query(self, breakdown: Breakdown, inner_query: ast.SelectQuery name="ifNull", args=[ ast.Call(name="toString", args=[ast.Field(chain=["breakdown_value"])]), - ast.Constant(value=""), + ast.Constant(value=BREAKDOWN_NULL_STRING_LABEL), ], ), ) @@ -565,7 +567,7 @@ def session_duration_math_property_wrapper(self, default_query: ast.SelectQuery) query.group_by = [] return query - def _breakdown(self, is_actors_query: bool, breakdown_values_override: Optional[str | int] = None): + def _breakdown(self, is_actors_query: bool, breakdown_values_override: Optional[str] = None): return Breakdown( team=self.team, query=self.query, diff --git a/posthog/hogql_queries/insights/trends/trends_query_runner.py b/posthog/hogql_queries/insights/trends/trends_query_runner.py index d66110298ffab..d61720740f52b 100644 --- a/posthog/hogql_queries/insights/trends/trends_query_runner.py +++ b/posthog/hogql_queries/insights/trends/trends_query_runner.py @@ -23,9 +23,7 @@ from posthog.hogql.query import execute_hogql_query from posthog.hogql.timings import HogQLTimings from posthog.hogql_queries.insights.trends.breakdown_values import ( - BREAKDOWN_NULL_NUMERIC_LABEL, BREAKDOWN_NULL_STRING_LABEL, - BREAKDOWN_OTHER_NUMERIC_LABEL, BREAKDOWN_OTHER_STRING_LABEL, ) from posthog.hogql_queries.insights.trends.display import TrendsDisplay @@ -175,7 +173,7 @@ def to_actors_query( modifiers=self.modifiers, ) - query = query_builder.build_actors_query(time_frame=time_frame, breakdown_filter=breakdown_value) + query = query_builder.build_actors_query(time_frame=time_frame, breakdown_filter=str(breakdown_value)) return query @@ -240,14 +238,10 @@ def to_actors_query_options(self) -> InsightActorsQueryOptionsResponse: cohort_name = "all users" if str(value) == "0" else Cohort.objects.get(pk=value).name label = cohort_name value = value - elif value == BREAKDOWN_OTHER_STRING_LABEL or value == BREAKDOWN_OTHER_NUMERIC_LABEL: - # label = "Other" - # value = BREAKDOWN_OTHER_STRING_LABEL - continue # TODO: Add support for "other" breakdowns - elif value == BREAKDOWN_NULL_STRING_LABEL or value == BREAKDOWN_NULL_NUMERIC_LABEL: - # label = "Null" - # value = BREAKDOWN_NULL_STRING_LABEL - continue # TODO: Add support for "null" breakdowns + elif value == BREAKDOWN_OTHER_STRING_LABEL: + label = "Other (Groups all remaining values)" + elif value == BREAKDOWN_NULL_STRING_LABEL: + label = "None (No value)" elif is_boolean_breakdown: label = self._convert_boolean(value) else: @@ -501,18 +495,6 @@ def get_value(name: str, val: Any): series_object["breakdown_value"] = remapped_label - # If the breakdown value is the numeric "other", then set it to the string version - if ( - remapped_label == BREAKDOWN_OTHER_NUMERIC_LABEL - or remapped_label == str(BREAKDOWN_OTHER_NUMERIC_LABEL) - or remapped_label == float(BREAKDOWN_OTHER_NUMERIC_LABEL) - ): - series_object["breakdown_value"] = BREAKDOWN_OTHER_STRING_LABEL - if real_series_count > 1 or self._is_breakdown_field_boolean(): - series_object["label"] = "{} - {}".format(series_label or "All events", "Other") - else: - series_object["label"] = "Other" - res.append(series_object) return res diff --git a/posthog/schema.py b/posthog/schema.py index dc77da163db17..c88f8bf3f76de 100644 --- a/posthog/schema.py +++ b/posthog/schema.py @@ -418,6 +418,7 @@ class PersonsOnEventsMode(str, Enum): v1_enabled = "v1_enabled" v1_mixed = "v1_mixed" v2_enabled = "v2_enabled" + v3_enabled = "v3_enabled" class HogQLQueryModifiers(BaseModel): diff --git a/posthog/temporal/batch_exports/batch_exports.py b/posthog/temporal/batch_exports/batch_exports.py index c776e1f245ef3..88cf9e32f274f 100644 --- a/posthog/temporal/batch_exports/batch_exports.py +++ b/posthog/temporal/batch_exports/batch_exports.py @@ -1,15 +1,10 @@ import collections.abc -import csv import dataclasses import datetime as dt -import gzip -import tempfile import typing import uuid from string import Template -import brotli -import orjson import pyarrow as pa from asgiref.sync import sync_to_async from django.conf import settings @@ -22,7 +17,7 @@ create_batch_export_backfill, create_batch_export_run, update_batch_export_backfill_status, - update_batch_export_run_status, + update_batch_export_run, ) from posthog.temporal.batch_exports.metrics import ( get_export_finished_metric, @@ -286,202 +281,6 @@ def get_data_interval(interval: str, data_interval_end: str | None) -> tuple[dt. return (data_interval_start_dt, data_interval_end_dt) -def json_dumps_bytes(d) -> bytes: - return orjson.dumps(d, default=str) - - -class BatchExportTemporaryFile: - """A TemporaryFile used to as an intermediate step while exporting data. - - This class does not implement the file-like interface but rather passes any calls - to the underlying tempfile.NamedTemporaryFile. We do override 'write' methods - to allow tracking bytes and records. - """ - - def __init__( - self, - mode: str = "w+b", - buffering=-1, - compression: str | None = None, - encoding: str | None = None, - newline: str | None = None, - suffix: str | None = None, - prefix: str | None = None, - dir: str | None = None, - *, - errors: str | None = None, - ): - self._file = tempfile.NamedTemporaryFile( - mode=mode, - encoding=encoding, - newline=newline, - buffering=buffering, - suffix=suffix, - prefix=prefix, - dir=dir, - errors=errors, - ) - self.compression = compression - self.bytes_total = 0 - self.records_total = 0 - self.bytes_since_last_reset = 0 - self.records_since_last_reset = 0 - self._brotli_compressor = None - - def __getattr__(self, name): - """Pass get attr to underlying tempfile.NamedTemporaryFile.""" - return self._file.__getattr__(name) - - def __enter__(self): - """Context-manager protocol enter method.""" - self._file.__enter__() - return self - - def __exit__(self, exc, value, tb): - """Context-manager protocol exit method.""" - return self._file.__exit__(exc, value, tb) - - def __iter__(self): - yield from self._file - - @property - def brotli_compressor(self): - if self._brotli_compressor is None: - self._brotli_compressor = brotli.Compressor() - return self._brotli_compressor - - def compress(self, content: bytes | str) -> bytes: - if isinstance(content, str): - encoded = content.encode("utf-8") - else: - encoded = content - - match self.compression: - case "gzip": - return gzip.compress(encoded) - case "brotli": - self.brotli_compressor.process(encoded) - return self.brotli_compressor.flush() - case None: - return encoded - case _: - raise ValueError(f"Unsupported compression: '{self.compression}'") - - def write(self, content: bytes | str): - """Write bytes to underlying file keeping track of how many bytes were written.""" - compressed_content = self.compress(content) - - if "b" in self.mode: - result = self._file.write(compressed_content) - else: - result = self._file.write(compressed_content.decode("utf-8")) - - self.bytes_total += result - self.bytes_since_last_reset += result - - return result - - def write_record_as_bytes(self, record: bytes): - result = self.write(record) - - self.records_total += 1 - self.records_since_last_reset += 1 - - return result - - def write_records_to_jsonl(self, records): - """Write records to a temporary file as JSONL.""" - if len(records) == 1: - jsonl_dump = orjson.dumps(records[0], option=orjson.OPT_APPEND_NEWLINE, default=str) - else: - jsonl_dump = b"\n".join(map(json_dumps_bytes, records)) - - result = self.write(jsonl_dump) - - self.records_total += len(records) - self.records_since_last_reset += len(records) - - return result - - def write_records_to_csv( - self, - records, - fieldnames: None | collections.abc.Sequence[str] = None, - extrasaction: typing.Literal["raise", "ignore"] = "ignore", - delimiter: str = ",", - quotechar: str = '"', - escapechar: str | None = "\\", - lineterminator: str = "\n", - quoting=csv.QUOTE_NONE, - ): - """Write records to a temporary file as CSV.""" - if len(records) == 0: - return - - if fieldnames is None: - fieldnames = list(records[0].keys()) - - writer = csv.DictWriter( - self, - fieldnames=fieldnames, - extrasaction=extrasaction, - delimiter=delimiter, - quotechar=quotechar, - escapechar=escapechar, - quoting=quoting, - lineterminator=lineterminator, - ) - writer.writerows(records) - - self.records_total += len(records) - self.records_since_last_reset += len(records) - - def write_records_to_tsv( - self, - records, - fieldnames: None | list[str] = None, - extrasaction: typing.Literal["raise", "ignore"] = "ignore", - quotechar: str = '"', - escapechar: str | None = "\\", - lineterminator: str = "\n", - quoting=csv.QUOTE_NONE, - ): - """Write records to a temporary file as TSV.""" - return self.write_records_to_csv( - records, - fieldnames=fieldnames, - extrasaction=extrasaction, - delimiter="\t", - quotechar=quotechar, - escapechar=escapechar, - quoting=quoting, - lineterminator=lineterminator, - ) - - def rewind(self): - """Rewind the file before reading it.""" - if self.compression == "brotli": - result = self._file.write(self.brotli_compressor.finish()) - - self.bytes_total += result - self.bytes_since_last_reset += result - - self._brotli_compressor = None - - self._file.seek(0) - - def reset(self): - """Reset underlying file by truncating it. - - Also resets the tracker attributes for bytes and records since last reset. - """ - self._file.seek(0) - self._file.truncate() - - self.bytes_since_last_reset = 0 - self.records_since_last_reset = 0 - - @dataclasses.dataclass class CreateBatchExportRunInputs: """Inputs to the create_export_run activity. @@ -542,7 +341,7 @@ async def update_export_run_status(inputs: UpdateBatchExportRunStatusInputs) -> """Activity that updates the status of an BatchExportRun.""" logger = await bind_temporal_worker_logger(team_id=inputs.team_id) - batch_export_run = await sync_to_async(update_batch_export_run_status)( + batch_export_run = await sync_to_async(update_batch_export_run)( run_id=uuid.UUID(inputs.id), status=inputs.status, latest_error=inputs.latest_error, diff --git a/posthog/temporal/batch_exports/bigquery_batch_export.py b/posthog/temporal/batch_exports/bigquery_batch_export.py index a0469de79bb9e..b754a7add16b4 100644 --- a/posthog/temporal/batch_exports/bigquery_batch_export.py +++ b/posthog/temporal/batch_exports/bigquery_batch_export.py @@ -15,7 +15,6 @@ from posthog.batch_exports.service import BatchExportField, BatchExportSchema, BigQueryBatchExportInputs from posthog.temporal.batch_exports.base import PostHogWorkflow from posthog.temporal.batch_exports.batch_exports import ( - BatchExportTemporaryFile, CreateBatchExportRunInputs, UpdateBatchExportRunStatusInputs, create_export_run, @@ -29,6 +28,9 @@ get_bytes_exported_metric, get_rows_exported_metric, ) +from posthog.temporal.batch_exports.temporary_file import ( + BatchExportTemporaryFile, +) from posthog.temporal.batch_exports.utils import peek_first_and_rewind from posthog.temporal.common.clickhouse import get_client from posthog.temporal.common.logger import bind_temporal_worker_logger diff --git a/posthog/temporal/batch_exports/http_batch_export.py b/posthog/temporal/batch_exports/http_batch_export.py index 8aca65c80ff38..2866d50c99876 100644 --- a/posthog/temporal/batch_exports/http_batch_export.py +++ b/posthog/temporal/batch_exports/http_batch_export.py @@ -13,7 +13,6 @@ from posthog.models import BatchExportRun from posthog.temporal.batch_exports.base import PostHogWorkflow from posthog.temporal.batch_exports.batch_exports import ( - BatchExportTemporaryFile, CreateBatchExportRunInputs, UpdateBatchExportRunStatusInputs, create_export_run, @@ -21,12 +20,15 @@ get_data_interval, get_rows_count, iter_records, - json_dumps_bytes, ) from posthog.temporal.batch_exports.metrics import ( get_bytes_exported_metric, get_rows_exported_metric, ) +from posthog.temporal.batch_exports.temporary_file import ( + BatchExportTemporaryFile, + json_dumps_bytes, +) from posthog.temporal.common.clickhouse import get_client from posthog.temporal.common.logger import bind_temporal_worker_logger diff --git a/posthog/temporal/batch_exports/postgres_batch_export.py b/posthog/temporal/batch_exports/postgres_batch_export.py index 5dbfc6faa4acf..98969ee78de79 100644 --- a/posthog/temporal/batch_exports/postgres_batch_export.py +++ b/posthog/temporal/batch_exports/postgres_batch_export.py @@ -17,7 +17,6 @@ from posthog.batch_exports.service import BatchExportField, BatchExportSchema, PostgresBatchExportInputs from posthog.temporal.batch_exports.base import PostHogWorkflow from posthog.temporal.batch_exports.batch_exports import ( - BatchExportTemporaryFile, CreateBatchExportRunInputs, UpdateBatchExportRunStatusInputs, create_export_run, @@ -31,6 +30,9 @@ get_bytes_exported_metric, get_rows_exported_metric, ) +from posthog.temporal.batch_exports.temporary_file import ( + BatchExportTemporaryFile, +) from posthog.temporal.batch_exports.utils import peek_first_and_rewind from posthog.temporal.common.clickhouse import get_client from posthog.temporal.common.logger import bind_temporal_worker_logger diff --git a/posthog/temporal/batch_exports/s3_batch_export.py b/posthog/temporal/batch_exports/s3_batch_export.py index 4d99cbeffd7c3..e83fe3f12915d 100644 --- a/posthog/temporal/batch_exports/s3_batch_export.py +++ b/posthog/temporal/batch_exports/s3_batch_export.py @@ -1,4 +1,5 @@ import asyncio +import collections.abc import contextlib import datetime as dt import io @@ -8,6 +9,8 @@ from dataclasses import dataclass import aioboto3 +import orjson +import pyarrow as pa from django.conf import settings from temporalio import activity, workflow from temporalio.common import RetryPolicy @@ -16,7 +19,6 @@ from posthog.batch_exports.service import BatchExportField, BatchExportSchema, S3BatchExportInputs from posthog.temporal.batch_exports.base import PostHogWorkflow from posthog.temporal.batch_exports.batch_exports import ( - BatchExportTemporaryFile, CreateBatchExportRunInputs, UpdateBatchExportRunStatusInputs, create_export_run, @@ -30,6 +32,15 @@ get_bytes_exported_metric, get_rows_exported_metric, ) +from posthog.temporal.batch_exports.temporary_file import ( + BatchExportTemporaryFile, + BatchExportWriter, + FlushCallable, + JSONLBatchExportWriter, + ParquetBatchExportWriter, + UnsupportedFileFormatError, +) +from posthog.temporal.batch_exports.utils import peek_first_and_rewind from posthog.temporal.common.clickhouse import get_client from posthog.temporal.common.logger import bind_temporal_worker_logger @@ -50,19 +61,31 @@ def get_allowed_template_variables(inputs) -> dict[str, str]: } +FILE_FORMAT_EXTENSIONS = { + "Parquet": "parquet", + "JSONLines": "jsonl", +} + +COMPRESSION_EXTENSIONS = { + "gzip": "gz", + "snappy": "sz", + "brotli": "br", + "ztsd": "zst", + "lz4": "lz4", +} + + def get_s3_key(inputs) -> str: """Return an S3 key given S3InsertInputs.""" template_variables = get_allowed_template_variables(inputs) key_prefix = inputs.prefix.format(**template_variables) + file_extension = FILE_FORMAT_EXTENSIONS[inputs.file_format] base_file_name = f"{inputs.data_interval_start}-{inputs.data_interval_end}" - match inputs.compression: - case "gzip": - file_name = base_file_name + ".jsonl.gz" - case "brotli": - file_name = base_file_name + ".jsonl.br" - case _: - file_name = base_file_name + ".jsonl" + if inputs.compression is not None: + file_name = base_file_name + f".{file_extension}.{COMPRESSION_EXTENSIONS[inputs.compression]}" + else: + file_name = base_file_name + f".{file_extension}" key = posixpath.join(key_prefix, file_name) @@ -311,6 +334,8 @@ class S3InsertInputs: kms_key_id: str | None = None batch_export_schema: BatchExportSchema | None = None endpoint_url: str | None = None + # TODO: In Python 3.11, this could be a enum.StrEnum. + file_format: str = "JSONLines" async def initialize_and_resume_multipart_upload(inputs: S3InsertInputs) -> tuple[S3MultiPartUpload, str]: @@ -451,7 +476,7 @@ async def insert_into_s3_activity(inputs: S3InsertInputs) -> int: last_uploaded_part_timestamp: str | None = None - async def worker_shutdown_handler(): + async def worker_shutdown_handler() -> None: """Handle the Worker shutting down by heart-beating our latest status.""" await activity.wait_for_worker_shutdown() logger.warn( @@ -466,50 +491,147 @@ async def worker_shutdown_handler(): asyncio.create_task(worker_shutdown_handler()) - record = None - async with s3_upload as s3_upload: - with BatchExportTemporaryFile(compression=inputs.compression) as local_results_file: + + async def flush_to_s3( + local_results_file, + records_since_last_flush: int, + bytes_since_last_flush: int, + last_inserted_at: dt.datetime, + last: bool, + ): + nonlocal last_uploaded_part_timestamp + + logger.debug( + "Uploading %s part %s containing %s records with size %s bytes", + "last " if last else "", + s3_upload.part_number + 1, + records_since_last_flush, + bytes_since_last_flush, + ) + + await s3_upload.upload_part(local_results_file) + rows_exported.add(records_since_last_flush) + bytes_exported.add(bytes_since_last_flush) + + last_uploaded_part_timestamp = str(last_inserted_at) + activity.heartbeat(last_uploaded_part_timestamp, s3_upload.to_state()) + + first_record_batch, record_iterator = peek_first_and_rewind(record_iterator) + first_record_batch = cast_record_batch_json_columns(first_record_batch) + column_names = first_record_batch.column_names + column_names.pop(column_names.index("_inserted_at")) + + schema = pa.schema( + # NOTE: For some reason, some batches set non-nullable fields as non-nullable, whereas other + # record batches have them as nullable. + # Until we figure it out, we set all fields to nullable. There are some fields we know + # are not nullable, but I'm opting for the more flexible option until we out why schemas differ + # between batches. + [field.with_nullable(True) for field in first_record_batch.select(column_names).schema] + ) + + writer = get_batch_export_writer( + inputs, + flush_callable=flush_to_s3, + max_bytes=settings.BATCH_EXPORT_S3_UPLOAD_CHUNK_SIZE_BYTES, + schema=schema, + ) + + async with writer.open_temporary_file(): rows_exported = get_rows_exported_metric() bytes_exported = get_bytes_exported_metric() - async def flush_to_s3(last_uploaded_part_timestamp: str, last=False): - logger.debug( - "Uploading %s part %s containing %s records with size %s bytes", - "last " if last else "", - s3_upload.part_number + 1, - local_results_file.records_since_last_reset, - local_results_file.bytes_since_last_reset, - ) + for record_batch in record_iterator: + record_batch = cast_record_batch_json_columns(record_batch) - await s3_upload.upload_part(local_results_file) - rows_exported.add(local_results_file.records_since_last_reset) - bytes_exported.add(local_results_file.bytes_since_last_reset) + await writer.write_record_batch(record_batch) - activity.heartbeat(last_uploaded_part_timestamp, s3_upload.to_state()) + await s3_upload.complete() - for record_batch in record_iterator: - for record in record_batch.to_pylist(): - for json_column in ("properties", "person_properties", "set", "set_once"): - if (json_str := record.get(json_column, None)) is not None: - record[json_column] = json.loads(json_str) + return writer.records_total - inserted_at = record.pop("_inserted_at") - local_results_file.write_records_to_jsonl([record]) +def get_batch_export_writer( + inputs: S3InsertInputs, flush_callable: FlushCallable, max_bytes: int, schema: pa.Schema | None = None +) -> BatchExportWriter: + """Return the `BatchExportWriter` corresponding to configured `file_format`. - if local_results_file.tell() > settings.BATCH_EXPORT_S3_UPLOAD_CHUNK_SIZE_BYTES: - last_uploaded_part_timestamp = str(inserted_at) - await flush_to_s3(last_uploaded_part_timestamp) - local_results_file.reset() + Raises: + UnsupportedFileFormatError: If no writer exists for given `file_format`. + """ + writer: BatchExportWriter - if local_results_file.tell() > 0 and record is not None: - last_uploaded_part_timestamp = str(inserted_at) - await flush_to_s3(last_uploaded_part_timestamp, last=True) + if inputs.file_format == "Parquet": + writer = ParquetBatchExportWriter( + max_bytes=max_bytes, + flush_callable=flush_callable, + compression=inputs.compression, + schema=schema, + ) + elif inputs.file_format == "JSONLines": + writer = JSONLBatchExportWriter( + max_bytes=settings.BATCH_EXPORT_S3_UPLOAD_CHUNK_SIZE_BYTES, + flush_callable=flush_callable, + compression=inputs.compression, + ) + else: + raise UnsupportedFileFormatError(inputs.file_format, "S3") - await s3_upload.complete() + return writer + + +def cast_record_batch_json_columns( + record_batch: pa.RecordBatch, + json_columns: collections.abc.Sequence = ("properties", "person_properties", "set", "set_once"), +) -> pa.RecordBatch: + """Cast json_columns in record_batch to JsonType. + + We return a new RecordBatch with any json_columns replaced by fields casted to JsonType. + Casting is not copying the underlying array buffers, so memory usage does not increase when creating + the new array or the new record batch. + """ + column_names = set(record_batch.column_names) + intersection = column_names & set(json_columns) + + casted_arrays = [] + for array in record_batch.select(intersection): + if pa.types.is_string(array.type): + casted_array = array.cast(JsonType()) + casted_arrays.append(casted_array) + + remaining_column_names = list(column_names - intersection) + return pa.RecordBatch.from_arrays( + record_batch.select(remaining_column_names).columns + casted_arrays, + names=remaining_column_names + list(intersection), + ) + + +class JsonScalar(pa.ExtensionScalar): + """Represents a JSON binary string.""" + + def as_py(self) -> dict | None: + if self.value: + return orjson.loads(self.value.as_py().encode("utf-8")) + else: + return None + + +class JsonType(pa.ExtensionType): + """Type for JSON binary strings.""" + + def __init__(self): + super().__init__(pa.string(), "json") + + def __arrow_ext_serialize__(self): + return b"" + + @classmethod + def __arrow_ext_deserialize__(self, storage_type, serialized): + return JsonType() - return local_results_file.records_total + def __arrow_ext_scalar_class__(self): + return JsonScalar @workflow.defn(name="s3-export") @@ -572,6 +694,7 @@ async def run(self, inputs: S3BatchExportInputs): encryption=inputs.encryption, kms_key_id=inputs.kms_key_id, batch_export_schema=inputs.batch_export_schema, + file_format=inputs.file_format, ) await execute_batch_export_insert_activity( diff --git a/posthog/temporal/batch_exports/snowflake_batch_export.py b/posthog/temporal/batch_exports/snowflake_batch_export.py index be94eca89a799..9053f3e1006ad 100644 --- a/posthog/temporal/batch_exports/snowflake_batch_export.py +++ b/posthog/temporal/batch_exports/snowflake_batch_export.py @@ -18,7 +18,6 @@ from posthog.batch_exports.service import BatchExportField, BatchExportSchema, SnowflakeBatchExportInputs from posthog.temporal.batch_exports.base import PostHogWorkflow from posthog.temporal.batch_exports.batch_exports import ( - BatchExportTemporaryFile, CreateBatchExportRunInputs, UpdateBatchExportRunStatusInputs, create_export_run, @@ -32,6 +31,9 @@ get_bytes_exported_metric, get_rows_exported_metric, ) +from posthog.temporal.batch_exports.temporary_file import ( + BatchExportTemporaryFile, +) from posthog.temporal.batch_exports.utils import peek_first_and_rewind from posthog.temporal.common.clickhouse import get_client from posthog.temporal.common.logger import bind_temporal_worker_logger diff --git a/posthog/temporal/batch_exports/temporary_file.py b/posthog/temporal/batch_exports/temporary_file.py new file mode 100644 index 0000000000000..f955f45553727 --- /dev/null +++ b/posthog/temporal/batch_exports/temporary_file.py @@ -0,0 +1,528 @@ +"""This module contains a temporary file to stage data in batch exports.""" +import abc +import collections.abc +import contextlib +import csv +import datetime as dt +import gzip +import tempfile +import typing + +import brotli +import orjson +import pyarrow as pa +import pyarrow.parquet as pq + + +def json_dumps_bytes(d) -> bytes: + return orjson.dumps(d, default=str) + + +class BatchExportTemporaryFile: + """A TemporaryFile used to as an intermediate step while exporting data. + + This class does not implement the file-like interface but rather passes any calls + to the underlying tempfile.NamedTemporaryFile. We do override 'write' methods + to allow tracking bytes and records. + """ + + def __init__( + self, + mode: str = "w+b", + buffering=-1, + compression: str | None = None, + encoding: str | None = None, + newline: str | None = None, + suffix: str | None = None, + prefix: str | None = None, + dir: str | None = None, + *, + errors: str | None = None, + ): + self._file = tempfile.NamedTemporaryFile( + mode=mode, + encoding=encoding, + newline=newline, + buffering=buffering, + suffix=suffix, + prefix=prefix, + dir=dir, + errors=errors, + ) + self.compression = compression + self.bytes_total = 0 + self.records_total = 0 + self.bytes_since_last_reset = 0 + self.records_since_last_reset = 0 + self._brotli_compressor = None + + def __getattr__(self, name): + """Pass get attr to underlying tempfile.NamedTemporaryFile.""" + return self._file.__getattr__(name) + + def __enter__(self): + """Context-manager protocol enter method.""" + self._file.__enter__() + return self + + def __exit__(self, exc, value, tb): + """Context-manager protocol exit method.""" + return self._file.__exit__(exc, value, tb) + + def __iter__(self): + yield from self._file + + @property + def brotli_compressor(self): + if self._brotli_compressor is None: + self._brotli_compressor = brotli.Compressor() + return self._brotli_compressor + + def finish_brotli_compressor(self): + """Flush remaining brotli bytes.""" + # TODO: Move compression out of `BatchExportTemporaryFile` to a standard class for all writers. + if self.compression != "brotli": + raise ValueError(f"Compression is '{self.compression}', not 'brotli'") + + result = self._file.write(self.brotli_compressor.finish()) + self.bytes_total += result + self.bytes_since_last_reset += result + self._brotli_compressor = None + + def compress(self, content: bytes | str) -> bytes: + if isinstance(content, str): + encoded = content.encode("utf-8") + else: + encoded = content + + match self.compression: + case "gzip": + return gzip.compress(encoded) + case "brotli": + self.brotli_compressor.process(encoded) + return self.brotli_compressor.flush() + case None: + return encoded + case _: + raise ValueError(f"Unsupported compression: '{self.compression}'") + + def write(self, content: bytes | str): + """Write bytes to underlying file keeping track of how many bytes were written.""" + compressed_content = self.compress(content) + + if "b" in self.mode: + result = self._file.write(compressed_content) + else: + result = self._file.write(compressed_content.decode("utf-8")) + + self.bytes_total += result + self.bytes_since_last_reset += result + + return result + + def write_record_as_bytes(self, record: bytes): + result = self.write(record) + + self.records_total += 1 + self.records_since_last_reset += 1 + + return result + + def write_records_to_jsonl(self, records): + """Write records to a temporary file as JSONL.""" + if len(records) == 1: + jsonl_dump = orjson.dumps(records[0], option=orjson.OPT_APPEND_NEWLINE, default=str) + else: + jsonl_dump = b"\n".join(map(json_dumps_bytes, records)) + + result = self.write(jsonl_dump) + + self.records_total += len(records) + self.records_since_last_reset += len(records) + + return result + + def write_records_to_csv( + self, + records, + fieldnames: None | collections.abc.Sequence[str] = None, + extrasaction: typing.Literal["raise", "ignore"] = "ignore", + delimiter: str = ",", + quotechar: str = '"', + escapechar: str | None = "\\", + lineterminator: str = "\n", + quoting=csv.QUOTE_NONE, + ): + """Write records to a temporary file as CSV.""" + if len(records) == 0: + return + + if fieldnames is None: + fieldnames = list(records[0].keys()) + + writer = csv.DictWriter( + self, + fieldnames=fieldnames, + extrasaction=extrasaction, + delimiter=delimiter, + quotechar=quotechar, + escapechar=escapechar, + quoting=quoting, + lineterminator=lineterminator, + ) + writer.writerows(records) + + self.records_total += len(records) + self.records_since_last_reset += len(records) + + def write_records_to_tsv( + self, + records, + fieldnames: None | list[str] = None, + extrasaction: typing.Literal["raise", "ignore"] = "ignore", + quotechar: str = '"', + escapechar: str | None = "\\", + lineterminator: str = "\n", + quoting=csv.QUOTE_NONE, + ): + """Write records to a temporary file as TSV.""" + return self.write_records_to_csv( + records, + fieldnames=fieldnames, + extrasaction=extrasaction, + delimiter="\t", + quotechar=quotechar, + escapechar=escapechar, + quoting=quoting, + lineterminator=lineterminator, + ) + + def rewind(self): + """Rewind the file before reading it.""" + self._file.seek(0) + + def reset(self): + """Reset underlying file by truncating it. + + Also resets the tracker attributes for bytes and records since last reset. + """ + self._file.seek(0) + self._file.truncate() + + self.bytes_since_last_reset = 0 + self.records_since_last_reset = 0 + + +LastInsertedAt = dt.datetime +IsLast = bool +RecordsSinceLastFlush = int +BytesSinceLastFlush = int +FlushCallable = collections.abc.Callable[ + [BatchExportTemporaryFile, RecordsSinceLastFlush, BytesSinceLastFlush, LastInsertedAt, IsLast], + collections.abc.Awaitable[None], +] + + +class UnsupportedFileFormatError(Exception): + """Raised when a writer for an unsupported file format is requested.""" + + def __init__(self, file_format: str, destination: str): + super().__init__(f"{file_format} is not a supported format for {destination} batch exports.") + + +class BatchExportWriter(abc.ABC): + """A temporary file writer to be used by batch export workflows. + + Subclasses should define `_write_record_batch` with the particular intricacies + of the format they are writing as. + + Actual writing calls are passed to the underlying `batch_export_file`. + + Attributes: + _batch_export_file: The temporary file we are writing to. + max_bytes: Flush the temporary file with the provided `flush_callable` + upon reaching or surpassing this threshold. Keep in mind we write on a RecordBatch + per RecordBatch basis, which means the threshold will be surpassed by at most the + size of a RecordBatch before a flush occurs. + flush_callable: A callback to flush the temporary file when `max_bytes` is reached. + The temporary file will be reset after calling `flush_callable`. When calling + `flush_callable` the following positional arguments will be passed: The temporary file + that must be flushed, the number of records since the last flush, the number of bytes + since the last flush, the latest recorded `_inserted_at`, and a `bool` indicating if + this is the last flush (when exiting the context manager). + file_kwargs: Optional keyword arguments passed when initializing `_batch_export_file`. + last_inserted_at: Latest `_inserted_at` written. This attribute leaks some implementation + details, as we are assuming assume `_inserted_at` is present, as it's added to all + batch export queries. + records_total: The total number of records (not RecordBatches!) written. + records_since_last_flush: The number of records written since last flush. + bytes_total: The total number of bytes written. + bytes_since_last_flush: The number of bytes written since last flush. + """ + + def __init__( + self, + flush_callable: FlushCallable, + max_bytes: int, + file_kwargs: collections.abc.Mapping[str, typing.Any] | None = None, + ): + self.flush_callable = flush_callable + self.max_bytes = max_bytes + self.file_kwargs: collections.abc.Mapping[str, typing.Any] = file_kwargs or {} + + self._batch_export_file: BatchExportTemporaryFile | None = None + self.reset_writer_tracking() + + def reset_writer_tracking(self): + """Reset this writer's tracking state.""" + self.last_inserted_at: dt.datetime | None = None + self.records_total = 0 + self.records_since_last_flush = 0 + self.bytes_total = 0 + self.bytes_since_last_flush = 0 + + @contextlib.asynccontextmanager + async def open_temporary_file(self): + """Explicitly open the temporary file this writer is writing to. + + The underlying `BatchExportTemporaryFile` is only accessible within this context manager. This helps + us separate the lifetime of the underlying temporary file from the writer: The writer may still be + accessed even after the temporary file is closed, while on the other hand we ensure the file and all + its data is flushed and not leaked outside the context. Any relevant tracking information is copied + to the writer. + """ + self.reset_writer_tracking() + + with BatchExportTemporaryFile(**self.file_kwargs) as temp_file: + self._batch_export_file = temp_file + + try: + yield + finally: + self.track_bytes_written(temp_file) + + if self.last_inserted_at is not None and self.bytes_since_last_flush > 0: + # `bytes_since_last_flush` should be 0 unless: + # 1. The last batch wasn't flushed as it didn't reach `max_bytes`. + # 2. The last batch was flushed but there was another write after the last call to + # `write_record_batch`. For example, footer bytes. + await self.flush(self.last_inserted_at, is_last=True) + + self._batch_export_file = None + + @property + def batch_export_file(self): + """Property for underlying temporary file. + + Raises: + ValueError: if attempting to access the temporary file before it has been opened. + """ + if self._batch_export_file is None: + raise ValueError("Batch export file is closed. Did you forget to call 'open_temporary_file'?") + return self._batch_export_file + + @abc.abstractmethod + def _write_record_batch(self, record_batch: pa.RecordBatch) -> None: + """Write a record batch to the underlying `BatchExportTemporaryFile`. + + Subclasses must override this to provide the actual implementation according to the supported + file format. + """ + pass + + def track_records_written(self, record_batch: pa.RecordBatch) -> None: + """Update this writer's state with the number of records in `record_batch`.""" + self.records_total += record_batch.num_rows + self.records_since_last_flush += record_batch.num_rows + + def track_bytes_written(self, batch_export_file: BatchExportTemporaryFile) -> None: + """Update this writer's state with the bytes in `batch_export_file`.""" + self.bytes_total = batch_export_file.bytes_total + self.bytes_since_last_flush = batch_export_file.bytes_since_last_reset + + async def write_record_batch(self, record_batch: pa.RecordBatch) -> None: + """Issue a record batch write tracking progress and flushing if required.""" + record_batch = record_batch.sort_by("_inserted_at") + last_inserted_at = record_batch.column("_inserted_at")[-1].as_py() + + column_names = record_batch.column_names + column_names.pop(column_names.index("_inserted_at")) + + self._write_record_batch(record_batch.select(column_names)) + + self.last_inserted_at = last_inserted_at + self.track_records_written(record_batch) + self.track_bytes_written(self.batch_export_file) + + if self.bytes_since_last_flush >= self.max_bytes: + await self.flush(last_inserted_at) + + async def flush(self, last_inserted_at: dt.datetime, is_last: bool = False) -> None: + """Call the provided `flush_callable` and reset underlying file. + + The underlying batch export temporary file will be reset after calling `flush_callable`. + """ + if is_last is True and self.batch_export_file.compression == "brotli": + self.batch_export_file.finish_brotli_compressor() + + self.batch_export_file.seek(0) + + await self.flush_callable( + self.batch_export_file, + self.records_since_last_flush, + self.bytes_since_last_flush, + last_inserted_at, + is_last, + ) + self.batch_export_file.reset() + + self.records_since_last_flush = 0 + self.bytes_since_last_flush = 0 + + +class JSONLBatchExportWriter(BatchExportWriter): + """A `BatchExportWriter` for JSONLines format. + + Attributes: + default: The default function to use to cast non-serializable Python objects to serializable objects. + By default, non-serializable objects will be cast to string via `str()`. + """ + + def __init__( + self, + max_bytes: int, + flush_callable: FlushCallable, + compression: None | str = None, + default: typing.Callable = str, + ): + super().__init__( + max_bytes=max_bytes, + flush_callable=flush_callable, + file_kwargs={"compression": compression}, + ) + + self.default = default + + def write(self, content: bytes) -> int: + """Write a single row of JSONL.""" + n = self.batch_export_file.write(orjson.dumps(content, default=str) + b"\n") + return n + + def _write_record_batch(self, record_batch: pa.RecordBatch) -> None: + """Write records to a temporary file as JSONL.""" + for record in record_batch.to_pylist(): + self.write(record) + + +class CSVBatchExportWriter(BatchExportWriter): + """A `BatchExportWriter` for CSV format.""" + + def __init__( + self, + max_bytes: int, + flush_callable: FlushCallable, + field_names: collections.abc.Sequence[str], + extras_action: typing.Literal["raise", "ignore"] = "ignore", + delimiter: str = ",", + quote_char: str = '"', + escape_char: str | None = "\\", + line_terminator: str = "\n", + quoting=csv.QUOTE_NONE, + compression: str | None = None, + ): + super().__init__( + max_bytes=max_bytes, + flush_callable=flush_callable, + file_kwargs={"compression": compression}, + ) + self.field_names = field_names + self.extras_action: typing.Literal["raise", "ignore"] = extras_action + self.delimiter = delimiter + self.quote_char = quote_char + self.escape_char = escape_char + self.line_terminator = line_terminator + self.quoting = quoting + + self._csv_writer: csv.DictWriter | None = None + + @property + def csv_writer(self) -> csv.DictWriter: + if self._csv_writer is None: + self._csv_writer = csv.DictWriter( + self.batch_export_file, + fieldnames=self.field_names, + extrasaction=self.extras_action, + delimiter=self.delimiter, + quotechar=self.quote_char, + escapechar=self.escape_char, + quoting=self.quoting, + lineterminator=self.line_terminator, + ) + + return self._csv_writer + + def _write_record_batch(self, record_batch: pa.RecordBatch) -> None: + """Write records to a temporary file as CSV.""" + self.csv_writer.writerows(record_batch.to_pylist()) + + +class ParquetBatchExportWriter(BatchExportWriter): + """A `BatchExportWriter` for Apache Parquet format. + + We utilize and wrap a `pyarrow.parquet.ParquetWriter` to do the actual writing. We default to their + defaults for most parameters; however this class could be extended with more attributes to pass along + to `pyarrow.parquet.ParquetWriter`. + + See the pyarrow docs for more details on what parameters can the writer be configured with: + https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetWriter.html + + In contrast to other writers, instead of us handling compression we let `pyarrow.parquet.ParquetWriter` + handle it, so `BatchExportTemporaryFile` is always initialized with `compression=None`. + + Attributes: + schema: The schema used by the Parquet file. Should match the schema of written RecordBatches. + compression: Compression codec passed to underlying `pyarrow.parquet.ParquetWriter`. + """ + + def __init__( + self, + max_bytes: int, + flush_callable: FlushCallable, + schema: pa.Schema, + compression: str | None = "snappy", + ): + super().__init__( + max_bytes=max_bytes, + flush_callable=flush_callable, + file_kwargs={"compression": None}, # ParquetWriter handles compression + ) + self.schema = schema + self.compression = compression + + self._parquet_writer: pq.ParquetWriter | None = None + + @property + def parquet_writer(self) -> pq.ParquetWriter: + if self._parquet_writer is None: + self._parquet_writer = pq.ParquetWriter( + self.batch_export_file, + schema=self.schema, + compression="none" if self.compression is None else self.compression, + ) + return self._parquet_writer + + @contextlib.asynccontextmanager + async def open_temporary_file(self): + """Ensure underlying Parquet writer is closed before flushing and closing temporary file.""" + async with super().open_temporary_file(): + try: + yield + finally: + if self._parquet_writer is not None: + self._parquet_writer.writer.close() + self._parquet_writer = None + + def _write_record_batch(self, record_batch: pa.RecordBatch) -> None: + """Write records to a temporary file as Parquet.""" + + self.parquet_writer.write_batch(record_batch.select(self.parquet_writer.schema.names)) diff --git a/posthog/temporal/tests/batch_exports/test_batch_exports.py b/posthog/temporal/tests/batch_exports/test_batch_exports.py index 0afbfcabb71cb..756c07e442e4f 100644 --- a/posthog/temporal/tests/batch_exports/test_batch_exports.py +++ b/posthog/temporal/tests/batch_exports/test_batch_exports.py @@ -1,6 +1,4 @@ -import csv import datetime as dt -import io import json import operator from random import randint @@ -9,11 +7,9 @@ from django.test import override_settings from posthog.temporal.batch_exports.batch_exports import ( - BatchExportTemporaryFile, get_data_interval, get_rows_count, iter_records, - json_dumps_bytes, ) from posthog.temporal.tests.utils.events import generate_test_events_in_clickhouse @@ -558,181 +554,3 @@ def test_get_data_interval(interval, data_interval_end, expected): """Test get_data_interval returns the expected data interval tuple.""" result = get_data_interval(interval, data_interval_end) assert result == expected - - -@pytest.mark.parametrize( - "to_write", - [ - (b"",), - (b"", b""), - (b"12345",), - (b"12345", b"12345"), - (b"abbcccddddeeeee",), - (b"abbcccddddeeeee", b"abbcccddddeeeee"), - ], -) -def test_batch_export_temporary_file_tracks_bytes(to_write): - """Test the bytes written by BatchExportTemporaryFile match expected.""" - with BatchExportTemporaryFile() as be_file: - for content in to_write: - be_file.write(content) - - assert be_file.bytes_total == sum(len(content) for content in to_write) - assert be_file.bytes_since_last_reset == sum(len(content) for content in to_write) - - be_file.reset() - - assert be_file.bytes_total == sum(len(content) for content in to_write) - assert be_file.bytes_since_last_reset == 0 - - -TEST_RECORDS = [ - [], - [ - {"id": "record-1", "property": "value", "property_int": 1}, - {"id": "record-2", "property": "another-value", "property_int": 2}, - { - "id": "record-3", - "property": {"id": "nested-record", "property": "nested-value"}, - "property_int": 3, - }, - ], -] - - -@pytest.mark.parametrize( - "records", - TEST_RECORDS, -) -def test_batch_export_temporary_file_write_records_to_jsonl(records): - """Test JSONL records written by BatchExportTemporaryFile match expected.""" - jsonl_dump = b"\n".join(map(json_dumps_bytes, records)) - - with BatchExportTemporaryFile() as be_file: - be_file.write_records_to_jsonl(records) - - assert be_file.bytes_total == len(jsonl_dump) - assert be_file.bytes_since_last_reset == len(jsonl_dump) - assert be_file.records_total == len(records) - assert be_file.records_since_last_reset == len(records) - - be_file.seek(0) - lines = be_file.readlines() - assert len(lines) == len(records) - - for line_index, jsonl_record in enumerate(lines): - json_loaded = json.loads(jsonl_record) - assert json_loaded == records[line_index] - - be_file.reset() - - assert be_file.bytes_total == len(jsonl_dump) - assert be_file.bytes_since_last_reset == 0 - assert be_file.records_total == len(records) - assert be_file.records_since_last_reset == 0 - - -@pytest.mark.parametrize( - "records", - TEST_RECORDS, -) -def test_batch_export_temporary_file_write_records_to_csv(records): - """Test CSV written by BatchExportTemporaryFile match expected.""" - in_memory_file_obj = io.StringIO() - writer = csv.DictWriter( - in_memory_file_obj, - fieldnames=records[0].keys() if len(records) > 0 else [], - delimiter=",", - quotechar='"', - escapechar="\\", - lineterminator="\n", - quoting=csv.QUOTE_NONE, - ) - writer.writerows(records) - - with BatchExportTemporaryFile(mode="w+") as be_file: - be_file.write_records_to_csv(records) - - assert be_file.bytes_total == in_memory_file_obj.tell() - assert be_file.bytes_since_last_reset == in_memory_file_obj.tell() - assert be_file.records_total == len(records) - assert be_file.records_since_last_reset == len(records) - - be_file.seek(0) - reader = csv.reader( - be_file._file, - delimiter=",", - quotechar='"', - escapechar="\\", - quoting=csv.QUOTE_NONE, - ) - - rows = [row for row in reader] - assert len(rows) == len(records) - - for row_index, csv_record in enumerate(rows): - for value_index, value in enumerate(records[row_index].values()): - # Everything returned by csv.reader is a str. - # This means type information is lost when writing to CSV - # but this just a limitation of the format. - assert csv_record[value_index] == str(value) - - be_file.reset() - - assert be_file.bytes_total == in_memory_file_obj.tell() - assert be_file.bytes_since_last_reset == 0 - assert be_file.records_total == len(records) - assert be_file.records_since_last_reset == 0 - - -@pytest.mark.parametrize( - "records", - TEST_RECORDS, -) -def test_batch_export_temporary_file_write_records_to_tsv(records): - """Test TSV written by BatchExportTemporaryFile match expected.""" - in_memory_file_obj = io.StringIO() - writer = csv.DictWriter( - in_memory_file_obj, - fieldnames=records[0].keys() if len(records) > 0 else [], - delimiter="\t", - quotechar='"', - escapechar="\\", - lineterminator="\n", - quoting=csv.QUOTE_NONE, - ) - writer.writerows(records) - - with BatchExportTemporaryFile(mode="w+") as be_file: - be_file.write_records_to_tsv(records) - - assert be_file.bytes_total == in_memory_file_obj.tell() - assert be_file.bytes_since_last_reset == in_memory_file_obj.tell() - assert be_file.records_total == len(records) - assert be_file.records_since_last_reset == len(records) - - be_file.seek(0) - reader = csv.reader( - be_file._file, - delimiter="\t", - quotechar='"', - escapechar="\\", - quoting=csv.QUOTE_NONE, - ) - - rows = [row for row in reader] - assert len(rows) == len(records) - - for row_index, csv_record in enumerate(rows): - for value_index, value in enumerate(records[row_index].values()): - # Everything returned by csv.reader is a str. - # This means type information is lost when writing to CSV - # but this just a limitation of the format. - assert csv_record[value_index] == str(value) - - be_file.reset() - - assert be_file.bytes_total == in_memory_file_obj.tell() - assert be_file.bytes_since_last_reset == 0 - assert be_file.records_total == len(records) - assert be_file.records_since_last_reset == 0 diff --git a/posthog/temporal/tests/batch_exports/test_s3_batch_export_workflow.py b/posthog/temporal/tests/batch_exports/test_s3_batch_export_workflow.py index e04e345d11245..e6583d049e2a8 100644 --- a/posthog/temporal/tests/batch_exports/test_s3_batch_export_workflow.py +++ b/posthog/temporal/tests/batch_exports/test_s3_batch_export_workflow.py @@ -10,10 +10,12 @@ import aioboto3 import botocore.exceptions import brotli +import pyarrow.parquet as pq import pytest import pytest_asyncio from django.conf import settings from django.test import override_settings +from pyarrow import fs from temporalio import activity from temporalio.client import WorkflowFailureError from temporalio.common import RetryPolicy @@ -27,6 +29,7 @@ update_export_run_status, ) from posthog.temporal.batch_exports.s3_batch_export import ( + FILE_FORMAT_EXTENSIONS, HeartbeatDetails, S3BatchExportInputs, S3BatchExportWorkflow, @@ -107,6 +110,15 @@ def s3_key_prefix(): return f"posthog-events-{str(uuid4())}" +@pytest.fixture +def file_format(request) -> str: + """S3 file format.""" + try: + return request.param + except AttributeError: + return f"JSONLines" + + async def delete_all_from_s3(minio_client, bucket_name: str, key_prefix: str): """Delete all objects in bucket_name under key_prefix.""" response = await minio_client.list_objects_v2(Bucket=bucket_name, Prefix=key_prefix) @@ -138,6 +150,61 @@ async def minio_client(bucket_name): await minio_client.delete_bucket(Bucket=bucket_name) +async def read_parquet_from_s3(bucket_name: str, key: str, json_columns) -> list: + async with aioboto3.Session().client("sts") as sts: + try: + await sts.get_caller_identity() + except botocore.exceptions.NoCredentialsError: + s3 = fs.S3FileSystem( + access_key="object_storage_root_user", + secret_key="object_storage_root_password", + endpoint_override=settings.OBJECT_STORAGE_ENDPOINT, + ) + + else: + if os.getenv("S3_TEST_BUCKET") is not None: + s3 = fs.S3FileSystem() + else: + s3 = fs.S3FileSystem( + access_key="object_storage_root_user", + secret_key="object_storage_root_password", + endpoint_override=settings.OBJECT_STORAGE_ENDPOINT, + ) + + table = pq.read_table(f"{bucket_name}/{key}", filesystem=s3) + + parquet_data = [] + for batch in table.to_batches(): + for record in batch.to_pylist(): + casted_record = {} + for k, v in record.items(): + if isinstance(v, dt.datetime): + # We read data from clickhouse as string, but parquet already casts them as dates. + # To facilitate comparison, we isoformat the dates. + casted_record[k] = v.isoformat() + elif k in json_columns and v is not None: + # Parquet doesn't have a variable map type, so JSON fields are just strings. + casted_record[k] = json.loads(v) + else: + casted_record[k] = v + parquet_data.append(casted_record) + + return parquet_data + + +def read_s3_data_as_json(data: bytes, compression: str | None) -> list: + match compression: + case "gzip": + data = gzip.decompress(data) + case "brotli": + data = brotli.decompress(data) + case _: + pass + + json_data = [json.loads(line) for line in data.decode("utf-8").split("\n") if line] + return json_data + + async def assert_clickhouse_records_in_s3( s3_compatible_client, clickhouse_client: ClickHouseClient, @@ -150,6 +217,7 @@ async def assert_clickhouse_records_in_s3( include_events: list[str] | None = None, batch_export_schema: BatchExportSchema | None = None, compression: str | None = None, + file_format: str = "JSONLines", ): """Assert ClickHouse records are written to JSON in key_prefix in S3 bucket_name. @@ -175,28 +243,24 @@ async def assert_clickhouse_records_in_s3( # Get the object. key = objects["Contents"][0].get("Key") assert key - s3_object = await s3_compatible_client.get_object(Bucket=bucket_name, Key=key) - data = await s3_object["Body"].read() - # Check that the data is correct. - match compression: - case "gzip": - data = gzip.decompress(data) - case "brotli": - data = brotli.decompress(data) - case _: - pass + json_columns = ("properties", "person_properties", "set", "set_once") - json_data = [json.loads(line) for line in data.decode("utf-8").split("\n") if line] - # Pull out the fields we inserted only + if file_format == "Parquet": + s3_data = await read_parquet_from_s3(bucket_name, key, json_columns) + + elif file_format == "JSONLines": + s3_object = await s3_compatible_client.get_object(Bucket=bucket_name, Key=key) + data = await s3_object["Body"].read() + s3_data = read_s3_data_as_json(data, compression) + else: + raise ValueError(f"Unsupported file format: {file_format}") if batch_export_schema is not None: schema_column_names = [field["alias"] for field in batch_export_schema["fields"]] else: schema_column_names = [field["alias"] for field in s3_default_fields()] - json_columns = ("properties", "person_properties", "set", "set_once") - expected_records = [] for record_batch in iter_records( client=clickhouse_client, @@ -225,9 +289,9 @@ async def assert_clickhouse_records_in_s3( expected_records.append(expected_record) - assert len(json_data) == len(expected_records) - assert json_data[0] == expected_records[0] - assert json_data == expected_records + assert len(s3_data) == len(expected_records) + assert s3_data[0] == expected_records[0] + assert s3_data == expected_records TEST_S3_SCHEMAS: list[BatchExportSchema | None] = [ @@ -255,6 +319,7 @@ async def assert_clickhouse_records_in_s3( @pytest.mark.parametrize("compression", [None, "gzip", "brotli"], indirect=True) @pytest.mark.parametrize("exclude_events", [None, ["test-exclude"]], indirect=True) @pytest.mark.parametrize("batch_export_schema", TEST_S3_SCHEMAS) +@pytest.mark.parametrize("file_format", FILE_FORMAT_EXTENSIONS.keys()) async def test_insert_into_s3_activity_puts_data_into_s3( clickhouse_client, bucket_name, @@ -262,6 +327,7 @@ async def test_insert_into_s3_activity_puts_data_into_s3( activity_environment, compression, exclude_events, + file_format, batch_export_schema: BatchExportSchema | None, ): """Test that the insert_into_s3_activity function ends up with data into S3. @@ -339,12 +405,15 @@ async def test_insert_into_s3_activity_puts_data_into_s3( compression=compression, exclude_events=exclude_events, batch_export_schema=batch_export_schema, + file_format=file_format, ) with override_settings( BATCH_EXPORT_S3_UPLOAD_CHUNK_SIZE_BYTES=5 * 1024**2 ): # 5MB, the minimum for Multipart uploads - await activity_environment.run(insert_into_s3_activity, insert_inputs) + records_total = await activity_environment.run(insert_into_s3_activity, insert_inputs) + + assert records_total == 10005 await assert_clickhouse_records_in_s3( s3_compatible_client=minio_client, @@ -358,6 +427,7 @@ async def test_insert_into_s3_activity_puts_data_into_s3( exclude_events=exclude_events, include_events=None, compression=compression, + file_format=file_format, ) @@ -371,6 +441,7 @@ async def s3_batch_export( exclude_events, temporal_client, encryption, + file_format, ): destination_data = { "type": "S3", @@ -385,6 +456,7 @@ async def s3_batch_export( "exclude_events": exclude_events, "encryption": encryption, "kms_key_id": os.getenv("S3_TEST_KMS_KEY_ID") if encryption == "aws:kms" else None, + "file_format": file_format, }, } @@ -410,6 +482,7 @@ async def s3_batch_export( @pytest.mark.parametrize("compression", [None, "gzip", "brotli"], indirect=True) @pytest.mark.parametrize("exclude_events", [None, ["test-exclude"]], indirect=True) @pytest.mark.parametrize("batch_export_schema", TEST_S3_SCHEMAS) +@pytest.mark.parametrize("file_format", FILE_FORMAT_EXTENSIONS.keys(), indirect=True) async def test_s3_export_workflow_with_minio_bucket( clickhouse_client, minio_client, @@ -421,6 +494,7 @@ async def test_s3_export_workflow_with_minio_bucket( exclude_events, s3_key_prefix, batch_export_schema, + file_format, ): """Test S3BatchExport Workflow end-to-end by using a local MinIO bucket instead of S3. @@ -508,6 +582,7 @@ async def test_s3_export_workflow_with_minio_bucket( batch_export_schema=batch_export_schema, exclude_events=exclude_events, compression=compression, + file_format=file_format, ) @@ -537,6 +612,7 @@ async def s3_client(bucket_name, s3_key_prefix): @pytest.mark.parametrize("encryption", [None, "AES256", "aws:kms"], indirect=True) @pytest.mark.parametrize("bucket_name", [os.getenv("S3_TEST_BUCKET")], indirect=True) @pytest.mark.parametrize("batch_export_schema", TEST_S3_SCHEMAS) +@pytest.mark.parametrize("file_format", FILE_FORMAT_EXTENSIONS.keys(), indirect=True) async def test_s3_export_workflow_with_s3_bucket( s3_client, clickhouse_client, @@ -549,6 +625,7 @@ async def test_s3_export_workflow_with_s3_bucket( exclude_events, ateam, batch_export_schema, + file_format, ): """Test S3 Export Workflow end-to-end by using an S3 bucket. @@ -646,6 +723,7 @@ async def test_s3_export_workflow_with_s3_bucket( exclude_events=exclude_events, include_events=None, compression=compression, + file_format=file_format, ) @@ -1206,6 +1284,49 @@ async def never_finish_activity(_: S3InsertInputs) -> str: ), "nested/prefix/2023-01-01 00:00:00-2023-01-01 01:00:00.jsonl.br", ), + ( + S3InsertInputs( + prefix="/nested/prefix/", + data_interval_start="2023-01-01 00:00:00", + data_interval_end="2023-01-01 01:00:00", + file_format="Parquet", + compression="snappy", + **base_inputs, # type: ignore + ), + "nested/prefix/2023-01-01 00:00:00-2023-01-01 01:00:00.parquet.sz", + ), + ( + S3InsertInputs( + prefix="/nested/prefix/", + data_interval_start="2023-01-01 00:00:00", + data_interval_end="2023-01-01 01:00:00", + file_format="Parquet", + **base_inputs, # type: ignore + ), + "nested/prefix/2023-01-01 00:00:00-2023-01-01 01:00:00.parquet", + ), + ( + S3InsertInputs( + prefix="/nested/prefix/", + data_interval_start="2023-01-01 00:00:00", + data_interval_end="2023-01-01 01:00:00", + compression="gzip", + file_format="Parquet", + **base_inputs, # type: ignore + ), + "nested/prefix/2023-01-01 00:00:00-2023-01-01 01:00:00.parquet.gz", + ), + ( + S3InsertInputs( + prefix="/nested/prefix/", + data_interval_start="2023-01-01 00:00:00", + data_interval_end="2023-01-01 01:00:00", + compression="brotli", + file_format="Parquet", + **base_inputs, # type: ignore + ), + "nested/prefix/2023-01-01 00:00:00-2023-01-01 01:00:00.parquet.br", + ), ], ) def test_get_s3_key(inputs, expected): @@ -1271,7 +1392,7 @@ def assert_heartbeat_details(*details): endpoint_url=settings.OBJECT_STORAGE_ENDPOINT, ) - with override_settings(BATCH_EXPORT_S3_UPLOAD_CHUNK_SIZE_BYTES=5 * 1024**2): + with override_settings(BATCH_EXPORT_S3_UPLOAD_CHUNK_SIZE_BYTES=1, CLICKHOUSE_MAX_BLOCK_SIZE_DEFAULT=1): await activity_environment.run(insert_into_s3_activity, insert_inputs) # This checks that the assert_heartbeat_details function was actually called. diff --git a/posthog/temporal/tests/batch_exports/test_temporary_file.py b/posthog/temporal/tests/batch_exports/test_temporary_file.py new file mode 100644 index 0000000000000..4fd7e69c0c12f --- /dev/null +++ b/posthog/temporal/tests/batch_exports/test_temporary_file.py @@ -0,0 +1,389 @@ +import csv +import datetime as dt +import io +import json + +import pyarrow as pa +import pyarrow.parquet as pq +import pytest + +from posthog.temporal.batch_exports.temporary_file import ( + BatchExportTemporaryFile, + CSVBatchExportWriter, + JSONLBatchExportWriter, + ParquetBatchExportWriter, + json_dumps_bytes, +) + + +@pytest.mark.parametrize( + "to_write", + [ + (b"",), + (b"", b""), + (b"12345",), + (b"12345", b"12345"), + (b"abbcccddddeeeee",), + (b"abbcccddddeeeee", b"abbcccddddeeeee"), + ], +) +def test_batch_export_temporary_file_tracks_bytes(to_write): + """Test the bytes written by BatchExportTemporaryFile match expected.""" + with BatchExportTemporaryFile() as be_file: + for content in to_write: + be_file.write(content) + + assert be_file.bytes_total == sum(len(content) for content in to_write) + assert be_file.bytes_since_last_reset == sum(len(content) for content in to_write) + + be_file.reset() + + assert be_file.bytes_total == sum(len(content) for content in to_write) + assert be_file.bytes_since_last_reset == 0 + + +TEST_RECORDS = [ + [], + [ + {"id": "record-1", "property": "value", "property_int": 1}, + {"id": "record-2", "property": "another-value", "property_int": 2}, + { + "id": "record-3", + "property": {"id": "nested-record", "property": "nested-value"}, + "property_int": 3, + }, + ], +] + + +@pytest.mark.parametrize( + "records", + TEST_RECORDS, +) +def test_batch_export_temporary_file_write_records_to_jsonl(records): + """Test JSONL records written by BatchExportTemporaryFile match expected.""" + jsonl_dump = b"\n".join(map(json_dumps_bytes, records)) + + with BatchExportTemporaryFile() as be_file: + be_file.write_records_to_jsonl(records) + + assert be_file.bytes_total == len(jsonl_dump) + assert be_file.bytes_since_last_reset == len(jsonl_dump) + assert be_file.records_total == len(records) + assert be_file.records_since_last_reset == len(records) + + be_file.seek(0) + lines = be_file.readlines() + assert len(lines) == len(records) + + for line_index, jsonl_record in enumerate(lines): + json_loaded = json.loads(jsonl_record) + assert json_loaded == records[line_index] + + be_file.reset() + + assert be_file.bytes_total == len(jsonl_dump) + assert be_file.bytes_since_last_reset == 0 + assert be_file.records_total == len(records) + assert be_file.records_since_last_reset == 0 + + +@pytest.mark.parametrize( + "records", + TEST_RECORDS, +) +def test_batch_export_temporary_file_write_records_to_csv(records): + """Test CSV written by BatchExportTemporaryFile match expected.""" + in_memory_file_obj = io.StringIO() + writer = csv.DictWriter( + in_memory_file_obj, + fieldnames=records[0].keys() if len(records) > 0 else [], + delimiter=",", + quotechar='"', + escapechar="\\", + lineterminator="\n", + quoting=csv.QUOTE_NONE, + ) + writer.writerows(records) + + with BatchExportTemporaryFile(mode="w+") as be_file: + be_file.write_records_to_csv(records) + + assert be_file.bytes_total == in_memory_file_obj.tell() + assert be_file.bytes_since_last_reset == in_memory_file_obj.tell() + assert be_file.records_total == len(records) + assert be_file.records_since_last_reset == len(records) + + be_file.seek(0) + reader = csv.reader( + be_file._file, + delimiter=",", + quotechar='"', + escapechar="\\", + quoting=csv.QUOTE_NONE, + ) + + rows = [row for row in reader] + assert len(rows) == len(records) + + for row_index, csv_record in enumerate(rows): + for value_index, value in enumerate(records[row_index].values()): + # Everything returned by csv.reader is a str. + # This means type information is lost when writing to CSV + # but this just a limitation of the format. + assert csv_record[value_index] == str(value) + + be_file.reset() + + assert be_file.bytes_total == in_memory_file_obj.tell() + assert be_file.bytes_since_last_reset == 0 + assert be_file.records_total == len(records) + assert be_file.records_since_last_reset == 0 + + +@pytest.mark.parametrize( + "records", + TEST_RECORDS, +) +def test_batch_export_temporary_file_write_records_to_tsv(records): + """Test TSV written by BatchExportTemporaryFile match expected.""" + in_memory_file_obj = io.StringIO() + writer = csv.DictWriter( + in_memory_file_obj, + fieldnames=records[0].keys() if len(records) > 0 else [], + delimiter="\t", + quotechar='"', + escapechar="\\", + lineterminator="\n", + quoting=csv.QUOTE_NONE, + ) + writer.writerows(records) + + with BatchExportTemporaryFile(mode="w+") as be_file: + be_file.write_records_to_tsv(records) + + assert be_file.bytes_total == in_memory_file_obj.tell() + assert be_file.bytes_since_last_reset == in_memory_file_obj.tell() + assert be_file.records_total == len(records) + assert be_file.records_since_last_reset == len(records) + + be_file.seek(0) + reader = csv.reader( + be_file._file, + delimiter="\t", + quotechar='"', + escapechar="\\", + quoting=csv.QUOTE_NONE, + ) + + rows = [row for row in reader] + assert len(rows) == len(records) + + for row_index, csv_record in enumerate(rows): + for value_index, value in enumerate(records[row_index].values()): + # Everything returned by csv.reader is a str. + # This means type information is lost when writing to CSV + # but this just a limitation of the format. + assert csv_record[value_index] == str(value) + + be_file.reset() + + assert be_file.bytes_total == in_memory_file_obj.tell() + assert be_file.bytes_since_last_reset == 0 + assert be_file.records_total == len(records) + assert be_file.records_since_last_reset == 0 + + +TEST_RECORD_BATCHES = [ + pa.RecordBatch.from_pydict( + { + "event": pa.array(["test-event-0", "test-event-1", "test-event-2"]), + "properties": pa.array(['{"prop_0": 1, "prop_1": 2}', "{}", "null"]), + "_inserted_at": pa.array([0, 1, 2]), + } + ) +] + + +@pytest.mark.parametrize( + "record_batch", + TEST_RECORD_BATCHES, +) +@pytest.mark.asyncio +async def test_jsonl_writer_writes_record_batches(record_batch): + """Test record batches are written as valid JSONL.""" + in_memory_file_obj = io.BytesIO() + inserted_ats_seen = [] + + async def store_in_memory_on_flush( + batch_export_file, records_since_last_flush, bytes_since_last_flush, last_inserted_at, is_last + ): + in_memory_file_obj.write(batch_export_file.read()) + inserted_ats_seen.append(last_inserted_at) + + writer = JSONLBatchExportWriter(max_bytes=1, flush_callable=store_in_memory_on_flush) + + record_batch = record_batch.sort_by("_inserted_at") + async with writer.open_temporary_file(): + await writer.write_record_batch(record_batch) + + lines = in_memory_file_obj.readlines() + for index, line in enumerate(lines): + written_jsonl = json.loads(line) + + single_record_batch = record_batch.slice(offset=index, length=1) + expected_jsonl = single_record_batch.to_pylist()[0] + + assert "_inserted_at" not in written_jsonl + assert written_jsonl == expected_jsonl + + assert inserted_ats_seen == [record_batch.column("_inserted_at")[-1].as_py()] + + +@pytest.mark.parametrize( + "record_batch", + TEST_RECORD_BATCHES, +) +@pytest.mark.asyncio +async def test_csv_writer_writes_record_batches(record_batch): + """Test record batches are written as valid CSV.""" + in_memory_file_obj = io.StringIO() + inserted_ats_seen = [] + + async def store_in_memory_on_flush( + batch_export_file, records_since_last_flush, bytes_since_last_flush, last_inserted_at, is_last + ): + in_memory_file_obj.write(batch_export_file.read().decode("utf-8")) + inserted_ats_seen.append(last_inserted_at) + + schema_columns = [column_name for column_name in record_batch.column_names if column_name != "_inserted_at"] + writer = CSVBatchExportWriter(max_bytes=1, field_names=schema_columns, flush_callable=store_in_memory_on_flush) + + record_batch = record_batch.sort_by("_inserted_at") + async with writer.open_temporary_file(): + await writer.write_record_batch(record_batch) + + reader = csv.reader( + in_memory_file_obj, + delimiter=",", + quotechar='"', + escapechar="\\", + quoting=csv.QUOTE_NONE, + ) + for index, written_csv_row in enumerate(reader): + single_record_batch = record_batch.slice(offset=index, length=1) + expected_csv = single_record_batch.to_pylist()[0] + + assert "_inserted_at" not in written_csv_row + assert written_csv_row == expected_csv + + assert inserted_ats_seen == [record_batch.column("_inserted_at")[-1].as_py()] + + +@pytest.mark.parametrize( + "record_batch", + TEST_RECORD_BATCHES, +) +@pytest.mark.asyncio +async def test_parquet_writer_writes_record_batches(record_batch): + """Test record batches are written as valid Parquet.""" + in_memory_file_obj = io.BytesIO() + inserted_ats_seen = [] + + async def store_in_memory_on_flush( + batch_export_file, records_since_last_flush, bytes_since_last_flush, last_inserted_at, is_last + ): + in_memory_file_obj.write(batch_export_file.read()) + inserted_ats_seen.append(last_inserted_at) + + schema_columns = [column_name for column_name in record_batch.column_names if column_name != "_inserted_at"] + + writer = ParquetBatchExportWriter( + max_bytes=1, + flush_callable=store_in_memory_on_flush, + schema=record_batch.select(schema_columns).schema, + ) + + record_batch = record_batch.sort_by("_inserted_at") + async with writer.open_temporary_file(): + await writer.write_record_batch(record_batch) + + written_parquet = pq.read_table(in_memory_file_obj) + + for index, written_row_as_dict in enumerate(written_parquet.to_pylist()): + single_record_batch = record_batch.slice(offset=index, length=1) + expected_row_as_dict = single_record_batch.select(schema_columns).to_pylist()[0] + + assert "_inserted_at" not in written_row_as_dict + assert written_row_as_dict == expected_row_as_dict + + # NOTE: Parquet gets flushed twice due to the extra flush at the end for footer bytes, so our mock function + # will see this value twice. + assert inserted_ats_seen == [ + record_batch.column("_inserted_at")[-1].as_py(), + record_batch.column("_inserted_at")[-1].as_py(), + ] + + +@pytest.mark.parametrize( + "record_batch", + TEST_RECORD_BATCHES, +) +@pytest.mark.asyncio +async def test_writing_out_of_scope_of_temporary_file_raises(record_batch): + """Test attempting a write out of temporary file scope raises a `ValueError`.""" + + async def do_nothing(*args, **kwargs): + pass + + schema_columns = [column_name for column_name in record_batch.column_names if column_name != "_inserted_at"] + writer = ParquetBatchExportWriter( + max_bytes=10, + flush_callable=do_nothing, + schema=record_batch.select(schema_columns).schema, + ) + + async with writer.open_temporary_file(): + pass + + with pytest.raises(ValueError, match="Batch export file is closed"): + await writer.write_record_batch(record_batch) + + +@pytest.mark.parametrize( + "record_batch", + TEST_RECORD_BATCHES, +) +@pytest.mark.asyncio +async def test_flushing_parquet_writer_resets_underlying_file(record_batch): + """Test flushing a writer resets underlying file.""" + flush_counter = 0 + + async def track_flushes(*args, **kwargs): + nonlocal flush_counter + flush_counter += 1 + + schema_columns = [column_name for column_name in record_batch.column_names if column_name != "_inserted_at"] + writer = ParquetBatchExportWriter( + max_bytes=10000000, + flush_callable=track_flushes, + schema=record_batch.select(schema_columns).schema, + ) + + async with writer.open_temporary_file(): + await writer.write_record_batch(record_batch) + + assert writer.batch_export_file.tell() > 0 + assert writer.bytes_since_last_flush > 0 + assert writer.bytes_since_last_flush == writer.batch_export_file.bytes_since_last_reset + assert writer.records_since_last_flush == record_batch.num_rows + + await writer.flush(dt.datetime.now()) + + assert flush_counter == 1 + assert writer.batch_export_file.tell() == 0 + assert writer.bytes_since_last_flush == 0 + assert writer.bytes_since_last_flush == writer.batch_export_file.bytes_since_last_reset + assert writer.records_since_last_flush == 0 + + assert flush_counter == 2