-
Notifications
You must be signed in to change notification settings - Fork 1.4k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
refactor: Support batch export models as views (#23052)
* refactor: Update metrics to fetch counts at request time * fix: Move import to method * fix: Add function * feat: Custom schemas for batch exports * feat: Frontend support for model field * fix: Clean-up * fix: Add missing migration * fix: Make new field nullable * Update UI snapshots for `chromium` (1) * Update UI snapshots for `chromium` (1) * Update UI snapshots for `chromium` (1) * Update UI snapshots for `chromium` (1) * Update UI snapshots for `chromium` (1) * Update UI snapshots for `chromium` (1) * fix: Bump migration number * fix: Bump migration number * refactor: Update metrics to fetch counts at request time * fix: Actually use include and exclude events * refactor: Switch to counting runs * refactor: Support batch export models as views * fix: Merge conflict * fix: Quality check fixes * refactor: Update metrics to fetch counts at request time * fix: Move import to method * fix: Add function * fix: Typing fixes * feat: Custom schemas for batch exports * feat: Frontend support for model field * fix: Clean-up * fix: Add missing migration * fix: Make new field nullable * Update UI snapshots for `chromium` (1) * Update UI snapshots for `chromium` (1) * Update UI snapshots for `chromium` (1) * Update UI snapshots for `chromium` (1) * Update UI snapshots for `chromium` (1) * Update UI snapshots for `chromium` (1) * fix: Bump migration number * fix: Clean-up unused code * chore: Clean-up unused function and tests * fix: Clean-up unused function * fix: HTTP Batch export default fields * fix: Remove test case on new column not present in base table * chore: Clean-up unused functions and queries * fix: Only run extra clickhouse queries in batch exports tests * refactor: Remove coalesce and use only inserted_at in queries At this point, inserted_at should always be set for all batch exports. Only historical exports require _timestamp, but backfills have already been switched over to query based on timestamp, so they also do not need to check for inserted_at/_timestamp. Removing the colaesce and using only inserted_at reduces the size of the data CH has to fetch by half. * fix: Remove deprecated test * fix: Add person_id to person model and enforce ordering * refactor: Also add version column --------- Co-authored-by: github-actions <41898282+github-actions[bot]@users.noreply.github.com>
- Loading branch information
Showing
22 changed files
with
593 additions
and
479 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,135 @@ | ||
CREATE_PERSONS_BATCH_EXPORT_VIEW = """ | ||
CREATE OR REPLACE VIEW persons_batch_export AS ( | ||
SELECT | ||
pd.team_id AS team_id, | ||
pd.distinct_id AS distinct_id, | ||
toString(p.id) AS person_id, | ||
p.properties AS properties, | ||
pd.version AS version, | ||
pd._timestamp AS _inserted_at | ||
FROM ( | ||
SELECT | ||
team_id, | ||
distinct_id, | ||
max(version) AS version, | ||
argMax(person_id, person_distinct_id2.version) AS person_id, | ||
max(_timestamp) AS _timestamp | ||
FROM | ||
person_distinct_id2 | ||
WHERE | ||
team_id = {team_id:Int64} | ||
GROUP BY | ||
team_id, | ||
distinct_id | ||
) AS pd | ||
INNER JOIN | ||
person p ON p.id = pd.person_id AND p.team_id = pd.team_id | ||
WHERE | ||
pd.team_id = {team_id:Int64} | ||
AND p.team_id = {team_id:Int64} | ||
AND pd._timestamp >= {interval_start:DateTime64} | ||
AND pd._timestamp < {interval_end:DateTime64} | ||
ORDER BY | ||
_inserted_at | ||
) | ||
""" | ||
|
||
CREATE_EVENTS_BATCH_EXPORT_VIEW = """ | ||
CREATE OR REPLACE VIEW events_batch_export AS ( | ||
SELECT | ||
team_id AS team_id, | ||
min(timestamp) AS timestamp, | ||
event AS event, | ||
any(distinct_id) AS distinct_id, | ||
any(toString(uuid)) AS uuid, | ||
min(COALESCE(inserted_at, _timestamp)) AS _inserted_at, | ||
any(created_at) AS created_at, | ||
any(elements_chain) AS elements_chain, | ||
any(toString(person_id)) AS person_id, | ||
any(nullIf(properties, '')) AS properties, | ||
any(nullIf(person_properties, '')) AS person_properties, | ||
nullIf(JSONExtractString(properties, '$set'), '') AS set, | ||
nullIf(JSONExtractString(properties, '$set_once'), '') AS set_once | ||
FROM | ||
events | ||
PREWHERE | ||
events.inserted_at >= {interval_start:DateTime64} | ||
AND events.inserted_at < {interval_end:DateTime64} | ||
WHERE | ||
team_id = {team_id:Int64} | ||
AND events.timestamp >= {interval_start:DateTime64} - INTERVAL {lookback_days:Int32} DAY | ||
AND events.timestamp < {interval_end:DateTime64} + INTERVAL 1 DAY | ||
AND (length({include_events:Array(String)}) = 0 OR event IN {include_events:Array(String)}) | ||
AND (length({exclude_events:Array(String)}) = 0 OR event NOT IN {exclude_events:Array(String)}) | ||
GROUP BY | ||
team_id, toDate(events.timestamp), event, cityHash64(events.distinct_id), cityHash64(events.uuid) | ||
ORDER BY | ||
_inserted_at, event | ||
SETTINGS optimize_aggregation_in_order=1 | ||
) | ||
""" | ||
|
||
CREATE_EVENTS_BATCH_EXPORT_VIEW_UNBOUNDED = """ | ||
CREATE OR REPLACE VIEW events_batch_export_unbounded AS ( | ||
SELECT | ||
team_id AS team_id, | ||
min(timestamp) AS timestamp, | ||
event AS event, | ||
any(distinct_id) AS distinct_id, | ||
any(toString(uuid)) AS uuid, | ||
min(COALESCE(inserted_at, _timestamp)) AS _inserted_at, | ||
any(created_at) AS created_at, | ||
any(elements_chain) AS elements_chain, | ||
any(toString(person_id)) AS person_id, | ||
any(nullIf(properties, '')) AS properties, | ||
any(nullIf(person_properties, '')) AS person_properties, | ||
nullIf(JSONExtractString(properties, '$set'), '') AS set, | ||
nullIf(JSONExtractString(properties, '$set_once'), '') AS set_once | ||
FROM | ||
events | ||
PREWHERE | ||
events.inserted_at >= {interval_start:DateTime64} | ||
AND events.inserted_at < {interval_end:DateTime64} | ||
WHERE | ||
team_id = {team_id:Int64} | ||
AND (length({include_events:Array(String)}) = 0 OR event IN {include_events:Array(String)}) | ||
AND (length({exclude_events:Array(String)}) = 0 OR event NOT IN {exclude_events:Array(String)}) | ||
GROUP BY | ||
team_id, toDate(events.timestamp), event, cityHash64(events.distinct_id), cityHash64(events.uuid) | ||
ORDER BY | ||
_inserted_at, event | ||
SETTINGS optimize_aggregation_in_order=1 | ||
) | ||
""" | ||
|
||
CREATE_EVENTS_BATCH_EXPORT_VIEW_BACKFILL = """ | ||
CREATE OR REPLACE VIEW events_batch_export_backfill AS ( | ||
SELECT | ||
team_id AS team_id, | ||
min(timestamp) AS timestamp, | ||
event AS event, | ||
any(distinct_id) AS distinct_id, | ||
any(toString(uuid)) AS uuid, | ||
min(COALESCE(inserted_at, _timestamp)) AS _inserted_at, | ||
any(created_at) AS created_at, | ||
any(elements_chain) AS elements_chain, | ||
any(toString(person_id)) AS person_id, | ||
any(nullIf(properties, '')) AS properties, | ||
any(nullIf(person_properties, '')) AS person_properties, | ||
nullIf(JSONExtractString(properties, '$set'), '') AS set, | ||
nullIf(JSONExtractString(properties, '$set_once'), '') AS set_once | ||
FROM | ||
events | ||
WHERE | ||
team_id = {team_id:Int64} | ||
AND events.timestamp >= {interval_start:DateTime64} | ||
AND events.timestamp < {interval_end:DateTime64} | ||
AND (length({include_events:Array(String)}) = 0 OR event IN {include_events:Array(String)}) | ||
AND (length({exclude_events:Array(String)}) = 0 OR event NOT IN {exclude_events:Array(String)}) | ||
GROUP BY | ||
team_id, toDate(events.timestamp), event, cityHash64(events.distinct_id), cityHash64(events.uuid) | ||
ORDER BY | ||
_inserted_at, event | ||
SETTINGS optimize_aggregation_in_order=1 | ||
) | ||
""" |
17 changes: 17 additions & 0 deletions
17
posthog/clickhouse/migrations/0064_create_person_batch_export_view.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
from posthog.batch_exports.sql import ( | ||
CREATE_EVENTS_BATCH_EXPORT_VIEW, | ||
CREATE_EVENTS_BATCH_EXPORT_VIEW_BACKFILL, | ||
CREATE_EVENTS_BATCH_EXPORT_VIEW_UNBOUNDED, | ||
CREATE_PERSONS_BATCH_EXPORT_VIEW, | ||
) | ||
from posthog.clickhouse.client.migration_tools import run_sql_with_exceptions | ||
|
||
operations = map( | ||
run_sql_with_exceptions, | ||
[ | ||
CREATE_PERSONS_BATCH_EXPORT_VIEW, | ||
CREATE_EVENTS_BATCH_EXPORT_VIEW, | ||
CREATE_EVENTS_BATCH_EXPORT_VIEW_UNBOUNDED, | ||
CREATE_EVENTS_BATCH_EXPORT_VIEW_BACKFILL, | ||
], | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.