-
Notifications
You must be signed in to change notification settings - Fork 1.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat: Start tracking records exported #21008
Changes from 8 commits
f0a4c12
04bbd27
c5fcb78
063dc37
855d9f0
6bcccad
6fc0168
3f7ae99
d31c20b
e27439a
852648d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -193,7 +193,7 @@ def bigquery_default_fields() -> list[BatchExportField]: | |
|
||
|
||
@activity.defn | ||
async def insert_into_bigquery_activity(inputs: BigQueryInsertInputs): | ||
async def insert_into_bigquery_activity(inputs: BigQueryInsertInputs) -> int: | ||
"""Activity streams data from ClickHouse to BigQuery.""" | ||
logger = await bind_temporal_worker_logger(team_id=inputs.team_id, destination="BigQuery") | ||
logger.info( | ||
|
@@ -230,7 +230,7 @@ async def insert_into_bigquery_activity(inputs: BigQueryInsertInputs): | |
inputs.data_interval_start, | ||
inputs.data_interval_end, | ||
) | ||
return | ||
return 0 | ||
|
||
logger.info("BatchExporting %s rows", count) | ||
|
||
|
@@ -354,6 +354,8 @@ async def flush_to_bigquery(bigquery_table, table_schema): | |
|
||
jsonl_file.reset() | ||
|
||
return jsonl_file.records_total | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is de-dented outside of the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, with statements do not introduce start a new scope. I am fine with de-denting this though, and I do plan to address it more in future PRs.
tomasfarias marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
|
||
@workflow.defn(name="bigquery-export") | ||
class BigQueryBatchExportWorkflow(PostHogWorkflow): | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -171,7 +171,7 @@ async def insert_records_to_redshift( | |
schema: str | None, | ||
table: str, | ||
batch_size: int = 100, | ||
): | ||
) -> int: | ||
"""Execute an INSERT query with given Redshift connection. | ||
|
||
The recommended way to insert multiple values into Redshift is using a COPY statement (see: | ||
|
@@ -206,15 +206,20 @@ async def insert_records_to_redshift( | |
template = sql.SQL("({})").format(sql.SQL(", ").join(map(sql.Placeholder, columns))) | ||
rows_exported = get_rows_exported_metric() | ||
|
||
total_rows_exported = 0 | ||
|
||
async with async_client_cursor_from_connection(redshift_connection) as cursor: | ||
batch = [] | ||
pre_query_str = pre_query.as_string(cursor).encode("utf-8") | ||
|
||
async def flush_to_redshift(batch): | ||
nonlocal total_rows_exported | ||
|
||
values = b",".join(batch).replace(b" E'", b" '") | ||
|
||
await cursor.execute(pre_query_str + values) | ||
rows_exported.add(len(batch)) | ||
total_rows_exported += len(batch) | ||
Comment on lines
221
to
+222
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not sure if we could just read the counter instead of having this duplicated. |
||
# It would be nice to record BYTES_EXPORTED for Redshift, but it's not worth estimating | ||
# the byte size of each batch the way things are currently written. We can revisit this | ||
# in the future if we decide it's useful enough. | ||
|
@@ -230,6 +235,8 @@ async def flush_to_redshift(batch): | |
if len(batch) > 0: | ||
await flush_to_redshift(batch) | ||
|
||
return total_rows_exported | ||
|
||
|
||
@contextlib.asynccontextmanager | ||
async def async_client_cursor_from_connection( | ||
|
@@ -264,7 +271,7 @@ class RedshiftInsertInputs(PostgresInsertInputs): | |
|
||
|
||
@activity.defn | ||
async def insert_into_redshift_activity(inputs: RedshiftInsertInputs): | ||
async def insert_into_redshift_activity(inputs: RedshiftInsertInputs) -> int: | ||
"""Activity to insert data from ClickHouse to Redshift. | ||
|
||
This activity executes the following steps: | ||
|
@@ -306,7 +313,7 @@ async def insert_into_redshift_activity(inputs: RedshiftInsertInputs): | |
inputs.data_interval_start, | ||
inputs.data_interval_end, | ||
) | ||
return | ||
return 0 | ||
|
||
logger.info("BatchExporting %s rows", count) | ||
|
||
|
@@ -383,13 +390,15 @@ def map_to_record(row: dict) -> dict: | |
return record | ||
|
||
async with postgres_connection(inputs) as connection: | ||
await insert_records_to_redshift( | ||
records_completed = await insert_records_to_redshift( | ||
(map_to_record(record) for record_batch in record_iterator for record in record_batch.to_pylist()), | ||
connection, | ||
inputs.schema, | ||
inputs.table_name, | ||
) | ||
|
||
return records_completed | ||
|
||
|
||
@workflow.defn(name="redshift-export") | ||
class RedshiftBatchExportWorkflow(PostHogWorkflow): | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -382,7 +382,7 @@ def s3_default_fields() -> list[BatchExportField]: | |
|
||
|
||
@activity.defn | ||
async def insert_into_s3_activity(inputs: S3InsertInputs): | ||
async def insert_into_s3_activity(inputs: S3InsertInputs) -> int: | ||
"""Activity to batch export data from PostHog's ClickHouse to S3. | ||
|
||
It currently only creates a single file per run, and uploads as a multipart upload. | ||
|
@@ -418,7 +418,7 @@ async def insert_into_s3_activity(inputs: S3InsertInputs): | |
inputs.data_interval_start, | ||
inputs.data_interval_end, | ||
) | ||
return | ||
return 0 | ||
|
||
logger.info("BatchExporting %s rows to S3", count) | ||
|
||
|
@@ -503,6 +503,8 @@ async def flush_to_s3(last_uploaded_part_timestamp: str, last=False): | |
|
||
await s3_upload.complete() | ||
|
||
return local_results_file.records_total | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Also de-dented, but it competes with the Maybe I'm being un-Pythonic? The use of important information outside of the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
In the end, I think I randomly fixed the bug in another PR when I wasn't looking for it. Anyways, this remained outside of the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. When we move it we do have to account for exceptions: We don't want to complete the upload in case of an exception and we have to be very careful if we are aborting, as a retry could continue the upload. Maybe the solution is to move this outside of the activity and in the workflow, as starting, completing, or aborting an upload should never fail except for wrong credentials, which we can very precisely wrap in a try/except. I can open a follow-up PR to deal with this. |
||
|
||
|
||
@workflow.defn(name="s3-export") | ||
class S3BatchExportWorkflow(PostHogWorkflow): | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could just be named
update_batch_export_run
as it's doing more than setting the status.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
On a broader note, I think we should get rid of the ORM (in batch exports) and move to something like aiosql.