From 7076cd4bc14e52b27bca99ba2419f600b6cc1649 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1s=20Far=C3=ADas=20Santana?= <tomas@tomasfarias.dev>
Date: Wed, 20 Mar 2024 10:58:35 +0100
Subject: [PATCH] feat: Support for s3 compatible destinations in batch exports
 (#20894)

* feat: Backend support for endpoint_url

* feat: Frontend support for endpoint_url

* fix: Update story

* fix: Remove deprecated comment

Co-authored-by: ted kaemming <65315+tkaemming@users.noreply.github.com>

* fix: Update S3 test and README

* Update UI snapshots for `chromium` (2)

* Update UI snapshots for `chromium` (2)

* Update query snapshots

* Update query snapshots

---------

Co-authored-by: ted kaemming <65315+tkaemming@users.noreply.github.com>
Co-authored-by: github-actions <41898282+github-actions[bot]@users.noreply.github.com>
---
 .../batch_exports/BatchExportEditForm.tsx     |   9 ++
 .../batch_exports/BatchExports.stories.tsx    |   1 +
 .../batch_exports/batchExportEditLogic.ts     |   1 +
 frontend/src/types.ts                         |   1 +
 posthog/batch_exports/service.py              |   1 +
 .../temporal/batch_exports/s3_batch_export.py |   7 +
 .../temporal/tests/batch_exports/README.md    |  11 +-
 .../test_s3_batch_export_workflow.py          | 147 ++++++++----------
 8 files changed, 90 insertions(+), 88 deletions(-)
diff --git a/frontend/src/scenes/batch_exports/BatchExportEditForm.tsx b/frontend/src/scenes/batch_exports/BatchExportEditForm.tsx
index 106da039cd22d..7fbbc8cc29d69 100644
--- a/frontend/src/scenes/batch_exports/BatchExportEditForm.tsx
+++ b/frontend/src/scenes/batch_exports/BatchExportEditForm.tsx
@@ -267,6 +267,15 @@ export function BatchExportsEditFields({
                             )}
                         </div>
 
+                        <LemonField
+                            name="endpoint_url"
+                            label="Endpoint URL"
+                            showOptional
+                            info={<>Only required if exporting to an S3-compatible blob storage (like MinIO)</>}
+                        >
+                            <LemonInput placeholder={isNew ? 'e.g. https://your-minio-host:9000' : 'Leave unchanged'} />
+                        </LemonField>
+
                         <LemonField name="exclude_events" label="Events to exclude" className="flex-1">
                             <LemonInputSelect
                                 mode="multiple"
diff --git a/frontend/src/scenes/batch_exports/BatchExports.stories.tsx b/frontend/src/scenes/batch_exports/BatchExports.stories.tsx
index fd3811664392a..0dd616c44982a 100644
--- a/frontend/src/scenes/batch_exports/BatchExports.stories.tsx
+++ b/frontend/src/scenes/batch_exports/BatchExports.stories.tsx
@@ -36,6 +36,7 @@ export default {
                             prefix: 'my-prefix',
                             aws_access_key_id: 'my-access-key-id',
                             aws_secret_access_key: '',
+                            endpoint_url: null,
                             compression: null,
                             exclude_events: [],
                             include_events: [],
diff --git a/frontend/src/scenes/batch_exports/batchExportEditLogic.ts b/frontend/src/scenes/batch_exports/batchExportEditLogic.ts
index 5d3a7fd3e2cee..bc86d1618fe4f 100644
--- a/frontend/src/scenes/batch_exports/batchExportEditLogic.ts
+++ b/frontend/src/scenes/batch_exports/batchExportEditLogic.ts
@@ -93,6 +93,7 @@ export const batchExportFormFields = (
                   kms_key_id: !config.kms_key_id && config.encryption == 'aws:kms' ? 'This field is required' : '',
                   exclude_events: '',
                   include_events: '',
+                  endpoint_url: null,
               }
             : destination === 'BigQuery'
             ? {
diff --git a/frontend/src/types.ts b/frontend/src/types.ts
index 9d53e22e4008f..2aaaba4809a87 100644
--- a/frontend/src/types.ts
+++ b/frontend/src/types.ts
@@ -3594,6 +3594,7 @@ export type BatchExportDestinationS3 = {
         compression: string | null
         encryption: string | null
         kms_key_id: string | null
+        endpoint_url: string | null
     }
 }
 
diff --git a/posthog/batch_exports/service.py b/posthog/batch_exports/service.py
index f3d5715220bf3..f026fce03fb3f 100644
--- a/posthog/batch_exports/service.py
+++ b/posthog/batch_exports/service.py
@@ -89,6 +89,7 @@ class S3BatchExportInputs:
     encryption: str | None = None
     kms_key_id: str | None = None
     batch_export_schema: BatchExportSchema | None = None
+    endpoint_url: str | None = None
 
 
 @dataclass
diff --git a/posthog/temporal/batch_exports/s3_batch_export.py b/posthog/temporal/batch_exports/s3_batch_export.py
index 29ccf7f9628c1..365613fd49ae4 100644
--- a/posthog/temporal/batch_exports/s3_batch_export.py
+++ b/posthog/temporal/batch_exports/s3_batch_export.py
@@ -120,11 +120,13 @@ def __init__(
         kms_key_id: str | None,
         aws_access_key_id: str | None = None,
         aws_secret_access_key: str | None = None,
+        endpoint_url: str | None = None,
     ):
         self._session = aioboto3.Session()
         self.region_name = region_name
         self.aws_access_key_id = aws_access_key_id
         self.aws_secret_access_key = aws_secret_access_key
+        self.endpoint_url = endpoint_url
         self.bucket_name = bucket_name
         self.key = key
         self.encryption = encryption
@@ -154,11 +156,13 @@ def is_upload_in_progress(self) -> bool:
     @contextlib.asynccontextmanager
     async def s3_client(self):
         """Asynchronously yield an S3 client."""
+
         async with self._session.client(
             "s3",
             region_name=self.region_name,
             aws_access_key_id=self.aws_access_key_id,
             aws_secret_access_key=self.aws_secret_access_key,
+            endpoint_url=self.endpoint_url,
         ) as client:
             yield client
 
@@ -306,6 +310,7 @@ class S3InsertInputs:
     encryption: str | None = None
     kms_key_id: str | None = None
     batch_export_schema: BatchExportSchema | None = None
+    endpoint_url: str | None = None
 
 
 async def initialize_and_resume_multipart_upload(inputs: S3InsertInputs) -> tuple[S3MultiPartUpload, str]:
@@ -321,6 +326,7 @@ async def initialize_and_resume_multipart_upload(inputs: S3InsertInputs) -> tupl
         region_name=inputs.region,
         aws_access_key_id=inputs.aws_access_key_id,
         aws_secret_access_key=inputs.aws_secret_access_key,
+        endpoint_url=inputs.endpoint_url,
     )
 
     details = activity.info().heartbeat_details
@@ -555,6 +561,7 @@ async def run(self, inputs: S3BatchExportInputs):
             team_id=inputs.team_id,
             aws_access_key_id=inputs.aws_access_key_id,
             aws_secret_access_key=inputs.aws_secret_access_key,
+            endpoint_url=inputs.endpoint_url,
             data_interval_start=data_interval_start.isoformat(),
             data_interval_end=data_interval_end.isoformat(),
             compression=inputs.compression,
diff --git a/posthog/temporal/tests/batch_exports/README.md b/posthog/temporal/tests/batch_exports/README.md
index 12bf2f6177cd3..c3f63f176ebde 100644
--- a/posthog/temporal/tests/batch_exports/README.md
+++ b/posthog/temporal/tests/batch_exports/README.md
@@ -6,7 +6,8 @@ This module contains unit tests covering activities, workflows, and helper funct
 
 BigQuery batch exports can be tested against a real BigQuery instance, but doing so requires additional setup. For this reason, these tests are skipped unless an environment variable pointing to a BigQuery credentials file (`GOOGLE_APPLICATION_CREDENTIALS=/path/to/my/project-credentials.json`) is set.
 
-> :warning: Since BigQuery batch export tests require additional setup, we skip them by default and will not be ran by automated CI pipelines. Please ensure these tests pass when making changes that affect BigQuery batch exports.
+> [!WARNING]
+> Since BigQuery batch export tests require additional setup, we skip them by default and will not be ran by automated CI pipelines. Please ensure these tests pass when making changes that affect BigQuery batch exports.
 
 To enable testing for BigQuery batch exports, we require:
 1. A BigQuery project and dataset
@@ -24,7 +25,8 @@ DEBUG=1 GOOGLE_APPLICATION_CREDENTIALS=/path/to/my/project-credentials.json pyte
 
 Redshift batch exports can be tested against a real Redshift (or Redshift Serverless) instance, with additional setup steps required. Due to this requirement, these tests are skipped unless Redshift credentials are specified in the environment.
 
-> :warning: Since Redshift batch export tests require additional setup, we skip them by default and will not be ran by automated CI pipelines. Please ensure these tests pass when making changes that affect Redshift batch exports.
+> [!WARNING]
+> Since Redshift batch export tests require additional setup, we skip them by default and will not be ran by automated CI pipelines. Please ensure these tests pass when making changes that affect Redshift batch exports.
 
 To enable testing for Redshift batch exports, we require:
 1. A Redshift (or Redshift Serverless) instance.
@@ -41,14 +43,15 @@ Replace the `REDSHIFT_*` environment variables with the values obtained from the
 
 ## Testing S3 batch exports
 
-S3 batch exports are tested against a MinIO bucket available in the local development stack. However there are also unit tests that specifically target an S3 bucket. Additional setup is required to run those specific tests:
+S3 batch exports are tested against a MinIO bucket available in the local development stack. However there are also unit tests that specifically target an S3 bucket (like `test_s3_export_workflow_with_s3_bucket`). Additional setup is required to run those specific tests:
 
 1. Ensure you are logged in to an AWS account.
 2. Create or choose an existing S3 bucket from that AWS account to use as the test bucket.
 3. Create or choose an existing KMS key id from that AWS account to use in tests.
 4. Make sure the role/user you are logged in as has permissions to use the bucket and KMS key.
 
-For PostHog employees, check your password manager for these details.
+> [!NOTE]
+> For PostHog employees, your password manager contains a set of credentials for S3 batch exports development testing. You may populate your development environment with these credentials and use the provided test bucket and KMS key.
 
 With these setup steps done, we can run all tests (MinIO and S3 bucket) from the root of the `posthog` repo with:
 
diff --git a/posthog/temporal/tests/batch_exports/test_s3_batch_export_workflow.py b/posthog/temporal/tests/batch_exports/test_s3_batch_export_workflow.py
index 17b1aaaa94d4a..5e741f9223321 100644
--- a/posthog/temporal/tests/batch_exports/test_s3_batch_export_workflow.py
+++ b/posthog/temporal/tests/batch_exports/test_s3_batch_export_workflow.py
@@ -1,12 +1,10 @@
 import asyncio
-import contextlib
 import datetime as dt
 import functools
 import gzip
 import json
 import os
 from random import randint
-from unittest import mock
 from uuid import uuid4
 
 import aioboto3
@@ -337,6 +335,7 @@ async def test_insert_into_s3_activity_puts_data_into_s3(
         data_interval_end=data_interval_end.isoformat(),
         aws_access_key_id="object_storage_root_user",
         aws_secret_access_key="object_storage_root_password",
+        endpoint_url=settings.OBJECT_STORAGE_ENDPOINT,
         compression=compression,
         exclude_events=exclude_events,
         batch_export_schema=batch_export_schema,
@@ -345,11 +344,7 @@ async def test_insert_into_s3_activity_puts_data_into_s3(
     with override_settings(
         BATCH_EXPORT_S3_UPLOAD_CHUNK_SIZE_BYTES=5 * 1024**2
     ):  # 5MB, the minimum for Multipart uploads
-        with mock.patch(
-            "posthog.temporal.batch_exports.s3_batch_export.aioboto3.Session.client",
-            side_effect=create_test_client,
-        ):
-            await activity_environment.run(insert_into_s3_activity, insert_inputs)
+        await activity_environment.run(insert_into_s3_activity, insert_inputs)
 
     await assert_clickhouse_records_in_s3(
         s3_compatible_client=minio_client,
@@ -368,7 +363,14 @@ async def test_insert_into_s3_activity_puts_data_into_s3(
 
 @pytest_asyncio.fixture
 async def s3_batch_export(
-    ateam, s3_key_prefix, bucket_name, compression, interval, exclude_events, temporal_client, encryption
+    ateam,
+    s3_key_prefix,
+    bucket_name,
+    compression,
+    interval,
+    exclude_events,
+    temporal_client,
+    encryption,
 ):
     destination_data = {
         "type": "S3",
@@ -378,6 +380,7 @@ async def s3_batch_export(
             "prefix": s3_key_prefix,
             "aws_access_key_id": "object_storage_root_user",
             "aws_secret_access_key": "object_storage_root_password",
+            "endpoint_url": settings.OBJECT_STORAGE_ENDPOINT,
             "compression": compression,
             "exclude_events": exclude_events,
             "encryption": encryption,
@@ -479,19 +482,14 @@ async def test_s3_export_workflow_with_minio_bucket(
             ],
             workflow_runner=UnsandboxedWorkflowRunner(),
         ):
-            # We patch the S3 client to return our client that targets MinIO.
-            with mock.patch(
-                "posthog.temporal.batch_exports.s3_batch_export.aioboto3.Session.client",
-                side_effect=create_test_client,
-            ):
-                await activity_environment.client.execute_workflow(
-                    S3BatchExportWorkflow.run,
-                    inputs,
-                    id=workflow_id,
-                    task_queue=settings.TEMPORAL_TASK_QUEUE,
-                    retry_policy=RetryPolicy(maximum_attempts=1),
-                    execution_timeout=dt.timedelta(minutes=10),
-                )
+            await activity_environment.client.execute_workflow(
+                S3BatchExportWorkflow.run,
+                inputs,
+                id=workflow_id,
+                task_queue=settings.TEMPORAL_TASK_QUEUE,
+                retry_policy=RetryPolicy(maximum_attempts=1),
+                execution_timeout=dt.timedelta(minutes=10),
+            )
 
     runs = await afetch_batch_export_runs(batch_export_id=s3_batch_export.id)
     assert len(runs) == 1
@@ -523,10 +521,10 @@ async def s3_client(bucket_name, s3_key_prefix):
     using a disposable bucket to run these tests or sticking to other tests that use the
     local development MinIO.
     """
-    async with aioboto3.Session().client("s3") as minio_client:
-        yield minio_client
+    async with aioboto3.Session().client("s3") as s3_client:
+        yield s3_client
 
-        await delete_all_from_s3(minio_client, bucket_name, key_prefix=s3_key_prefix)
+        await delete_all_from_s3(s3_client, bucket_name, key_prefix=s3_key_prefix)
 
 
 @pytest.mark.skipif(
@@ -595,20 +593,20 @@ async def test_s3_export_workflow_with_s3_bucket(
             )
 
     workflow_id = str(uuid4())
+    destination_config = s3_batch_export.destination.config | {
+        "endpoint_url": None,
+        "aws_access_key_id": os.getenv("AWS_ACCESS_KEY_ID"),
+        "aws_secret_access_key": os.getenv("AWS_SECRET_ACCESS_KEY"),
+    }
     inputs = S3BatchExportInputs(
         team_id=ateam.pk,
         batch_export_id=str(s3_batch_export.id),
         data_interval_end=data_interval_end.isoformat(),
         interval=interval,
         batch_export_schema=batch_export_schema,
-        **s3_batch_export.destination.config,
+        **destination_config,
     )
 
-    @contextlib.asynccontextmanager
-    async def create_minio_client(*args, **kwargs):
-        """Mock function to return an already initialized S3 client."""
-        yield s3_client
-
     async with await WorkflowEnvironment.start_time_skipping() as activity_environment:
         async with Worker(
             activity_environment.client,
@@ -621,18 +619,14 @@ async def create_minio_client(*args, **kwargs):
             ],
             workflow_runner=UnsandboxedWorkflowRunner(),
         ):
-            with mock.patch(
-                "posthog.temporal.batch_exports.s3_batch_export.aioboto3.Session.client",
-                side_effect=create_minio_client,
-            ):
-                await activity_environment.client.execute_workflow(
-                    S3BatchExportWorkflow.run,
-                    inputs,
-                    id=workflow_id,
-                    task_queue=settings.TEMPORAL_TASK_QUEUE,
-                    retry_policy=RetryPolicy(maximum_attempts=1),
-                    execution_timeout=dt.timedelta(seconds=10),
-                )
+            await activity_environment.client.execute_workflow(
+                S3BatchExportWorkflow.run,
+                inputs,
+                id=workflow_id,
+                task_queue=settings.TEMPORAL_TASK_QUEUE,
+                retry_policy=RetryPolicy(maximum_attempts=1),
+                execution_timeout=dt.timedelta(seconds=10),
+            )
 
     runs = await afetch_batch_export_runs(batch_export_id=s3_batch_export.id)
     assert len(runs) == 1
@@ -641,7 +635,7 @@ async def create_minio_client(*args, **kwargs):
     assert run.status == "Completed"
 
     await assert_clickhouse_records_in_s3(
-        s3_compatible_client=minio_client,
+        s3_compatible_client=s3_client,
         clickhouse_client=clickhouse_client,
         bucket_name=bucket_name,
         key_prefix=s3_key_prefix,
@@ -708,18 +702,14 @@ async def test_s3_export_workflow_with_minio_bucket_and_a_lot_of_data(
             ],
             workflow_runner=UnsandboxedWorkflowRunner(),
         ):
-            with mock.patch(
-                "posthog.temporal.batch_exports.s3_batch_export.aioboto3.Session.client",
-                side_effect=create_test_client,
-            ):
-                await activity_environment.client.execute_workflow(
-                    S3BatchExportWorkflow.run,
-                    inputs,
-                    id=workflow_id,
-                    task_queue=settings.TEMPORAL_TASK_QUEUE,
-                    retry_policy=RetryPolicy(maximum_attempts=1),
-                    execution_timeout=dt.timedelta(seconds=360),
-                )
+            await activity_environment.client.execute_workflow(
+                S3BatchExportWorkflow.run,
+                inputs,
+                id=workflow_id,
+                task_queue=settings.TEMPORAL_TASK_QUEUE,
+                retry_policy=RetryPolicy(maximum_attempts=1),
+                execution_timeout=dt.timedelta(seconds=360),
+            )
 
     runs = await afetch_batch_export_runs(batch_export_id=s3_batch_export.id)
     assert len(runs) == 1
@@ -787,18 +777,14 @@ async def test_s3_export_workflow_defaults_to_timestamp_on_null_inserted_at(
             ],
             workflow_runner=UnsandboxedWorkflowRunner(),
         ):
-            with mock.patch(
-                "posthog.temporal.batch_exports.s3_batch_export.aioboto3.Session.client",
-                side_effect=create_test_client,
-            ):
-                await activity_environment.client.execute_workflow(
-                    S3BatchExportWorkflow.run,
-                    inputs,
-                    id=workflow_id,
-                    task_queue=settings.TEMPORAL_TASK_QUEUE,
-                    retry_policy=RetryPolicy(maximum_attempts=1),
-                    execution_timeout=dt.timedelta(seconds=10),
-                )
+            await activity_environment.client.execute_workflow(
+                S3BatchExportWorkflow.run,
+                inputs,
+                id=workflow_id,
+                task_queue=settings.TEMPORAL_TASK_QUEUE,
+                retry_policy=RetryPolicy(maximum_attempts=1),
+                execution_timeout=dt.timedelta(seconds=10),
+            )
 
     runs = await afetch_batch_export_runs(batch_export_id=s3_batch_export.id)
     assert len(runs) == 1
@@ -875,18 +861,14 @@ async def test_s3_export_workflow_with_minio_bucket_and_custom_key_prefix(
             ],
             workflow_runner=UnsandboxedWorkflowRunner(),
         ):
-            with mock.patch(
-                "posthog.temporal.batch_exports.s3_batch_export.aioboto3.Session.client",
-                side_effect=create_test_client,
-            ):
-                await activity_environment.client.execute_workflow(
-                    S3BatchExportWorkflow.run,
-                    inputs,
-                    id=workflow_id,
-                    task_queue=settings.TEMPORAL_TASK_QUEUE,
-                    retry_policy=RetryPolicy(maximum_attempts=1),
-                    execution_timeout=dt.timedelta(seconds=10),
-                )
+            await activity_environment.client.execute_workflow(
+                S3BatchExportWorkflow.run,
+                inputs,
+                id=workflow_id,
+                task_queue=settings.TEMPORAL_TASK_QUEUE,
+                retry_policy=RetryPolicy(maximum_attempts=1),
+                execution_timeout=dt.timedelta(seconds=10),
+            )
 
     runs = await afetch_batch_export_runs(batch_export_id=s3_batch_export.id)
     assert len(runs) == 1
@@ -1283,14 +1265,11 @@ def assert_heartbeat_details(*details):
         data_interval_end=data_interval_end.isoformat(),
         aws_access_key_id="object_storage_root_user",
         aws_secret_access_key="object_storage_root_password",
+        endpoint_url=settings.OBJECT_STORAGE_ENDPOINT,
     )
 
     with override_settings(BATCH_EXPORT_S3_UPLOAD_CHUNK_SIZE_BYTES=5 * 1024**2):
-        with mock.patch(
-            "posthog.temporal.batch_exports.s3_batch_export.aioboto3.Session.client",
-            side_effect=create_test_client,
-        ):
-            await activity_environment.run(insert_into_s3_activity, insert_inputs)
+        await activity_environment.run(insert_into_s3_activity, insert_inputs)
 
     # This checks that the assert_heartbeat_details function was actually called.
     # The '+ 1' is because we increment current_part_number one last time after we are done.