From 74b8056eec9d27e21697e13ae9507e6fb862bcf8 Mon Sep 17 00:00:00 2001 From: eric Date: Tue, 13 Feb 2024 16:40:50 -0500 Subject: [PATCH] add test --- .../test_data_warehouse_query_builder.ambr | 26 ++++ .../test/test_data_warehouse_query_builder.py | 137 ++++++++++++++++++ .../insights/trends/test/test_trends.py | 68 +-------- 3 files changed, 164 insertions(+), 67 deletions(-) create mode 100644 posthog/hogql_queries/insights/trends/test/__snapshots__/test_data_warehouse_query_builder.ambr create mode 100644 posthog/hogql_queries/insights/trends/test/test_data_warehouse_query_builder.py diff --git a/posthog/hogql_queries/insights/trends/test/__snapshots__/test_data_warehouse_query_builder.ambr b/posthog/hogql_queries/insights/trends/test/__snapshots__/test_data_warehouse_query_builder.ambr new file mode 100644 index 0000000000000..6b0e8f1be8e5a --- /dev/null +++ b/posthog/hogql_queries/insights/trends/test/__snapshots__/test_data_warehouse_query_builder.ambr @@ -0,0 +1,26 @@ +# name: TestDataWarehouseQueryBuilder.test_trends_data_warehouse + ' + SELECT groupArray(day_start) AS date, + groupArray(count) AS total + FROM + (SELECT sum(total) AS count, + day_start AS day_start + FROM + (SELECT 0 AS total, + minus(toStartOfDay(assumeNotNull(parseDateTime64BestEffortOrNull('2023-01-07 23:59:59', 6, 'UTC'))), toIntervalDay(numbers.number)) AS day_start + FROM numbers(coalesce(dateDiff('day', assumeNotNull(parseDateTime64BestEffortOrNull('2023-01-01 00:00:00', 6, 'UTC')), assumeNotNull(parseDateTime64BestEffortOrNull('2023-01-07 23:59:59', 6, 'UTC'))), 0)) AS numbers + UNION ALL SELECT 0 AS total, + toStartOfDay(assumeNotNull(parseDateTime64BestEffortOrNull('2023-01-01 00:00:00', 6, 'UTC'))) AS day_start + UNION ALL SELECT count(e.id) AS total, + toStartOfDay(toDateTime(toTimeZone(e.created, 'UTC'), 'UTC')) AS day_start + FROM s3Cluster('posthog', 'http://host.docker.internal:19000/posthog/test_storage_bucket-posthog.hogql.datawarehouse.trendquery/*.parquet', 'object_storage_root_user', 'object_storage_root_password', 'Parquet', 'id String, created DateTime64(3, \'UTC\')') AS e + WHERE and(ifNull(greaterOrEquals(toDateTime(toTimeZone(e.created, 'UTC'), 'UTC'), toStartOfDay(assumeNotNull(parseDateTime64BestEffortOrNull('2023-01-01 00:00:00', 6, 'UTC')))), 0), ifNull(lessOrEquals(toDateTime(toTimeZone(e.created, 'UTC'), 'UTC'), assumeNotNull(parseDateTime64BestEffortOrNull('2023-01-07 23:59:59', 6, 'UTC'))), 0)) + GROUP BY day_start) + GROUP BY day_start + ORDER BY day_start ASC) + ORDER BY sum(count) DESC + LIMIT 100 SETTINGS readonly=2, + max_execution_time=60, + allow_experimental_object_type=1 + ' +--- diff --git a/posthog/hogql_queries/insights/trends/test/test_data_warehouse_query_builder.py b/posthog/hogql_queries/insights/trends/test/test_data_warehouse_query_builder.py new file mode 100644 index 0000000000000..04875a093fe12 --- /dev/null +++ b/posthog/hogql_queries/insights/trends/test/test_data_warehouse_query_builder.py @@ -0,0 +1,137 @@ +from datetime import datetime +from freezegun import freeze_time +from posthog.hogql.modifiers import create_default_modifiers_for_team + +from posthog.hogql.query import execute_hogql_query +from posthog.hogql.timings import HogQLTimings +from posthog.hogql_queries.insights.trends.data_warehouse_trends_query_builder import DataWarehouseTrendsQueryBuilder +from posthog.hogql_queries.utils.query_date_range import QueryDateRange +from posthog.schema import ( + DateRange, + DataWarehouseNode, + TrendsQuery, +) +from posthog.test.base import BaseTest +from posthog.warehouse.models import DataWarehouseTable, DataWarehouseCredential + +from boto3 import resource +from botocore.config import Config +from posthog.settings import ( + OBJECT_STORAGE_ACCESS_KEY_ID, + OBJECT_STORAGE_BUCKET, + OBJECT_STORAGE_ENDPOINT, + OBJECT_STORAGE_SECRET_ACCESS_KEY, +) +import s3fs +from pyarrow import parquet as pq +import pyarrow as pa + +from posthog.test.base import ( + ClickhouseTestMixin, + snapshot_clickhouse_queries, +) + +TEST_BUCKET = "test_storage_bucket-posthog.hogql.datawarehouse.trendquery" + + +class TestDataWarehouseQueryBuilder(ClickhouseTestMixin, BaseTest): + def teardown_method(self, method) -> None: + s3 = resource( + "s3", + endpoint_url=OBJECT_STORAGE_ENDPOINT, + aws_access_key_id=OBJECT_STORAGE_ACCESS_KEY_ID, + aws_secret_access_key=OBJECT_STORAGE_SECRET_ACCESS_KEY, + config=Config(signature_version="s3v4"), + region_name="us-east-1", + ) + bucket = s3.Bucket(OBJECT_STORAGE_BUCKET) + bucket.objects.filter(Prefix=TEST_BUCKET).delete() + + def get_response(self, trends_query: TrendsQuery): + query_date_range = QueryDateRange( + date_range=trends_query.dateRange, + team=self.team, + interval=trends_query.interval, + now=datetime.now(), + ) + + timings = HogQLTimings() + modifiers = create_default_modifiers_for_team(self.team) + + query_builder = DataWarehouseTrendsQueryBuilder( + trends_query=trends_query, + team=self.team, + query_date_range=query_date_range, + series=trends_query.series[0], + timings=timings, + modifiers=modifiers, + ) + + query = query_builder.build_query() + + return execute_hogql_query( + query_type="TrendsQuery", + query=query, + team=self.team, + timings=timings, + ) + + def create_parquet_file(self): + fs = s3fs.S3FileSystem( + anon=False, + use_ssl=False, + client_kwargs={ + "region_name": "us-east-1", + "endpoint_url": OBJECT_STORAGE_ENDPOINT, + "aws_access_key_id": OBJECT_STORAGE_ACCESS_KEY_ID, + "aws_secret_access_key": OBJECT_STORAGE_SECRET_ACCESS_KEY, + "verify": False, + }, + ) + + path_to_s3_object = "s3://" + OBJECT_STORAGE_BUCKET + f"/{TEST_BUCKET}" + + id = pa.array(["1", "2", "3", "4"]) + created = pa.array([datetime(2023, 1, 1), datetime(2023, 1, 2), datetime(2023, 1, 3), datetime(2023, 1, 4)]) + names = ["id", "created"] + + pq.write_to_dataset( + pa.Table.from_arrays([id, created], names=names), + path_to_s3_object, + filesystem=fs, + use_dictionary=True, + compression="snappy", + version="2.0", + ) + + @snapshot_clickhouse_queries + def test_trends_data_warehouse(self): + self.create_parquet_file() + credential = DataWarehouseCredential.objects.create( + access_key=OBJECT_STORAGE_ACCESS_KEY_ID, access_secret=OBJECT_STORAGE_SECRET_ACCESS_KEY, team=self.team + ) + + DataWarehouseTable.objects.create( + name="stripe_charges", + url_pattern=f"http://host.docker.internal:19000/{OBJECT_STORAGE_BUCKET}/{TEST_BUCKET}/*.parquet", + format=DataWarehouseTable.TableFormat.Parquet, + team=self.team, + columns={ + "id": "String", + "created": "DateTime64(3, 'UTC')", + }, + credential=credential, + ) + + trends_query = TrendsQuery( + kind="TrendsQuery", + dateRange=DateRange(date_from="2023-01-01"), + series=[DataWarehouseNode(table_name="stripe_charges", id_field="id", timestamp_field="created")], + ) + + with freeze_time("2023-01-07"): + response = self.get_response(trends_query=trends_query) + + assert response.columns is not None + assert set(response.columns).issubset({"date", "total"}) + assert response.results[0][1] == [1, 1, 1, 1, 0, 0, 0] diff --git a/posthog/hogql_queries/insights/trends/test/test_trends.py b/posthog/hogql_queries/insights/trends/test/test_trends.py index fd19df6433fda..d6348fb4ecc75 100644 --- a/posthog/hogql_queries/insights/trends/test/test_trends.py +++ b/posthog/hogql_queries/insights/trends/test/test_trends.py @@ -35,8 +35,6 @@ Person, ) -from posthog.warehouse.models import DataWarehouseTable, DataWarehouseCredential - from posthog.models.group.util import create_group from posthog.models.instance_setting import ( get_instance_setting, @@ -48,7 +46,6 @@ ActionsNode, BreakdownFilter, DateRange, - DataWarehouseNode, EventsNode, PropertyGroupFilter, SeriesType, @@ -129,7 +126,6 @@ def convert_filter_to_trends_query(filter: Filter) -> TrendsQuery: events: List[EventsNode] = [] actions: List[ActionsNode] = [] - data_warehouse_entities: List[DataWarehouseNode] = [] for event in filter.events: if isinstance(event._data.get("properties", None), List): @@ -175,30 +171,7 @@ def convert_filter_to_trends_query(filter: Filter) -> TrendsQuery: ) ) - for entity in filter.data_warehouse_entities: - if isinstance(entity._data.get("properties", None), List): - properties = clean_entity_properties(entity._data.get("properties", None)) - elif entity._data.get("properties", None) is not None: - values = entity._data.get("properties", None).get("values", None) - properties = clean_entity_properties(values) - else: - properties = None - - data_warehouse_entities.append( - DataWarehouseNode( - table_name=entity.id, - id_field=entity.id_field, - timestamp_field=entity.timestamp_field, - custom_name=entity.custom_name, - math=entity.math, - math_property=entity.math_property, - math_hogql=entity.math_hogql, - math_group_type_index=entity.math_group_type_index, - properties=properties, - ) - ) - - series: List[SeriesType] = [*events, *actions, *data_warehouse_entities] + series: List[SeriesType] = [*events, *actions] tq = TrendsQuery( series=series, @@ -513,45 +486,6 @@ def test_trends_per_day(self): self.assertEqual(response[0]["labels"][5], "2-Jan-2020") self.assertEqual(response[0]["data"][5], 1.0) - @snapshot_clickhouse_queries - def test_trends_data_warehouse(self): - self._create_events() - - credential = DataWarehouseCredential.objects.create(access_key="test", access_secret="test", team=self.team) - DataWarehouseTable.objects.create( - name="stripe_charges", - url_pattern="https://databeach-hackathon.s3.amazonaws.com/tim_test/test_events6.pqt", - format=DataWarehouseTable.TableFormat.Parquet, - team=self.team, - columns={ - "id": "String", - "created": "DateTime64(3, 'UTC')", - "mrr": "Nullable(Int64)", - "offset": "UInt32", - }, - credential=credential, - ) - - with freeze_time("2020-01-04T13:00:01Z"): - # with self.assertNumQueries(16): - self._run( - Filter( - team=self.team, - data={ - "date_from": "-7d", - "data_warehouse_entities": [ - { - "id": "stripe_charges", - "id_field": "id", - "timestamp_field": "created", - "type": "data_warehouse", - } - ], - }, - ), - self.team, - ) - # just make sure this doesn't error def test_no_props(self): PropertyDefinition.objects.create(