From 29cd5763848f8bfed2627a0b3ed23c42e28b830d Mon Sep 17 00:00:00 2001 From: Elon Gliksberg Date: Tue, 19 Sep 2023 14:53:58 +0300 Subject: [PATCH 01/16] Updated prioritization of timestamp columns. --- .../table_timestamp_columns.sql | 31 +++++++++++-------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/elementary/monitor/dbt_project/models/tests_recommendation/table_timestamp_columns.sql b/elementary/monitor/dbt_project/models/tests_recommendation/table_timestamp_columns.sql index 08b74cb4e..453c4eb74 100644 --- a/elementary/monitor/dbt_project/models/tests_recommendation/table_timestamp_columns.sql +++ b/elementary/monitor/dbt_project/models/tests_recommendation/table_timestamp_columns.sql @@ -1,35 +1,40 @@ +{# Prioritization: 1. event time, 2. insertion time, 3. update time. #} {% set timestamp_column_names = [ + "event_time", + "event_date", + "event_created_at", + "event_event_time", + + "timestamp", "created_at", "created_at_utc", "inserted_at", "inserted_at_utc", - "updated_at", - "updated_at_utc", - "_fivetran_synced", - "_airbyte_emitted_at", "create_date", "created", "db_insert_time", "create_ts", "created_ts", - "update_ts", - "updated_ts", "load_ts", "loaded_at", "date_created", - "dbt_updated_at", - "update_datetime", - "event_time", - "event_date", - "event_created_at", - "event_updated_at", - "event_event_time", "_etl_loaded_at", "__etl_loaded_at", "_etl_inserted_at", "_ingestion_time", + "_fivetran_synced", + "_airbyte_emitted_at", + + "updated_at", + "updated_at_utc", + "update_ts", + "updated_ts", + "dbt_updated_at", + "update_datetime", + "event_updated_at", "last_modified_datetime", ] %} + {% set joined_timestamp_column_names = "'{}'".format( "', '".join(timestamp_column_names) ) %} From 2e74b4a0e8d887db872bd97468524df5e4ea5203 Mon Sep 17 00:00:00 2001 From: Elon Gliksberg Date: Tue, 19 Sep 2023 17:49:09 +0300 Subject: [PATCH 02/16] Added 'model_provided_timestamp_columns' to 'table_timestamp_columns'. --- .../table_timestamp_columns.sql | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/elementary/monitor/dbt_project/models/tests_recommendation/table_timestamp_columns.sql b/elementary/monitor/dbt_project/models/tests_recommendation/table_timestamp_columns.sql index 453c4eb74..b294c41ff 100644 --- a/elementary/monitor/dbt_project/models/tests_recommendation/table_timestamp_columns.sql +++ b/elementary/monitor/dbt_project/models/tests_recommendation/table_timestamp_columns.sql @@ -81,6 +81,18 @@ with where loaded_at_field is not null ), + -- Users can provide the timestamp columns for their models, + -- if provided, we assign a confidence score of 0 (certain). + model_provided_timestamp_columns as ( + select + lower(database_name) as database_name, + lower(schema_name) as schema_name, + lower(name) as table_name, + lower(partition_by_field) as column_name + from {{ ref("elementary", "dbt_models") }} + where partition_by_field is not null + ), + -- Combining the inferred and source provided timestamp columns. absolute_rated_timestamp_columns as ( select @@ -98,6 +110,14 @@ with column_name, 0 as absolute_confidence from source_provided_timestamp_columns + union all + select + database_name, + schema_name, + table_name, + column_name, + 0 as absolute_confidence + from model_provided_timestamp_columns ), -- Sort the timestamp columns by confidence and assign a rank. From 6d733452830262faa9e89a67e7c923353d3ef099 Mon Sep 17 00:00:00 2001 From: Elon Gliksberg Date: Wed, 20 Sep 2023 14:21:56 +0300 Subject: [PATCH 03/16] Renamed column. --- .../models/tests_recommendation/table_timestamp_columns.sql | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/elementary/monitor/dbt_project/models/tests_recommendation/table_timestamp_columns.sql b/elementary/monitor/dbt_project/models/tests_recommendation/table_timestamp_columns.sql index b294c41ff..a41481e0e 100644 --- a/elementary/monitor/dbt_project/models/tests_recommendation/table_timestamp_columns.sql +++ b/elementary/monitor/dbt_project/models/tests_recommendation/table_timestamp_columns.sql @@ -88,9 +88,9 @@ with lower(database_name) as database_name, lower(schema_name) as schema_name, lower(name) as table_name, - lower(partition_by_field) as column_name + lower(bigquery_partition_by_field) as column_name from {{ ref("elementary", "dbt_models") }} - where partition_by_field is not null + where bigquery_partition_by_field is not null ), -- Combining the inferred and source provided timestamp columns. From 2999ac9abd012af73af5f8248226bbf69167ed12 Mon Sep 17 00:00:00 2001 From: Elon Gliksberg Date: Thu, 21 Sep 2023 10:10:01 +0300 Subject: [PATCH 04/16] Updated packages.yml. --- elementary/monitor/dbt_project/packages.yml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/elementary/monitor/dbt_project/packages.yml b/elementary/monitor/dbt_project/packages.yml index 6c053a92d..268fa3e5a 100644 --- a/elementary/monitor/dbt_project/packages.yml +++ b/elementary/monitor/dbt_project/packages.yml @@ -1,9 +1,9 @@ packages: - package: dbt-labs/dbt_utils version: [">=0.8.0", "<0.9.0"] - - package: elementary-data/elementary - version: 0.10.3 -# NOTE - for unreleased CLI versions we often need to update the package version to a commit hash (please leave this -# commented, so it will be easy to access) -# - git: https://github.com/elementary-data/dbt-data-reliability.git -# revision: 68b9edb2833d63b5de59e9648f38c8031d853d01 + # - package: elementary-data/elementary + # version: 0.10.3 + # NOTE - for unreleased CLI versions we often need to update the package version to a commit hash (please leave this + # commented, so it will be easy to access) + - git: https://github.com/elementary-data/dbt-data-reliability.git + revision: ef99fb5c0286038b9839a509a937a9bd2ed388d8 From 64daef71b1ea907481b7fcda6bdc5c8d38fe0c55 Mon Sep 17 00:00:00 2001 From: Elon Gliksberg Date: Thu, 21 Sep 2023 12:32:47 +0300 Subject: [PATCH 05/16] Grouping by 'table' in tests. --- .github/workflows/test-warehouse.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/test-warehouse.yml b/.github/workflows/test-warehouse.yml index 32395942d..383ea52df 100644 --- a/.github/workflows/test-warehouse.yml +++ b/.github/workflows/test-warehouse.yml @@ -163,6 +163,7 @@ jobs: run: > edr monitor -t "${{ inputs.warehouse-type }}" + --group-by table --project-dir "${{ env.DBT_PKG_INTEG_TESTS_DIR }}" --project-profile-target "${{ inputs.warehouse-type }}" --slack-webhook "$SLACK_WEBHOOK" From a58055e7c2816e32cd99b1c8324e497b585dab48 Mon Sep 17 00:00:00 2001 From: Elon Gliksberg Date: Thu, 21 Sep 2023 14:00:46 +0300 Subject: [PATCH 06/16] Excluding singular test alerts. --- .../internal_tests/validate_alert_statuses_are_updated.sql | 2 ++ 1 file changed, 2 insertions(+) diff --git a/elementary/monitor/dbt_project/macros/internal_tests/validate_alert_statuses_are_updated.sql b/elementary/monitor/dbt_project/macros/internal_tests/validate_alert_statuses_are_updated.sql index 06afb7162..4c1ece625 100644 --- a/elementary/monitor/dbt_project/macros/internal_tests/validate_alert_statuses_are_updated.sql +++ b/elementary/monitor/dbt_project/macros/internal_tests/validate_alert_statuses_are_updated.sql @@ -29,6 +29,8 @@ select alert_id from all_alerts where suppression_status not in ('sent', 'skipped') + -- When using --group-by table, singular test alerts are not sent. + and sub_type != 'singular' {% endset %} {% set alerts_agate = run_query(alerts_with_no_updated_status_query) %} {% set alerts_with_no_updated_status = elementary.agate_to_dicts(alerts_agate) %} From 58bb3f66ea46852beb04a8206cbd99e4a06265b1 Mon Sep 17 00:00:00 2001 From: Elon Gliksberg Date: Thu, 21 Sep 2023 14:56:26 +0300 Subject: [PATCH 07/16] Updated selected fields. --- .../internal_tests/validate_alert_statuses_are_updated.sql | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/elementary/monitor/dbt_project/macros/internal_tests/validate_alert_statuses_are_updated.sql b/elementary/monitor/dbt_project/macros/internal_tests/validate_alert_statuses_are_updated.sql index 4c1ece625..099b52074 100644 --- a/elementary/monitor/dbt_project/macros/internal_tests/validate_alert_statuses_are_updated.sql +++ b/elementary/monitor/dbt_project/macros/internal_tests/validate_alert_statuses_are_updated.sql @@ -16,13 +16,13 @@ ), all_alerts as ( - select alert_id, suppression_status + select alert_id, suppression_status, sub_type from alerts union all - select alert_id, suppression_status + select alert_id, suppression_status, sub_type from alerts_models union all - select alert_id, suppression_status + select alert_id, suppression_status, sub_type from alerts_source_freshness ) From ff0e6826c7231544c53e857e5b343c45f07387d0 Mon Sep 17 00:00:00 2001 From: Elon Gliksberg Date: Thu, 21 Sep 2023 15:32:39 +0300 Subject: [PATCH 08/16] Excluding singular test alerts. --- .../validate_alert_statuses_are_updated.sql | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/elementary/monitor/dbt_project/macros/internal_tests/validate_alert_statuses_are_updated.sql b/elementary/monitor/dbt_project/macros/internal_tests/validate_alert_statuses_are_updated.sql index 099b52074..9ca3d37f8 100644 --- a/elementary/monitor/dbt_project/macros/internal_tests/validate_alert_statuses_are_updated.sql +++ b/elementary/monitor/dbt_project/macros/internal_tests/validate_alert_statuses_are_updated.sql @@ -3,6 +3,8 @@ with alerts as ( select * from {{ ref('alerts') }} + -- When using --group-by table, singular test alerts are not sent. + where sub_type != 'singular' ), alerts_models as ( @@ -16,21 +18,19 @@ ), all_alerts as ( - select alert_id, suppression_status, sub_type + select alert_id, suppression_status from alerts union all - select alert_id, suppression_status, sub_type + select alert_id, suppression_status from alerts_models union all - select alert_id, suppression_status, sub_type + select alert_id, suppression_status from alerts_source_freshness ) select alert_id from all_alerts where suppression_status not in ('sent', 'skipped') - -- When using --group-by table, singular test alerts are not sent. - and sub_type != 'singular' {% endset %} {% set alerts_agate = run_query(alerts_with_no_updated_status_query) %} {% set alerts_with_no_updated_status = elementary.agate_to_dicts(alerts_agate) %} From e08336c6e8aa9d6d230ffce6aba5075c053f07c4 Mon Sep 17 00:00:00 2001 From: Elon Gliksberg Date: Thu, 21 Sep 2023 16:17:37 +0300 Subject: [PATCH 09/16] Updated timestamp column names. --- .../tests_recommendation/table_timestamp_columns.sql | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/elementary/monitor/dbt_project/models/tests_recommendation/table_timestamp_columns.sql b/elementary/monitor/dbt_project/models/tests_recommendation/table_timestamp_columns.sql index a41481e0e..121ffe872 100644 --- a/elementary/monitor/dbt_project/models/tests_recommendation/table_timestamp_columns.sql +++ b/elementary/monitor/dbt_project/models/tests_recommendation/table_timestamp_columns.sql @@ -1,10 +1,5 @@ -{# Prioritization: 1. event time, 2. insertion time, 3. update time. #} +{# Prioritization: 1. insertion time, 2. update time. #} {% set timestamp_column_names = [ - "event_time", - "event_date", - "event_created_at", - "event_event_time", - "timestamp", "created_at", "created_at_utc", From 59a8bdd33d42412c6deaf955bef30a5e81dca0aa Mon Sep 17 00:00:00 2001 From: Elon Gliksberg Date: Tue, 26 Sep 2023 10:23:43 +0300 Subject: [PATCH 10/16] Removed an event timestamp column. --- .../models/tests_recommendation/table_timestamp_columns.sql | 1 - 1 file changed, 1 deletion(-) diff --git a/elementary/monitor/dbt_project/models/tests_recommendation/table_timestamp_columns.sql b/elementary/monitor/dbt_project/models/tests_recommendation/table_timestamp_columns.sql index 121ffe872..93df3c611 100644 --- a/elementary/monitor/dbt_project/models/tests_recommendation/table_timestamp_columns.sql +++ b/elementary/monitor/dbt_project/models/tests_recommendation/table_timestamp_columns.sql @@ -1,6 +1,5 @@ {# Prioritization: 1. insertion time, 2. update time. #} {% set timestamp_column_names = [ - "timestamp", "created_at", "created_at_utc", "inserted_at", From d19d0db07dc2411228ffa77ea8dd743a055eca41 Mon Sep 17 00:00:00 2001 From: Elon Gliksberg Date: Tue, 26 Sep 2023 10:42:01 +0300 Subject: [PATCH 11/16] Validating data type of partition by. --- .../models/tests_recommendation/table_timestamp_columns.sql | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/elementary/monitor/dbt_project/models/tests_recommendation/table_timestamp_columns.sql b/elementary/monitor/dbt_project/models/tests_recommendation/table_timestamp_columns.sql index 93df3c611..e09693430 100644 --- a/elementary/monitor/dbt_project/models/tests_recommendation/table_timestamp_columns.sql +++ b/elementary/monitor/dbt_project/models/tests_recommendation/table_timestamp_columns.sql @@ -82,9 +82,9 @@ with lower(database_name) as database_name, lower(schema_name) as schema_name, lower(name) as table_name, - lower(bigquery_partition_by_field) as column_name + bigquery_partition_by::json ->> 'field' as column_name from {{ ref("elementary", "dbt_models") }} - where bigquery_partition_by_field is not null + where bigquery_partition_by::json ->> 'data_type' != 'int64' ), -- Combining the inferred and source provided timestamp columns. From 7af8abfe77bb4da2a033e2507ebb065265f374ea Mon Sep 17 00:00:00 2001 From: Elon Gliksberg Date: Tue, 26 Sep 2023 10:43:06 +0300 Subject: [PATCH 12/16] Updated dbt package revision. --- elementary/monitor/dbt_project/packages.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/elementary/monitor/dbt_project/packages.yml b/elementary/monitor/dbt_project/packages.yml index 268fa3e5a..78348e84f 100644 --- a/elementary/monitor/dbt_project/packages.yml +++ b/elementary/monitor/dbt_project/packages.yml @@ -6,4 +6,4 @@ packages: # NOTE - for unreleased CLI versions we often need to update the package version to a commit hash (please leave this # commented, so it will be easy to access) - git: https://github.com/elementary-data/dbt-data-reliability.git - revision: ef99fb5c0286038b9839a509a937a9bd2ed388d8 + revision: 2a44dd0bb5e82ddb445d56d8f88ee1db444618b6 From 5295ee1bfc51d8ea36784722b77b7d55a785ffa4 Mon Sep 17 00:00:00 2001 From: Elon Gliksberg Date: Tue, 26 Sep 2023 12:18:45 +0300 Subject: [PATCH 13/16] Deleted a model from the docs. --- docs/guides/modules-overview/dbt-package.mdx | 8 -------- 1 file changed, 8 deletions(-) diff --git a/docs/guides/modules-overview/dbt-package.mdx b/docs/guides/modules-overview/dbt-package.mdx index d1079a588..dd1af7e76 100644 --- a/docs/guides/modules-overview/dbt-package.mdx +++ b/docs/guides/modules-overview/dbt-package.mdx @@ -234,11 +234,3 @@ _Incremental model_ Stores the schema details for tables that are monitored with elementary schema changes test. In order to compare current schema to previous state, we must store the previous state. The data is from a view that queries the data warehouse information schema. - -### filtered_information_schema_columns - -_View_ - -Queries the columns view from the information schema of the schemas in the project. -This view is generated using an adapter specific macro, as information schema is different between platforms. -This is a view to make the work with the information schema more convenient. From 93104de310c14768311bd44b9cdf833e4359ceef Mon Sep 17 00:00:00 2001 From: Elon Gliksberg Date: Wed, 27 Sep 2023 11:17:02 +0300 Subject: [PATCH 14/16] Added 'schema_changes_from_baseline' to the test recommendations model. --- .../test_recommendations.sql | 41 ++++++++++++++----- 1 file changed, 30 insertions(+), 11 deletions(-) diff --git a/elementary/monitor/dbt_project/models/tests_recommendation/test_recommendations.sql b/elementary/monitor/dbt_project/models/tests_recommendation/test_recommendations.sql index f11af0aa3..f1a202d0d 100644 --- a/elementary/monitor/dbt_project/models/tests_recommendation/test_recommendations.sql +++ b/elementary/monitor/dbt_project/models/tests_recommendation/test_recommendations.sql @@ -1,7 +1,8 @@ -{# Object structure is [test_namespace, test_name, requires_timestamp_column] #} +{# Object structure is [test_namespace, test_name] #} {% set recommended_tests = [ - ("elementary", "volume_anomalies", false), - ("elementary", "freshness_anomalies", true), + ("elementary", "volume_anomalies"), + ("elementary", "freshness_anomalies"), + ("elementary", "schema_changes_from_baseline"), ] %} with @@ -23,15 +24,14 @@ with ), potential_recommended_tests as ( - select id, test_namespace, short_name, requires_timestamp_column + select id, test_namespace, short_name from tables_criticality cross join ( {% for recommended_test in recommended_tests %} select '{{ recommended_test[0] }}' as test_namespace, - '{{ recommended_test[1] }}' as short_name, - {{ recommended_test[2] }} as requires_timestamp_column + '{{ recommended_test[1] }}' as short_name {% if not loop.last %} union all {% endif %} @@ -45,7 +45,7 @@ with ), pending_recommended_tests as ( - select id, test_namespace, short_name, requires_timestamp_column + select id, test_namespace, short_name from potential_recommended_tests where (id, test_namespace, short_name) not in ( @@ -59,6 +59,16 @@ with from {{ ref("table_timestamp_columns") }} ), + table_columns as ( + select + lower(database_name) as database_name, + lower(schema_name) as schema_name, + lower(table_name) as table_name, + json_agg(json_build_object('name', lower(column_name), 'data_type', lower(data_type))) as columns + from {{ ref("elementary", "dbt_columns") }} + group by 1, 2, 3 + ), + pending_tests_with_table_info as ( select resource_name, @@ -72,14 +82,23 @@ with exposure_count, table_type, case - when timestamp_column is not null + when short_name in ('volume_anomalies', 'freshness_anomalies') and timestamp_column is not null then cast('{"timestamp_column": "' || timestamp_column || '"}' as jsonb) - else null - end as test_args + end as test_args, + case + when short_name = 'schema_changes_from_baseline' + then cast(json_build_object('columns', table_columns.columns) as jsonb) + end as table_args from pending_recommended_tests join tables_criticality using (id) left join timestamp_columns using (database_name, schema_name, table_name) - where requires_timestamp_column = false or timestamp_column is not null + left join table_columns using (database_name, schema_name, table_name) + where + short_name = 'volume_anomalies' + or + short_name = 'freshness_anomalies' and timestamp_column is not null + or + short_name = 'schema_changes_from_baseline' and table_columns.columns is not null ) select * From a9f97a8a4ff36e42042d6defc5533de3ae820b39 Mon Sep 17 00:00:00 2001 From: Elon Gliksberg Date: Wed, 27 Sep 2023 11:36:31 +0300 Subject: [PATCH 15/16] Also querying 'table_args' in the 'get_recommended_tests' macro. --- .../macros/tests_recommendation/get_recommended_tests.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/elementary/monitor/dbt_project/macros/tests_recommendation/get_recommended_tests.sql b/elementary/monitor/dbt_project/macros/tests_recommendation/get_recommended_tests.sql index cd8dfc5bf..4ad5922d4 100644 --- a/elementary/monitor/dbt_project/macros/tests_recommendation/get_recommended_tests.sql +++ b/elementary/monitor/dbt_project/macros/tests_recommendation/get_recommended_tests.sql @@ -4,7 +4,7 @@ {% endif %} {% set query %} - select resource_name, source_name, test_namespace, test_name, test_args + select resource_name, source_name, test_namespace, test_name, test_args, table_args from {{ ref("test_recommendations") }} where {{ where_expression }} {% endset %} From c5db04050b8cd58cc39ea82e5ce1261dad7c8dda Mon Sep 17 00:00:00 2001 From: Elon Gliksberg Date: Wed, 27 Sep 2023 12:51:21 +0300 Subject: [PATCH 16/16] Added parentheses to SQL expressions. --- .../models/tests_recommendation/test_recommendations.sql | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/elementary/monitor/dbt_project/models/tests_recommendation/test_recommendations.sql b/elementary/monitor/dbt_project/models/tests_recommendation/test_recommendations.sql index f1a202d0d..33ba8bdfd 100644 --- a/elementary/monitor/dbt_project/models/tests_recommendation/test_recommendations.sql +++ b/elementary/monitor/dbt_project/models/tests_recommendation/test_recommendations.sql @@ -96,9 +96,9 @@ with where short_name = 'volume_anomalies' or - short_name = 'freshness_anomalies' and timestamp_column is not null + (short_name = 'freshness_anomalies' and timestamp_column is not null) or - short_name = 'schema_changes_from_baseline' and table_columns.columns is not null + (short_name = 'schema_changes_from_baseline' and table_columns.columns is not null) ) select *