diff --git a/.github/workflows/test-warehouse.yml b/.github/workflows/test-warehouse.yml index 32395942d..383ea52df 100644 --- a/.github/workflows/test-warehouse.yml +++ b/.github/workflows/test-warehouse.yml @@ -163,6 +163,7 @@ jobs: run: > edr monitor -t "${{ inputs.warehouse-type }}" + --group-by table --project-dir "${{ env.DBT_PKG_INTEG_TESTS_DIR }}" --project-profile-target "${{ inputs.warehouse-type }}" --slack-webhook "$SLACK_WEBHOOK" diff --git a/docs/guides/modules-overview/dbt-package.mdx b/docs/guides/modules-overview/dbt-package.mdx index d1079a588..dd1af7e76 100644 --- a/docs/guides/modules-overview/dbt-package.mdx +++ b/docs/guides/modules-overview/dbt-package.mdx @@ -234,11 +234,3 @@ _Incremental model_ Stores the schema details for tables that are monitored with elementary schema changes test. In order to compare current schema to previous state, we must store the previous state. The data is from a view that queries the data warehouse information schema. - -### filtered_information_schema_columns - -_View_ - -Queries the columns view from the information schema of the schemas in the project. -This view is generated using an adapter specific macro, as information schema is different between platforms. -This is a view to make the work with the information schema more convenient. diff --git a/elementary/monitor/dbt_project/macros/internal_tests/validate_alert_statuses_are_updated.sql b/elementary/monitor/dbt_project/macros/internal_tests/validate_alert_statuses_are_updated.sql index 06afb7162..9ca3d37f8 100644 --- a/elementary/monitor/dbt_project/macros/internal_tests/validate_alert_statuses_are_updated.sql +++ b/elementary/monitor/dbt_project/macros/internal_tests/validate_alert_statuses_are_updated.sql @@ -3,6 +3,8 @@ with alerts as ( select * from {{ ref('alerts') }} + -- When using --group-by table, singular test alerts are not sent. + where sub_type != 'singular' ), alerts_models as ( diff --git a/elementary/monitor/dbt_project/macros/tests_recommendation/get_recommended_tests.sql b/elementary/monitor/dbt_project/macros/tests_recommendation/get_recommended_tests.sql index cd8dfc5bf..4ad5922d4 100644 --- a/elementary/monitor/dbt_project/macros/tests_recommendation/get_recommended_tests.sql +++ b/elementary/monitor/dbt_project/macros/tests_recommendation/get_recommended_tests.sql @@ -4,7 +4,7 @@ {% endif %} {% set query %} - select resource_name, source_name, test_namespace, test_name, test_args + select resource_name, source_name, test_namespace, test_name, test_args, table_args from {{ ref("test_recommendations") }} where {{ where_expression }} {% endset %} diff --git a/elementary/monitor/dbt_project/models/tests_recommendation/table_timestamp_columns.sql b/elementary/monitor/dbt_project/models/tests_recommendation/table_timestamp_columns.sql index 08b74cb4e..e09693430 100644 --- a/elementary/monitor/dbt_project/models/tests_recommendation/table_timestamp_columns.sql +++ b/elementary/monitor/dbt_project/models/tests_recommendation/table_timestamp_columns.sql @@ -1,35 +1,34 @@ +{# Prioritization: 1. insertion time, 2. update time. #} {% set timestamp_column_names = [ "created_at", "created_at_utc", "inserted_at", "inserted_at_utc", - "updated_at", - "updated_at_utc", - "_fivetran_synced", - "_airbyte_emitted_at", "create_date", "created", "db_insert_time", "create_ts", "created_ts", - "update_ts", - "updated_ts", "load_ts", "loaded_at", "date_created", - "dbt_updated_at", - "update_datetime", - "event_time", - "event_date", - "event_created_at", - "event_updated_at", - "event_event_time", "_etl_loaded_at", "__etl_loaded_at", "_etl_inserted_at", "_ingestion_time", + "_fivetran_synced", + "_airbyte_emitted_at", + + "updated_at", + "updated_at_utc", + "update_ts", + "updated_ts", + "dbt_updated_at", + "update_datetime", + "event_updated_at", "last_modified_datetime", ] %} + {% set joined_timestamp_column_names = "'{}'".format( "', '".join(timestamp_column_names) ) %} @@ -76,6 +75,18 @@ with where loaded_at_field is not null ), + -- Users can provide the timestamp columns for their models, + -- if provided, we assign a confidence score of 0 (certain). + model_provided_timestamp_columns as ( + select + lower(database_name) as database_name, + lower(schema_name) as schema_name, + lower(name) as table_name, + bigquery_partition_by::json ->> 'field' as column_name + from {{ ref("elementary", "dbt_models") }} + where bigquery_partition_by::json ->> 'data_type' != 'int64' + ), + -- Combining the inferred and source provided timestamp columns. absolute_rated_timestamp_columns as ( select @@ -93,6 +104,14 @@ with column_name, 0 as absolute_confidence from source_provided_timestamp_columns + union all + select + database_name, + schema_name, + table_name, + column_name, + 0 as absolute_confidence + from model_provided_timestamp_columns ), -- Sort the timestamp columns by confidence and assign a rank. diff --git a/elementary/monitor/dbt_project/models/tests_recommendation/test_recommendations.sql b/elementary/monitor/dbt_project/models/tests_recommendation/test_recommendations.sql index f11af0aa3..33ba8bdfd 100644 --- a/elementary/monitor/dbt_project/models/tests_recommendation/test_recommendations.sql +++ b/elementary/monitor/dbt_project/models/tests_recommendation/test_recommendations.sql @@ -1,7 +1,8 @@ -{# Object structure is [test_namespace, test_name, requires_timestamp_column] #} +{# Object structure is [test_namespace, test_name] #} {% set recommended_tests = [ - ("elementary", "volume_anomalies", false), - ("elementary", "freshness_anomalies", true), + ("elementary", "volume_anomalies"), + ("elementary", "freshness_anomalies"), + ("elementary", "schema_changes_from_baseline"), ] %} with @@ -23,15 +24,14 @@ with ), potential_recommended_tests as ( - select id, test_namespace, short_name, requires_timestamp_column + select id, test_namespace, short_name from tables_criticality cross join ( {% for recommended_test in recommended_tests %} select '{{ recommended_test[0] }}' as test_namespace, - '{{ recommended_test[1] }}' as short_name, - {{ recommended_test[2] }} as requires_timestamp_column + '{{ recommended_test[1] }}' as short_name {% if not loop.last %} union all {% endif %} @@ -45,7 +45,7 @@ with ), pending_recommended_tests as ( - select id, test_namespace, short_name, requires_timestamp_column + select id, test_namespace, short_name from potential_recommended_tests where (id, test_namespace, short_name) not in ( @@ -59,6 +59,16 @@ with from {{ ref("table_timestamp_columns") }} ), + table_columns as ( + select + lower(database_name) as database_name, + lower(schema_name) as schema_name, + lower(table_name) as table_name, + json_agg(json_build_object('name', lower(column_name), 'data_type', lower(data_type))) as columns + from {{ ref("elementary", "dbt_columns") }} + group by 1, 2, 3 + ), + pending_tests_with_table_info as ( select resource_name, @@ -72,14 +82,23 @@ with exposure_count, table_type, case - when timestamp_column is not null + when short_name in ('volume_anomalies', 'freshness_anomalies') and timestamp_column is not null then cast('{"timestamp_column": "' || timestamp_column || '"}' as jsonb) - else null - end as test_args + end as test_args, + case + when short_name = 'schema_changes_from_baseline' + then cast(json_build_object('columns', table_columns.columns) as jsonb) + end as table_args from pending_recommended_tests join tables_criticality using (id) left join timestamp_columns using (database_name, schema_name, table_name) - where requires_timestamp_column = false or timestamp_column is not null + left join table_columns using (database_name, schema_name, table_name) + where + short_name = 'volume_anomalies' + or + (short_name = 'freshness_anomalies' and timestamp_column is not null) + or + (short_name = 'schema_changes_from_baseline' and table_columns.columns is not null) ) select * diff --git a/elementary/monitor/dbt_project/packages.yml b/elementary/monitor/dbt_project/packages.yml index 6c053a92d..78348e84f 100644 --- a/elementary/monitor/dbt_project/packages.yml +++ b/elementary/monitor/dbt_project/packages.yml @@ -1,9 +1,9 @@ packages: - package: dbt-labs/dbt_utils version: [">=0.8.0", "<0.9.0"] - - package: elementary-data/elementary - version: 0.10.3 -# NOTE - for unreleased CLI versions we often need to update the package version to a commit hash (please leave this -# commented, so it will be easy to access) -# - git: https://github.com/elementary-data/dbt-data-reliability.git -# revision: 68b9edb2833d63b5de59e9648f38c8031d853d01 + # - package: elementary-data/elementary + # version: 0.10.3 + # NOTE - for unreleased CLI versions we often need to update the package version to a commit hash (please leave this + # commented, so it will be easy to access) + - git: https://github.com/elementary-data/dbt-data-reliability.git + revision: 2a44dd0bb5e82ddb445d56d8f88ee1db444618b6