Merge branch 'master' into ele-1706-add-result-description-to-source-…

…freshness
elementary-data · Sep 27, 2023 · 3a7483d · 3a7483d
2 parents 5650bcc + f30a0fd
commit 3a7483d
Show file tree

Hide file tree

Showing 7 changed files with 72 additions and 39 deletions.
diff --git a/.github/workflows/test-warehouse.yml b/.github/workflows/test-warehouse.yml
@@ -163,6 +163,7 @@ jobs:
         run: >
           edr monitor 
           -t "${{ inputs.warehouse-type }}"
+          --group-by table
           --project-dir "${{ env.DBT_PKG_INTEG_TESTS_DIR }}"
           --project-profile-target "${{ inputs.warehouse-type }}"
           --slack-webhook "$SLACK_WEBHOOK"

diff --git a/docs/guides/modules-overview/dbt-package.mdx b/docs/guides/modules-overview/dbt-package.mdx
@@ -234,11 +234,3 @@ _Incremental model_
 Stores the schema details for tables that are monitored with elementary schema changes test.
 In order to compare current schema to previous state, we must store the previous state.
 The data is from a view that queries the data warehouse information schema.
-
-### filtered_information_schema_columns
-
-_View_
-
-Queries the columns view from the information schema of the schemas in the project.
-This view is generated using an adapter specific macro, as information schema is different between platforms.
-This is a view to make the work with the information schema more convenient.
diff --git a/elementary/monitor/dbt_project/macros/internal_tests/validate_alert_statuses_are_updated.sql b/elementary/monitor/dbt_project/macros/internal_tests/validate_alert_statuses_are_updated.sql
@@ -3,6 +3,8 @@
         with alerts as (
             select *
             from {{ ref('alerts') }}
+            -- When using --group-by table, singular test alerts are not sent.
+            where sub_type != 'singular'
         ),
 
         alerts_models as (

diff --git a/elementary/monitor/dbt_project/macros/tests_recommendation/get_recommended_tests.sql b/elementary/monitor/dbt_project/macros/tests_recommendation/get_recommended_tests.sql
@@ -4,7 +4,7 @@
     {% endif %}
 
     {% set query %}
-        select resource_name, source_name, test_namespace, test_name, test_args
+        select resource_name, source_name, test_namespace, test_name, test_args, table_args
         from {{ ref("test_recommendations") }}
         where {{ where_expression }}
     {% endset %}

diff --git a/elementary/monitor/dbt_project/models/tests_recommendation/table_timestamp_columns.sql b/elementary/monitor/dbt_project/models/tests_recommendation/table_timestamp_columns.sql
@@ -1,35 +1,34 @@
+{# Prioritization: 1. insertion time, 2. update time. #}
 {% set timestamp_column_names = [
     "created_at",
     "created_at_utc",
     "inserted_at",
     "inserted_at_utc",
-    "updated_at",
-    "updated_at_utc",
-    "_fivetran_synced",
-    "_airbyte_emitted_at",
     "create_date",
     "created",
     "db_insert_time",
     "create_ts",
     "created_ts",
-    "update_ts",
-    "updated_ts",
     "load_ts",
     "loaded_at",
     "date_created",
-    "dbt_updated_at",
-    "update_datetime",
-    "event_time",
-    "event_date",
-    "event_created_at",
-    "event_updated_at",
-    "event_event_time",
     "_etl_loaded_at",
     "__etl_loaded_at",
     "_etl_inserted_at",
     "_ingestion_time",
+    "_fivetran_synced",
+    "_airbyte_emitted_at",
+
+    "updated_at",
+    "updated_at_utc",
+    "update_ts",
+    "updated_ts",
+    "dbt_updated_at",
+    "update_datetime",
+    "event_updated_at",
     "last_modified_datetime",
 ] %}
+
 {% set joined_timestamp_column_names = "'{}'".format(
     "', '".join(timestamp_column_names)
 ) %}
@@ -76,6 +75,18 @@ with
         where loaded_at_field is not null
     ),
 
+    -- Users can provide the timestamp columns for their models,
+    -- if provided, we assign a confidence score of 0 (certain).
+    model_provided_timestamp_columns as (
+        select
+            lower(database_name) as database_name,
+            lower(schema_name) as schema_name,
+            lower(name) as table_name,
+            bigquery_partition_by::json ->> 'field' as column_name
+        from {{ ref("elementary", "dbt_models") }}
+        where bigquery_partition_by::json ->> 'data_type' != 'int64'
+    ),
+
     -- Combining the inferred and source provided timestamp columns.
     absolute_rated_timestamp_columns as (
         select
@@ -93,6 +104,14 @@ with
             column_name,
             0 as absolute_confidence
         from source_provided_timestamp_columns
+        union all
+        select
+            database_name,
+            schema_name,
+            table_name,
+            column_name,
+            0 as absolute_confidence
+        from model_provided_timestamp_columns
     ),
 
     -- Sort the timestamp columns by confidence and assign a rank.

diff --git a/elementary/monitor/dbt_project/models/tests_recommendation/test_recommendations.sql b/elementary/monitor/dbt_project/models/tests_recommendation/test_recommendations.sql
@@ -1,7 +1,8 @@
-{# Object structure is [test_namespace, test_name, requires_timestamp_column] #}
+{# Object structure is [test_namespace, test_name] #}
 {% set recommended_tests = [
-    ("elementary", "volume_anomalies", false),
-    ("elementary", "freshness_anomalies", true),
+    ("elementary", "volume_anomalies"),
+    ("elementary", "freshness_anomalies"),
+    ("elementary", "schema_changes_from_baseline"),
 ] %}
 
 with
@@ -23,15 +24,14 @@ with
     ),
 
     potential_recommended_tests as (
-        select id, test_namespace, short_name, requires_timestamp_column
+        select id, test_namespace, short_name
         from tables_criticality
         cross join
             (
                 {% for recommended_test in recommended_tests %}
                     select
                         '{{ recommended_test[0] }}' as test_namespace,
-                        '{{ recommended_test[1] }}' as short_name,
-                        {{ recommended_test[2] }} as requires_timestamp_column
+                        '{{ recommended_test[1] }}' as short_name
                     {% if not loop.last %}
                         union all
                     {% endif %}
@@ -45,7 +45,7 @@ with
     ),
 
     pending_recommended_tests as (
-        select id, test_namespace, short_name, requires_timestamp_column
+        select id, test_namespace, short_name
         from potential_recommended_tests
         where
             (id, test_namespace, short_name) not in (
@@ -59,6 +59,16 @@ with
         from {{ ref("table_timestamp_columns") }}
     ),
 
+    table_columns as (
+        select
+            lower(database_name) as database_name,
+            lower(schema_name) as schema_name,
+            lower(table_name) as table_name,
+            json_agg(json_build_object('name', lower(column_name), 'data_type', lower(data_type))) as columns
+        from {{ ref("elementary", "dbt_columns") }}
+        group by 1, 2, 3
+    ),
+
     pending_tests_with_table_info as (
         select
             resource_name,
@@ -72,14 +82,23 @@ with
             exposure_count,
             table_type,
             case
-                when timestamp_column is not null
+                when short_name in ('volume_anomalies', 'freshness_anomalies') and timestamp_column is not null
                 then cast('{"timestamp_column": "' || timestamp_column || '"}' as jsonb)
-                else null
-            end as test_args
+            end as test_args,
+            case
+                when short_name = 'schema_changes_from_baseline'
+                then cast(json_build_object('columns', table_columns.columns) as jsonb)
+            end as table_args
         from pending_recommended_tests
         join tables_criticality using (id)
         left join timestamp_columns using (database_name, schema_name, table_name)
-        where requires_timestamp_column = false or timestamp_column is not null
+        left join table_columns using (database_name, schema_name, table_name)
+        where
+        short_name = 'volume_anomalies'
+        or
+        (short_name = 'freshness_anomalies' and timestamp_column is not null)
+        or
+        (short_name = 'schema_changes_from_baseline' and table_columns.columns is not null)
     )
 
 select *

diff --git a/elementary/monitor/dbt_project/packages.yml b/elementary/monitor/dbt_project/packages.yml
@@ -1,9 +1,9 @@
 packages:
   - package: dbt-labs/dbt_utils
     version: [">=0.8.0", "<0.9.0"]
-  - package: elementary-data/elementary
-    version: 0.10.3
-#  NOTE - for unreleased CLI versions we often need to update the package version to a commit hash (please leave this
-#  commented, so it will be easy to access)
-#  - git: https://github.com/elementary-data/dbt-data-reliability.git
-#    revision: 68b9edb2833d63b5de59e9648f38c8031d853d01
+  # - package: elementary-data/elementary
+  #   version: 0.10.3
+  #  NOTE - for unreleased CLI versions we often need to update the package version to a commit hash (please leave this
+  #  commented, so it will be easy to access)
+  - git: https://github.com/elementary-data/dbt-data-reliability.git
+    revision: 2a44dd0bb5e82ddb445d56d8f88ee1db444618b6