Skip to content

Commit

Permalink
Merge branch 'master' into ele-1706-add-result-description-to-source-…
Browse files Browse the repository at this point in the history
…freshness
  • Loading branch information
NoyaArie authored Sep 27, 2023
2 parents 5650bcc + f30a0fd commit 3a7483d
Show file tree
Hide file tree
Showing 7 changed files with 72 additions and 39 deletions.
1 change: 1 addition & 0 deletions .github/workflows/test-warehouse.yml
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,7 @@ jobs:
run: >
edr monitor
-t "${{ inputs.warehouse-type }}"
--group-by table
--project-dir "${{ env.DBT_PKG_INTEG_TESTS_DIR }}"
--project-profile-target "${{ inputs.warehouse-type }}"
--slack-webhook "$SLACK_WEBHOOK"
Expand Down
8 changes: 0 additions & 8 deletions docs/guides/modules-overview/dbt-package.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -234,11 +234,3 @@ _Incremental model_
Stores the schema details for tables that are monitored with elementary schema changes test.
In order to compare current schema to previous state, we must store the previous state.
The data is from a view that queries the data warehouse information schema.

### filtered_information_schema_columns

_View_

Queries the columns view from the information schema of the schemas in the project.
This view is generated using an adapter specific macro, as information schema is different between platforms.
This is a view to make the work with the information schema more convenient.
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
with alerts as (
select *
from {{ ref('alerts') }}
-- When using --group-by table, singular test alerts are not sent.
where sub_type != 'singular'
),

alerts_models as (
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
{% endif %}

{% set query %}
select resource_name, source_name, test_namespace, test_name, test_args
select resource_name, source_name, test_namespace, test_name, test_args, table_args
from {{ ref("test_recommendations") }}
where {{ where_expression }}
{% endset %}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,35 +1,34 @@
{# Prioritization: 1. insertion time, 2. update time. #}
{% set timestamp_column_names = [
"created_at",
"created_at_utc",
"inserted_at",
"inserted_at_utc",
"updated_at",
"updated_at_utc",
"_fivetran_synced",
"_airbyte_emitted_at",
"create_date",
"created",
"db_insert_time",
"create_ts",
"created_ts",
"update_ts",
"updated_ts",
"load_ts",
"loaded_at",
"date_created",
"dbt_updated_at",
"update_datetime",
"event_time",
"event_date",
"event_created_at",
"event_updated_at",
"event_event_time",
"_etl_loaded_at",
"__etl_loaded_at",
"_etl_inserted_at",
"_ingestion_time",
"_fivetran_synced",
"_airbyte_emitted_at",

"updated_at",
"updated_at_utc",
"update_ts",
"updated_ts",
"dbt_updated_at",
"update_datetime",
"event_updated_at",
"last_modified_datetime",
] %}

{% set joined_timestamp_column_names = "'{}'".format(
"', '".join(timestamp_column_names)
) %}
Expand Down Expand Up @@ -76,6 +75,18 @@ with
where loaded_at_field is not null
),

-- Users can provide the timestamp columns for their models,
-- if provided, we assign a confidence score of 0 (certain).
model_provided_timestamp_columns as (
select
lower(database_name) as database_name,
lower(schema_name) as schema_name,
lower(name) as table_name,
bigquery_partition_by::json ->> 'field' as column_name
from {{ ref("elementary", "dbt_models") }}
where bigquery_partition_by::json ->> 'data_type' != 'int64'
),

-- Combining the inferred and source provided timestamp columns.
absolute_rated_timestamp_columns as (
select
Expand All @@ -93,6 +104,14 @@ with
column_name,
0 as absolute_confidence
from source_provided_timestamp_columns
union all
select
database_name,
schema_name,
table_name,
column_name,
0 as absolute_confidence
from model_provided_timestamp_columns
),

-- Sort the timestamp columns by confidence and assign a rank.
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
{# Object structure is [test_namespace, test_name, requires_timestamp_column] #}
{# Object structure is [test_namespace, test_name] #}
{% set recommended_tests = [
("elementary", "volume_anomalies", false),
("elementary", "freshness_anomalies", true),
("elementary", "volume_anomalies"),
("elementary", "freshness_anomalies"),
("elementary", "schema_changes_from_baseline"),
] %}

with
Expand All @@ -23,15 +24,14 @@ with
),

potential_recommended_tests as (
select id, test_namespace, short_name, requires_timestamp_column
select id, test_namespace, short_name
from tables_criticality
cross join
(
{% for recommended_test in recommended_tests %}
select
'{{ recommended_test[0] }}' as test_namespace,
'{{ recommended_test[1] }}' as short_name,
{{ recommended_test[2] }} as requires_timestamp_column
'{{ recommended_test[1] }}' as short_name
{% if not loop.last %}
union all
{% endif %}
Expand All @@ -45,7 +45,7 @@ with
),

pending_recommended_tests as (
select id, test_namespace, short_name, requires_timestamp_column
select id, test_namespace, short_name
from potential_recommended_tests
where
(id, test_namespace, short_name) not in (
Expand All @@ -59,6 +59,16 @@ with
from {{ ref("table_timestamp_columns") }}
),

table_columns as (
select
lower(database_name) as database_name,
lower(schema_name) as schema_name,
lower(table_name) as table_name,
json_agg(json_build_object('name', lower(column_name), 'data_type', lower(data_type))) as columns
from {{ ref("elementary", "dbt_columns") }}
group by 1, 2, 3
),

pending_tests_with_table_info as (
select
resource_name,
Expand All @@ -72,14 +82,23 @@ with
exposure_count,
table_type,
case
when timestamp_column is not null
when short_name in ('volume_anomalies', 'freshness_anomalies') and timestamp_column is not null
then cast('{"timestamp_column": "' || timestamp_column || '"}' as jsonb)
else null
end as test_args
end as test_args,
case
when short_name = 'schema_changes_from_baseline'
then cast(json_build_object('columns', table_columns.columns) as jsonb)
end as table_args
from pending_recommended_tests
join tables_criticality using (id)
left join timestamp_columns using (database_name, schema_name, table_name)
where requires_timestamp_column = false or timestamp_column is not null
left join table_columns using (database_name, schema_name, table_name)
where
short_name = 'volume_anomalies'
or
(short_name = 'freshness_anomalies' and timestamp_column is not null)
or
(short_name = 'schema_changes_from_baseline' and table_columns.columns is not null)
)

select *
Expand Down
12 changes: 6 additions & 6 deletions elementary/monitor/dbt_project/packages.yml
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
packages:
- package: dbt-labs/dbt_utils
version: [">=0.8.0", "<0.9.0"]
- package: elementary-data/elementary
version: 0.10.3
# NOTE - for unreleased CLI versions we often need to update the package version to a commit hash (please leave this
# commented, so it will be easy to access)
# - git: https://github.com/elementary-data/dbt-data-reliability.git
# revision: 68b9edb2833d63b5de59e9648f38c8031d853d01
# - package: elementary-data/elementary
# version: 0.10.3
# NOTE - for unreleased CLI versions we often need to update the package version to a commit hash (please leave this
# commented, so it will be easy to access)
- git: https://github.com/elementary-data/dbt-data-reliability.git
revision: 2a44dd0bb5e82ddb445d56d8f88ee1db444618b6

0 comments on commit 3a7483d

Please sign in to comment.