From f7c15d8331890a2815c2dddcad83ebb79a37ec52 Mon Sep 17 00:00:00 2001 From: Carl Cervone <42869436+ccerv1@users.noreply.github.com> Date: Thu, 12 Dec 2024 17:43:43 -0500 Subject: [PATCH] feat(dbt): mart registry for dependency graph (#2623) * fix(dbt): point to better source for latest package owner in deps.dev * feat(dbt): staging model for latest package owner * feat(dbt): int model to see if we have a package's owner repo already * fix(dbt): add model to resolve deps.dev and sbom source names * feat(dbt): parse package lineage and determine if historic owner is current owner * fix(dbt): refactor to use int_packages * fix(dbt): use int_packages to join on sbom package sources * fix(dbt): lookup deps.dev package owner artifact_id if it exists * fix(dbt): rename field * feat(dbt): add sbom mart model * feat(dbt): add project id to int model * refactor(dbt): break sboms_v0 into two models --- warehouse/dbt/models/deps_dev_sources.yml | 5 +- .../int_artifacts_in_deps_dev_by_project.sql | 31 ++++++++++++ .../intermediate/directory/int_packages.sql | 47 +++++++++++++++++++ .../directory/int_sbom_artifacts.sql | 26 ++++++++++ .../marts/directory/package_owners_v0.sql | 21 +++++++++ .../dbt/models/marts/directory/sboms_v0.sql | 21 +++++++++ .../deps-dev/stg_deps_dev__packages.sql | 40 ++++++++++++++++ 7 files changed, 188 insertions(+), 3 deletions(-) create mode 100644 warehouse/dbt/models/intermediate/directory/int_artifacts_in_deps_dev_by_project.sql create mode 100644 warehouse/dbt/models/intermediate/directory/int_packages.sql create mode 100644 warehouse/dbt/models/marts/directory/package_owners_v0.sql create mode 100644 warehouse/dbt/models/marts/directory/sboms_v0.sql create mode 100644 warehouse/dbt/models/staging/deps-dev/stg_deps_dev__packages.sql diff --git a/warehouse/dbt/models/deps_dev_sources.yml b/warehouse/dbt/models/deps_dev_sources.yml index e19c6e64a..89d21ab20 100644 --- a/warehouse/dbt/models/deps_dev_sources.yml +++ b/warehouse/dbt/models/deps_dev_sources.yml @@ -5,6 +5,5 @@ sources: tables: - name: dependencies identifier: Dependencies - - name: package_versions_latest - identifier: PackageVersionsLatest - + - name: package_version_to_project_latest + identifier: PackageVersionToProjectLatest diff --git a/warehouse/dbt/models/intermediate/directory/int_artifacts_in_deps_dev_by_project.sql b/warehouse/dbt/models/intermediate/directory/int_artifacts_in_deps_dev_by_project.sql new file mode 100644 index 000000000..5d6d6d795 --- /dev/null +++ b/warehouse/dbt/models/intermediate/directory/int_artifacts_in_deps_dev_by_project.sql @@ -0,0 +1,31 @@ +with latest_package as ( + select distinct + package_artifact_source, + package_artifact_name, + package_github_owner, + package_github_repo + from {{ ref('int_packages') }} + where is_current_owner = true +) + +select + github_repos.project_id, + github_repos.artifact_id, + github_repos.artifact_source_id, + github_repos.artifact_source, + latest_package.package_github_owner as artifact_namespace, + latest_package.package_github_repo as artifact_name, + all_packages.artifact_id as package_artifact_id, + latest_package.package_artifact_source as package_artifact_source, + latest_package.package_artifact_name as package_artifact_name +from latest_package +left outer join {{ ref('int_all_artifacts') }} as github_repos + on + latest_package.package_github_owner = github_repos.artifact_namespace + and latest_package.package_github_repo = github_repos.artifact_name + and github_repos.artifact_source = 'GITHUB' +left outer join {{ ref('int_all_artifacts') }} as all_packages + on + latest_package.package_artifact_name = all_packages.artifact_name + and latest_package.package_artifact_source = all_packages.artifact_source +where github_repos.project_id is not null diff --git a/warehouse/dbt/models/intermediate/directory/int_packages.sql b/warehouse/dbt/models/intermediate/directory/int_packages.sql new file mode 100644 index 000000000..c96fc36dc --- /dev/null +++ b/warehouse/dbt/models/intermediate/directory/int_packages.sql @@ -0,0 +1,47 @@ +with deps_dev as ( + select + `Version` as package_version, + upper(`System`) as package_artifact_source, + lower(`Name`) as package_artifact_name, + lower(split(`ProjectName`, '/')[0]) as package_github_owner, + lower(split(`ProjectName`, '/')[1]) as package_github_repo + from {{ ref('stg_deps_dev__packages') }} +), + +latest_versions as ( + select + package_artifact_source, + package_artifact_name, + package_github_owner as current_owner, + package_github_repo as current_repo + from deps_dev + qualify row_number() over ( + partition by package_artifact_name, package_artifact_source + order by package_version desc + ) = 1 +) + +select + deps_dev.package_artifact_source, + deps_dev.package_artifact_name, + deps_dev.package_version, + deps_dev.package_github_owner, + deps_dev.package_github_repo, + case + when deps_dev.package_artifact_source = 'CARGO' then 'RUST' + when deps_dev.package_artifact_source = 'NPM' then 'NPM' + when deps_dev.package_artifact_source = 'PYPI' then 'PIP' + when deps_dev.package_artifact_source = 'GO' then 'GO' + when deps_dev.package_artifact_source = 'MAVEN' then 'MAVEN' + when deps_dev.package_artifact_source = 'NUGET' then 'NUGET' + else 'UNKNOWN' + end as sbom_artifact_source, + ( + deps_dev.package_github_owner = latest_versions.current_owner + and deps_dev.package_github_repo = latest_versions.current_repo) + as is_current_owner +from deps_dev +left join latest_versions + on + deps_dev.package_artifact_source = latest_versions.package_artifact_source + and deps_dev.package_artifact_name = latest_versions.package_artifact_name diff --git a/warehouse/dbt/models/intermediate/directory/int_sbom_artifacts.sql b/warehouse/dbt/models/intermediate/directory/int_sbom_artifacts.sql index e1b1a646a..0427693d5 100644 --- a/warehouse/dbt/models/intermediate/directory/int_sbom_artifacts.sql +++ b/warehouse/dbt/models/intermediate/directory/int_sbom_artifacts.sql @@ -31,12 +31,23 @@ sbom_artifacts as ( snapshot_at from ranked_snapshots where row_num = 1 +), + +deps_dev_packages as ( + select distinct + sbom_artifact_source, + package_artifact_name, + package_github_owner, + package_github_repo + from {{ ref('int_packages') }} + where is_current_owner = true ) select {# Because we use repo.id as the artifact_source_id for github, we need to lookup the artifact_id for the SBOM repo. If the artifact is not found, this will return null. #} + all_repos.project_id, all_repos.artifact_id, sbom_artifacts.artifact_source, sbom_artifacts.artifact_namespace, @@ -44,10 +55,17 @@ select {# Because we only index packages that are found in OSSD, most of the time this will return a null package_artifact_id. #} + all_packages.project_id as package_project_id, all_packages.artifact_id as package_artifact_id, sbom_artifacts.package_source as package_artifact_source, + '' as package_artifact_namespace, sbom_artifacts.package as package_artifact_name, + 'GITHUB' as package_owner_source, sbom_artifacts.package_version as package_version, + deps_dev_packages.package_github_owner, + deps_dev_packages.package_github_repo, + deps_dev_repos.artifact_id as package_github_artifact_id, + deps_dev_repos.project_id as package_github_project_id, sbom_artifacts.snapshot_at from sbom_artifacts left outer join {{ ref('int_all_artifacts') }} as all_repos @@ -58,3 +76,11 @@ left outer join {{ ref('int_all_artifacts') }} as all_packages on sbom_artifacts.package = all_packages.artifact_name and sbom_artifacts.package_source = all_packages.artifact_source +left outer join deps_dev_packages + on + sbom_artifacts.package_source = deps_dev_packages.sbom_artifact_source + and sbom_artifacts.package = deps_dev_packages.package_artifact_name +left outer join {{ ref('int_all_artifacts') }} as deps_dev_repos + on + deps_dev_packages.package_github_owner = deps_dev_repos.artifact_namespace + and deps_dev_packages.package_github_repo = deps_dev_repos.artifact_name diff --git a/warehouse/dbt/models/marts/directory/package_owners_v0.sql b/warehouse/dbt/models/marts/directory/package_owners_v0.sql new file mode 100644 index 000000000..31128281b --- /dev/null +++ b/warehouse/dbt/models/marts/directory/package_owners_v0.sql @@ -0,0 +1,21 @@ +{{ + config( + materialized='table', + meta = { + 'sync_to_db': True, + } + ) +}} + +select distinct + package_project_id, + package_artifact_id, + package_artifact_source, + package_artifact_namespace, + package_artifact_name, + package_github_project_id as package_owner_project_id, + package_github_artifact_id as package_owner_artifact_id, + package_owner_source, + package_github_owner as package_owner_artifact_namespace, + package_github_repo as package_owner_artifact_name +from {{ ref('int_sbom_artifacts') }} diff --git a/warehouse/dbt/models/marts/directory/sboms_v0.sql b/warehouse/dbt/models/marts/directory/sboms_v0.sql new file mode 100644 index 000000000..e3d13915e --- /dev/null +++ b/warehouse/dbt/models/marts/directory/sboms_v0.sql @@ -0,0 +1,21 @@ +{{ + config( + materialized='table', + meta = { + 'sync_to_db': True, + } + ) +}} + +select distinct + project_id as from_project_id, + artifact_id as from_artifact_id, + artifact_source as from_artifact_source, + artifact_namespace as from_artifact_namespace, + artifact_name as from_artifact_name, + package_project_id as to_package_project_id, + package_artifact_id as to_package_artifact_id, + package_artifact_source as to_package_artifact_source, + package_artifact_namespace as to_package_artifact_namespace, + package_artifact_name as to_package_artifact_name +from {{ ref('int_sbom_artifacts') }} diff --git a/warehouse/dbt/models/staging/deps-dev/stg_deps_dev__packages.sql b/warehouse/dbt/models/staging/deps-dev/stg_deps_dev__packages.sql new file mode 100644 index 000000000..f78ca02a8 --- /dev/null +++ b/warehouse/dbt/models/staging/deps-dev/stg_deps_dev__packages.sql @@ -0,0 +1,40 @@ +{{ config( + materialized='incremental', + partition_by={ + 'field': 'SnapshotAt', + 'data_type': 'timestamp', + 'granularity': 'day' + }, +) }} + +{% set is_production = target.name == 'production' %} + +{% if is_production %} +with base as ( + select + `SnapshotAt`, + `System`, + `ProjectName`, + `Name`, + `Version` + from {{ source("deps_dev", "package_version_to_project_latest") }} + where + `ProjectName` is not null + and `ProjectType` = 'GITHUB' + and `RelationType` = 'SOURCE_REPO_TYPE' +) +{% if is_incremental() %} + select * from base + where `SnapshotAt` > (select max(`SnapshotAt`) from {{ this }}) +{% else %} + select * from base +{% endif %} +{% else %} + select + 'NPM' as `System`, + 'opensource-observer/oso' as `ProjectName`, + '@example/oso' as `Name`, + '0.0.16' as `Version`, + current_timestamp() as `SnapshotAt` + limit 1 +{% endif %}