Skip to content

Commit

Permalink
add: npm ownership intermediate model (#2520)
Browse files Browse the repository at this point in the history
* add: npm `manifests` dlt connector

* add: npm `manifests` and `downloads` staging sources

* add: dbt macro to parse `npm` git URLs for remote information

* add: `artifact_ownership` intermediate model
  • Loading branch information
Jabolol authored Nov 26, 2024
1 parent 3a912db commit dcceca0
Show file tree
Hide file tree
Showing 6 changed files with 270 additions and 11 deletions.
49 changes: 49 additions & 0 deletions warehouse/dbt/macros/models/parse_npm_git_url.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
{% macro parse_npm_git_url(key, source) %}

select
*,

case
when regexp_contains({{ key }}, r'^git\+ssh://') then
regexp_replace({{ key }}, r'^git\+ssh://([^@]+)@', 'https://')
when regexp_contains({{ key }}, r'^git@') then
regexp_replace({{ key }}, r'^git@(.*?):', 'https://\\1/')
when regexp_contains({{ key }}, r'^git\+https://') then
regexp_replace({{ key }}, r'^git\+', '')
when regexp_contains({{ key }}, r'^https?://') then
{{ key }}
when regexp_contains({{ key }}, r'^[^:/]+\.[^:/]+/') then
concat('https://', {{ key }})
else null
end as remote_url,

regexp_extract(
case
when regexp_contains({{ key }}, r'\.git$') then
regexp_replace({{ key }}, r'\.git$', '')
else {{ key }}
end,
r'/([^/]+)$'
) as remote_name,

regexp_extract(
case
when regexp_contains({{ key }}, r'^git@') then
regexp_replace({{ key }}, r'^git@(.*?):', 'https://\\1/')
when regexp_contains({{ key }}, r'^git\+ssh://') then
regexp_replace({{ key }}, r'^git\+ssh://', 'https://')
else {{ key }}
end,
r'https?:\/\/[^\/]+\/([^\/]+)\/[^\/]+$'
) as remote_namespace,

case
when regexp_contains({{ key }}, r'github\.com') then 'GITHUB'
when regexp_contains({{ key }}, r'gitlab\.com') then 'GITLAB'
when regexp_contains({{ key }}, r'bitbucket\.org') then 'BITBUCKET'
else 'OTHER'
end as remote_source_id

from {{ source }}

{% endmacro %}
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
with npm_artifacts as (
select artifact_name
from {{ ref('artifacts_v1') }}
where artifact_source = 'NPM'
),

npm_manifests as (
select
`name`,
repository__url,
repository__type,
concat('https://www.npmjs.com/package/', `name`) as artifact_url
from {{ ref('stg_npm__manifests') }}
where
`name` in (select * from npm_artifacts)
and repository__url is not null
),

npm_repository_urls as (
{{ parse_npm_git_url('repository__url', 'npm_manifests') }}
),

npm_artifact_ownership as (
select
{{ oso_id(
"'NPM'",
"artifact_url",
) }} as artifact_id,
artifact_url,
`name` as artifact_name,
'NPM' as artifact_source_id,
remote_url,
remote_name,
remote_namespace,
remote_source_id
from npm_repository_urls
)

select * from npm_artifact_ownership
9 changes: 9 additions & 0 deletions warehouse/dbt/models/npm_sources.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
sources:
- name: npm
database: opensource-observer
schema: npm
tables:
- name: downloads
identifier: downloads
- name: manifests
identifier: manifests
9 changes: 9 additions & 0 deletions warehouse/dbt/models/staging/npm/stg_npm__downloads.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
with source as (
select
`date`,
artifact_name,
downloads
from {{ source('npm', 'downloads') }}
)

select * from source
25 changes: 25 additions & 0 deletions warehouse/dbt/models/staging/npm/stg_npm__manifests.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
{% set columns = [
"name", "version", "description", "keywords", "homepage", "bugs",
"license", "author", "contributors", "funding", "files", "exports",
"main", "browser", "bin", "man", "directories", "repository",
"scripts", "config", "dependencies", "dev_dependencies",
"peer_dependencies", "peer_dependencies_meta", "bundle_dependencies",
"optional_dependencies", "overrides", "engines", "os", "cpu",
"dev_engines", "private", "publish_config", "workspaces", "bugs__url",
"repository__url", "repository__type", "author__url", "author__name",
"author__email"
] %}

with source as (
select * from {{ source('npm', 'manifests') }}
),

renamed as (
select
{% for column in columns %}
{{ adapter.quote(column) }}{% if not loop.last %},{% endif %}
{% endfor %}
from source
)

select * from renamed
150 changes: 139 additions & 11 deletions warehouse/oso_dagster/assets/npm.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from datetime import datetime, timedelta
from typing import Generator, List, Optional
from typing import Dict, Generator, List, Optional, Union

import dlt
import requests
Expand All @@ -9,22 +9,61 @@

from ..factories.dlt import dlt_factory, pydantic_to_dlt_nullable_columns

# Host for the NPM API
NPM_API_HOST = "https://api.npmjs.org"

# Host for the NPM registry
NPM_HOST = "https://api.npmjs.org"
NPM_REGISTRY_HOST = "https://registry.npmjs.org"

# NPM was launched on January 12, 2010
NPM_EPOCH = "2010-01-12T00:00:00Z"
# https://github.com/npm/registry/blob/main/docs/download-counts.md#limits
NPM_EPOCH = "2015-01-10T00:00:00Z"


class NPMPackageInfo(BaseModel):
class NPMPackageDownloadInfo(BaseModel):
date: datetime
artifact_name: str
downloads: int


class NPMPackageManifest(BaseModel):
name: Optional[str] = None
version: Optional[str] = None
description: Optional[str] = None
keywords: Optional[List] = None
homepage: Optional[str] = None
bugs: Optional[Union[str, Dict]] = None
license: Optional[str] = None
author: Optional[Union[str, Dict]] = None
contributors: Optional[List] = None
funding: Optional[Union[str, Dict, List]] = None
files: Optional[List] = None
exports: Optional[Dict] = None
main: Optional[str] = None
browser: Optional[bool] = None
man: Optional[Union[str, Dict, List]] = None
directories: Optional[Dict] = None
repository: Optional[Union[str, Dict]] = None
scripts: Optional[Dict] = None
config: Optional[Dict] = None
dependencies: Optional[Dict] = None
devDependencies: Optional[Dict] = None
peerDependencies: Optional[Dict] = None
peerDependenciesMeta: Optional[Dict] = None
bundleDependencies: Optional[List] = None
optionalDependencies: Optional[Dict] = None
overrides: Optional[Dict] = None
engines: Optional[Dict] = None
os: Optional[List] = None
cpu: Optional[List] = None
devEngines: Optional[Dict] = None
private: Optional[bool] = None
publishConfig: Optional[Dict] = None
workspaces: Optional[List] = None


def get_npm_package_downloads(
package_name: str, date_from: datetime, date_to: datetime
) -> Generator[Optional[NPMPackageInfo], None, None]:
) -> Generator[Optional[NPMPackageDownloadInfo], None, None]:
"""
Fetches the download count for an NPM package between two dates.
Expand All @@ -34,13 +73,13 @@ def get_npm_package_downloads(
date_to (datetime): The end date
Yields:
Optional[NPMPackageInfo]: The download count for the package
Optional[NPMPackageDownloadInfo]: The download count for the package
"""

str_from = date_from.strftime("%Y-%m-%d")
str_to = date_to.strftime("%Y-%m-%d")

endpoint = f"{NPM_HOST}/downloads/range/{str_from}:{str_to}/{package_name}"
endpoint = f"{NPM_API_HOST}/downloads/range/{str_from}:{str_to}/{package_name}"
response = requests.get(
endpoint,
timeout=10,
Expand Down Expand Up @@ -89,7 +128,7 @@ def get_npm_package_downloads(
total_downloads = sum(download["downloads"] for download in data["downloads"])

yield (
NPMPackageInfo(
NPMPackageDownloadInfo(
date=date_from,
artifact_name=package_name,
downloads=total_downloads,
Expand All @@ -99,10 +138,44 @@ def get_npm_package_downloads(
)


def get_npm_package_manifest(
package_name: str,
) -> Generator[Optional[NPMPackageManifest], None, None]:
"""
Fetches the manifest for an NPM package.
Args:
context (AssetExecutionContext): The asset execution context
package_name (str): The NPM package name
Yields:
Optional[NPMPackageManifest]: The manifest for the package
"""

endpoint = f"{NPM_REGISTRY_HOST}/{package_name}/latest"
response = requests.get(
endpoint,
timeout=10,
headers={
"X-URL": "https://github.com/opensource-observer/oso",
"X-Contact": "[email protected]",
"X-Purpose": "We are currently indexing NPM packages to provide dependency statistics. "
"If you have any questions or concerns, please contact us",
},
)

data = response.json()

if not response.ok:
raise ValueError(f"Failed to fetch data for {package_name}: {response.text}")

yield NPMPackageManifest(**data)


@dlt.resource(
primary_key="artifact_name",
name="downloads",
columns=pydantic_to_dlt_nullable_columns(NPMPackageInfo),
columns=pydantic_to_dlt_nullable_columns(NPMPackageDownloadInfo),
)
def get_all_downloads(
context: AssetExecutionContext,
Expand All @@ -117,7 +190,7 @@ def get_all_downloads(
package_names (List[str]): List of NPM package names to fetch
Yields:
List[NPMPackageInfo]: The download count for each package
List[NPMPackageDownloadInfo]: The download count for each package
"""

start = datetime.strptime(context.partition_key, "%Y-%m-%d")
Expand All @@ -134,6 +207,33 @@ def get_all_downloads(
)


@dlt.resource(
primary_key="name",
name="manifests",
columns=pydantic_to_dlt_nullable_columns(NPMPackageManifest),
)
def get_all_manifests(
context: AssetExecutionContext,
package_names: List,
):
"""
Fetches the manifest for a list of NPM packages.
Args:
context (AssetExecutionContext): The asset execution
package_names (List): List of NPM package names to fetch
Yields:
List[NPMPackageManifest]: The manifest for each package
"""

context.log.info(f"Processing NPM manifests for {len(package_names)} packages")

yield from (
get_npm_package_manifest(package_name) for package_name in package_names
)


@dlt_factory(
key_prefix="npm",
partitions_def=WeeklyPartitionsDefinition(
Expand Down Expand Up @@ -164,3 +264,31 @@ def downloads(
for row in client.query_with_string(unique_artifacts_query)
],
)


@dlt_factory(
key_prefix="npm",
deps=[AssetKey(["dbt", "production", "artifacts_v1"])],
)
def manifests(
context: AssetExecutionContext,
cbt: CBTResource,
):
unique_artifacts_query = """
SELECT
DISTINCT(artifact_name)
FROM
`oso.artifacts_v1`
WHERE
artifact_source = "NPM"
"""

client = cbt.get(context.log)

yield get_all_manifests(
context,
package_names=[
row["artifact_name"]
for row in client.query_with_string(unique_artifacts_query)
],
)

0 comments on commit dcceca0

Please sign in to comment.