-
Notifications
You must be signed in to change notification settings - Fork 16
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
add:
npm
ownership intermediate model (#2520)
* add: npm `manifests` dlt connector * add: npm `manifests` and `downloads` staging sources * add: dbt macro to parse `npm` git URLs for remote information * add: `artifact_ownership` intermediate model
- Loading branch information
Showing
6 changed files
with
270 additions
and
11 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
{% macro parse_npm_git_url(key, source) %} | ||
|
||
select | ||
*, | ||
|
||
case | ||
when regexp_contains({{ key }}, r'^git\+ssh://') then | ||
regexp_replace({{ key }}, r'^git\+ssh://([^@]+)@', 'https://') | ||
when regexp_contains({{ key }}, r'^git@') then | ||
regexp_replace({{ key }}, r'^git@(.*?):', 'https://\\1/') | ||
when regexp_contains({{ key }}, r'^git\+https://') then | ||
regexp_replace({{ key }}, r'^git\+', '') | ||
when regexp_contains({{ key }}, r'^https?://') then | ||
{{ key }} | ||
when regexp_contains({{ key }}, r'^[^:/]+\.[^:/]+/') then | ||
concat('https://', {{ key }}) | ||
else null | ||
end as remote_url, | ||
|
||
regexp_extract( | ||
case | ||
when regexp_contains({{ key }}, r'\.git$') then | ||
regexp_replace({{ key }}, r'\.git$', '') | ||
else {{ key }} | ||
end, | ||
r'/([^/]+)$' | ||
) as remote_name, | ||
|
||
regexp_extract( | ||
case | ||
when regexp_contains({{ key }}, r'^git@') then | ||
regexp_replace({{ key }}, r'^git@(.*?):', 'https://\\1/') | ||
when regexp_contains({{ key }}, r'^git\+ssh://') then | ||
regexp_replace({{ key }}, r'^git\+ssh://', 'https://') | ||
else {{ key }} | ||
end, | ||
r'https?:\/\/[^\/]+\/([^\/]+)\/[^\/]+$' | ||
) as remote_namespace, | ||
|
||
case | ||
when regexp_contains({{ key }}, r'github\.com') then 'GITHUB' | ||
when regexp_contains({{ key }}, r'gitlab\.com') then 'GITLAB' | ||
when regexp_contains({{ key }}, r'bitbucket\.org') then 'BITBUCKET' | ||
else 'OTHER' | ||
end as remote_source_id | ||
|
||
from {{ source }} | ||
|
||
{% endmacro %} |
39 changes: 39 additions & 0 deletions
39
warehouse/dbt/models/intermediate/directory/int_artifact_ownership.sql
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
with npm_artifacts as ( | ||
select artifact_name | ||
from {{ ref('artifacts_v1') }} | ||
where artifact_source = 'NPM' | ||
), | ||
|
||
npm_manifests as ( | ||
select | ||
`name`, | ||
repository__url, | ||
repository__type, | ||
concat('https://www.npmjs.com/package/', `name`) as artifact_url | ||
from {{ ref('stg_npm__manifests') }} | ||
where | ||
`name` in (select * from npm_artifacts) | ||
and repository__url is not null | ||
), | ||
|
||
npm_repository_urls as ( | ||
{{ parse_npm_git_url('repository__url', 'npm_manifests') }} | ||
), | ||
|
||
npm_artifact_ownership as ( | ||
select | ||
{{ oso_id( | ||
"'NPM'", | ||
"artifact_url", | ||
) }} as artifact_id, | ||
artifact_url, | ||
`name` as artifact_name, | ||
'NPM' as artifact_source_id, | ||
remote_url, | ||
remote_name, | ||
remote_namespace, | ||
remote_source_id | ||
from npm_repository_urls | ||
) | ||
|
||
select * from npm_artifact_ownership |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
sources: | ||
- name: npm | ||
database: opensource-observer | ||
schema: npm | ||
tables: | ||
- name: downloads | ||
identifier: downloads | ||
- name: manifests | ||
identifier: manifests |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
with source as ( | ||
select | ||
`date`, | ||
artifact_name, | ||
downloads | ||
from {{ source('npm', 'downloads') }} | ||
) | ||
|
||
select * from source |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
{% set columns = [ | ||
"name", "version", "description", "keywords", "homepage", "bugs", | ||
"license", "author", "contributors", "funding", "files", "exports", | ||
"main", "browser", "bin", "man", "directories", "repository", | ||
"scripts", "config", "dependencies", "dev_dependencies", | ||
"peer_dependencies", "peer_dependencies_meta", "bundle_dependencies", | ||
"optional_dependencies", "overrides", "engines", "os", "cpu", | ||
"dev_engines", "private", "publish_config", "workspaces", "bugs__url", | ||
"repository__url", "repository__type", "author__url", "author__name", | ||
"author__email" | ||
] %} | ||
|
||
with source as ( | ||
select * from {{ source('npm', 'manifests') }} | ||
), | ||
|
||
renamed as ( | ||
select | ||
{% for column in columns %} | ||
{{ adapter.quote(column) }}{% if not loop.last %},{% endif %} | ||
{% endfor %} | ||
from source | ||
) | ||
|
||
select * from renamed |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,5 @@ | ||
from datetime import datetime, timedelta | ||
from typing import Generator, List, Optional | ||
from typing import Dict, Generator, List, Optional, Union | ||
|
||
import dlt | ||
import requests | ||
|
@@ -9,22 +9,61 @@ | |
|
||
from ..factories.dlt import dlt_factory, pydantic_to_dlt_nullable_columns | ||
|
||
# Host for the NPM API | ||
NPM_API_HOST = "https://api.npmjs.org" | ||
|
||
# Host for the NPM registry | ||
NPM_HOST = "https://api.npmjs.org" | ||
NPM_REGISTRY_HOST = "https://registry.npmjs.org" | ||
|
||
# NPM was launched on January 12, 2010 | ||
NPM_EPOCH = "2010-01-12T00:00:00Z" | ||
# https://github.com/npm/registry/blob/main/docs/download-counts.md#limits | ||
NPM_EPOCH = "2015-01-10T00:00:00Z" | ||
|
||
|
||
class NPMPackageInfo(BaseModel): | ||
class NPMPackageDownloadInfo(BaseModel): | ||
date: datetime | ||
artifact_name: str | ||
downloads: int | ||
|
||
|
||
class NPMPackageManifest(BaseModel): | ||
name: Optional[str] = None | ||
version: Optional[str] = None | ||
description: Optional[str] = None | ||
keywords: Optional[List] = None | ||
homepage: Optional[str] = None | ||
bugs: Optional[Union[str, Dict]] = None | ||
license: Optional[str] = None | ||
author: Optional[Union[str, Dict]] = None | ||
contributors: Optional[List] = None | ||
funding: Optional[Union[str, Dict, List]] = None | ||
files: Optional[List] = None | ||
exports: Optional[Dict] = None | ||
main: Optional[str] = None | ||
browser: Optional[bool] = None | ||
man: Optional[Union[str, Dict, List]] = None | ||
directories: Optional[Dict] = None | ||
repository: Optional[Union[str, Dict]] = None | ||
scripts: Optional[Dict] = None | ||
config: Optional[Dict] = None | ||
dependencies: Optional[Dict] = None | ||
devDependencies: Optional[Dict] = None | ||
peerDependencies: Optional[Dict] = None | ||
peerDependenciesMeta: Optional[Dict] = None | ||
bundleDependencies: Optional[List] = None | ||
optionalDependencies: Optional[Dict] = None | ||
overrides: Optional[Dict] = None | ||
engines: Optional[Dict] = None | ||
os: Optional[List] = None | ||
cpu: Optional[List] = None | ||
devEngines: Optional[Dict] = None | ||
private: Optional[bool] = None | ||
publishConfig: Optional[Dict] = None | ||
workspaces: Optional[List] = None | ||
|
||
|
||
def get_npm_package_downloads( | ||
package_name: str, date_from: datetime, date_to: datetime | ||
) -> Generator[Optional[NPMPackageInfo], None, None]: | ||
) -> Generator[Optional[NPMPackageDownloadInfo], None, None]: | ||
""" | ||
Fetches the download count for an NPM package between two dates. | ||
|
@@ -34,13 +73,13 @@ def get_npm_package_downloads( | |
date_to (datetime): The end date | ||
Yields: | ||
Optional[NPMPackageInfo]: The download count for the package | ||
Optional[NPMPackageDownloadInfo]: The download count for the package | ||
""" | ||
|
||
str_from = date_from.strftime("%Y-%m-%d") | ||
str_to = date_to.strftime("%Y-%m-%d") | ||
|
||
endpoint = f"{NPM_HOST}/downloads/range/{str_from}:{str_to}/{package_name}" | ||
endpoint = f"{NPM_API_HOST}/downloads/range/{str_from}:{str_to}/{package_name}" | ||
response = requests.get( | ||
endpoint, | ||
timeout=10, | ||
|
@@ -89,7 +128,7 @@ def get_npm_package_downloads( | |
total_downloads = sum(download["downloads"] for download in data["downloads"]) | ||
|
||
yield ( | ||
NPMPackageInfo( | ||
NPMPackageDownloadInfo( | ||
date=date_from, | ||
artifact_name=package_name, | ||
downloads=total_downloads, | ||
|
@@ -99,10 +138,44 @@ def get_npm_package_downloads( | |
) | ||
|
||
|
||
def get_npm_package_manifest( | ||
package_name: str, | ||
) -> Generator[Optional[NPMPackageManifest], None, None]: | ||
""" | ||
Fetches the manifest for an NPM package. | ||
Args: | ||
context (AssetExecutionContext): The asset execution context | ||
package_name (str): The NPM package name | ||
Yields: | ||
Optional[NPMPackageManifest]: The manifest for the package | ||
""" | ||
|
||
endpoint = f"{NPM_REGISTRY_HOST}/{package_name}/latest" | ||
response = requests.get( | ||
endpoint, | ||
timeout=10, | ||
headers={ | ||
"X-URL": "https://github.com/opensource-observer/oso", | ||
"X-Contact": "[email protected]", | ||
"X-Purpose": "We are currently indexing NPM packages to provide dependency statistics. " | ||
"If you have any questions or concerns, please contact us", | ||
}, | ||
) | ||
|
||
data = response.json() | ||
|
||
if not response.ok: | ||
raise ValueError(f"Failed to fetch data for {package_name}: {response.text}") | ||
|
||
yield NPMPackageManifest(**data) | ||
|
||
|
||
@dlt.resource( | ||
primary_key="artifact_name", | ||
name="downloads", | ||
columns=pydantic_to_dlt_nullable_columns(NPMPackageInfo), | ||
columns=pydantic_to_dlt_nullable_columns(NPMPackageDownloadInfo), | ||
) | ||
def get_all_downloads( | ||
context: AssetExecutionContext, | ||
|
@@ -117,7 +190,7 @@ def get_all_downloads( | |
package_names (List[str]): List of NPM package names to fetch | ||
Yields: | ||
List[NPMPackageInfo]: The download count for each package | ||
List[NPMPackageDownloadInfo]: The download count for each package | ||
""" | ||
|
||
start = datetime.strptime(context.partition_key, "%Y-%m-%d") | ||
|
@@ -134,6 +207,33 @@ def get_all_downloads( | |
) | ||
|
||
|
||
@dlt.resource( | ||
primary_key="name", | ||
name="manifests", | ||
columns=pydantic_to_dlt_nullable_columns(NPMPackageManifest), | ||
) | ||
def get_all_manifests( | ||
context: AssetExecutionContext, | ||
package_names: List, | ||
): | ||
""" | ||
Fetches the manifest for a list of NPM packages. | ||
Args: | ||
context (AssetExecutionContext): The asset execution | ||
package_names (List): List of NPM package names to fetch | ||
Yields: | ||
List[NPMPackageManifest]: The manifest for each package | ||
""" | ||
|
||
context.log.info(f"Processing NPM manifests for {len(package_names)} packages") | ||
|
||
yield from ( | ||
get_npm_package_manifest(package_name) for package_name in package_names | ||
) | ||
|
||
|
||
@dlt_factory( | ||
key_prefix="npm", | ||
partitions_def=WeeklyPartitionsDefinition( | ||
|
@@ -164,3 +264,31 @@ def downloads( | |
for row in client.query_with_string(unique_artifacts_query) | ||
], | ||
) | ||
|
||
|
||
@dlt_factory( | ||
key_prefix="npm", | ||
deps=[AssetKey(["dbt", "production", "artifacts_v1"])], | ||
) | ||
def manifests( | ||
context: AssetExecutionContext, | ||
cbt: CBTResource, | ||
): | ||
unique_artifacts_query = """ | ||
SELECT | ||
DISTINCT(artifact_name) | ||
FROM | ||
`oso.artifacts_v1` | ||
WHERE | ||
artifact_source = "NPM" | ||
""" | ||
|
||
client = cbt.get(context.log) | ||
|
||
yield get_all_manifests( | ||
context, | ||
package_names=[ | ||
row["artifact_name"] | ||
for row in client.query_with_string(unique_artifacts_query) | ||
], | ||
) |