Skip to content

Commit

Permalink
Add get_catalog_store_urls + get_github_commit_url (#23)
Browse files Browse the repository at this point in the history
* Add get_catalog_store_urls + get_github_commit_url

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix pre-commit stuff

* Bugfix in get_catalog_store_urls

* Make inject_attrs optional

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
jbusecke and pre-commit-ci[bot] authored Apr 22, 2024
1 parent b5762a1 commit 4bf7925
Show file tree
Hide file tree
Showing 3 changed files with 91 additions and 3 deletions.
61 changes: 58 additions & 3 deletions leap_data_management_utils/data_management_transforms.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,52 @@
# Note: All of this code was written by Julius Busecke and copied from this feedstock:
# https://github.com/leap-stc/cmip6-leap-feedstock/blob/main/feedstock/recipe.py#L262

import datetime
import subprocess
from dataclasses import dataclass
from datetime import datetime, timezone
from typing import Optional

import apache_beam as beam
import zarr
from google.api_core.exceptions import NotFound
from google.cloud import bigquery
from ruamel.yaml import YAML

yaml = YAML(typ='safe')


def get_github_commit_url() -> Optional[str]:
"""Get the GitHub commit URL for the current commit"""
# Get GitHub Server URL
github_server_url = 'https://github.com'

# Get the repository's remote origin URL
try:
repo_origin_url = subprocess.check_output(
['git', 'config', '--get', 'remote.origin.url'], text=True
).strip()

# Extract the repository path from the remote URL
repository_path = repo_origin_url.split('github.com/')[-1].replace('.git', '')

# Get the current commit SHA
commit_sha = subprocess.check_output(['git', 'rev-parse', 'HEAD'], text=True).strip()

# Construct the GitHub commit URL
git_url_hash = f'{github_server_url}/{repository_path}/commit/{commit_sha}'

# Output the GitHub commit URL
return git_url_hash

except subprocess.CalledProcessError as e:
print('Error executing Git command:', e)
return None


def get_catalog_store_urls(catalog_yaml_path: str) -> dict[str, str]:
with open(catalog_yaml_path) as f:
catalog_meta = yaml.load(f)
return {d['id']: d['url'] for d in catalog_meta['stores']}


@dataclass
Expand Down Expand Up @@ -54,7 +92,7 @@ def _get_table(self) -> bigquery.table.Table:
return self.client.get_table(self.table_id)

def insert(self, fields: dict = {}):
timestamp = datetime.datetime.now().isoformat()
timestamp = datetime.now().isoformat()

rows_to_insert = [
fields | {'timestamp': timestamp} # timestamp is always overridden
Expand Down Expand Up @@ -120,6 +158,8 @@ def expand(self, pcoll: beam.PCollection) -> beam.PCollection:

@dataclass
class Copy(beam.PTransform):
"""Copy a store to a new location. If the target input is False, do nothing."""

target: str

def _copy(self, store: zarr.storage.FSStore) -> zarr.storage.FSStore:
Expand Down Expand Up @@ -147,7 +187,22 @@ def expand(self, pcoll: beam.PCollection) -> beam.PCollection:

@dataclass
class InjectAttrs(beam.PTransform):
inject_attrs: dict
inject_attrs: dict = None
add_provenance: bool = True

# add a post_init method to add the provenance attributes
def __post_init__(self):
if self.inject_attrs is None:
self.inject_attrs = {}

if self.add_provenance:
git_url_hash = get_github_commit_url()
timestamp = datetime.now(timezone.utc).isoformat()
provenance_dict = {
'pangeo_forge_build_git_hash': git_url_hash,
'pangeo_forge_build_timestamp': timestamp,
}
self.inject_attrs.update(provenance_dict)

def _update_zarr_attrs(self, store: zarr.storage.FSStore) -> zarr.storage.FSStore:
# TODO: Can we get a warning here if the store does not exist?
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,36 @@
from ruamel.yaml import YAML

from leap_data_management_utils.data_management_transforms import (
get_catalog_store_urls,
get_github_commit_url,
)

yaml = YAML(typ='safe')


def test_smoke_test():
assert True
# This is a bit dumb, but it at least checks the the imports are working
# again super hard to test code involving bigquery here.


def test_get_github_commit_url():
url = get_github_commit_url()
assert url.startswith('https://github.com/leap-stc/leap-data-management-utils')


def test_get_catalog_store_urls(tmp_path):
# Create a temporary text file
temp_file = tmp_path / 'some-name.yaml'
data = {
'stores': [{'id': 'a', 'url': 'a-url', 'some_other': 'stuff'}, {'id': 'b', 'url': 'b-url'}]
}
with open(temp_file, 'w') as f:
yaml.dump(data, f)

# Call the function to read the file
content = get_catalog_store_urls(temp_file)

# Assertions
assert content['a'] == 'a-url'
assert content['b'] == 'b-url'
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ dependencies = [
"pydantic-core",
"pydantic>=2",
"pyyaml",
"ruamel.yaml",
"universal-pathlib",
"xarray",
"zarr",
Expand Down

0 comments on commit 4bf7925

Please sign in to comment.