From c34cdb258e67224c4b0c5818ac3d6e8570caf7cd Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Wed, 17 Apr 2024 11:29:27 -0600 Subject: [PATCH 1/3] added import of Copy and InjectAttrs to data_management_transforms --- feedstock/recipe.py | 45 +------------------------------------- feedstock/requirements.txt | 1 + 2 files changed, 2 insertions(+), 44 deletions(-) diff --git a/feedstock/recipe.py b/feedstock/recipe.py index 0049567..862988a 100644 --- a/feedstock/recipe.py +++ b/feedstock/recipe.py @@ -2,12 +2,11 @@ A synthetic prototype recipe """ -import zarr import os -from dataclasses import dataclass from typing import List, Dict import apache_beam as beam from datetime import datetime, timezone +from leap_data_management_utils.data_management_transforms import Copy, InjectAttrs from pangeo_forge_recipes.patterns import pattern_from_file_sequence from pangeo_forge_recipes.transforms import ( OpenURLWithFSSpec, @@ -21,48 +20,6 @@ yaml = YAML(typ="safe") -# copied from cmip feedstock (TODO: move to central repo?) -@dataclass -class Copy(beam.PTransform): - target: str - - def _copy(self, store: zarr.storage.FSStore) -> zarr.storage.FSStore: - import os - import zarr - import gcsfs - - # We do need the gs:// prefix? - # TODO: Determine this dynamically from zarr.storage.FSStore - source = f"gs://{os.path.normpath(store.path)}/" # FIXME more elegant. `.copytree` needs trailing slash - fs = gcsfs.GCSFileSystem() # FIXME: How can we generalize this? - fs.cp(source, self.target, recursive=True) - # return a new store with the new path that behaves exactly like the input - # to this stage (so we can slot this stage right before testing/logging stages) - return zarr.storage.FSStore(self.target) - - def expand(self, pcoll: beam.PCollection) -> beam.PCollection: - return pcoll | "Copying Store" >> beam.Map(self._copy) - - -@dataclass -class InjectAttrs(beam.PTransform): - inject_attrs: dict - - def _update_zarr_attrs(self, store: zarr.storage.FSStore) -> zarr.storage.FSStore: - # TODO: Can we get a warning here if the store does not exist? - attrs = zarr.open(store, mode="a").attrs - attrs.update(self.inject_attrs) - # ? Should we consolidate here? We are explicitly doing that later... - return store - - def expand( - self, pcoll: beam.PCollection[zarr.storage.FSStore] - ) -> beam.PCollection[zarr.storage.FSStore]: - return pcoll | "Injecting Attributes" >> beam.Map(self._update_zarr_attrs) - - -# TODO: Both these stages are generally useful. They should at least be in the utils package, maybe in recipes? - # load the global config values (we will have to decide where these ultimately live) catalog_meta = yaml.load(open("feedstock/catalog.yaml")) diff --git a/feedstock/requirements.txt b/feedstock/requirements.txt index d542980..36db6f8 100644 --- a/feedstock/requirements.txt +++ b/feedstock/requirements.txt @@ -1,3 +1,4 @@ pangeo-forge-recipes==0.10.7 apache-beam[gcp] gcsfs +git+https://github.com/leap-stc/leap-data-management-utils@proto_transform From 6879fc839cefdae137febda6ec810d6f007724f3 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 17 Apr 2024 17:31:04 +0000 Subject: [PATCH 2/3] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 6642556..29498c0 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ Click on the button on the top left to use this repository as a template for you >[!WARNING] > - Make sure to create the repo under the `leap-stc` github organization, not your personal account! If you already did that, you can always transfer the ownership afterwards. -> - Name your feedstock according to your data `_feedstock`. +> - Name your feedstock according to your data `_feedstock`. Now you can locally check out the repository. From e18a2b0d0ae91d017b475293ee194bd7c228ea84 Mon Sep 17 00:00:00 2001 From: Julius Busecke Date: Wed, 17 Apr 2024 14:30:22 -0400 Subject: [PATCH 3/3] Update requirements.txt --- feedstock/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/feedstock/requirements.txt b/feedstock/requirements.txt index 36db6f8..aa9503b 100644 --- a/feedstock/requirements.txt +++ b/feedstock/requirements.txt @@ -1,4 +1,4 @@ pangeo-forge-recipes==0.10.7 apache-beam[gcp] gcsfs -git+https://github.com/leap-stc/leap-data-management-utils@proto_transform +leap-data-management-utils==0.0.3