From ffdb901a5cd3d571577990f2be4fd951ebcf42de Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Wed, 26 Apr 2023 13:05:40 -0500 Subject: [PATCH 01/44] PathLike wrapper/cache for ExternalStorage --- gufe/storage/pseudodirectory.py | 156 +++++++++++++++++++++ gufe/tests/storage/test_pseudodirectory.py | 107 ++++++++++++++ 2 files changed, 263 insertions(+) create mode 100644 gufe/storage/pseudodirectory.py create mode 100644 gufe/tests/storage/test_pseudodirectory.py diff --git a/gufe/storage/pseudodirectory.py b/gufe/storage/pseudodirectory.py new file mode 100644 index 00000000..b595dafa --- /dev/null +++ b/gufe/storage/pseudodirectory.py @@ -0,0 +1,156 @@ +from typing import Union, Optional +from pathlib import Path +from os import PathLike +from .externalresource import ExternalStorage + +import logging +_logger = logging.getLogger(__name__) + +class SharedRoot: + """ + Parameters + ---------- + scratch : os.PathLike + the scratch directory shared by all objects on this host + external : :class:`.ExternalStorage` + external storage resource where objects should eventualy go + prefix : str + label for this specific unit + holding : os.PathLike + name of the subdirectory of scratch where shared results are + temporarily stored; default is '.holding'. This must be the same for + all units within a DAG. + delete_holding : bool + whether to delete the contents of the $SCRATCH/$HOLDING/$PREFIX + directory when this object is deleted + read_only : bool + write to prevent NEW files from being written within this shared + directory. NOTE: This will not prevent overwrite of existing files, + in scratch space, but it will prevent changed files from uploading + to the external storage. + """ + def __init__( + self, + scratch: PathLike, + external: ExternalStorage, + prefix: str, + *, + holding: PathLike =".holding", + delete_holding: bool = True, + read_only: bool = False, + ): + self.external = external + self.scratch = Path(scratch) + self.prefix = Path(prefix) + self.read_only = read_only + self.delete_holding = delete_holding + self.holding = holding + + self.registry = set() + # NOTE: the fact that we use $SCRATCH/$HOLDING/$PREFIX instead of + # $SCRATCH/$PREFIX/$HOLDING is important for 2 reasons: + # 1. This doesn't take any of the user's namespace from their + # $SCRATCH/$PREFIX directory. + # 2. This allows us to easily use an external FileStorage where the + # external storage is exactly the same as this local storage, + # meaning that copies to/from the external storage are no-ops. + # Use FileStorage(scratch / holding) for that. + self.shared_dir = self.scratch / holding / prefix + self.shared_dir.mkdir(exist_ok=True, parents=True) + + def get_other_shared_dir(self, prefix, delete_holding=None): + """Get a related unit's shared directory. + """ + if delete_holding is None: + delete_holding = self.delete_holding + + return SharedRoot( + scratch=self.scratch, + external=self.external, + prefix=prefix, + holding=self.holding, + delete_holding=delete_holding, + read_only=True, + ) + + def transfer_holding_to_external(self): + """Transfer all objects in the registry to external storage""" + if self.read_only: + logging.debug("Read-only: Not transfering to external storage") + return # early exit + + for obj in self.registry: + path = Path(obj) + if not path.exists(): + logging.info(f"Found nonexistent path {path}, not " + "transfering to external storage") + elif path.is_dir(): + logging.debug(f"Found directory {path}, not " + "transfering to external storage") + else: + logging.info(f"Transfering {path} to external storage") + self.external.store_path(obj.label, path) + + def __del__(self): + # take everything in self.shared_dir and write to it shared; keeping + # our prefix + self.transfer_holding_to_external() + if self.delete_holding: + shutil.rmtree(self.shared_dir) + + def register_path(self, shared_path): + label_exists = self.external.exists(shared_path.label) + + if self.read_only and not label_exists: + raise IOError(f"Unable to create '{shared_path.label}'. This " + "shared path is read-only.") + + self.registry.add(shared_path) + + # if this is a file that exists, bring it into our subdir + # NB: this happens even if you're intending to overwrite the path, + # which is kind of wasteful + if label_exists: + scratch_path = self.shared_dir / shared_path.path + # TODO: switch this to using `get_filename` and `store_path` + with self.external.load_stream(shared_path.label) as f: + external_bytes = f.read() + if scratch_path.exists(): + ... # TODO: something to check that the bytes are the same? + scratch_path.parent.mkdir(exist_ok=True, parents=True) + with open(scratch_path, mode='wb') as f: + f.write(external_bytes) + + def __truediv__(self, path: PathLike): + return SharedPath(root=self, path=path) + + def __fspath__(self): + return str(self.shared_dir) + + def __repr__(self): + return f"SharedRoot({self.scratch}, {self.external}, {self.prefix})" + + +class SharedPath: + def __init__(self, root: SharedRoot, path: PathLike): + self.root = root + self.path = Path(path) + self.root.register_path(self) + + def __truediv__(self, path): + return SharedPath(self.root, self.path / path) + + def __fspath__(self): + return str(self.root.shared_dir / self.path) + + @property + def label(self): + return str(self.root.prefix / self.path) + + def __repr__(self): + return f"SharedPath({self.__fspath__()})" + + # TODO: how much of the pathlib.Path interface do we want to wrap? + # although edge cases may be a pain, we can get most of it with, e.g.: + # def exists(self): return Path(self).exists() + # but also, can do pathlib.Path(shared_path) and get hte whole thing diff --git a/gufe/tests/storage/test_pseudodirectory.py b/gufe/tests/storage/test_pseudodirectory.py new file mode 100644 index 00000000..4763cdd2 --- /dev/null +++ b/gufe/tests/storage/test_pseudodirectory.py @@ -0,0 +1,107 @@ +import pytest + +import os +import pathlib + +from gufe.storage.externalresource import MemoryStorage +from gufe.storage.pseudodirectory import SharedRoot + + +@pytest.fixture +def root(tmp_path): + external = MemoryStorage() + external.store_bytes("old_unit/data.txt", b"foo") + root = SharedRoot( + scratch=tmp_path, + external=external, + prefix="new_unit", + delete_holding=False + ) + return root + +@pytest.fixture +def root_with_contents(root): + with open(root / "data.txt", mode='wb') as f: + f.write(b"bar") + + return root + +class TestSharedRoot: + @pytest.mark.parametrize('pathlist', [ + ['file.txt'], ['dir', 'file.txt'] + ]) + def test_path(self, root, pathlist): + path = root + for p in pathlist: + path = path / p + + inner_path = os.sep.join(pathlist) + actual_path = root.shared_dir / inner_path + + assert pathlib.Path(path) == actual_path + + def test_read_old(self, root): + # When the file doesn't exist locally, it should be pulled down the + # first time that we register the path. + + # initial conditions, without touching SharedRoot/SharedPath + label = "old_unit/data.txt" + on_filesystem = root.scratch / root.holding / label + assert not on_filesystem.exists() + assert root.external.exists(label) + + # when we create the specific SharedPath, it registers and + # "downloads" the file + old_shared = root.get_other_shared_dir("old_unit") + filepath = old_shared / "data.txt" + assert pathlib.Path(filepath) == on_filesystem + assert on_filesystem.exists() + + # let's just be sure we can read in the data as desired + with open(filepath, mode='rb') as f: + assert f.read() == b"foo" + + def test_write_new(self, root): + label = "new_unit/somefile.txt" + on_filesystem = root.scratch / root.holding / label + assert not on_filesystem.exists() + with open(root / "somefile.txt", mode='wb') as f: + f.write(b"testing") + + # this has been written to disk in scratch, but not yet saved to + # external storage + assert on_filesystem.exists() + assert not root.external.exists(label) + + def test_write_old_fail(self, root): + old_shared = root.get_other_shared_dir("old_unit") + with pytest.raises(IOError, match="read-only"): + old_shared / "foo.txt" + + def test_transfer_to_external(self, root_with_contents): + path = list(root_with_contents.registry)[0] # only 1 + assert not root_with_contents.external.exists(path.label) + + root_with_contents.transfer_holding_to_external() + assert root_with_contents.external.exists(path.label) + + with root_with_contents.external.load_stream(path.label) as f: + assert f.read() == b"bar" + + def test_transfer_to_external_no_file(self, root): + ... + + def test_tranfer_to_external_directory(self, root): + ... + + def test_del(self): + ... + + def test_existing_local_and_external(self, root): + ... + + def test_existing_local_and_external_conflict(self, root): + ... + + def test_no_transfer_for_read_only(self, root): + ... From eb19e0f9ea880648cd285494b5ac379695264f1a Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Wed, 26 Apr 2023 14:07:44 -0500 Subject: [PATCH 02/44] mypy --- gufe/storage/pseudodirectory.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gufe/storage/pseudodirectory.py b/gufe/storage/pseudodirectory.py index b595dafa..eb88bfcf 100644 --- a/gufe/storage/pseudodirectory.py +++ b/gufe/storage/pseudodirectory.py @@ -35,7 +35,7 @@ def __init__( external: ExternalStorage, prefix: str, *, - holding: PathLike =".holding", + holding: PathLike = Path(".holding"), delete_holding: bool = True, read_only: bool = False, ): @@ -46,7 +46,7 @@ def __init__( self.delete_holding = delete_holding self.holding = holding - self.registry = set() + self.registry : set[SharedPath] = set() # NOTE: the fact that we use $SCRATCH/$HOLDING/$PREFIX instead of # $SCRATCH/$PREFIX/$HOLDING is important for 2 reasons: # 1. This doesn't take any of the user's namespace from their From 3be13c0c79bb7f63b3828af5ee8e0ab84ea38139 Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Thu, 27 Apr 2023 09:24:15 -0500 Subject: [PATCH 03/44] docstrings --- gufe/storage/pseudodirectory.py | 49 +++++++++++++++++++++++++++++++-- 1 file changed, 47 insertions(+), 2 deletions(-) diff --git a/gufe/storage/pseudodirectory.py b/gufe/storage/pseudodirectory.py index eb88bfcf..141a06bd 100644 --- a/gufe/storage/pseudodirectory.py +++ b/gufe/storage/pseudodirectory.py @@ -7,7 +7,29 @@ _logger = logging.getLogger(__name__) class SharedRoot: - """ + """PathLike local representation of an :class:`.ExternalStorage`. + + This connects objects on a local filesystem to the key-value store of a + (possibly remote) :class:`.ExternalStorage`. It presents a FileLike + interface to users, but internally (via the :class:`.SharedPath` objects + it contains in its registry) maps local filenames to the keys (labels) + for the key-value store. + + 1. If a local path is requested that corresponds to an existing label in + the :class:`.ExternalStorage`, this object will "download" the + contents of that key to that local path. + + 2. When requested, or when this object ??? (TODO: __exit__ or __del__), + it transfers any newly created files to the + :class:`.ExternalStorage`. + + 3. Optionally, this can delete the local cache of files when requested + or when this object ??? (TODO: __exit__ or __del__) + + This can be opened in "read-only" mode, which prevents new files from + being created, but does not prevent changes to existing versions of + local files. + Parameters ---------- scratch : os.PathLike @@ -25,7 +47,7 @@ class SharedRoot: directory when this object is deleted read_only : bool write to prevent NEW files from being written within this shared - directory. NOTE: This will not prevent overwrite of existing files, + directory. NOTE: This will not prevent overwrite of existing files in scratch space, but it will prevent changed files from uploading to the external storage. """ @@ -99,6 +121,23 @@ def __del__(self): shutil.rmtree(self.shared_dir) def register_path(self, shared_path): + """Register a :class:`.SharedPath` with this :class:`.SharedRoot`. + + This marks a given path as something for this object to manage, by + loading it into the ``registry``. This way it is tracked such that + its contents can be transfered to the :class:`.ExternalStorage` and + such that the local copy can be deleted when it is no longer needed. + + If this objects's :class:`.ExternalStorage` already has data for the + label associated with the provided :class:`.Sharedpath`, then the + contents of that should copied to the local path so that it can be + read by the user. + + Parameters + ---------- + shared_path: :class:`.SharedPath` + the path to track + """ label_exists = self.external.exists(shared_path.label) if self.read_only and not label_exists: @@ -132,6 +171,11 @@ def __repr__(self): class SharedPath: + """PathLike object linking local path with label for external storage. + + On creation, this registers with a :class:`.SharedRoot` that will manage + the local path and transferring data with its :class:`.ExternalStorage`. + """ def __init__(self, root: SharedRoot, path: PathLike): self.root = root self.path = Path(path) @@ -145,6 +189,7 @@ def __fspath__(self): @property def label(self): + """Label used in :class:`.ExternalStorage` for this path""" return str(self.root.prefix / self.path) def __repr__(self): From 472151eddb3bab87e82b5a26401ae5927a1efd1b Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Tue, 30 May 2023 15:13:23 -0500 Subject: [PATCH 04/44] Add StorageManager code --- gufe/storage/pseudodirectory.py | 70 +++++++++++----- gufe/storage/storagemanager.py | 136 ++++++++++++++++++++++++++++++++ 2 files changed, 186 insertions(+), 20 deletions(-) create mode 100644 gufe/storage/storagemanager.py diff --git a/gufe/storage/pseudodirectory.py b/gufe/storage/pseudodirectory.py index 141a06bd..550f3b45 100644 --- a/gufe/storage/pseudodirectory.py +++ b/gufe/storage/pseudodirectory.py @@ -6,6 +6,28 @@ import logging _logger = logging.getLogger(__name__) + +def _delete_empty_dirs(root, delete_root=True): + """Delete all empty directories. + + Repeats so that directories that only contained empty directories also + get deleted. + """ + root = Path(root) + + def find_empty_dirs(directory): + if not (paths := directory.iterdir()): + return [directory] + directories = [p for p in paths if p.is_dir()] + return sum([find_empty_dirs(d) for d in directories], []) + + while empties := find_empty_dirs(root): + if empties == [root] and not delete_root: + return + for directory in empties: + os.rmdir(directory) + + class SharedRoot: """PathLike local representation of an :class:`.ExternalStorage`. @@ -19,12 +41,10 @@ class SharedRoot: the :class:`.ExternalStorage`, this object will "download" the contents of that key to that local path. - 2. When requested, or when this object ??? (TODO: __exit__ or __del__), - it transfers any newly created files to the + 2. When requested, it transfers any newly created files to the :class:`.ExternalStorage`. - 3. Optionally, this can delete the local cache of files when requested - or when this object ??? (TODO: __exit__ or __del__) + 3. It can delete all of the files it manages This can be opened in "read-only" mode, which prevents new files from being created, but does not prevent changes to existing versions of @@ -95,6 +115,24 @@ def get_other_shared_dir(self, prefix, delete_holding=None): read_only=True, ) + def transfer_single_file_to_external(self, held_file): + """Transfer a given file from holding into external storage + """ + if self.read_only: + logging.debug("Read-only: Not transfering to external storage") + return # early exit + + path = Path(held_file) + if not path.exists(): + logging.info(f"Found nonexistent path {path}, not " + "transfering to external storage") + elif path.is_dir(): + logging.debug(f"Found directory {path}, not " + "transfering to external storage") + else: + logging.info(f"Transfering {path} to external storage") + self.external.store_path(held_file.label, path) + def transfer_holding_to_external(self): """Transfer all objects in the registry to external storage""" if self.read_only: @@ -102,23 +140,15 @@ def transfer_holding_to_external(self): return # early exit for obj in self.registry: - path = Path(obj) - if not path.exists(): - logging.info(f"Found nonexistent path {path}, not " - "transfering to external storage") - elif path.is_dir(): - logging.debug(f"Found directory {path}, not " - "transfering to external storage") - else: - logging.info(f"Transfering {path} to external storage") - self.external.store_path(obj.label, path) - - def __del__(self): - # take everything in self.shared_dir and write to it shared; keeping - # our prefix - self.transfer_holding_to_external() + self.transfer_single_file_to_external(obj) + + def cleanup(self): + """Perform end-of-lifecycle cleanup. + """ if self.delete_holding: - shutil.rmtree(self.shared_dir) + for file in self.registry: + os.delete(file) + _delete_empty_dirs(self.shared_dir) def register_path(self, shared_path): """Register a :class:`.SharedPath` with this :class:`.SharedRoot`. diff --git a/gufe/storage/storagemanager.py b/gufe/storage/storagemanager.py new file mode 100644 index 00000000..ac3c6831 --- /dev/null +++ b/gufe/storage/storagemanager.py @@ -0,0 +1,136 @@ +import os +from pathlib import Path +from contextlib import contextmanager + +def _storage_path_conflict(external, path): + # this is a little brittle; would be nice to directly check the + # filenames without needing to load the files? maybe we have a way + # to do that? + if isinstance(pseudodir.external, FileStorage): + root = Path(external.root_dir) + + ... + + +class _DAGStorageManager: + """Context manager to handle details of storage lifecycle. + + Making this a separate class ensures that ``running_unit`` is always + called within the context of a given DAG. This is usually not created + directly; instead, it is created (and used) with its ``running_dag`` + classmethod, typically from within a ``StorageManager``. + """ + def __init__(self, storage_manager, dag_label): + self.manager = storage_manager + self.dag_label = dag_label + self.permanents = [] + + @classmethod # NB: classmethod must be on top + @contextmanager + def running_dag(cls, storage_manager, dag_label): + """DAG level of the storage lifecycle + + When the DAG is completed, transfer everything to the permanent + storage, and delete the holding area for permanent (if we are + supposed to). + + This is not usually called by users; instead it is called from + within the ``StorageManager``. + """ + dag_manager = cls(storage_manager, dag_label) + try: + yield dag_manager + finally: + for permanent in dag_manager.permanents: + permanent.transfer_holding_to_external() + + if not dag_manager.manager.keep_holding: + for d in dag_manager.permanents + dag_manager.shareds: + d.cleanup() + + @contextmanager + def running_unit(self, unit): + """Unit level of the storage lifecycle. + + This provides the holding directories used for scratch, shared, and + permanent. At the end of the unit, it transfers anything from shared + to the real shared external storage, cleans up the scratch + directory and the shared holding directory. + """ + unit_label = unit.key + scratch = self.manager.get_scratch(self.dag_label, unit_label) + shared = self.manager.get_shared(self.dag_label, unit_label) + permanent = self.manager.get_permanent(self.dag_label, unit_label) + try: + yield scratch, shared, permanent + finally: + self.permanents.append(permanent) + shared.transfer_holding_to_external() + + # TODO: check whether shared external is the same as scratch, + # put this in can_delete + can_delete = True + if not self.manager.keep_scratch and can_delete: + shutil.rmtree(scratch) + + if not self.manager.keep_holding: + shared.cleanup() + + +class StorageManager: + """Tool to manage the storage lifecycle during a DAG. + + This object primarily contains the logic for getting the holding + directories. A separate class, in the ``_DAGContextClass`` variable, + handles the logic for the context managers. + """ + _DAGContextClass = _DAGStorageManager + def __init__( + self, + scratch_root: os.Pathlike, + shared_root: ExternalStorage, + permanent_root: ExternalStorage, + *, + keep_scratch: bool = False, + keep_holding: bool = False, + holding: os.PathLike = Path(".holding") + ): + ... + + def get_scratch(self, dag_label: str , unit_label: str) -> Path: + """Get the path for this unit's scratch directory""" + scratch = self.scratch_root / dag_label / "scratch" / unit_label + scratch.mkdir(parents=True, exist_ok=True) + return scratch + + def get_permanent(self, dag_label, unit_label): + """Get the object for this unit's permanent holding directory""" + return SharedRoot( + scratch=self.scratch_root / dag_label, + external=self.permanent_root, + prefix=unit_label, + ) + + def get_shared(self, dag_label, unit_label): + """Get the object for this unit's shared holding directory""" + return SharedRoot( + scratch=self.scratch_root / dag_label, + external=self.shared_root, + prefix=unit_label + ) + + def running_dag(self, dag_label: str): + """Return a context manager that handles storage. + + For simple use cases, this is the only method a user needs to call. + Usage is something like: + + .. code:: + + with manager.running_dag(dag_label) as dag_ctx: + for unit in dag_ordered_units: + with dag_ctx.running_unit(unit) as dirs: + scratch, shared, permanent = dirs + # run the unit + """ + return self._DAGContextClass.running_dag(self, dag_label) From b692c2f7669da2ac22d8075038b4e57a9c7b3962 Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Tue, 30 May 2023 16:32:07 -0500 Subject: [PATCH 05/44] rename to Stagine --- ...pseudodirectory.py => stagingdirectory.py} | 79 ++++++++++--------- ...odirectory.py => test_stagingdirectory.py} | 0 2 files changed, 43 insertions(+), 36 deletions(-) rename gufe/storage/{pseudodirectory.py => stagingdirectory.py} (76%) rename gufe/tests/storage/{test_pseudodirectory.py => test_stagingdirectory.py} (100%) diff --git a/gufe/storage/pseudodirectory.py b/gufe/storage/stagingdirectory.py similarity index 76% rename from gufe/storage/pseudodirectory.py rename to gufe/storage/stagingdirectory.py index 550f3b45..08e6285e 100644 --- a/gufe/storage/pseudodirectory.py +++ b/gufe/storage/stagingdirectory.py @@ -1,6 +1,6 @@ from typing import Union, Optional from pathlib import Path -from os import PathLike +from os import PathLike, rmdir from .externalresource import ExternalStorage import logging @@ -16,24 +16,26 @@ def _delete_empty_dirs(root, delete_root=True): root = Path(root) def find_empty_dirs(directory): - if not (paths := directory.iterdir()): + if not (paths := list(directory.iterdir())): return [directory] directories = [p for p in paths if p.is_dir()] return sum([find_empty_dirs(d) for d in directories], []) - while empties := find_empty_dirs(root): + + while root.exists() and (empties := find_empty_dirs(root)): if empties == [root] and not delete_root: return for directory in empties: - os.rmdir(directory) + _logger.debug(f"Removing '{directory}'") + rmdir(directory) -class SharedRoot: +class StagingDirectory: """PathLike local representation of an :class:`.ExternalStorage`. This connects objects on a local filesystem to the key-value store of a (possibly remote) :class:`.ExternalStorage`. It presents a FileLike - interface to users, but internally (via the :class:`.SharedPath` objects + interface to users, but internally (via the :class:`.StagingPath` objects it contains in its registry) maps local filenames to the keys (labels) for the key-value store. @@ -59,14 +61,14 @@ class SharedRoot: prefix : str label for this specific unit holding : os.PathLike - name of the subdirectory of scratch where shared results are + name of the subdirectory of scratch where staged results are temporarily stored; default is '.holding'. This must be the same for all units within a DAG. delete_holding : bool whether to delete the contents of the $SCRATCH/$HOLDING/$PREFIX directory when this object is deleted read_only : bool - write to prevent NEW files from being written within this shared + write to prevent NEW files from being written within this staging directory. NOTE: This will not prevent overwrite of existing files in scratch space, but it will prevent changed files from uploading to the external storage. @@ -88,7 +90,7 @@ def __init__( self.delete_holding = delete_holding self.holding = holding - self.registry : set[SharedPath] = set() + self.registry : set[StagingPath] = set() # NOTE: the fact that we use $SCRATCH/$HOLDING/$PREFIX instead of # $SCRATCH/$PREFIX/$HOLDING is important for 2 reasons: # 1. This doesn't take any of the user's namespace from their @@ -97,16 +99,16 @@ def __init__( # external storage is exactly the same as this local storage, # meaning that copies to/from the external storage are no-ops. # Use FileStorage(scratch / holding) for that. - self.shared_dir = self.scratch / holding / prefix - self.shared_dir.mkdir(exist_ok=True, parents=True) + self.staging_dir = self.scratch / holding / prefix + self.staging_dir.mkdir(exist_ok=True, parents=True) - def get_other_shared_dir(self, prefix, delete_holding=None): - """Get a related unit's shared directory. + def get_other_staging_dir(self, prefix, delete_holding=None): + """Get a related unit's staging directory. """ if delete_holding is None: delete_holding = self.delete_holding - return SharedRoot( + return StagingDirectory( scratch=self.scratch, external=self.external, prefix=prefix, @@ -148,10 +150,11 @@ def cleanup(self): if self.delete_holding: for file in self.registry: os.delete(file) - _delete_empty_dirs(self.shared_dir) + _delete_empty_dirs(self.staging_dir) - def register_path(self, shared_path): - """Register a :class:`.SharedPath` with this :class:`.SharedRoot`. + def register_path(self, staging_path): + """ + Register a :class:`.StagingPath` with this :class:`.StagingDirectory`. This marks a given path as something for this object to manage, by loading it into the ``registry``. This way it is tracked such that @@ -159,30 +162,30 @@ def register_path(self, shared_path): such that the local copy can be deleted when it is no longer needed. If this objects's :class:`.ExternalStorage` already has data for the - label associated with the provided :class:`.Sharedpath`, then the + label associated with the provided :class:`.Stagingpath`, then the contents of that should copied to the local path so that it can be read by the user. Parameters ---------- - shared_path: :class:`.SharedPath` + staging_path: :class:`.StagingPath` the path to track """ - label_exists = self.external.exists(shared_path.label) + label_exists = self.external.exists(staging_path.label) if self.read_only and not label_exists: - raise IOError(f"Unable to create '{shared_path.label}'. This " - "shared path is read-only.") + raise IOError(f"Unable to create '{staging_path.label}'. This " + "staging path is read-only.") - self.registry.add(shared_path) + self.registry.add(staging_path) # if this is a file that exists, bring it into our subdir # NB: this happens even if you're intending to overwrite the path, # which is kind of wasteful if label_exists: - scratch_path = self.shared_dir / shared_path.path + scratch_path = self.staging_dir / staging_path.path # TODO: switch this to using `get_filename` and `store_path` - with self.external.load_stream(shared_path.label) as f: + with self.external.load_stream(staging_path.label) as f: external_bytes = f.read() if scratch_path.exists(): ... # TODO: something to check that the bytes are the same? @@ -191,31 +194,35 @@ def register_path(self, shared_path): f.write(external_bytes) def __truediv__(self, path: PathLike): - return SharedPath(root=self, path=path) + return StagingPath(root=self, path=path) def __fspath__(self): - return str(self.shared_dir) + return str(self.staging_dir) def __repr__(self): - return f"SharedRoot({self.scratch}, {self.external}, {self.prefix})" + return ( + f"StagingDirectory({self.scratch}, {self.external}, " + f"{self.prefix})" + ) -class SharedPath: +class StagingPath: """PathLike object linking local path with label for external storage. - On creation, this registers with a :class:`.SharedRoot` that will manage - the local path and transferring data with its :class:`.ExternalStorage`. + On creation, this registers with a :class:`.StagingDirectory` that will + manage the local path and transferring data with its + :class:`.ExternalStorage`. """ - def __init__(self, root: SharedRoot, path: PathLike): + def __init__(self, root: StagingDirectory, path: PathLike): self.root = root self.path = Path(path) self.root.register_path(self) def __truediv__(self, path): - return SharedPath(self.root, self.path / path) + return StagingPath(self.root, self.path / path) def __fspath__(self): - return str(self.root.shared_dir / self.path) + return str(self.root.staging_dir / self.path) @property def label(self): @@ -223,9 +230,9 @@ def label(self): return str(self.root.prefix / self.path) def __repr__(self): - return f"SharedPath({self.__fspath__()})" + return f"StagingPath({self.__fspath__()})" # TODO: how much of the pathlib.Path interface do we want to wrap? # although edge cases may be a pain, we can get most of it with, e.g.: # def exists(self): return Path(self).exists() - # but also, can do pathlib.Path(shared_path) and get hte whole thing + # but also, can do pathlib.Path(staging_path) and get hte whole thing diff --git a/gufe/tests/storage/test_pseudodirectory.py b/gufe/tests/storage/test_stagingdirectory.py similarity index 100% rename from gufe/tests/storage/test_pseudodirectory.py rename to gufe/tests/storage/test_stagingdirectory.py From 7432ff4dc8a14dabf19990abfdb45cc51c82891c Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Tue, 30 May 2023 16:32:18 -0500 Subject: [PATCH 06/44] Add tests for _delete_empty_dirs --- gufe/storage/storagemanager.py | 4 ++ gufe/tests/storage/test_stagingdirectory.py | 73 +++++++++++++++++---- 2 files changed, 63 insertions(+), 14 deletions(-) diff --git a/gufe/storage/storagemanager.py b/gufe/storage/storagemanager.py index ac3c6831..c3ab34f9 100644 --- a/gufe/storage/storagemanager.py +++ b/gufe/storage/storagemanager.py @@ -67,6 +67,10 @@ def running_unit(self, unit): self.permanents.append(permanent) shared.transfer_holding_to_external() + # everything in permanent must also be available in shared + for file in permanent.registry: + shared.transfer_single_file_to_external(file) + # TODO: check whether shared external is the same as scratch, # put this in can_delete can_delete = True diff --git a/gufe/tests/storage/test_stagingdirectory.py b/gufe/tests/storage/test_stagingdirectory.py index 4763cdd2..46c48819 100644 --- a/gufe/tests/storage/test_stagingdirectory.py +++ b/gufe/tests/storage/test_stagingdirectory.py @@ -4,14 +4,15 @@ import pathlib from gufe.storage.externalresource import MemoryStorage -from gufe.storage.pseudodirectory import SharedRoot - +from gufe.storage.stagingdirectory import ( + StagingDirectory, _delete_empty_dirs +) @pytest.fixture def root(tmp_path): external = MemoryStorage() external.store_bytes("old_unit/data.txt", b"foo") - root = SharedRoot( + root = StagingDirectory( scratch=tmp_path, external=external, prefix="new_unit", @@ -26,7 +27,54 @@ def root_with_contents(root): return root -class TestSharedRoot: +def test_delete_empty_dirs(tmp_path): + base = tmp_path / "tmp" + paths = [ + base / "foo" / "qux" / "qux.txt", + + ] + dirs = [ + base / "foo" / "bar" / "baz", + base / "quux", + ] + for directory in dirs: + directory.mkdir(parents=True, exist_ok=True) + + for path in paths: + path.parent.mkdir(parents=True, exist_ok=True) + path.touch() + + _delete_empty_dirs(base) + for path in paths: + assert path.exists() + + for directory in dirs: + assert not directory.exists() + + assert not (base / "foo" / "bar").exists() + +@pytest.mark.parametrize('delete_root', [True, False]) +def test_delete_empty_dirs_delete_root(tmp_path, delete_root): + base = tmp_path / "tmp" + dirs = [ + base / "foo" / "bar" / "baz", + base / "quux", + ] + for directory in dirs: + directory.mkdir(parents=True, exist_ok=True) + + _delete_empty_dirs(base, delete_root=delete_root) + + for directory in dirs: + assert not directory.exists() + + assert not (base / "foo" / "bar").exists() + assert base.exists() is not delete_root + + + + +class TestStagingDirectory: @pytest.mark.parametrize('pathlist', [ ['file.txt'], ['dir', 'file.txt'] ]) @@ -36,7 +84,7 @@ def test_path(self, root, pathlist): path = path / p inner_path = os.sep.join(pathlist) - actual_path = root.shared_dir / inner_path + actual_path = root.staging_dir / inner_path assert pathlib.Path(path) == actual_path @@ -44,16 +92,16 @@ def test_read_old(self, root): # When the file doesn't exist locally, it should be pulled down the # first time that we register the path. - # initial conditions, without touching SharedRoot/SharedPath + # initial conditions, without touching StagingDirectory/StagingPath label = "old_unit/data.txt" on_filesystem = root.scratch / root.holding / label assert not on_filesystem.exists() assert root.external.exists(label) - # when we create the specific SharedPath, it registers and + # when we create the specific StagingPath, it registers and # "downloads" the file - old_shared = root.get_other_shared_dir("old_unit") - filepath = old_shared / "data.txt" + old_staging = root.get_other_staging_dir("old_unit") + filepath = old_staging / "data.txt" assert pathlib.Path(filepath) == on_filesystem assert on_filesystem.exists() @@ -74,9 +122,9 @@ def test_write_new(self, root): assert not root.external.exists(label) def test_write_old_fail(self, root): - old_shared = root.get_other_shared_dir("old_unit") + old_staging = root.get_other_staging_dir("old_unit") with pytest.raises(IOError, match="read-only"): - old_shared / "foo.txt" + old_staging / "foo.txt" def test_transfer_to_external(self, root_with_contents): path = list(root_with_contents.registry)[0] # only 1 @@ -94,9 +142,6 @@ def test_transfer_to_external_no_file(self, root): def test_tranfer_to_external_directory(self, root): ... - def test_del(self): - ... - def test_existing_local_and_external(self, root): ... From 319d0d000ac1d67896e2ca7f0a0f646569a6d386 Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Wed, 31 May 2023 07:33:05 -0500 Subject: [PATCH 07/44] Storage docs --- docs/guide/storage.rst | 64 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 63 insertions(+), 1 deletion(-) diff --git a/docs/guide/storage.rst b/docs/guide/storage.rst index 39413208..caac6399 100644 --- a/docs/guide/storage.rst +++ b/docs/guide/storage.rst @@ -1,5 +1,67 @@ The GUFE Storage System ======================= +Storage lifetimes +----------------- -Storage docs. +The storage system in GUFE is heavily tied to the GUFE protocol system. The +key concept here is that the different levels of the GUFE protocol system; +campaign, DAG, and unit; inherently imply different lifetimes for the data +that is created. Those different lifetimes define the stages of the GUFE +storage system. + +In an abstract sense, as used by protocol developers, these three levels +correspond to three lifetimes of data: + +* ``scratch``: This is temporary data that is only needed for the lifetime + of a :class:`.ProtocolUnit`. This data is not guaranteed to be available + beyown the single :class:`.ProtocolUnit` where it is created, but may be + reused within that :class:`.ProtocolUnit`. +* ``shared``: This is data that is shared between different units in a + :class:`.ProtocolDAG`. For example, a single equilibration stage might be + shared between multiple production runs. The output snapshot of the + equilibration would be suitable for as something to put in ``shared`` + data. This data is guarateed to be present from when it is created until + the end of the :class:`.ProtocolDAG`, but is not guaranteed to exist after + the :class:`.ProtocolDAG` terminates. +* ``permanent``: This is the data that will be needed beyond the scope of a + single rough estimate of the calculation. This could include anything that + an extension of the simulation would require, or things that require + network-scale analysis. Anything stored here will be usable after the + calculation has completed. + +The ``scratch`` area is always a local directory. However, ``shared`` and +``permanent`` can be external (remote) resources, using the +:class:`.ExternalResource` API. + +As a practical matter, the GUFE storage system can be handled with a +:class:`.StorageManager`. This automates some aspects of the transfer +between stages of the GUFE storage system, and simplifies the API for +protocol authors. In detail, this provides protocol authors with +``PathLike`` objects for ``scratch``, ``shared``, and ``permanent``. All +three of these objects actually point to special subdirectories of the +scratch space for a specific unit, but are managed by context manangers at +the executor level, which handle the process of moving objects from local +directories to the actual ``shared`` and ``permanent`` locations, which can +be external resources. + + +External resource utilities +--------------------------- + +For flexible data storage, GUFE defines the :class:`.ExternalResource` API, +which allows data be stored/loaded in a way that is agnostic to the +underlying data store, as long as the store can be represented as a +key-value store. Withing GUFE, we provide several examples, including +:class:`.FileStorage` and :class:`.MemoryStorage` (primarily useful for +testing.) The specific ``shared`` and ``permanent`` resources, as provided +to the executor, can be instances of an :class:`.ExternalResource`. + +.. note:: + + The ``shared`` space must be a resource where an uploaded object is + instantaneously available, otherwise later protocol units may fail if the + shared result is unavailable. This means that files or approaches based + on ``scp`` or ``sftp`` are fine, but things like cloud storage, where the + existence of a new document may take time to propagate through the + network, are not recommended for ``shared``. From 5650609f9fa525e578ed0ab0fd5325cad2f25972 Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Wed, 31 May 2023 07:53:52 -0500 Subject: [PATCH 08/44] outline of storage manager tests --- gufe/storage/storagemanager.py | 47 ++++++++++++-------- gufe/tests/storage/test_storagemanager.py | 54 +++++++++++++++++++++++ 2 files changed, 82 insertions(+), 19 deletions(-) create mode 100644 gufe/tests/storage/test_storagemanager.py diff --git a/gufe/storage/storagemanager.py b/gufe/storage/storagemanager.py index c3ab34f9..0aacda35 100644 --- a/gufe/storage/storagemanager.py +++ b/gufe/storage/storagemanager.py @@ -1,15 +1,17 @@ -import os +from os import PathLike from pathlib import Path from contextlib import contextmanager +from .externalstorage import ExternalStorage, FileStorage + def _storage_path_conflict(external, path): - # this is a little brittle; would be nice to directly check the - # filenames without needing to load the files? maybe we have a way - # to do that? - if isinstance(pseudodir.external, FileStorage): + # this is a little brittle; I don't like hard-coding the class here + if isinstance(external, FileStorage): root = Path(external.root_dir) + else: + return False - ... + return False class _DAGStorageManager: @@ -45,7 +47,7 @@ def running_dag(cls, storage_manager, dag_label): permanent.transfer_holding_to_external() if not dag_manager.manager.keep_holding: - for d in dag_manager.permanents + dag_manager.shareds: + for d in dag_manager.permanents: d.cleanup() @contextmanager @@ -64,19 +66,20 @@ def running_unit(self, unit): try: yield scratch, shared, permanent finally: + # TODO: should some of this be in an else clause instead? self.permanents.append(permanent) shared.transfer_holding_to_external() - # everything in permanent must also be available in shared for file in permanent.registry: shared.transfer_single_file_to_external(file) - - # TODO: check whether shared external is the same as scratch, - # put this in can_delete - can_delete = True - if not self.manager.keep_scratch and can_delete: + scratch_conflict = _storage_path_conflict(shared.external, + scratch) + if not self.manager.keep_scratch and not scratch_conflict: shutil.rmtree(scratch) + shared_conflict = _storage_path_conflict(shared.external, + shared) + if not self.manager.keep_holding: shared.cleanup() @@ -85,21 +88,27 @@ class StorageManager: """Tool to manage the storage lifecycle during a DAG. This object primarily contains the logic for getting the holding - directories. A separate class, in the ``_DAGContextClass`` variable, + directories. A separate class, in the ``DAGContextClass`` variable, handles the logic for the context managers. """ - _DAGContextClass = _DAGStorageManager def __init__( self, - scratch_root: os.Pathlike, + scratch_root: PathLike, shared_root: ExternalStorage, permanent_root: ExternalStorage, *, keep_scratch: bool = False, keep_holding: bool = False, - holding: os.PathLike = Path(".holding") + holding: PathLike = Path(".holding"), + DAGContextClass: type = _DAGStorageManager, ): - ... + self.scratch_root = scratch_root + self.shared_root = shared_root + self.permanent_root = permanent_root + self.keep_scratch = keep_scratch + self.keep_holding = keep_holding + self.holding = holding + self.DAGContextClass = DAGContextClass def get_scratch(self, dag_label: str , unit_label: str) -> Path: """Get the path for this unit's scratch directory""" @@ -137,4 +146,4 @@ def running_dag(self, dag_label: str): scratch, shared, permanent = dirs # run the unit """ - return self._DAGContextClass.running_dag(self, dag_label) + return self.DAGContextClass.running_dag(self, dag_label) diff --git a/gufe/tests/storage/test_storagemanager.py b/gufe/tests/storage/test_storagemanager.py new file mode 100644 index 00000000..bb9402b3 --- /dev/null +++ b/gufe/tests/storage/test_storagemanager.py @@ -0,0 +1,54 @@ +import pytest +from gufe.storage.storagemanager import ( + StorageManager, _storage_path_conflict +) +from gufe.storage.externalresource import MemoryStorage + +@pytest.fixture +def storage_manager_std(tmp_path): + return StorageManager( + scratch_root=tmp_path / "scratch", + shared_root=MemoryStorage(), + permanent_root=MemoryStorage() + ) + +@pytest.fixture +def storage_manager_holding_overlaps_shared(tmp_path): + ... + +@pytest.fixture +def storage_manager_holding_overlaps_permanent(tmp_path): + ... + +@pytest.mark.parametrize('manager', ['std']) +def test_lifecycle(request, manager, dag_units): + # heavy integration test to ensure that the whole process works + # this is the primary test of _DAGStorageManager + storage_manager = request.getfixture(f"storage_manager_{manager}") + with storage_manager.running_dag("dag_label") as dag_ctx: + for unit in dag_units: + with dag_ctx.running_unit(unit) as (scratch, shared, permanent): + results.append(unit.run(scratch, shareed, permanent)) + # TODO: asserts at this level + # all files exist; are where we expect them + # TODO: asserts at this level + # TODO: asserts at this level + +def test_lifecycle_keep_scratch_and_holding(): + ... + +def test_storage_path_conflict_ok(): + ... + +def test_storage_path_conflict_problem(): + ... + +class TestStorageManager: + def test_get_scratch(): + ... + + def test_get_permanent(): + ... + + def test_get_shared(): + ... From ddbbd19cbba45d94e14140898b222c1dd1fa432f Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Wed, 31 May 2023 12:21:02 -0500 Subject: [PATCH 09/44] minor improvements on staging directory --- gufe/storage/stagingdirectory.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/gufe/storage/stagingdirectory.py b/gufe/storage/stagingdirectory.py index 08e6285e..a1237999 100644 --- a/gufe/storage/stagingdirectory.py +++ b/gufe/storage/stagingdirectory.py @@ -1,7 +1,8 @@ from typing import Union, Optional from pathlib import Path -from os import PathLike, rmdir +from os import PathLike, rmdir, remove from .externalresource import ExternalStorage +from contextlib import contextmanager import logging _logger = logging.getLogger(__name__) @@ -54,13 +55,13 @@ class StagingDirectory: Parameters ---------- - scratch : os.PathLike + scratch : PathLike the scratch directory shared by all objects on this host external : :class:`.ExternalStorage` external storage resource where objects should eventualy go prefix : str label for this specific unit - holding : os.PathLike + holding : PathLike name of the subdirectory of scratch where staged results are temporarily stored; default is '.holding'. This must be the same for all units within a DAG. @@ -117,6 +118,13 @@ def get_other_staging_dir(self, prefix, delete_holding=None): read_only=True, ) + @contextmanager + def other_shared(self, prefix, delete_holding=None): + other = self.get_other_staging_dir(prefix, delete_holding) + yield other + other.cleanup() + + def transfer_single_file_to_external(self, held_file): """Transfer a given file from holding into external storage """ @@ -149,7 +157,7 @@ def cleanup(self): """ if self.delete_holding: for file in self.registry: - os.delete(file) + remove(file) _delete_empty_dirs(self.staging_dir) def register_path(self, staging_path): @@ -174,7 +182,8 @@ def register_path(self, staging_path): label_exists = self.external.exists(staging_path.label) if self.read_only and not label_exists: - raise IOError(f"Unable to create '{staging_path.label}'. This " + raise IOError(f"Unable to create '{staging_path.label}'. File " + "does not exist in external storage, and This " "staging path is read-only.") self.registry.add(staging_path) From c5ce48a50f3d746a1599a232ab4dad8943f161b7 Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Wed, 31 May 2023 12:44:13 -0500 Subject: [PATCH 10/44] first storage lifecycle test works --- gufe/storage/stagingdirectory.py | 4 +- gufe/storage/storagemanager.py | 20 ++- gufe/tests/storage/test_storagemanager.py | 148 +++++++++++++++++++--- 3 files changed, 148 insertions(+), 24 deletions(-) diff --git a/gufe/storage/stagingdirectory.py b/gufe/storage/stagingdirectory.py index a1237999..d7411e6e 100644 --- a/gufe/storage/stagingdirectory.py +++ b/gufe/storage/stagingdirectory.py @@ -92,6 +92,7 @@ def __init__( self.holding = holding self.registry : set[StagingPath] = set() + self.preexisting : set[StagingPath] = set() # NOTE: the fact that we use $SCRATCH/$HOLDING/$PREFIX instead of # $SCRATCH/$PREFIX/$HOLDING is important for 2 reasons: # 1. This doesn't take any of the user's namespace from their @@ -156,7 +157,7 @@ def cleanup(self): """Perform end-of-lifecycle cleanup. """ if self.delete_holding: - for file in self.registry: + for file in self.registry - self.preexisting: remove(file) _delete_empty_dirs(self.staging_dir) @@ -197,6 +198,7 @@ def register_path(self, staging_path): with self.external.load_stream(staging_path.label) as f: external_bytes = f.read() if scratch_path.exists(): + self.preexisting.add(staging_path) ... # TODO: something to check that the bytes are the same? scratch_path.parent.mkdir(exist_ok=True, parents=True) with open(scratch_path, mode='wb') as f: diff --git a/gufe/storage/storagemanager.py b/gufe/storage/storagemanager.py index 0aacda35..f914d32d 100644 --- a/gufe/storage/storagemanager.py +++ b/gufe/storage/storagemanager.py @@ -1,17 +1,26 @@ from os import PathLike from pathlib import Path from contextlib import contextmanager +import shutil -from .externalstorage import ExternalStorage, FileStorage +from .externalresource import ExternalStorage, FileStorage +from .stagingdirectory import StagingDirectory def _storage_path_conflict(external, path): + """Check if deleting ``path`` could delete externally stored data + """ # this is a little brittle; I don't like hard-coding the class here if isinstance(external, FileStorage): root = Path(external.root_dir) else: return False - return False + try: + _ = root.relative_to(Path(path)) + except ValueError: + return False + else: + return True class _DAGStorageManager: @@ -79,8 +88,7 @@ def running_unit(self, unit): shared_conflict = _storage_path_conflict(shared.external, shared) - - if not self.manager.keep_holding: + if not self.manager.keep_holding and not shared_conflict: shared.cleanup() @@ -118,7 +126,7 @@ def get_scratch(self, dag_label: str , unit_label: str) -> Path: def get_permanent(self, dag_label, unit_label): """Get the object for this unit's permanent holding directory""" - return SharedRoot( + return StagingDirectory( scratch=self.scratch_root / dag_label, external=self.permanent_root, prefix=unit_label, @@ -126,7 +134,7 @@ def get_permanent(self, dag_label, unit_label): def get_shared(self, dag_label, unit_label): """Get the object for this unit's shared holding directory""" - return SharedRoot( + return StagingDirectory( scratch=self.scratch_root / dag_label, external=self.shared_root, prefix=unit_label diff --git a/gufe/tests/storage/test_storagemanager.py b/gufe/tests/storage/test_storagemanager.py index bb9402b3..95fdf255 100644 --- a/gufe/tests/storage/test_storagemanager.py +++ b/gufe/tests/storage/test_storagemanager.py @@ -2,7 +2,9 @@ from gufe.storage.storagemanager import ( StorageManager, _storage_path_conflict ) -from gufe.storage.externalresource import MemoryStorage +from gufe.storage.stagingdirectory import StagingDirectory +from gufe.storage.externalresource import MemoryStorage, FileStorage +from pathlib import Path @pytest.fixture def storage_manager_std(tmp_path): @@ -20,35 +22,147 @@ def storage_manager_holding_overlaps_shared(tmp_path): def storage_manager_holding_overlaps_permanent(tmp_path): ... +@pytest.fixture +def dag_units(): + class Unit1: + @property + def key(self): + return "unit1" + + def run(self, scratch, shared, permanent): + (scratch / "foo.txt").touch() + with open(shared / "bar.txt", mode='w') as f: + f.write("bar was written") + with open(permanent / "baz.txt", mode='w') as f: + f.write("baz was written") + + return "done 1" + + class Unit2: + @property + def key(self): + return "unit2" + + def run(self, scratch, shared, permanent): + (scratch / "foo2.txt").touch() + with shared.other_shared("unit1") as prev_shared: + with open(prev_shared / "bar.txt", mode='r') as f: + bar = f.read() + + # note that you can open a file from permanent as if it was + # from shared -- everything in permanent is in shared + with open(prev_shared / "baz.txt", mode='r') as f: + baz = f.read() + + return {"bar": bar, "baz": baz} + + return [Unit1(), Unit2()] + + @pytest.mark.parametrize('manager', ['std']) def test_lifecycle(request, manager, dag_units): # heavy integration test to ensure that the whole process works # this is the primary test of _DAGStorageManager - storage_manager = request.getfixture(f"storage_manager_{manager}") + storage_manager = request.getfixturevalue(f"storage_manager_{manager}") + permanent_root = storage_manager.permanent_root + shared_root = storage_manager.shared_root + results = [] + unit1_dir = Path(storage_manager.get_shared("dag_label", "unit1")) + scratch1 = Path(storage_manager.get_scratch("dag_label", "unit1")) + scratch2 = Path(storage_manager.get_scratch("dag_label", "unit2")) + barfile = unit1_dir / "bar.txt" + bazfile = unit1_dir / "baz.txt" + foofile = scratch1 / "foo.txt" + foo2file = scratch2 / "foo2.txt" + + all_files = {barfile, bazfile, foofile, foo2file} with storage_manager.running_dag("dag_label") as dag_ctx: for unit in dag_units: with dag_ctx.running_unit(unit) as (scratch, shared, permanent): - results.append(unit.run(scratch, shareed, permanent)) - # TODO: asserts at this level - # all files exist; are where we expect them - # TODO: asserts at this level - # TODO: asserts at this level + results.append(unit.run(scratch, shared, permanent)) + + # check that the expected files are found in staging + exists = { + "unit1": {barfile, bazfile, foofile}, + "unit2": {foo2file, bazfile} + }[unit.key] + + for file in exists: + assert file.exists() + + for file in all_files - exists: + assert not file.exists() + + # check that shared store is as expected + expected_in_shared = { + "unit1": set(), + "unit2": {"unit1/bar.txt", "unit1/baz.txt"} + }[unit.key] + assert set(shared_root.iter_contents()) == expected_in_shared + # check that permanent store is empty + assert list(permanent_root.iter_contents()) == [] + # AFTER THE RUNNING_UNIT CONTEXT + # Same for both units because unit2 doesn't add anything to + # shared/permanent + # Files staged for shared should be transferred to shared and + # removed from the staging directories; files staged for + # permanent should remain + for_permanent = {bazfile} + for file in for_permanent: + assert file.exists() + + for file in all_files - for_permanent: + assert not file.exists() + + # check that we have things in shared + expected_in_shared = {"unit1/bar.txt", "unit1/baz.txt"} + assert set(shared_root.iter_contents()) == expected_in_shared + # ensure that we haven't written to permanent yet + assert list(permanent_root.iter_contents()) == [] + # AFTER THE RUNNING_DAG CONTEXT + # all staged files should be deleted + for file in all_files: + assert not file.exists() + # shared still contains everything it had; but this isn't something we + # guarantee, so we don't actually test for it + # assert set(shared_root.iter_contents()) == {"unit1/bar.txt", + # "unit1/baz.txt"} + assert list(permanent_root.iter_contents()) == ["unit1/baz.txt"] def test_lifecycle_keep_scratch_and_holding(): ... -def test_storage_path_conflict_ok(): - ... +def test_storage_path_conflict_ok(tmp_path): + # if the filestorage root is not in the given path, no conflict + external = FileStorage(tmp_path / "foo" / "bar") + path = tmp_path / "foo" / "baz" + assert _storage_path_conflict(external, path) is False + +def test_storage_path_conflict_not_filestorage(tmp_path): + # if the external resource isn't a FileStorage, no conflict + external = MemoryStorage() + path = tmp_path / "foo" / "baz" + assert _storage_path_conflict(external, path) is False + +def test_storage_path_conflict_problem(tmp_path): + # if the filestorage root is in the given path, we have a conflict + external = FileStorage(tmp_path / "foo" / "bar") + path = tmp_path / "foo" + assert _storage_path_conflict(external, path) is True -def test_storage_path_conflict_problem(): - ... class TestStorageManager: - def test_get_scratch(): - ... + def test_get_scratch(self, storage_manager_std): + scratch = storage_manager_std.get_scratch("dag_label", "unit_label") + assert str(scratch).endswith("dag_label/scratch/unit_label") + assert isinstance(scratch, Path) - def test_get_permanent(): - ... + def test_get_permanent(self, storage_manager_std): + perm = storage_manager_std.get_permanent("dag_label", "unit_label") + assert perm.__fspath__().endswith("dag_label/.holding/unit_label") + assert isinstance(perm, StagingDirectory) - def test_get_shared(): - ... + def test_get_shared(self, storage_manager_std): + shared = storage_manager_std.get_shared("dag_label", "unit_label") + assert shared.__fspath__().endswith("dag_label/.holding/unit_label") + assert isinstance(shared, StagingDirectory) From b805aac64c88eb9052233744e5adc560c05d0e76 Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Wed, 31 May 2023 14:11:15 -0500 Subject: [PATCH 11/44] cleanup mypy --- gufe/storage/stagingdirectory.py | 131 ++++++++++++++------ gufe/storage/storagemanager.py | 28 ++++- gufe/tests/storage/test_stagingdirectory.py | 10 +- gufe/tests/storage/test_storagemanager.py | 13 +- 4 files changed, 128 insertions(+), 54 deletions(-) diff --git a/gufe/storage/stagingdirectory.py b/gufe/storage/stagingdirectory.py index d7411e6e..e19b7882 100644 --- a/gufe/storage/stagingdirectory.py +++ b/gufe/storage/stagingdirectory.py @@ -82,12 +82,10 @@ def __init__( *, holding: PathLike = Path(".holding"), delete_holding: bool = True, - read_only: bool = False, ): self.external = external self.scratch = Path(scratch) self.prefix = Path(prefix) - self.read_only = read_only self.delete_holding = delete_holding self.holding = holding @@ -104,34 +102,9 @@ def __init__( self.staging_dir = self.scratch / holding / prefix self.staging_dir.mkdir(exist_ok=True, parents=True) - def get_other_staging_dir(self, prefix, delete_holding=None): - """Get a related unit's staging directory. - """ - if delete_holding is None: - delete_holding = self.delete_holding - - return StagingDirectory( - scratch=self.scratch, - external=self.external, - prefix=prefix, - holding=self.holding, - delete_holding=delete_holding, - read_only=True, - ) - - @contextmanager - def other_shared(self, prefix, delete_holding=None): - other = self.get_other_staging_dir(prefix, delete_holding) - yield other - other.cleanup() - - def transfer_single_file_to_external(self, held_file): """Transfer a given file from holding into external storage """ - if self.read_only: - logging.debug("Read-only: Not transfering to external storage") - return # early exit path = Path(held_file) if not path.exists(): @@ -146,10 +119,6 @@ def transfer_single_file_to_external(self, held_file): def transfer_holding_to_external(self): """Transfer all objects in the registry to external storage""" - if self.read_only: - logging.debug("Read-only: Not transfering to external storage") - return # early exit - for obj in self.registry: self.transfer_single_file_to_external(obj) @@ -182,20 +151,18 @@ def register_path(self, staging_path): """ label_exists = self.external.exists(staging_path.label) - if self.read_only and not label_exists: - raise IOError(f"Unable to create '{staging_path.label}'. File " - "does not exist in external storage, and This " - "staging path is read-only.") - self.registry.add(staging_path) # if this is a file that exists, bring it into our subdir # NB: this happens even if you're intending to overwrite the path, # which is kind of wasteful if label_exists: + self._load_file_from_external(self.external, staging_path) + + def _load_file_from_external(self, external, staging_path): scratch_path = self.staging_dir / staging_path.path # TODO: switch this to using `get_filename` and `store_path` - with self.external.load_stream(staging_path.label) as f: + with external.load_stream(staging_path.label) as f: external_bytes = f.read() if scratch_path.exists(): self.preexisting.add(staging_path) @@ -217,6 +184,96 @@ def __repr__(self): ) +class SharedStaging(StagingDirectory): + def __init__( + self, + scratch: PathLike, + external: ExternalStorage, + prefix: str, + *, + holding: PathLike = Path(".holding"), + delete_holding: bool = True, + read_only: bool = False, + ): + super().__init__(scratch, external, prefix, holding=holding, + delete_holding=delete_holding) + self.read_only = read_only + + def get_other_shared(self, prefix, delete_holding=None): + """Get a related unit's staging directory. + """ + if delete_holding is None: + delete_holding = self.delete_holding + + return SharedStaging( + scratch=self.scratch, + external=self.external, + prefix=prefix, + holding=self.holding, + delete_holding=delete_holding, + read_only=True, + ) + + @contextmanager + def other_shared(self, prefix, delete_holding=None): + """Context manager approach for getting a related unit's directory. + + This is usually the recommended way to get a previous unit's shared + data. + """ + other = self.get_other_shared(prefix, delete_holding) + yield other + other.cleanup() + + def transfer_single_file_to_external(self, held_file): + if self.read_only: + logging.debug("Read-only: Not transfering to external storage") + return # early exit + + super().transfer_single_file_to_external(held_file) + + def transfer_holding_to_external(self): + if self.read_only: + logging.debug("Read-only: Not transfering to external storage") + return # early exit + + super().transfer_holding_to_external() + + def register_path(self, staging_path): + label_exists = self.external.exists(staging_path.label) + + if self.read_only and not label_exists: + raise IOError(f"Unable to create '{staging_path.label}'. File " + "does not exist in external storage, and This " + "staging path is read-only.") + + super().register_path(staging_path) + + +class PermanentStaging(StagingDirectory): + def __init__( + self, + scratch: PathLike, + external: ExternalStorage, + shared: ExternalStorage, + prefix: str, + *, + holding: PathLike = Path(".holding"), + delete_holding: bool = True, + ): + super().__init__(scratch, external, prefix, holding=holding, + delete_holding=delete_holding) + self.shared = shared + + def transfer_single_file_to_external(self, held_file): + # for this one, if we can't fin + path = Path(held_file) + if not path.exists(): + self._load_file_from_external(self.shared, held_file) + + super().transfer_single_file_to_external(held_file) + + class StagingPath: """PathLike object linking local path with label for external storage. diff --git a/gufe/storage/storagemanager.py b/gufe/storage/storagemanager.py index f914d32d..aa4fc4ef 100644 --- a/gufe/storage/storagemanager.py +++ b/gufe/storage/storagemanager.py @@ -3,8 +3,10 @@ from contextlib import contextmanager import shutil +from typing import Type + from .externalresource import ExternalStorage, FileStorage -from .stagingdirectory import StagingDirectory +from .stagingdirectory import SharedStaging, PermanentStaging def _storage_path_conflict(external, path): """Check if deleting ``path`` could delete externally stored data @@ -22,8 +24,20 @@ def _storage_path_conflict(external, path): else: return True +class _AbstractDAGContextManager: + @classmethod + @contextmanager + def running_dag(cls, storage_manager, dag_label): + raise NotImplementedError() + + @contextmanager + def running_unit(cls, unit): + raise NotImplementedError() + +DAGContextManager = Type[_DAGStorageManager] -class _DAGStorageManager: + +class _DAGStorageManager(_AbstractDAGContextManager): """Context manager to handle details of storage lifecycle. Making this a separate class ensures that ``running_unit`` is always @@ -108,9 +122,9 @@ def __init__( keep_scratch: bool = False, keep_holding: bool = False, holding: PathLike = Path(".holding"), - DAGContextClass: type = _DAGStorageManager, + DAGContextClass: DAGContextManager = _DAGStorageManager, ): - self.scratch_root = scratch_root + self.scratch_root = Path(scratch_root) self.shared_root = shared_root self.permanent_root = permanent_root self.keep_scratch = keep_scratch @@ -120,21 +134,23 @@ def __init__( def get_scratch(self, dag_label: str , unit_label: str) -> Path: """Get the path for this unit's scratch directory""" + scratch = self.scratch_root / dag_label / "scratch" / unit_label scratch.mkdir(parents=True, exist_ok=True) return scratch def get_permanent(self, dag_label, unit_label): """Get the object for this unit's permanent holding directory""" - return StagingDirectory( + return PermanentStaging( scratch=self.scratch_root / dag_label, external=self.permanent_root, + shared=self.shared_root, prefix=unit_label, ) def get_shared(self, dag_label, unit_label): """Get the object for this unit's shared holding directory""" - return StagingDirectory( + return SharedStaging( scratch=self.scratch_root / dag_label, external=self.shared_root, prefix=unit_label diff --git a/gufe/tests/storage/test_stagingdirectory.py b/gufe/tests/storage/test_stagingdirectory.py index 46c48819..09f4fc2d 100644 --- a/gufe/tests/storage/test_stagingdirectory.py +++ b/gufe/tests/storage/test_stagingdirectory.py @@ -5,14 +5,14 @@ from gufe.storage.externalresource import MemoryStorage from gufe.storage.stagingdirectory import ( - StagingDirectory, _delete_empty_dirs + SharedStaging, PermanentStaging, _delete_empty_dirs ) @pytest.fixture def root(tmp_path): external = MemoryStorage() external.store_bytes("old_unit/data.txt", b"foo") - root = StagingDirectory( + root = SharedStaging( scratch=tmp_path, external=external, prefix="new_unit", @@ -74,7 +74,7 @@ def test_delete_empty_dirs_delete_root(tmp_path, delete_root): -class TestStagingDirectory: +class TestSharedStaging: @pytest.mark.parametrize('pathlist', [ ['file.txt'], ['dir', 'file.txt'] ]) @@ -100,7 +100,7 @@ def test_read_old(self, root): # when we create the specific StagingPath, it registers and # "downloads" the file - old_staging = root.get_other_staging_dir("old_unit") + old_staging = root.get_other_shared("old_unit") filepath = old_staging / "data.txt" assert pathlib.Path(filepath) == on_filesystem assert on_filesystem.exists() @@ -122,7 +122,7 @@ def test_write_new(self, root): assert not root.external.exists(label) def test_write_old_fail(self, root): - old_staging = root.get_other_staging_dir("old_unit") + old_staging = root.get_other_shared("old_unit") with pytest.raises(IOError, match="read-only"): old_staging / "foo.txt" diff --git a/gufe/tests/storage/test_storagemanager.py b/gufe/tests/storage/test_storagemanager.py index 95fdf255..c4d60e56 100644 --- a/gufe/tests/storage/test_storagemanager.py +++ b/gufe/tests/storage/test_storagemanager.py @@ -25,9 +25,7 @@ def storage_manager_holding_overlaps_permanent(tmp_path): @pytest.fixture def dag_units(): class Unit1: - @property - def key(self): - return "unit1" + key = "unit1" def run(self, scratch, shared, permanent): (scratch / "foo.txt").touch() @@ -39,9 +37,7 @@ def run(self, scratch, shared, permanent): return "done 1" class Unit2: - @property - def key(self): - return "unit2" + key = "unit2" def run(self, scratch, shared, permanent): (scratch / "foo2.txt").touch() @@ -128,6 +124,11 @@ def test_lifecycle(request, manager, dag_units): # assert set(shared_root.iter_contents()) == {"unit1/bar.txt", # "unit1/baz.txt"} assert list(permanent_root.iter_contents()) == ["unit1/baz.txt"] + # check the results + assert results == [ + "done 1", + {"bar": "bar was written", "baz": "baz was written"} + ] def test_lifecycle_keep_scratch_and_holding(): ... From ed5e83c23e3ee03534f8cbe5af753cc4ac9baeae Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Wed, 31 May 2023 14:18:29 -0500 Subject: [PATCH 12/44] change to unit taking in the label --- gufe/storage/storagemanager.py | 9 ++++----- gufe/tests/storage/test_storagemanager.py | 2 +- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/gufe/storage/storagemanager.py b/gufe/storage/storagemanager.py index aa4fc4ef..6a7a46a9 100644 --- a/gufe/storage/storagemanager.py +++ b/gufe/storage/storagemanager.py @@ -27,14 +27,14 @@ def _storage_path_conflict(external, path): class _AbstractDAGContextManager: @classmethod @contextmanager - def running_dag(cls, storage_manager, dag_label): + def running_dag(cls, storage_manager, dag_label: str): raise NotImplementedError() @contextmanager - def running_unit(cls, unit): + def running_unit(cls, unit_label: str): raise NotImplementedError() -DAGContextManager = Type[_DAGStorageManager] +DAGContextManager = Type[_AbstractDAGContextManager] class _DAGStorageManager(_AbstractDAGContextManager): @@ -74,7 +74,7 @@ def running_dag(cls, storage_manager, dag_label): d.cleanup() @contextmanager - def running_unit(self, unit): + def running_unit(self, unit_label: str): """Unit level of the storage lifecycle. This provides the holding directories used for scratch, shared, and @@ -82,7 +82,6 @@ def running_unit(self, unit): to the real shared external storage, cleans up the scratch directory and the shared holding directory. """ - unit_label = unit.key scratch = self.manager.get_scratch(self.dag_label, unit_label) shared = self.manager.get_shared(self.dag_label, unit_label) permanent = self.manager.get_permanent(self.dag_label, unit_label) diff --git a/gufe/tests/storage/test_storagemanager.py b/gufe/tests/storage/test_storagemanager.py index c4d60e56..c2497170 100644 --- a/gufe/tests/storage/test_storagemanager.py +++ b/gufe/tests/storage/test_storagemanager.py @@ -74,7 +74,7 @@ def test_lifecycle(request, manager, dag_units): all_files = {barfile, bazfile, foofile, foo2file} with storage_manager.running_dag("dag_label") as dag_ctx: for unit in dag_units: - with dag_ctx.running_unit(unit) as (scratch, shared, permanent): + with dag_ctx.running_unit(unit.key) as (scratch, shared, permanent): results.append(unit.run(scratch, shared, permanent)) # check that the expected files are found in staging From 61810398c60ec9038f2fb67494770ebfad79db3f Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Mon, 5 Jun 2023 17:59:30 -0500 Subject: [PATCH 13/44] lots of updates; switched to harness for tests --- docs/guide/storage.rst | 22 ++ gufe/storage/stagingdirectory.py | 10 +- gufe/storage/storagemanager.py | 29 ++- gufe/tests/storage/test_stagingdirectory.py | 5 +- gufe/tests/storage/test_storagemanager.py | 218 ++++++++++++-------- 5 files changed, 179 insertions(+), 105 deletions(-) diff --git a/docs/guide/storage.rst b/docs/guide/storage.rst index caac6399..76ae59c7 100644 --- a/docs/guide/storage.rst +++ b/docs/guide/storage.rst @@ -65,3 +65,25 @@ to the executor, can be instances of an :class:`.ExternalResource`. on ``scp`` or ``sftp`` are fine, but things like cloud storage, where the existence of a new document may take time to propagate through the network, are not recommended for ``shared``. + + +Details: Manangement of the Storage Lifetime +-------------------------------------------- + +The concepts of the storage lifetimes are important for protocol authors, +but details of implementation are left to the specific executor. In order to +facilitate ??? + +* :class:`.StorageManager`: This is the overall façade interface for + interacting with the rest of the storage lifecycle tools. +* ``DAGContextManager``: +* :class:`.StagingDirectory`: +* :class:`.StagingPath`: + +In practice, the executor uses the :class:`.StorageManager` to create a +:class:`.DAGContextManager` at the level of a DAG, and then uses the +:class:`.DAGContextManager` to create a context to run a unit. That context +creates a :class:`.SharedStaging` and a :class:`.PermanentStaging` +associated with the specific unit. Those staging directories, with the +scratch directory, are provided to the :class:`.ProtocolDAGUnit`, so that +these are the only objects protocol authors need to interact with. diff --git a/gufe/storage/stagingdirectory.py b/gufe/storage/stagingdirectory.py index e19b7882..b70a1858 100644 --- a/gufe/storage/stagingdirectory.py +++ b/gufe/storage/stagingdirectory.py @@ -60,7 +60,11 @@ class StagingDirectory: external : :class:`.ExternalStorage` external storage resource where objects should eventualy go prefix : str - label for this specific unit + label for this specific unit; this should be a slash-separated + description of where this unit fits in the hierarchy. For example, + it might be ``$DAG_LABEL/$UNIT_LABEL`` or + ``$DAG_LABEL/$UNIT_LABEL/$UNIT_REPEAT``. It must be a unique + identifier for this unit within the permanent storage. holding : PathLike name of the subdirectory of scratch where staged results are temporarily stored; default is '.holding'. This must be the same for @@ -99,7 +103,7 @@ def __init__( # external storage is exactly the same as this local storage, # meaning that copies to/from the external storage are no-ops. # Use FileStorage(scratch / holding) for that. - self.staging_dir = self.scratch / holding / prefix + self.staging_dir = self.scratch / prefix / holding self.staging_dir.mkdir(exist_ok=True, parents=True) def transfer_single_file_to_external(self, held_file): @@ -266,7 +270,7 @@ def __init__( self.shared = shared def transfer_single_file_to_external(self, held_file): - # for this one, if we can't fin + # if we can't find it locally, we load it from shared storage path = Path(held_file) if not path.exists(): self._load_file_from_external(self.shared, held_file) diff --git a/gufe/storage/storagemanager.py b/gufe/storage/storagemanager.py index 6a7a46a9..d88b9e86 100644 --- a/gufe/storage/storagemanager.py +++ b/gufe/storage/storagemanager.py @@ -28,12 +28,21 @@ class _AbstractDAGContextManager: @classmethod @contextmanager def running_dag(cls, storage_manager, dag_label: str): + """Return a context manager for when a DAG is started. + + This context manager handles the DAG scale of the lifecycle. + """ raise NotImplementedError() @contextmanager - def running_unit(cls, unit_label: str): + def running_unit(self, unit_label: str): + """Return a context manager for when unit is started. + + This context manager handles the unit scale of the lifecycle. + """ raise NotImplementedError() + DAGContextManager = Type[_AbstractDAGContextManager] @@ -82,9 +91,9 @@ def running_unit(self, unit_label: str): to the real shared external storage, cleans up the scratch directory and the shared holding directory. """ - scratch = self.manager.get_scratch(self.dag_label, unit_label) - shared = self.manager.get_shared(self.dag_label, unit_label) - permanent = self.manager.get_permanent(self.dag_label, unit_label) + scratch = self.manager.get_scratch(unit_label) + shared = self.manager.get_shared(unit_label) + permanent = self.manager.get_permanent(unit_label) try: yield scratch, shared, permanent finally: @@ -131,26 +140,26 @@ def __init__( self.holding = holding self.DAGContextClass = DAGContextClass - def get_scratch(self, dag_label: str , unit_label: str) -> Path: + def get_scratch(self, unit_label: str) -> Path: """Get the path for this unit's scratch directory""" - scratch = self.scratch_root / dag_label / "scratch" / unit_label + scratch = self.scratch_root / unit_label / "scratch" scratch.mkdir(parents=True, exist_ok=True) return scratch - def get_permanent(self, dag_label, unit_label): + def get_permanent(self, unit_label): """Get the object for this unit's permanent holding directory""" return PermanentStaging( - scratch=self.scratch_root / dag_label, + scratch=self.scratch_root, external=self.permanent_root, shared=self.shared_root, prefix=unit_label, ) - def get_shared(self, dag_label, unit_label): + def get_shared(self, unit_label): """Get the object for this unit's shared holding directory""" return SharedStaging( - scratch=self.scratch_root / dag_label, + scratch=self.scratch_root, external=self.shared_root, prefix=unit_label ) diff --git a/gufe/tests/storage/test_stagingdirectory.py b/gufe/tests/storage/test_stagingdirectory.py index 09f4fc2d..e825d28b 100644 --- a/gufe/tests/storage/test_stagingdirectory.py +++ b/gufe/tests/storage/test_stagingdirectory.py @@ -94,7 +94,7 @@ def test_read_old(self, root): # initial conditions, without touching StagingDirectory/StagingPath label = "old_unit/data.txt" - on_filesystem = root.scratch / root.holding / label + on_filesystem = root.scratch / "old_unit" / root.holding / "data.txt" assert not on_filesystem.exists() assert root.external.exists(label) @@ -111,7 +111,8 @@ def test_read_old(self, root): def test_write_new(self, root): label = "new_unit/somefile.txt" - on_filesystem = root.scratch / root.holding / label + on_filesystem = (root.scratch / "new_unit" / root.holding + / "somefile.txt") assert not on_filesystem.exists() with open(root / "somefile.txt", mode='wb') as f: f.write(b"testing") diff --git a/gufe/tests/storage/test_storagemanager.py b/gufe/tests/storage/test_storagemanager.py index c2497170..9c36e391 100644 --- a/gufe/tests/storage/test_storagemanager.py +++ b/gufe/tests/storage/test_storagemanager.py @@ -9,19 +9,11 @@ @pytest.fixture def storage_manager_std(tmp_path): return StorageManager( - scratch_root=tmp_path / "scratch", + scratch_root=tmp_path / "working", shared_root=MemoryStorage(), permanent_root=MemoryStorage() ) -@pytest.fixture -def storage_manager_holding_overlaps_shared(tmp_path): - ... - -@pytest.fixture -def storage_manager_holding_overlaps_permanent(tmp_path): - ... - @pytest.fixture def dag_units(): class Unit1: @@ -41,7 +33,9 @@ class Unit2: def run(self, scratch, shared, permanent): (scratch / "foo2.txt").touch() - with shared.other_shared("unit1") as prev_shared: + # TODO: this will change; the inputs should include a way to get + # the previous shared unit label + with shared.other_shared("dag/unit1") as prev_shared: with open(prev_shared / "bar.txt", mode='r') as f: bar = f.read() @@ -54,85 +48,129 @@ def run(self, scratch, shared, permanent): return [Unit1(), Unit2()] +class LifecycleHarness: + @pytest.fixture + def storage_manager(self, tmp_path): + raise NotImplementedError() + + @staticmethod + def get_files_dict(storage_manager): + root = storage_manager.scratch_root + holding = storage_manager.holding + return { + "foo": root / "dag/unit1/scratch/foo.txt", + "foo2": root / "dag/unit2/scratch/foo2.txt", + "bar": root / "dag/unit1" / holding / "bar.txt", + "baz": root / "dag/unit1" / holding / "baz.txt", + } + + def test_lifecycle(self, storage_manager, dag_units, tmp_path): + results = [] + dag_label = "dag" + with storage_manager.running_dag(dag_label) as dag_ctx: + for unit in dag_units: + label = f"{dag_ctx.dag_label}/{unit.key}" + with dag_ctx.running_unit(label) as (scratch, shared, perm): + results.append(unit.run(scratch, shared, perm)) + self.in_unit_asserts(storage_manager, label) + self.after_unit_asserts(storage_manager, label) + self.after_dag_asserts(storage_manager) + assert results == [ + "done 1", + {"bar": "bar was written", "baz": "baz was written"} + ] + + def _in_unit_existing_files(self, unit_label): + raise NotImplementedError() + + def _after_unit_existing_files(self, unit_label): + raise NotImplementedError() + + def _after_dag_existing_files(self): + raise NotImplementedError() + + @staticmethod + def assert_existing_files(files_dict, existing): + for file in existing: + assert files_dict[file].exists() + + for file in set(files_dict) - existing: + assert not files_dict[file].exists() + + def in_unit_asserts(self, storage_manager, unit_label): + # check that shared and permanent are correct + shared_root = storage_manager.shared_root + permanent_root = storage_manager.permanent_root + expected_in_shared = { + "dag/unit1": set(), + "dag/unit2": {"dag/unit1/bar.txt", "dag/unit1/baz.txt"} + }[unit_label] + assert set(shared_root.iter_contents()) == expected_in_shared + + assert list(permanent_root.iter_contents()) == [] + + # manager-specific check for files + files_dict = self.get_files_dict(storage_manager) + existing = self._in_unit_existing_files(unit_label) + self.assert_existing_files(files_dict, existing) + + def after_unit_asserts(self, storage_manager, unit_label): + shared_root = storage_manager.shared_root + permanent_root = storage_manager.permanent_root + # these are independent of unit label + expected_in_shared = {"dag/unit1/bar.txt", "dag/unit1/baz.txt"} + assert set(shared_root.iter_contents()) == expected_in_shared + assert list(permanent_root.iter_contents()) == [] + + files_dict = self.get_files_dict(storage_manager) + existing = self._after_unit_existing_files(unit_label) + self.assert_existing_files(files_dict, existing) + + def after_dag_asserts(self, storage_manager): + permanent_root = storage_manager.permanent_root + # shared still contains everything it had; but this isn't something + # we guarantee, so we don't actually test for it: + # shared_root = storage_manager.shared_root + # assert set(shared_root.iter_contents()) == {"unit1/bar.txt", + # "unit1/baz.txt"} + assert list(permanent_root.iter_contents()) == ["dag/unit1/baz.txt"] + + files_dict = self.get_files_dict(storage_manager) + existing = self._after_dag_existing_files() + self.assert_existing_files(files_dict, existing) + + +class TestStandardStorageManager(LifecycleHarness): + @pytest.fixture + def storage_manager(self, storage_manager_std): + return storage_manager_std + + def _in_unit_existing_files(self, unit_label): + return { + "dag/unit1": {'bar', 'baz', 'foo'}, + "dag/unit2": {'foo2', 'baz'} + }[unit_label] + + def _after_unit_existing_files(self, unit_label): + # Same for both units because unit2 doesn't add anything to + # shared/permanent; in this one, only files staged for permanent + # should remain + return {'baz'} + + def _after_dag_existing_files(self): + return set() -@pytest.mark.parametrize('manager', ['std']) -def test_lifecycle(request, manager, dag_units): - # heavy integration test to ensure that the whole process works - # this is the primary test of _DAGStorageManager - storage_manager = request.getfixturevalue(f"storage_manager_{manager}") - permanent_root = storage_manager.permanent_root - shared_root = storage_manager.shared_root - results = [] - unit1_dir = Path(storage_manager.get_shared("dag_label", "unit1")) - scratch1 = Path(storage_manager.get_scratch("dag_label", "unit1")) - scratch2 = Path(storage_manager.get_scratch("dag_label", "unit2")) - barfile = unit1_dir / "bar.txt" - bazfile = unit1_dir / "baz.txt" - foofile = scratch1 / "foo.txt" - foo2file = scratch2 / "foo2.txt" - - all_files = {barfile, bazfile, foofile, foo2file} - with storage_manager.running_dag("dag_label") as dag_ctx: - for unit in dag_units: - with dag_ctx.running_unit(unit.key) as (scratch, shared, permanent): - results.append(unit.run(scratch, shared, permanent)) - - # check that the expected files are found in staging - exists = { - "unit1": {barfile, bazfile, foofile}, - "unit2": {foo2file, bazfile} - }[unit.key] - - for file in exists: - assert file.exists() - - for file in all_files - exists: - assert not file.exists() - - # check that shared store is as expected - expected_in_shared = { - "unit1": set(), - "unit2": {"unit1/bar.txt", "unit1/baz.txt"} - }[unit.key] - assert set(shared_root.iter_contents()) == expected_in_shared - # check that permanent store is empty - assert list(permanent_root.iter_contents()) == [] - # AFTER THE RUNNING_UNIT CONTEXT - # Same for both units because unit2 doesn't add anything to - # shared/permanent - # Files staged for shared should be transferred to shared and - # removed from the staging directories; files staged for - # permanent should remain - for_permanent = {bazfile} - for file in for_permanent: - assert file.exists() - - for file in all_files - for_permanent: - assert not file.exists() - - # check that we have things in shared - expected_in_shared = {"unit1/bar.txt", "unit1/baz.txt"} - assert set(shared_root.iter_contents()) == expected_in_shared - # ensure that we haven't written to permanent yet - assert list(permanent_root.iter_contents()) == [] - # AFTER THE RUNNING_DAG CONTEXT - # all staged files should be deleted - for file in all_files: - assert not file.exists() - # shared still contains everything it had; but this isn't something we - # guarantee, so we don't actually test for it - # assert set(shared_root.iter_contents()) == {"unit1/bar.txt", - # "unit1/baz.txt"} - assert list(permanent_root.iter_contents()) == ["unit1/baz.txt"] - # check the results - assert results == [ - "done 1", - {"bar": "bar was written", "baz": "baz was written"} - ] def test_lifecycle_keep_scratch_and_holding(): ... +def test_lifecycle_holding_overlaps_shared(tmp_path): + ... + +def test_lifecycle_holding_overlaps_permanent(tmp_path): + ... + + def test_storage_path_conflict_ok(tmp_path): # if the filestorage root is not in the given path, no conflict external = FileStorage(tmp_path / "foo" / "bar") @@ -154,16 +192,16 @@ def test_storage_path_conflict_problem(tmp_path): class TestStorageManager: def test_get_scratch(self, storage_manager_std): - scratch = storage_manager_std.get_scratch("dag_label", "unit_label") - assert str(scratch).endswith("dag_label/scratch/unit_label") + scratch = storage_manager_std.get_scratch("dag_label/unit_label") + assert str(scratch).endswith("dag_label/unit_label/scratch") assert isinstance(scratch, Path) def test_get_permanent(self, storage_manager_std): - perm = storage_manager_std.get_permanent("dag_label", "unit_label") - assert perm.__fspath__().endswith("dag_label/.holding/unit_label") + perm = storage_manager_std.get_permanent("dag_label/unit_label") + assert perm.__fspath__().endswith("dag_label/unit_label/.holding") assert isinstance(perm, StagingDirectory) def test_get_shared(self, storage_manager_std): - shared = storage_manager_std.get_shared("dag_label", "unit_label") - assert shared.__fspath__().endswith("dag_label/.holding/unit_label") + shared = storage_manager_std.get_shared("dag_label/unit_label") + assert shared.__fspath__().endswith("dag_label/unit_label/.holding") assert isinstance(shared, StagingDirectory) From 1880f730219e2171b59baec261ef19e32f3055f9 Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Tue, 6 Jun 2023 11:03:07 -0500 Subject: [PATCH 14/44] Big reorg for shared overlapping staging --- docs/guide/storage.rst | 12 +- gufe/storage/stagingdirectory.py | 53 +++++-- gufe/storage/storagemanager.py | 27 +++- gufe/tests/storage/test_stagingdirectory.py | 5 +- gufe/tests/storage/test_storagemanager.py | 161 +++++++++++++++----- 5 files changed, 198 insertions(+), 60 deletions(-) diff --git a/docs/guide/storage.rst b/docs/guide/storage.rst index 76ae59c7..671c0b4a 100644 --- a/docs/guide/storage.rst +++ b/docs/guide/storage.rst @@ -72,7 +72,9 @@ Details: Manangement of the Storage Lifetime The concepts of the storage lifetimes are important for protocol authors, but details of implementation are left to the specific executor. In order to -facilitate ??? +facilitate correct treatment of the storage lifecycle, GUFE provides a few +helpers. The information in this section is mostly of interest to authors of +executors. The helpers are: * :class:`.StorageManager`: This is the overall façade interface for interacting with the rest of the storage lifecycle tools. @@ -87,3 +89,11 @@ creates a :class:`.SharedStaging` and a :class:`.PermanentStaging` associated with the specific unit. Those staging directories, with the scratch directory, are provided to the :class:`.ProtocolDAGUnit`, so that these are the only objects protocol authors need to interact with. + +In using these, we assume that the label for data from a given unit is of +the form ``$DAG/$UNIT_INFO/$FILEPATH``. The details of ``$DAG`` and +``$UNIT_INFO`` are up the executor; in particular, there may be a more +deeply nested hierarchy for the ``$UNIT_INFO`` (e.g., to account for retries +of a given unit). An executor that wants to use a data label that doesnt' +match this format should not use :class:`.StorageManager` and the related +tools. diff --git a/gufe/storage/stagingdirectory.py b/gufe/storage/stagingdirectory.py index b70a1858..94dc3c24 100644 --- a/gufe/storage/stagingdirectory.py +++ b/gufe/storage/stagingdirectory.py @@ -1,12 +1,36 @@ from typing import Union, Optional from pathlib import Path from os import PathLike, rmdir, remove -from .externalresource import ExternalStorage +from .externalresource import ExternalStorage, FileStorage from contextlib import contextmanager import logging _logger = logging.getLogger(__name__) +def _safe_to_delete_holding(external, path, prefix): + """Check if deleting ``path`` could delete externally stored data. + + If external storage is a FileStorage, then it will storage files for + this unit or dag in the directory ``external.root_dir / prefix``, where + ``prefix`` is either the unit label or the dag label. If ``path`` is + inside that directory, then deleting it may delete information from the + external storage. In that case, this returns False, indicating a + conflict. Otherwise, this returns True. + """ + # this is a little brittle; I don't like hard-coding the class here + if isinstance(external, FileStorage): + root = Path(external.root_dir) / prefix + else: + return True + + p = Path(path) + try: + _ = p.relative_to(root) + except ValueError: + return True + else: + return False + def _delete_empty_dirs(root, delete_root=True): """Delete all empty directories. @@ -95,17 +119,16 @@ def __init__( self.registry : set[StagingPath] = set() self.preexisting : set[StagingPath] = set() - # NOTE: the fact that we use $SCRATCH/$HOLDING/$PREFIX instead of - # $SCRATCH/$PREFIX/$HOLDING is important for 2 reasons: - # 1. This doesn't take any of the user's namespace from their - # $SCRATCH/$PREFIX directory. - # 2. This allows us to easily use an external FileStorage where the - # external storage is exactly the same as this local storage, - # meaning that copies to/from the external storage are no-ops. - # Use FileStorage(scratch / holding) for that. - self.staging_dir = self.scratch / prefix / holding + self.staging_dir = self.scratch / holding / prefix self.staging_dir.mkdir(exist_ok=True, parents=True) + def _delete_holding_safe(self): + return _safe_to_delete_holding( + external=self.external, + path=self.staging_dir, + prefix=self.prefix, + ) + def transfer_single_file_to_external(self, held_file): """Transfer a given file from holding into external storage """ @@ -129,7 +152,7 @@ def transfer_holding_to_external(self): def cleanup(self): """Perform end-of-lifecycle cleanup. """ - if self.delete_holding: + if self.delete_holding and self._delete_holding_safe(): for file in self.registry - self.preexisting: remove(file) _delete_empty_dirs(self.staging_dir) @@ -269,6 +292,14 @@ def __init__( delete_holding=delete_holding) self.shared = shared + def _delete_holding_safe(self): + shared_safe = _safe_to_delete_holding( + external=self.shared, + path=self.staging_dir, + prefix=self.prefix + ) + return shared_safe and super()._delete_holding_safe() + def transfer_single_file_to_external(self, held_file): # if we can't find it locally, we load it from shared storage path = Path(held_file) diff --git a/gufe/storage/storagemanager.py b/gufe/storage/storagemanager.py index d88b9e86..6a1f0838 100644 --- a/gufe/storage/storagemanager.py +++ b/gufe/storage/storagemanager.py @@ -8,17 +8,25 @@ from .externalresource import ExternalStorage, FileStorage from .stagingdirectory import SharedStaging, PermanentStaging -def _storage_path_conflict(external, path): - """Check if deleting ``path`` could delete externally stored data +def _storage_path_conflict(external, path, label): + """Check if deleting ``path`` could delete externally stored data. + + If external storage is a FileStorage, then it will storage files for + this unit or dag in the directory ``external.root_dir / label``, where + ``label`` is either the unit label or the dag label. If ``path`` is + inside that directory, then deleting it may delete information from the + external storage. In that case, this returns True, indicating a + conflict. Otherwise, this returns False. """ # this is a little brittle; I don't like hard-coding the class here if isinstance(external, FileStorage): - root = Path(external.root_dir) + root = Path(external.root_dir) / label else: return False + p = Path(path) try: - _ = root.relative_to(Path(path)) + _ = p.relative_to(root) except ValueError: return False else: @@ -80,6 +88,7 @@ def running_dag(cls, storage_manager, dag_label): if not dag_manager.manager.keep_holding: for d in dag_manager.permanents: + # import pdb; pdb.set_trace() d.cleanup() @contextmanager @@ -104,12 +113,12 @@ def running_unit(self, unit_label: str): for file in permanent.registry: shared.transfer_single_file_to_external(file) scratch_conflict = _storage_path_conflict(shared.external, - scratch) + scratch, unit_label) if not self.manager.keep_scratch and not scratch_conflict: shutil.rmtree(scratch) shared_conflict = _storage_path_conflict(shared.external, - shared) + shared, unit_label) if not self.manager.keep_holding and not shared_conflict: shared.cleanup() @@ -143,7 +152,7 @@ def __init__( def get_scratch(self, unit_label: str) -> Path: """Get the path for this unit's scratch directory""" - scratch = self.scratch_root / unit_label / "scratch" + scratch = self.scratch_root / "scratch" / unit_label scratch.mkdir(parents=True, exist_ok=True) return scratch @@ -154,6 +163,7 @@ def get_permanent(self, unit_label): external=self.permanent_root, shared=self.shared_root, prefix=unit_label, + holding=self.holding, ) def get_shared(self, unit_label): @@ -161,7 +171,8 @@ def get_shared(self, unit_label): return SharedStaging( scratch=self.scratch_root, external=self.shared_root, - prefix=unit_label + prefix=unit_label, + holding=self.holding, ) def running_dag(self, dag_label: str): diff --git a/gufe/tests/storage/test_stagingdirectory.py b/gufe/tests/storage/test_stagingdirectory.py index e825d28b..42893358 100644 --- a/gufe/tests/storage/test_stagingdirectory.py +++ b/gufe/tests/storage/test_stagingdirectory.py @@ -94,7 +94,7 @@ def test_read_old(self, root): # initial conditions, without touching StagingDirectory/StagingPath label = "old_unit/data.txt" - on_filesystem = root.scratch / "old_unit" / root.holding / "data.txt" + on_filesystem = root.scratch / root.holding / "old_unit/data.txt" assert not on_filesystem.exists() assert root.external.exists(label) @@ -111,8 +111,7 @@ def test_read_old(self, root): def test_write_new(self, root): label = "new_unit/somefile.txt" - on_filesystem = (root.scratch / "new_unit" / root.holding - / "somefile.txt") + on_filesystem = root.scratch / root.holding / "new_unit/somefile.txt" assert not on_filesystem.exists() with open(root / "somefile.txt", mode='wb') as f: f.write(b"testing") diff --git a/gufe/tests/storage/test_storagemanager.py b/gufe/tests/storage/test_storagemanager.py index 9c36e391..41092fc5 100644 --- a/gufe/tests/storage/test_storagemanager.py +++ b/gufe/tests/storage/test_storagemanager.py @@ -11,7 +11,7 @@ def storage_manager_std(tmp_path): return StorageManager( scratch_root=tmp_path / "working", shared_root=MemoryStorage(), - permanent_root=MemoryStorage() + permanent_root=MemoryStorage(), ) @pytest.fixture @@ -58,10 +58,10 @@ def get_files_dict(storage_manager): root = storage_manager.scratch_root holding = storage_manager.holding return { - "foo": root / "dag/unit1/scratch/foo.txt", - "foo2": root / "dag/unit2/scratch/foo2.txt", - "bar": root / "dag/unit1" / holding / "bar.txt", - "baz": root / "dag/unit1" / holding / "baz.txt", + "foo": root / "scratch/dag/unit1/foo.txt", + "foo2": root / "scratch/dag/unit2/foo2.txt", + "bar": root / holding / "dag/unit1/bar.txt", + "baz": root / holding / "dag/unit1/baz.txt", } def test_lifecycle(self, storage_manager, dag_units, tmp_path): @@ -97,6 +97,20 @@ def assert_existing_files(files_dict, existing): for file in set(files_dict) - existing: assert not files_dict[file].exists() + def _in_staging_shared(self, unit_label, in_after): + """ + This is to include things when a shared staging directory reports + that files exist in it. + """ + return set() + + def _in_staging_permanent(self, unit_label, in_after): + """ + This is to include things when a permanent staging directory reports + that files exist in it. + """ + return set() + def in_unit_asserts(self, storage_manager, unit_label): # check that shared and permanent are correct shared_root = storage_manager.shared_root @@ -104,10 +118,11 @@ def in_unit_asserts(self, storage_manager, unit_label): expected_in_shared = { "dag/unit1": set(), "dag/unit2": {"dag/unit1/bar.txt", "dag/unit1/baz.txt"} - }[unit_label] + }[unit_label] | self._in_staging_shared(unit_label, "in") assert set(shared_root.iter_contents()) == expected_in_shared - assert list(permanent_root.iter_contents()) == [] + expected_in_permanent = self._in_staging_permanent(unit_label, "in") + assert set(permanent_root.iter_contents()) == expected_in_permanent # manager-specific check for files files_dict = self.get_files_dict(storage_manager) @@ -117,24 +132,33 @@ def in_unit_asserts(self, storage_manager, unit_label): def after_unit_asserts(self, storage_manager, unit_label): shared_root = storage_manager.shared_root permanent_root = storage_manager.permanent_root - # these are independent of unit label + shared_extras = self._in_staging_shared(unit_label, "after") + permanent_extras = self._in_staging_permanent(unit_label, "after") expected_in_shared = {"dag/unit1/bar.txt", "dag/unit1/baz.txt"} + expected_in_shared |= shared_extras assert set(shared_root.iter_contents()) == expected_in_shared - assert list(permanent_root.iter_contents()) == [] + assert set(permanent_root.iter_contents()) == permanent_extras + # manager-specific check for files files_dict = self.get_files_dict(storage_manager) existing = self._after_unit_existing_files(unit_label) self.assert_existing_files(files_dict, existing) def after_dag_asserts(self, storage_manager): permanent_root = storage_manager.permanent_root + permanent_extras = self._in_staging_permanent('dag/unit2', "after") # shared still contains everything it had; but this isn't something - # we guarantee, so we don't actually test for it: + # we guarantee, so we don't actually test for it, but we could with + # this: # shared_root = storage_manager.shared_root - # assert set(shared_root.iter_contents()) == {"unit1/bar.txt", - # "unit1/baz.txt"} - assert list(permanent_root.iter_contents()) == ["dag/unit1/baz.txt"] + # shared_extras = self._in_staging_shared('dag/unit2', "after") + # expected_in_shared = {"dag/unit1/bar.txt", "dag/unit1/baz.txt"} + # expected_in_shared |= shared_extras + # assert set(shared_root.iter_contents()) == expected_in_shared + expected_in_permanent = {"dag/unit1/baz.txt"} | permanent_extras + assert set(permanent_root.iter_contents()) == expected_in_permanent + # manager-specific check for files files_dict = self.get_files_dict(storage_manager) existing = self._after_dag_existing_files() self.assert_existing_files(files_dict, existing) @@ -161,47 +185,110 @@ def _after_dag_existing_files(self): return set() -def test_lifecycle_keep_scratch_and_holding(): - ... +class TestKeepScratchAndHoldingStorageManager(LifecycleHarness): + @pytest.fixture + def storage_manager(self, tmp_path): + return StorageManager( + scratch_root=tmp_path / "working", + shared_root=MemoryStorage(), + permanent_root=MemoryStorage(), + keep_scratch=True, + keep_holding=True + ) + + @staticmethod + def files_after_unit(unit_label): + unit1 = {'bar', 'baz', 'foo'} + unit2 = {'foo2', 'baz'} + return { + 'dag/unit1': unit1, + 'dag/unit2': unit1 | unit2 + }[unit_label] + + def _in_unit_existing_files(self, unit_label): + return self.files_after_unit(unit_label) + + def _after_unit_existing_files(self, unit_label): + return self.files_after_unit(unit_label) + + def _after_dag_existing_files(self): + return self.files_after_unit('dag/unit2') + + +class TestHoldingOverlapsSharedStorageManager(LifecycleHarness): + @pytest.fixture + def storage_manager(self, tmp_path): + root = tmp_path / "working" + return StorageManager( + scratch_root=root, + shared_root=FileStorage(root), + permanent_root=MemoryStorage(), + holding="", + ) + + def _in_staging_shared(self, unit_label, in_after): + bar = "dag/unit1/bar.txt" + baz = "dag/unit1/baz.txt" + foo = "scratch/dag/unit1/foo.txt" + foo2 = "scratch/dag/unit2/foo2.txt" + return { + ("dag/unit1", "in"): {bar, baz, foo}, + ("dag/unit1", "after"): {bar, baz}, + ("dag/unit2", "in"): {bar, baz, foo2}, + ("dag/unit2", "after"): {baz} + }[unit_label, in_after] -def test_lifecycle_holding_overlaps_shared(tmp_path): - ... + def _in_unit_existing_files(self, unit_label): + return { + "dag/unit1": {'foo', 'bar', 'baz'}, + "dag/unit2": {'foo2', 'bar', 'baz'}, + }[unit_label] + + def _after_unit_existing_files(self, unit_label): + # same for both; all files come from unit 1 + return {"bar", "baz"} + + def _after_dag_existing_files(self): + # NOTE: currently we don't delete bar at the end of a cycle, but we + # don't guarantee that we would not. So it exists, but changing that + # isn't API-breaking. + return {"bar", "baz"} -def test_lifecycle_holding_overlaps_permanent(tmp_path): - ... +# class TestHoldingOverlapsPermanentStorageManager(LifecycleHarness): +# ... -def test_storage_path_conflict_ok(tmp_path): - # if the filestorage root is not in the given path, no conflict - external = FileStorage(tmp_path / "foo" / "bar") - path = tmp_path / "foo" / "baz" - assert _storage_path_conflict(external, path) is False +# def test_storage_path_conflict_ok(tmp_path): +# # if the filestorage root is not in the given path, no conflict +# external = FileStorage(tmp_path / "foo" / "bar") +# path = tmp_path / "foo" / "baz" +# assert _storage_path_conflict(external, path) is False -def test_storage_path_conflict_not_filestorage(tmp_path): - # if the external resource isn't a FileStorage, no conflict - external = MemoryStorage() - path = tmp_path / "foo" / "baz" - assert _storage_path_conflict(external, path) is False +# def test_storage_path_conflict_not_filestorage(tmp_path): +# # if the external resource isn't a FileStorage, no conflict +# external = MemoryStorage() +# path = tmp_path / "foo" / "baz" +# assert _storage_path_conflict(external, path) is False -def test_storage_path_conflict_problem(tmp_path): - # if the filestorage root is in the given path, we have a conflict - external = FileStorage(tmp_path / "foo" / "bar") - path = tmp_path / "foo" - assert _storage_path_conflict(external, path) is True +# def test_storage_path_conflict_problem(tmp_path): +# # if the filestorage root is in the given path, we have a conflict +# external = FileStorage(tmp_path / "foo" / "bar") +# path = tmp_path / "foo" +# assert _storage_path_conflict(external, path) is True class TestStorageManager: def test_get_scratch(self, storage_manager_std): scratch = storage_manager_std.get_scratch("dag_label/unit_label") - assert str(scratch).endswith("dag_label/unit_label/scratch") + assert str(scratch).endswith("scratch/dag_label/unit_label") assert isinstance(scratch, Path) def test_get_permanent(self, storage_manager_std): perm = storage_manager_std.get_permanent("dag_label/unit_label") - assert perm.__fspath__().endswith("dag_label/unit_label/.holding") + assert perm.__fspath__().endswith(".holding/dag_label/unit_label") assert isinstance(perm, StagingDirectory) def test_get_shared(self, storage_manager_std): shared = storage_manager_std.get_shared("dag_label/unit_label") - assert shared.__fspath__().endswith("dag_label/unit_label/.holding") + assert shared.__fspath__().endswith(".holding/dag_label/unit_label") assert isinstance(shared, StagingDirectory) From 1e4ca2c217ca79d7e5409bcfa4f6d5c64c29f05a Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Tue, 6 Jun 2023 12:08:09 -0500 Subject: [PATCH 15/44] remove _storage_path_conflict Makes much more sense as _safe_to_delete_holding --- gufe/storage/stagingdirectory.py | 5 ++++ gufe/storage/storagemanager.py | 32 ++------------------- gufe/tests/storage/test_stagingdirectory.py | 23 +++++++++++++-- gufe/tests/storage/test_storagemanager.py | 21 +------------- 4 files changed, 30 insertions(+), 51 deletions(-) diff --git a/gufe/storage/stagingdirectory.py b/gufe/storage/stagingdirectory.py index 94dc3c24..9a06a3ca 100644 --- a/gufe/storage/stagingdirectory.py +++ b/gufe/storage/stagingdirectory.py @@ -278,6 +278,11 @@ def register_path(self, staging_path): class PermanentStaging(StagingDirectory): + """Staging directory for the permanent storage. + + This allows files to be downloaded from a shared + :class:`.ExternalStorage`. + """ def __init__( self, scratch: PathLike, diff --git a/gufe/storage/storagemanager.py b/gufe/storage/storagemanager.py index 6a1f0838..4d3cefd2 100644 --- a/gufe/storage/storagemanager.py +++ b/gufe/storage/storagemanager.py @@ -8,29 +8,6 @@ from .externalresource import ExternalStorage, FileStorage from .stagingdirectory import SharedStaging, PermanentStaging -def _storage_path_conflict(external, path, label): - """Check if deleting ``path`` could delete externally stored data. - - If external storage is a FileStorage, then it will storage files for - this unit or dag in the directory ``external.root_dir / label``, where - ``label`` is either the unit label or the dag label. If ``path`` is - inside that directory, then deleting it may delete information from the - external storage. In that case, this returns True, indicating a - conflict. Otherwise, this returns False. - """ - # this is a little brittle; I don't like hard-coding the class here - if isinstance(external, FileStorage): - root = Path(external.root_dir) / label - else: - return False - - p = Path(path) - try: - _ = p.relative_to(root) - except ValueError: - return False - else: - return True class _AbstractDAGContextManager: @classmethod @@ -112,14 +89,11 @@ def running_unit(self, unit_label: str): # everything in permanent must also be available in shared for file in permanent.registry: shared.transfer_single_file_to_external(file) - scratch_conflict = _storage_path_conflict(shared.external, - scratch, unit_label) - if not self.manager.keep_scratch and not scratch_conflict: + + if not self.manager.keep_scratch: shutil.rmtree(scratch) - shared_conflict = _storage_path_conflict(shared.external, - shared, unit_label) - if not self.manager.keep_holding and not shared_conflict: + if not self.manager.keep_holding: shared.cleanup() diff --git a/gufe/tests/storage/test_stagingdirectory.py b/gufe/tests/storage/test_stagingdirectory.py index 42893358..2def33af 100644 --- a/gufe/tests/storage/test_stagingdirectory.py +++ b/gufe/tests/storage/test_stagingdirectory.py @@ -3,9 +3,10 @@ import os import pathlib -from gufe.storage.externalresource import MemoryStorage +from gufe.storage.externalresource import MemoryStorage, FileStorage from gufe.storage.stagingdirectory import ( - SharedStaging, PermanentStaging, _delete_empty_dirs + SharedStaging, PermanentStaging, _delete_empty_dirs, + _safe_to_delete_holding ) @pytest.fixture @@ -27,6 +28,24 @@ def root_with_contents(root): return root +def test_safe_to_delete_holding_ok(tmp_path): + external = FileStorage(tmp_path / "foo") + prefix = "bar" + holding = tmp_path / "foo" / "baz" + assert _safe_to_delete_holding(external, holding, prefix) + +def test_safe_to_delete_holding_danger(tmp_path): + external = FileStorage(tmp_path / "foo") + prefix = "bar" + holding = tmp_path / "foo" / "bar" / "baz" + assert not _safe_to_delete_holding(external, holding, prefix) + +def test_safe_to_delete_holding_not_filestorage(tmp_path): + external = MemoryStorage() + prefix = "bar" + holding = tmp_path / "bar" + assert _safe_to_delete_holding(external, holding, prefix) + def test_delete_empty_dirs(tmp_path): base = tmp_path / "tmp" paths = [ diff --git a/gufe/tests/storage/test_storagemanager.py b/gufe/tests/storage/test_storagemanager.py index 41092fc5..2d7fb292 100644 --- a/gufe/tests/storage/test_storagemanager.py +++ b/gufe/tests/storage/test_storagemanager.py @@ -1,6 +1,6 @@ import pytest from gufe.storage.storagemanager import ( - StorageManager, _storage_path_conflict + StorageManager ) from gufe.storage.stagingdirectory import StagingDirectory from gufe.storage.externalresource import MemoryStorage, FileStorage @@ -258,25 +258,6 @@ def _after_dag_existing_files(self): # ... -# def test_storage_path_conflict_ok(tmp_path): -# # if the filestorage root is not in the given path, no conflict -# external = FileStorage(tmp_path / "foo" / "bar") -# path = tmp_path / "foo" / "baz" -# assert _storage_path_conflict(external, path) is False - -# def test_storage_path_conflict_not_filestorage(tmp_path): -# # if the external resource isn't a FileStorage, no conflict -# external = MemoryStorage() -# path = tmp_path / "foo" / "baz" -# assert _storage_path_conflict(external, path) is False - -# def test_storage_path_conflict_problem(tmp_path): -# # if the filestorage root is in the given path, we have a conflict -# external = FileStorage(tmp_path / "foo" / "bar") -# path = tmp_path / "foo" -# assert _storage_path_conflict(external, path) is True - - class TestStorageManager: def test_get_scratch(self, storage_manager_std): scratch = storage_manager_std.get_scratch("dag_label/unit_label") From a6d26f3022ba7f2deb936d946702ce80c41d1251 Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Tue, 6 Jun 2023 12:45:47 -0500 Subject: [PATCH 16/44] docs & types --- docs/guide/storage.rst | 8 ----- gufe/storage/stagingdirectory.py | 51 ++++++++++++++++++++++---------- gufe/storage/storagemanager.py | 2 +- 3 files changed, 36 insertions(+), 25 deletions(-) diff --git a/docs/guide/storage.rst b/docs/guide/storage.rst index 671c0b4a..0153bf89 100644 --- a/docs/guide/storage.rst +++ b/docs/guide/storage.rst @@ -89,11 +89,3 @@ creates a :class:`.SharedStaging` and a :class:`.PermanentStaging` associated with the specific unit. Those staging directories, with the scratch directory, are provided to the :class:`.ProtocolDAGUnit`, so that these are the only objects protocol authors need to interact with. - -In using these, we assume that the label for data from a given unit is of -the form ``$DAG/$UNIT_INFO/$FILEPATH``. The details of ``$DAG`` and -``$UNIT_INFO`` are up the executor; in particular, there may be a more -deeply nested hierarchy for the ``$UNIT_INFO`` (e.g., to account for retries -of a given unit). An executor that wants to use a data label that doesnt' -match this format should not use :class:`.StorageManager` and the related -tools. diff --git a/gufe/storage/stagingdirectory.py b/gufe/storage/stagingdirectory.py index 9a06a3ca..2f7ee313 100644 --- a/gufe/storage/stagingdirectory.py +++ b/gufe/storage/stagingdirectory.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from typing import Union, Optional from pathlib import Path from os import PathLike, rmdir, remove @@ -7,7 +9,10 @@ import logging _logger = logging.getLogger(__name__) -def _safe_to_delete_holding(external, path, prefix): +# TODO: holding -> staging + +def _safe_to_delete_holding(external: ExternalStorage, path: PathLike, + prefix: Union[PathLike, str]) -> bool: """Check if deleting ``path`` could delete externally stored data. If external storage is a FileStorage, then it will storage files for @@ -32,7 +37,7 @@ def _safe_to_delete_holding(external, path, prefix): return False -def _delete_empty_dirs(root, delete_root=True): +def _delete_empty_dirs(root: PathLike, delete_root: bool = True): """Delete all empty directories. Repeats so that directories that only contained empty directories also @@ -46,7 +51,6 @@ def find_empty_dirs(directory): directories = [p for p in paths if p.is_dir()] return sum([find_empty_dirs(d) for d in directories], []) - while root.exists() and (empties := find_empty_dirs(root)): if empties == [root] and not delete_root: return @@ -123,16 +127,17 @@ def __init__( self.staging_dir.mkdir(exist_ok=True, parents=True) def _delete_holding_safe(self): + """Check if deleting staging will remove data from external. + """ return _safe_to_delete_holding( external=self.external, path=self.staging_dir, prefix=self.prefix, ) - def transfer_single_file_to_external(self, held_file): + def transfer_single_file_to_external(self, held_file: StagingPath): """Transfer a given file from holding into external storage """ - path = Path(held_file) if not path.exists(): logging.info(f"Found nonexistent path {path}, not " @@ -157,7 +162,7 @@ def cleanup(self): remove(file) _delete_empty_dirs(self.staging_dir) - def register_path(self, staging_path): + def register_path(self, staging_path: StagingPath): """ Register a :class:`.StagingPath` with this :class:`.StagingDirectory`. @@ -186,7 +191,8 @@ def register_path(self, staging_path): if label_exists: self._load_file_from_external(self.external, staging_path) - def _load_file_from_external(self, external, staging_path): + def _load_file_from_external(self, external: ExternalStorage, + staging_path: StagingPath): scratch_path = self.staging_dir / staging_path.path # TODO: switch this to using `get_filename` and `store_path` with external.load_stream(staging_path.label) as f: @@ -198,7 +204,7 @@ def _load_file_from_external(self, external, staging_path): with open(scratch_path, mode='wb') as f: f.write(external_bytes) - def __truediv__(self, path: PathLike): + def __truediv__(self, path: Union[PathLike, str, bytes]): return StagingPath(root=self, path=path) def __fspath__(self): @@ -210,8 +216,18 @@ def __repr__(self): f"{self.prefix})" ) + def __del__(self): # -no-cov- + # in case someone doesn't use this within a context manager + if self.staging_dir.exists(): + self.cleanup() + + class SharedStaging(StagingDirectory): + """Staging for shared external storage. + + This enables read-only versions to be loaded from other units. + """ def __init__( self, scratch: PathLike, @@ -226,7 +242,8 @@ def __init__( delete_holding=delete_holding) self.read_only = read_only - def get_other_shared(self, prefix, delete_holding=None): + def get_other_shared(self, prefix: Union[str, PathLike], + delete_holding: Optional[bool] = None): """Get a related unit's staging directory. """ if delete_holding is None: @@ -242,7 +259,8 @@ def get_other_shared(self, prefix, delete_holding=None): ) @contextmanager - def other_shared(self, prefix, delete_holding=None): + def other_shared(self, prefix: Union[str, PathLike], + delete_holding: Optional[bool] = None): """Context manager approach for getting a related unit's directory. This is usually the recommended way to get a previous unit's shared @@ -252,7 +270,7 @@ def other_shared(self, prefix, delete_holding=None): yield other other.cleanup() - def transfer_single_file_to_external(self, held_file): + def transfer_single_file_to_external(self, held_file: StagingPath): if self.read_only: logging.debug("Read-only: Not transfering to external storage") return # early exit @@ -266,7 +284,7 @@ def transfer_holding_to_external(self): super().transfer_holding_to_external() - def register_path(self, staging_path): + def register_path(self, staging_path: StagingPath): label_exists = self.external.exists(staging_path.label) if self.read_only and not label_exists: @@ -305,7 +323,7 @@ def _delete_holding_safe(self): ) return shared_safe and super()._delete_holding_safe() - def transfer_single_file_to_external(self, held_file): + def transfer_single_file_to_external(self, held_file: StagingPath): # if we can't find it locally, we load it from shared storage path = Path(held_file) if not path.exists(): @@ -321,19 +339,20 @@ class StagingPath: manage the local path and transferring data with its :class:`.ExternalStorage`. """ - def __init__(self, root: StagingDirectory, path: PathLike): + def __init__(self, root: StagingDirectory, + path: Union[PathLike, str, bytes]): self.root = root self.path = Path(path) self.root.register_path(self) - def __truediv__(self, path): + def __truediv__(self, path: Union[PathLike, str, bytes]): return StagingPath(self.root, self.path / path) def __fspath__(self): return str(self.root.staging_dir / self.path) @property - def label(self): + def label(self) -> str: """Label used in :class:`.ExternalStorage` for this path""" return str(self.root.prefix / self.path) diff --git a/gufe/storage/storagemanager.py b/gufe/storage/storagemanager.py index 4d3cefd2..51fe5cd8 100644 --- a/gufe/storage/storagemanager.py +++ b/gufe/storage/storagemanager.py @@ -30,7 +30,7 @@ def running_unit(self, unit_label: str): DAGContextManager = Type[_AbstractDAGContextManager] - +# TODO: rename class _DAGStorageManager(_AbstractDAGContextManager): """Context manager to handle details of storage lifecycle. From aabbc33054ddf1818a75f2944f83c08d7bae5e52 Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Tue, 6 Jun 2023 13:09:42 -0500 Subject: [PATCH 17/44] docs, types, logging --- gufe/storage/stagingdirectory.py | 26 +++++++++++--------------- gufe/storage/storagemanager.py | 17 +++++++++++------ 2 files changed, 22 insertions(+), 21 deletions(-) diff --git a/gufe/storage/stagingdirectory.py b/gufe/storage/stagingdirectory.py index 2f7ee313..5d43769c 100644 --- a/gufe/storage/stagingdirectory.py +++ b/gufe/storage/stagingdirectory.py @@ -77,10 +77,6 @@ class StagingDirectory: 3. It can delete all of the files it manages - This can be opened in "read-only" mode, which prevents new files from - being created, but does not prevent changes to existing versions of - local files. - Parameters ---------- scratch : PathLike @@ -100,11 +96,6 @@ class StagingDirectory: delete_holding : bool whether to delete the contents of the $SCRATCH/$HOLDING/$PREFIX directory when this object is deleted - read_only : bool - write to prevent NEW files from being written within this staging - directory. NOTE: This will not prevent overwrite of existing files - in scratch space, but it will prevent changed files from uploading - to the external storage. """ def __init__( self, @@ -140,13 +131,13 @@ def transfer_single_file_to_external(self, held_file: StagingPath): """ path = Path(held_file) if not path.exists(): - logging.info(f"Found nonexistent path {path}, not " + _logger.info(f"Found nonexistent path {path}, not " "transfering to external storage") elif path.is_dir(): - logging.debug(f"Found directory {path}, not " + _logger.debug(f"Found directory {path}, not " "transfering to external storage") else: - logging.info(f"Transfering {path} to external storage") + _logger.info(f"Transfering {path} to external storage") self.external.store_path(held_file.label, path) def transfer_holding_to_external(self): @@ -159,7 +150,12 @@ def cleanup(self): """ if self.delete_holding and self._delete_holding_safe(): for file in self.registry - self.preexisting: - remove(file) + if Path(file).exists(): + _logger.debug(f"Removing file {file}") + remove(file) + else: + _logger.warning("Request to remove missing file " + f"{file}") _delete_empty_dirs(self.staging_dir) def register_path(self, staging_path: StagingPath): @@ -272,14 +268,14 @@ def other_shared(self, prefix: Union[str, PathLike], def transfer_single_file_to_external(self, held_file: StagingPath): if self.read_only: - logging.debug("Read-only: Not transfering to external storage") + _logger.debug("Read-only: Not transfering to external storage") return # early exit super().transfer_single_file_to_external(held_file) def transfer_holding_to_external(self): if self.read_only: - logging.debug("Read-only: Not transfering to external storage") + _logger.debug("Read-only: Not transfering to external storage") return # early exit super().transfer_holding_to_external() diff --git a/gufe/storage/storagemanager.py b/gufe/storage/storagemanager.py index 51fe5cd8..11dbae0b 100644 --- a/gufe/storage/storagemanager.py +++ b/gufe/storage/storagemanager.py @@ -1,3 +1,4 @@ +from __future__ import annotations from os import PathLike from pathlib import Path from contextlib import contextmanager @@ -12,7 +13,7 @@ class _AbstractDAGContextManager: @classmethod @contextmanager - def running_dag(cls, storage_manager, dag_label: str): + def running_dag(cls, storage_manager: StorageManager, dag_label: str): """Return a context manager for when a DAG is started. This context manager handles the DAG scale of the lifecycle. @@ -39,14 +40,14 @@ class _DAGStorageManager(_AbstractDAGContextManager): directly; instead, it is created (and used) with its ``running_dag`` classmethod, typically from within a ``StorageManager``. """ - def __init__(self, storage_manager, dag_label): + def __init__(self, storage_manager: StorageLabel, dag_label: str): self.manager = storage_manager self.dag_label = dag_label self.permanents = [] @classmethod # NB: classmethod must be on top @contextmanager - def running_dag(cls, storage_manager, dag_label): + def running_dag(cls, storage_manager: StorageManager, dag_label: str): """DAG level of the storage lifecycle When the DAG is completed, transfer everything to the permanent @@ -76,6 +77,9 @@ def running_unit(self, unit_label: str): permanent. At the end of the unit, it transfers anything from shared to the real shared external storage, cleans up the scratch directory and the shared holding directory. + + Note that the unit label here is the *entire* label; that is, it + would also include information identifying the DAG. """ scratch = self.manager.get_scratch(unit_label) shared = self.manager.get_shared(unit_label) @@ -130,7 +134,7 @@ def get_scratch(self, unit_label: str) -> Path: scratch.mkdir(parents=True, exist_ok=True) return scratch - def get_permanent(self, unit_label): + def get_permanent(self, unit_label) -> PermanentStaging: """Get the object for this unit's permanent holding directory""" return PermanentStaging( scratch=self.scratch_root, @@ -140,7 +144,7 @@ def get_permanent(self, unit_label): holding=self.holding, ) - def get_shared(self, unit_label): + def get_shared(self, unit_label) -> SharedStaging: """Get the object for this unit's shared holding directory""" return SharedStaging( scratch=self.scratch_root, @@ -159,7 +163,8 @@ def running_dag(self, dag_label: str): with manager.running_dag(dag_label) as dag_ctx: for unit in dag_ordered_units: - with dag_ctx.running_unit(unit) as dirs: + label = f"{dag_ctx.dag_label}/{unit.key}" + with dag_ctx.running_unit(label) as dirs: scratch, shared, permanent = dirs # run the unit """ From b4d73b39d62ef814aea5718269aa5eae904c3375 Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Tue, 6 Jun 2023 17:17:20 -0500 Subject: [PATCH 18/44] finish TestHoldingOverlapsPermanentStorageManager --- gufe/storage/stagingdirectory.py | 21 ++++---- gufe/tests/storage/test_storagemanager.py | 58 ++++++++++++++++++----- 2 files changed, 57 insertions(+), 22 deletions(-) diff --git a/gufe/storage/stagingdirectory.py b/gufe/storage/stagingdirectory.py index 5d43769c..49f16a89 100644 --- a/gufe/storage/stagingdirectory.py +++ b/gufe/storage/stagingdirectory.py @@ -189,16 +189,16 @@ def register_path(self, staging_path: StagingPath): def _load_file_from_external(self, external: ExternalStorage, staging_path: StagingPath): - scratch_path = self.staging_dir / staging_path.path - # TODO: switch this to using `get_filename` and `store_path` - with external.load_stream(staging_path.label) as f: - external_bytes = f.read() - if scratch_path.exists(): - self.preexisting.add(staging_path) - ... # TODO: something to check that the bytes are the same? - scratch_path.parent.mkdir(exist_ok=True, parents=True) - with open(scratch_path, mode='wb') as f: - f.write(external_bytes) + scratch_path = self.staging_dir / staging_path.path + # TODO: switch this to using `get_filename` and `store_path` + with external.load_stream(staging_path.label) as f: + external_bytes = f.read() + if scratch_path.exists(): + self.preexisting.add(staging_path) + ... # TODO: something to check that the bytes are the same? + scratch_path.parent.mkdir(exist_ok=True, parents=True) + with open(scratch_path, mode='wb') as f: + f.write(external_bytes) def __truediv__(self, path: Union[PathLike, str, bytes]): return StagingPath(root=self, path=path) @@ -218,7 +218,6 @@ def __del__(self): # -no-cov- self.cleanup() - class SharedStaging(StagingDirectory): """Staging for shared external storage. diff --git a/gufe/tests/storage/test_storagemanager.py b/gufe/tests/storage/test_storagemanager.py index 2d7fb292..c37451ad 100644 --- a/gufe/tests/storage/test_storagemanager.py +++ b/gufe/tests/storage/test_storagemanager.py @@ -215,6 +215,7 @@ def _after_dag_existing_files(self): return self.files_after_unit('dag/unit2') + class TestHoldingOverlapsSharedStorageManager(LifecycleHarness): @pytest.fixture def storage_manager(self, tmp_path): @@ -226,6 +227,23 @@ def storage_manager(self, tmp_path): holding="", ) + def _in_unit_existing_files(self, unit_label): + return { + "dag/unit1": {'foo', 'bar', 'baz'}, + "dag/unit2": {'foo2', 'bar', 'baz'}, + }[unit_label] + + + def _after_unit_existing_files(self, unit_label): + # same for both; all files come from unit 1 + return {"bar", "baz"} + + def _after_dag_existing_files(self): + # NOTE: currently we don't delete bar at the end of a cycle, but we + # don't guarantee that we would not. So it exists, but changing that + # isn't API-breaking. + return {"bar", "baz"} + def _in_staging_shared(self, unit_label, in_after): bar = "dag/unit1/bar.txt" baz = "dag/unit1/baz.txt" @@ -238,24 +256,42 @@ def _in_staging_shared(self, unit_label, in_after): ("dag/unit2", "after"): {baz} }[unit_label, in_after] + +class TestHoldingOverlapsPermanentStorageManager(LifecycleHarness): + @pytest.fixture + def storage_manager(self, tmp_path): + root = tmp_path / "working" + return StorageManager( + scratch_root=root, + permanent_root=FileStorage(root), + shared_root=MemoryStorage(), + holding="", + ) + def _in_unit_existing_files(self, unit_label): return { "dag/unit1": {'foo', 'bar', 'baz'}, - "dag/unit2": {'foo2', 'bar', 'baz'}, + "dag/unit2": {"foo2", "baz"}, # no bar because it was temporary }[unit_label] - def _after_unit_existing_files(self, unit_label): - # same for both; all files come from unit 1 - return {"bar", "baz"} - def _after_dag_existing_files(self): - # NOTE: currently we don't delete bar at the end of a cycle, but we - # don't guarantee that we would not. So it exists, but changing that - # isn't API-breaking. - return {"bar", "baz"} + return {"baz"} -# class TestHoldingOverlapsPermanentStorageManager(LifecycleHarness): -# ... + def _in_staging_permanent(self, unit_label, in_after): + bar = "dag/unit1/bar.txt" + baz = "dag/unit1/baz.txt" + foo = "scratch/dag/unit1/foo.txt" + foo2 = "scratch/dag/unit2/foo2.txt" + return { + ("dag/unit1", "in"): {bar, baz, foo}, + ("dag/unit1", "after"): {baz}, + ("dag/unit2", "in"): {baz, foo2}, + ("dag/unit2", "after"): {baz} + }[unit_label, in_after] + + def _after_unit_existing_files(self, unit_label): + # same for both; all files come from unit 1 + return {"baz"} class TestStorageManager: From 7af006e29dee3f349256cb06cff7264600103487 Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Tue, 6 Jun 2023 18:00:21 -0500 Subject: [PATCH 19/44] mypy --- gufe/storage/stagingdirectory.py | 10 +++++----- gufe/storage/storagemanager.py | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/gufe/storage/stagingdirectory.py b/gufe/storage/stagingdirectory.py index 49f16a89..7c44f4aa 100644 --- a/gufe/storage/stagingdirectory.py +++ b/gufe/storage/stagingdirectory.py @@ -200,7 +200,7 @@ def _load_file_from_external(self, external: ExternalStorage, with open(scratch_path, mode='wb') as f: f.write(external_bytes) - def __truediv__(self, path: Union[PathLike, str, bytes]): + def __truediv__(self, path: Union[PathLike, str]): return StagingPath(root=self, path=path) def __fspath__(self): @@ -237,7 +237,7 @@ def __init__( delete_holding=delete_holding) self.read_only = read_only - def get_other_shared(self, prefix: Union[str, PathLike], + def get_other_shared(self, prefix: str, delete_holding: Optional[bool] = None): """Get a related unit's staging directory. """ @@ -254,7 +254,7 @@ def get_other_shared(self, prefix: Union[str, PathLike], ) @contextmanager - def other_shared(self, prefix: Union[str, PathLike], + def other_shared(self, prefix: str, delete_holding: Optional[bool] = None): """Context manager approach for getting a related unit's directory. @@ -335,12 +335,12 @@ class StagingPath: :class:`.ExternalStorage`. """ def __init__(self, root: StagingDirectory, - path: Union[PathLike, str, bytes]): + path: Union[PathLike, str]): self.root = root self.path = Path(path) self.root.register_path(self) - def __truediv__(self, path: Union[PathLike, str, bytes]): + def __truediv__(self, path: Union[PathLike, str]): return StagingPath(self.root, self.path / path) def __fspath__(self): diff --git a/gufe/storage/storagemanager.py b/gufe/storage/storagemanager.py index 11dbae0b..bfb3bfc3 100644 --- a/gufe/storage/storagemanager.py +++ b/gufe/storage/storagemanager.py @@ -40,10 +40,10 @@ class _DAGStorageManager(_AbstractDAGContextManager): directly; instead, it is created (and used) with its ``running_dag`` classmethod, typically from within a ``StorageManager``. """ - def __init__(self, storage_manager: StorageLabel, dag_label: str): + def __init__(self, storage_manager: StorageManager, dag_label: str): self.manager = storage_manager self.dag_label = dag_label - self.permanents = [] + self.permanents: list[PermanentStaging] = [] @classmethod # NB: classmethod must be on top @contextmanager From 58a58bc94d13e8dafa4e1ea58df827f72cec5c2d Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Tue, 6 Jun 2023 18:48:09 -0500 Subject: [PATCH 20/44] test_repr --- gufe/tests/storage/test_stagingdirectory.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/gufe/tests/storage/test_stagingdirectory.py b/gufe/tests/storage/test_stagingdirectory.py index 2def33af..e9f4287f 100644 --- a/gufe/tests/storage/test_stagingdirectory.py +++ b/gufe/tests/storage/test_stagingdirectory.py @@ -94,6 +94,12 @@ def test_delete_empty_dirs_delete_root(tmp_path, delete_root): class TestSharedStaging: + def test_repr(self, root): + r = repr(root) + assert r.startswith("StagingDirectory") + assert "MemoryStorage" in r + assert r.endswith(", new_unit)") + @pytest.mark.parametrize('pathlist', [ ['file.txt'], ['dir', 'file.txt'] ]) From 8e429f5d9248131dadf82c1cb033a67078463723 Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Tue, 6 Jun 2023 18:58:54 -0500 Subject: [PATCH 21/44] renaming around DAGContextManager --- gufe/storage/storagemanager.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/gufe/storage/storagemanager.py b/gufe/storage/storagemanager.py index bfb3bfc3..4a08dc75 100644 --- a/gufe/storage/storagemanager.py +++ b/gufe/storage/storagemanager.py @@ -10,7 +10,7 @@ from .stagingdirectory import SharedStaging, PermanentStaging -class _AbstractDAGContextManager: +class DAGContextManager: @classmethod @contextmanager def running_dag(cls, storage_manager: StorageManager, dag_label: str): @@ -28,11 +28,9 @@ def running_unit(self, unit_label: str): """ raise NotImplementedError() +_DCMType = Type[DAGContextManager] # to shorten some lines -DAGContextManager = Type[_AbstractDAGContextManager] - -# TODO: rename -class _DAGStorageManager(_AbstractDAGContextManager): +class SingleProcDAGContextManager(DAGContextManager): """Context manager to handle details of storage lifecycle. Making this a separate class ensures that ``running_unit`` is always @@ -117,7 +115,7 @@ def __init__( keep_scratch: bool = False, keep_holding: bool = False, holding: PathLike = Path(".holding"), - DAGContextClass: DAGContextManager = _DAGStorageManager, + DAGContextClass: _DCMType = SingleProcDAGContextManager, ): self.scratch_root = Path(scratch_root) self.shared_root = shared_root From b70df482c0cbebe9fd0a5d34be15bb13c723eedc Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Tue, 6 Jun 2023 19:09:17 -0500 Subject: [PATCH 22/44] holding => staging --- gufe/storage/stagingdirectory.py | 68 ++++++++++----------- gufe/storage/storagemanager.py | 32 +++++----- gufe/tests/storage/test_stagingdirectory.py | 28 ++++----- gufe/tests/storage/test_storagemanager.py | 22 +++---- 4 files changed, 75 insertions(+), 75 deletions(-) diff --git a/gufe/storage/stagingdirectory.py b/gufe/storage/stagingdirectory.py index 7c44f4aa..02626365 100644 --- a/gufe/storage/stagingdirectory.py +++ b/gufe/storage/stagingdirectory.py @@ -11,7 +11,7 @@ # TODO: holding -> staging -def _safe_to_delete_holding(external: ExternalStorage, path: PathLike, +def _safe_to_delete_staging(external: ExternalStorage, path: PathLike, prefix: Union[PathLike, str]) -> bool: """Check if deleting ``path`` could delete externally stored data. @@ -89,11 +89,11 @@ class StagingDirectory: it might be ``$DAG_LABEL/$UNIT_LABEL`` or ``$DAG_LABEL/$UNIT_LABEL/$UNIT_REPEAT``. It must be a unique identifier for this unit within the permanent storage. - holding : PathLike + staging : PathLike name of the subdirectory of scratch where staged results are - temporarily stored; default is '.holding'. This must be the same for + temporarily stored; default is '.staging'. This must be the same for all units within a DAG. - delete_holding : bool + delete_staging : bool whether to delete the contents of the $SCRATCH/$HOLDING/$PREFIX directory when this object is deleted """ @@ -103,31 +103,31 @@ def __init__( external: ExternalStorage, prefix: str, *, - holding: PathLike = Path(".holding"), - delete_holding: bool = True, + staging: PathLike = Path(".staging"), + delete_staging: bool = True, ): self.external = external self.scratch = Path(scratch) self.prefix = Path(prefix) - self.delete_holding = delete_holding - self.holding = holding + self.delete_staging = delete_staging + self.staging = staging self.registry : set[StagingPath] = set() self.preexisting : set[StagingPath] = set() - self.staging_dir = self.scratch / holding / prefix + self.staging_dir = self.scratch / staging / prefix self.staging_dir.mkdir(exist_ok=True, parents=True) - def _delete_holding_safe(self): + def _delete_staging_safe(self): """Check if deleting staging will remove data from external. """ - return _safe_to_delete_holding( + return _safe_to_delete_staging( external=self.external, path=self.staging_dir, prefix=self.prefix, ) def transfer_single_file_to_external(self, held_file: StagingPath): - """Transfer a given file from holding into external storage + """Transfer a given file from staging into external storage """ path = Path(held_file) if not path.exists(): @@ -140,7 +140,7 @@ def transfer_single_file_to_external(self, held_file: StagingPath): _logger.info(f"Transfering {path} to external storage") self.external.store_path(held_file.label, path) - def transfer_holding_to_external(self): + def transfer_staging_to_external(self): """Transfer all objects in the registry to external storage""" for obj in self.registry: self.transfer_single_file_to_external(obj) @@ -148,7 +148,7 @@ def transfer_holding_to_external(self): def cleanup(self): """Perform end-of-lifecycle cleanup. """ - if self.delete_holding and self._delete_holding_safe(): + if self.delete_staging and self._delete_staging_safe(): for file in self.registry - self.preexisting: if Path(file).exists(): _logger.debug(f"Removing file {file}") @@ -229,39 +229,39 @@ def __init__( external: ExternalStorage, prefix: str, *, - holding: PathLike = Path(".holding"), - delete_holding: bool = True, + staging: PathLike = Path(".staging"), + delete_staging: bool = True, read_only: bool = False, ): - super().__init__(scratch, external, prefix, holding=holding, - delete_holding=delete_holding) + super().__init__(scratch, external, prefix, staging=staging, + delete_staging=delete_staging) self.read_only = read_only def get_other_shared(self, prefix: str, - delete_holding: Optional[bool] = None): + delete_staging: Optional[bool] = None): """Get a related unit's staging directory. """ - if delete_holding is None: - delete_holding = self.delete_holding + if delete_staging is None: + delete_staging = self.delete_staging return SharedStaging( scratch=self.scratch, external=self.external, prefix=prefix, - holding=self.holding, - delete_holding=delete_holding, + staging=self.staging, + delete_staging=delete_staging, read_only=True, ) @contextmanager def other_shared(self, prefix: str, - delete_holding: Optional[bool] = None): + delete_staging: Optional[bool] = None): """Context manager approach for getting a related unit's directory. This is usually the recommended way to get a previous unit's shared data. """ - other = self.get_other_shared(prefix, delete_holding) + other = self.get_other_shared(prefix, delete_staging) yield other other.cleanup() @@ -272,12 +272,12 @@ def transfer_single_file_to_external(self, held_file: StagingPath): super().transfer_single_file_to_external(held_file) - def transfer_holding_to_external(self): + def transfer_staging_to_external(self): if self.read_only: _logger.debug("Read-only: Not transfering to external storage") return # early exit - super().transfer_holding_to_external() + super().transfer_staging_to_external() def register_path(self, staging_path: StagingPath): label_exists = self.external.exists(staging_path.label) @@ -303,20 +303,20 @@ def __init__( shared: ExternalStorage, prefix: str, *, - holding: PathLike = Path(".holding"), - delete_holding: bool = True, + staging: PathLike = Path(".staging"), + delete_staging: bool = True, ): - super().__init__(scratch, external, prefix, holding=holding, - delete_holding=delete_holding) + super().__init__(scratch, external, prefix, staging=staging, + delete_staging=delete_staging) self.shared = shared - def _delete_holding_safe(self): - shared_safe = _safe_to_delete_holding( + def _delete_staging_safe(self): + shared_safe = _safe_to_delete_staging( external=self.shared, path=self.staging_dir, prefix=self.prefix ) - return shared_safe and super()._delete_holding_safe() + return shared_safe and super()._delete_staging_safe() def transfer_single_file_to_external(self, held_file: StagingPath): # if we can't find it locally, we load it from shared storage diff --git a/gufe/storage/storagemanager.py b/gufe/storage/storagemanager.py index 4a08dc75..dbba86ea 100644 --- a/gufe/storage/storagemanager.py +++ b/gufe/storage/storagemanager.py @@ -49,7 +49,7 @@ def running_dag(cls, storage_manager: StorageManager, dag_label: str): """DAG level of the storage lifecycle When the DAG is completed, transfer everything to the permanent - storage, and delete the holding area for permanent (if we are + storage, and delete the staging area for permanent (if we are supposed to). This is not usually called by users; instead it is called from @@ -60,9 +60,9 @@ def running_dag(cls, storage_manager: StorageManager, dag_label: str): yield dag_manager finally: for permanent in dag_manager.permanents: - permanent.transfer_holding_to_external() + permanent.transfer_staging_to_external() - if not dag_manager.manager.keep_holding: + if not dag_manager.manager.keep_staging: for d in dag_manager.permanents: # import pdb; pdb.set_trace() d.cleanup() @@ -71,10 +71,10 @@ def running_dag(cls, storage_manager: StorageManager, dag_label: str): def running_unit(self, unit_label: str): """Unit level of the storage lifecycle. - This provides the holding directories used for scratch, shared, and + This provides the staging directories used for scratch, shared, and permanent. At the end of the unit, it transfers anything from shared to the real shared external storage, cleans up the scratch - directory and the shared holding directory. + directory and the shared staging directory. Note that the unit label here is the *entire* label; that is, it would also include information identifying the DAG. @@ -87,7 +87,7 @@ def running_unit(self, unit_label: str): finally: # TODO: should some of this be in an else clause instead? self.permanents.append(permanent) - shared.transfer_holding_to_external() + shared.transfer_staging_to_external() # everything in permanent must also be available in shared for file in permanent.registry: shared.transfer_single_file_to_external(file) @@ -95,14 +95,14 @@ def running_unit(self, unit_label: str): if not self.manager.keep_scratch: shutil.rmtree(scratch) - if not self.manager.keep_holding: + if not self.manager.keep_staging: shared.cleanup() class StorageManager: """Tool to manage the storage lifecycle during a DAG. - This object primarily contains the logic for getting the holding + This object primarily contains the logic for getting the staging directories. A separate class, in the ``DAGContextClass`` variable, handles the logic for the context managers. """ @@ -113,16 +113,16 @@ def __init__( permanent_root: ExternalStorage, *, keep_scratch: bool = False, - keep_holding: bool = False, - holding: PathLike = Path(".holding"), + keep_staging: bool = False, + staging: PathLike = Path(".staging"), DAGContextClass: _DCMType = SingleProcDAGContextManager, ): self.scratch_root = Path(scratch_root) self.shared_root = shared_root self.permanent_root = permanent_root self.keep_scratch = keep_scratch - self.keep_holding = keep_holding - self.holding = holding + self.keep_staging = keep_staging + self.staging = staging self.DAGContextClass = DAGContextClass def get_scratch(self, unit_label: str) -> Path: @@ -133,22 +133,22 @@ def get_scratch(self, unit_label: str) -> Path: return scratch def get_permanent(self, unit_label) -> PermanentStaging: - """Get the object for this unit's permanent holding directory""" + """Get the object for this unit's permanent staging directory""" return PermanentStaging( scratch=self.scratch_root, external=self.permanent_root, shared=self.shared_root, prefix=unit_label, - holding=self.holding, + staging=self.staging, ) def get_shared(self, unit_label) -> SharedStaging: - """Get the object for this unit's shared holding directory""" + """Get the object for this unit's shared staging directory""" return SharedStaging( scratch=self.scratch_root, external=self.shared_root, prefix=unit_label, - holding=self.holding, + staging=self.staging, ) def running_dag(self, dag_label: str): diff --git a/gufe/tests/storage/test_stagingdirectory.py b/gufe/tests/storage/test_stagingdirectory.py index e9f4287f..69c3bea0 100644 --- a/gufe/tests/storage/test_stagingdirectory.py +++ b/gufe/tests/storage/test_stagingdirectory.py @@ -6,7 +6,7 @@ from gufe.storage.externalresource import MemoryStorage, FileStorage from gufe.storage.stagingdirectory import ( SharedStaging, PermanentStaging, _delete_empty_dirs, - _safe_to_delete_holding + _safe_to_delete_staging ) @pytest.fixture @@ -17,7 +17,7 @@ def root(tmp_path): scratch=tmp_path, external=external, prefix="new_unit", - delete_holding=False + delete_staging=False ) return root @@ -28,23 +28,23 @@ def root_with_contents(root): return root -def test_safe_to_delete_holding_ok(tmp_path): +def test_safe_to_delete_staging_ok(tmp_path): external = FileStorage(tmp_path / "foo") prefix = "bar" - holding = tmp_path / "foo" / "baz" - assert _safe_to_delete_holding(external, holding, prefix) + staging = tmp_path / "foo" / "baz" + assert _safe_to_delete_staging(external, staging, prefix) -def test_safe_to_delete_holding_danger(tmp_path): +def test_safe_to_delete_staging_danger(tmp_path): external = FileStorage(tmp_path / "foo") prefix = "bar" - holding = tmp_path / "foo" / "bar" / "baz" - assert not _safe_to_delete_holding(external, holding, prefix) + staging = tmp_path / "foo" / "bar" / "baz" + assert not _safe_to_delete_staging(external, staging, prefix) -def test_safe_to_delete_holding_not_filestorage(tmp_path): +def test_safe_to_delete_staging_not_filestorage(tmp_path): external = MemoryStorage() prefix = "bar" - holding = tmp_path / "bar" - assert _safe_to_delete_holding(external, holding, prefix) + staging = tmp_path / "bar" + assert _safe_to_delete_staging(external, staging, prefix) def test_delete_empty_dirs(tmp_path): base = tmp_path / "tmp" @@ -119,7 +119,7 @@ def test_read_old(self, root): # initial conditions, without touching StagingDirectory/StagingPath label = "old_unit/data.txt" - on_filesystem = root.scratch / root.holding / "old_unit/data.txt" + on_filesystem = root.scratch / root.staging / "old_unit/data.txt" assert not on_filesystem.exists() assert root.external.exists(label) @@ -136,7 +136,7 @@ def test_read_old(self, root): def test_write_new(self, root): label = "new_unit/somefile.txt" - on_filesystem = root.scratch / root.holding / "new_unit/somefile.txt" + on_filesystem = root.scratch / root.staging / "new_unit/somefile.txt" assert not on_filesystem.exists() with open(root / "somefile.txt", mode='wb') as f: f.write(b"testing") @@ -155,7 +155,7 @@ def test_transfer_to_external(self, root_with_contents): path = list(root_with_contents.registry)[0] # only 1 assert not root_with_contents.external.exists(path.label) - root_with_contents.transfer_holding_to_external() + root_with_contents.transfer_staging_to_external() assert root_with_contents.external.exists(path.label) with root_with_contents.external.load_stream(path.label) as f: diff --git a/gufe/tests/storage/test_storagemanager.py b/gufe/tests/storage/test_storagemanager.py index c37451ad..fb584be1 100644 --- a/gufe/tests/storage/test_storagemanager.py +++ b/gufe/tests/storage/test_storagemanager.py @@ -56,12 +56,12 @@ def storage_manager(self, tmp_path): @staticmethod def get_files_dict(storage_manager): root = storage_manager.scratch_root - holding = storage_manager.holding + staging = storage_manager.staging return { "foo": root / "scratch/dag/unit1/foo.txt", "foo2": root / "scratch/dag/unit2/foo2.txt", - "bar": root / holding / "dag/unit1/bar.txt", - "baz": root / holding / "dag/unit1/baz.txt", + "bar": root / staging / "dag/unit1/bar.txt", + "baz": root / staging / "dag/unit1/baz.txt", } def test_lifecycle(self, storage_manager, dag_units, tmp_path): @@ -185,7 +185,7 @@ def _after_dag_existing_files(self): return set() -class TestKeepScratchAndHoldingStorageManager(LifecycleHarness): +class TestKeepScratchAndStagingStorageManager(LifecycleHarness): @pytest.fixture def storage_manager(self, tmp_path): return StorageManager( @@ -193,7 +193,7 @@ def storage_manager(self, tmp_path): shared_root=MemoryStorage(), permanent_root=MemoryStorage(), keep_scratch=True, - keep_holding=True + keep_staging=True ) @staticmethod @@ -216,7 +216,7 @@ def _after_dag_existing_files(self): -class TestHoldingOverlapsSharedStorageManager(LifecycleHarness): +class TestStagingOverlapsSharedStorageManager(LifecycleHarness): @pytest.fixture def storage_manager(self, tmp_path): root = tmp_path / "working" @@ -224,7 +224,7 @@ def storage_manager(self, tmp_path): scratch_root=root, shared_root=FileStorage(root), permanent_root=MemoryStorage(), - holding="", + staging="", ) def _in_unit_existing_files(self, unit_label): @@ -257,7 +257,7 @@ def _in_staging_shared(self, unit_label, in_after): }[unit_label, in_after] -class TestHoldingOverlapsPermanentStorageManager(LifecycleHarness): +class TestStagingOverlapsPermanentStorageManager(LifecycleHarness): @pytest.fixture def storage_manager(self, tmp_path): root = tmp_path / "working" @@ -265,7 +265,7 @@ def storage_manager(self, tmp_path): scratch_root=root, permanent_root=FileStorage(root), shared_root=MemoryStorage(), - holding="", + staging="", ) def _in_unit_existing_files(self, unit_label): @@ -302,10 +302,10 @@ def test_get_scratch(self, storage_manager_std): def test_get_permanent(self, storage_manager_std): perm = storage_manager_std.get_permanent("dag_label/unit_label") - assert perm.__fspath__().endswith(".holding/dag_label/unit_label") + assert perm.__fspath__().endswith(".staging/dag_label/unit_label") assert isinstance(perm, StagingDirectory) def test_get_shared(self, storage_manager_std): shared = storage_manager_std.get_shared("dag_label/unit_label") - assert shared.__fspath__().endswith(".holding/dag_label/unit_label") + assert shared.__fspath__().endswith(".staging/dag_label/unit_label") assert isinstance(shared, StagingDirectory) From 08e3ac2cd7702a0c01ec08e33af563c72887a7ab Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Tue, 6 Jun 2023 19:29:12 -0500 Subject: [PATCH 23/44] finish docs (I think?) --- docs/guide/storage.rst | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/docs/guide/storage.rst b/docs/guide/storage.rst index 0153bf89..d740525e 100644 --- a/docs/guide/storage.rst +++ b/docs/guide/storage.rst @@ -42,8 +42,8 @@ protocol authors. In detail, this provides protocol authors with three of these objects actually point to special subdirectories of the scratch space for a specific unit, but are managed by context manangers at the executor level, which handle the process of moving objects from local -directories to the actual ``shared`` and ``permanent`` locations, which can -be external resources. +staging directories to the actual ``shared`` and ``permanent`` locations, +which can be external resources. External resource utilities @@ -78,9 +78,24 @@ executors. The helpers are: * :class:`.StorageManager`: This is the overall façade interface for interacting with the rest of the storage lifecycle tools. -* ``DAGContextManager``: -* :class:`.StagingDirectory`: -* :class:`.StagingPath`: +* :class:`.DAGContextManager`: This provides context managers at the DAG and + unit level to handle the transfer of storage. GUFE provides a + :class:`.SingleProcDAGContextManager` to handle the simple case that an + entire DAG is run within a single process. If individual units are run on + different remote resources, a more complicated :class:`.DAGContextManager` + would be needed. +* :class:`.StagingDirectory`: This represents the root directory for staging + the results of a given :class:`.ProtocolUnit`. This is an abstract + representation of a local directory. Paths within it register with it, and + it handles deletion of the temporary local files when not needed, as well + as the download of remote files when necessary for reading. There are two + important subclasses of this: :class:`.SharedStaging` for a ``shared`` + resource, and :class:`.PermanentStaging` for a ``permanent`` resource. +* :class:`.StagingPath`: This represents a file within the + :class:`.StagingDirectory`. It contains both the key (label) used in the + key-value store, as well as the actual local path to the file. On + creation, it registers itself with its :class:`.StagingDirectory`, which + handles managing it over its lifecycle. In practice, the executor uses the :class:`.StorageManager` to create a :class:`.DAGContextManager` at the level of a DAG, and then uses the From ca7871b00edd7fd7bf8b614437b4b9549c39c773 Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Tue, 6 Jun 2023 19:39:20 -0500 Subject: [PATCH 24/44] remove completed TODO --- gufe/storage/stagingdirectory.py | 1 - 1 file changed, 1 deletion(-) diff --git a/gufe/storage/stagingdirectory.py b/gufe/storage/stagingdirectory.py index 02626365..f529b327 100644 --- a/gufe/storage/stagingdirectory.py +++ b/gufe/storage/stagingdirectory.py @@ -9,7 +9,6 @@ import logging _logger = logging.getLogger(__name__) -# TODO: holding -> staging def _safe_to_delete_staging(external: ExternalStorage, path: PathLike, prefix: Union[PathLike, str]) -> bool: From 2aa06169e6a629897828180edc1a32ebac251e05 Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Fri, 9 Jun 2023 10:01:32 -0500 Subject: [PATCH 25/44] start to testing edge case logging right now, we seem to not be getting anything; see if this is because someone has added a handler on the root logger or because we can't caplog from a parent logger (may need specific __name__) --- gufe/tests/storage/test_stagingdirectory.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/gufe/tests/storage/test_stagingdirectory.py b/gufe/tests/storage/test_stagingdirectory.py index 69c3bea0..36ad1228 100644 --- a/gufe/tests/storage/test_stagingdirectory.py +++ b/gufe/tests/storage/test_stagingdirectory.py @@ -1,4 +1,6 @@ import pytest +from unittest import mock +import logging import os import pathlib @@ -161,7 +163,18 @@ def test_transfer_to_external(self, root_with_contents): with root_with_contents.external.load_stream(path.label) as f: assert f.read() == b"bar" - def test_transfer_to_external_no_file(self, root): + @mock.patch.object(SharedStaging, 'register_path') + def test_transfer_to_external_no_file(self, root, caplog): + nonfile = root / "does_not_exist.txt" + # ensure that we've set this up correctly + assert nonfile not in root.registry + caplog.set_level(logging.INFO, logger="gufe.storage") + root.transfer_single_file_to_external(nonfile) + assert len(caplog.records) == 1 + + + + ... def test_tranfer_to_external_directory(self, root): From 7c03dcd76e10b88f30443ea279028694705e2a0d Mon Sep 17 00:00:00 2001 From: Richard Gowers Date: Mon, 12 Jun 2023 08:51:50 +0100 Subject: [PATCH 26/44] Update stagingdirectory.py --- gufe/storage/stagingdirectory.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gufe/storage/stagingdirectory.py b/gufe/storage/stagingdirectory.py index f529b327..1f55bfc9 100644 --- a/gufe/storage/stagingdirectory.py +++ b/gufe/storage/stagingdirectory.py @@ -14,7 +14,7 @@ def _safe_to_delete_staging(external: ExternalStorage, path: PathLike, prefix: Union[PathLike, str]) -> bool: """Check if deleting ``path`` could delete externally stored data. - If external storage is a FileStorage, then it will storage files for + If external storage is a FileStorage, then it will store files for this unit or dag in the directory ``external.root_dir / prefix``, where ``prefix`` is either the unit label or the dag label. If ``path`` is inside that directory, then deleting it may delete information from the From 383075e8288614c42a3eb7459ef6271bd8922339 Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Mon, 12 Jun 2023 15:52:29 -0500 Subject: [PATCH 27/44] tests for single_file_transfer logging --- gufe/storage/stagingdirectory.py | 3 +++ gufe/tests/storage/test_stagingdirectory.py | 26 ++++++++++++++------- 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/gufe/storage/stagingdirectory.py b/gufe/storage/stagingdirectory.py index f529b327..8dc86b47 100644 --- a/gufe/storage/stagingdirectory.py +++ b/gufe/storage/stagingdirectory.py @@ -177,6 +177,9 @@ def register_path(self, staging_path: StagingPath): the path to track """ label_exists = self.external.exists(staging_path.label) + fspath = Path(staging_path.__fspath__()) + if not fspath.parent.exists(): + fspath.parent.mkdir(parents=True, exist_ok=True) self.registry.add(staging_path) diff --git a/gufe/tests/storage/test_stagingdirectory.py b/gufe/tests/storage/test_stagingdirectory.py index 36ad1228..2f6301f8 100644 --- a/gufe/tests/storage/test_stagingdirectory.py +++ b/gufe/tests/storage/test_stagingdirectory.py @@ -163,22 +163,30 @@ def test_transfer_to_external(self, root_with_contents): with root_with_contents.external.load_stream(path.label) as f: assert f.read() == b"bar" - @mock.patch.object(SharedStaging, 'register_path') def test_transfer_to_external_no_file(self, root, caplog): - nonfile = root / "does_not_exist.txt" + with mock.patch.object(root, 'register_path'): + nonfile = root / "does_not_exist.txt" # ensure that we've set this up correctly assert nonfile not in root.registry - caplog.set_level(logging.INFO, logger="gufe.storage") + logger_name = "gufe.storage.stagingdirectory" + caplog.set_level(logging.INFO, logger=logger_name) root.transfer_single_file_to_external(nonfile) assert len(caplog.records) == 1 + record = caplog.records[0] + assert "nonexistent" in record.msg + def test_transfer_to_external_directory(self, root, caplog): + directory = root / "directory" + with open(directory / "file.txt", mode='w') as f: + f.write("foo") - - - ... - - def test_tranfer_to_external_directory(self, root): - ... + logger_name = "gufe.storage.stagingdirectory" + caplog.set_level(logging.DEBUG, logger=logger_name) + root.transfer_single_file_to_external(directory) + assert len(caplog.records) == 1 + record = caplog.records[0] + assert "Found directory" in record.msg + assert "not transfering" in record.msg def test_existing_local_and_external(self, root): ... From 63653983e1cd723d7182087d725b91fc79909110 Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Fri, 16 Jun 2023 14:57:03 -0500 Subject: [PATCH 28/44] tests for read-only transfers --- gufe/storage/stagingdirectory.py | 4 +- gufe/tests/storage/test_stagingdirectory.py | 81 +++++++++++++++++++-- 2 files changed, 78 insertions(+), 7 deletions(-) diff --git a/gufe/storage/stagingdirectory.py b/gufe/storage/stagingdirectory.py index 8dc86b47..e6cc34e0 100644 --- a/gufe/storage/stagingdirectory.py +++ b/gufe/storage/stagingdirectory.py @@ -210,7 +210,7 @@ def __fspath__(self): def __repr__(self): return ( - f"StagingDirectory({self.scratch}, {self.external}, " + f"{self.__class__.__name__}({self.scratch}, {self.external}, " f"{self.prefix})" ) @@ -286,7 +286,7 @@ def register_path(self, staging_path: StagingPath): if self.read_only and not label_exists: raise IOError(f"Unable to create '{staging_path.label}'. File " - "does not exist in external storage, and This " + "does not exist in external storage, and this " "staging path is read-only.") super().register_path(staging_path) diff --git a/gufe/tests/storage/test_stagingdirectory.py b/gufe/tests/storage/test_stagingdirectory.py index 2f6301f8..c4281fd8 100644 --- a/gufe/tests/storage/test_stagingdirectory.py +++ b/gufe/tests/storage/test_stagingdirectory.py @@ -30,6 +30,26 @@ def root_with_contents(root): return root +@pytest.fixture +def read_only_with_overwritten(root_with_contents): + read_only = SharedStaging( + scratch=root_with_contents.scratch, + external=root_with_contents.external, + prefix="old_unit", + staging=root_with_contents.staging, + delete_staging=root_with_contents.delete_staging, + read_only=True + ) + filename = pathlib.Path(read_only) / "data.txt" + assert not filename.exists() + staged = read_only / "data.txt" + assert filename.exists() + with open(staged, mode='w') as f: + f.write("changed") + + return read_only, staged + + def test_safe_to_delete_staging_ok(tmp_path): external = FileStorage(tmp_path / "foo") prefix = "bar" @@ -188,11 +208,62 @@ def test_transfer_to_external_directory(self, root, caplog): assert "Found directory" in record.msg assert "not transfering" in record.msg - def test_existing_local_and_external(self, root): - ... + def test_single_file_transfer_read_only(self, + read_only_with_overwritten, + caplog): + read_only, staged = read_only_with_overwritten + with read_only.external.load_stream("old_unit/data.txt") as f: + old_contents = f.read() - def test_existing_local_and_external_conflict(self, root): - ... + assert old_contents == b"foo" + logger_name = "gufe.storage.stagingdirectory" + caplog.set_level(logging.DEBUG, logger=logger_name) + read_only.transfer_single_file_to_external(staged) + assert len(caplog.records) == 1 + record = caplog.records[0] + assert "Read-only:" in record.msg + with read_only.external.load_stream("old_unit/data.txt") as f: + new_contents = f.read() + assert old_contents == new_contents + + def test_transfer_read_only(self, read_only_with_overwritten, caplog): + read_only, staged = read_only_with_overwritten + with read_only.external.load_stream("old_unit/data.txt") as f: + old_contents = f.read() - def test_no_transfer_for_read_only(self, root): + assert old_contents == b"foo" + logger_name = "gufe.storage.stagingdirectory" + caplog.set_level(logging.DEBUG, logger=logger_name) + read_only.transfer_staging_to_external() + assert len(caplog.records) == 1 + record = caplog.records[0] + assert "Read-only:" in record.msg + with read_only.external.load_stream("old_unit/data.txt") as f: + new_contents = f.read() + assert old_contents == new_contents + + def test_cleanup(self, root_with_contents): + path = pathlib.Path(root_with_contents.__fspath__()) / "data.txt" + assert path.exists() + root_with_contents.cleanup() + assert not path.exists() + + def test_register_cleanup_preexisting_file(self, root): + filename = pathlib.Path(root.__fspath__()) / "foo.txt" + filename.touch() + root.external.store_bytes("new_unit/foo.txt", b"") + assert len(root.registry) == 0 + assert len(root.preexisting) == 0 + staging = root / "foo.txt" + assert staging.label == "new_unit/foo.txt" + assert len(root.registry) == 1 + assert len(root.preexisting) == 1 + + assert filename.exists() + root.cleanup() + assert filename.exists() + + +class TestPermanentStage: + def test_delete_staging_safe(self): ... From d35bd60150cb81a27ff3781ebdbeeab75cbde01e Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Fri, 16 Jun 2023 15:21:26 -0500 Subject: [PATCH 29/44] fix repr and cleanup tests --- gufe/tests/storage/test_stagingdirectory.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/gufe/tests/storage/test_stagingdirectory.py b/gufe/tests/storage/test_stagingdirectory.py index c4281fd8..3a7e3ef4 100644 --- a/gufe/tests/storage/test_stagingdirectory.py +++ b/gufe/tests/storage/test_stagingdirectory.py @@ -118,7 +118,7 @@ def test_delete_empty_dirs_delete_root(tmp_path, delete_root): class TestSharedStaging: def test_repr(self, root): r = repr(root) - assert r.startswith("StagingDirectory") + assert r.startswith("SharedStaging") assert "MemoryStorage" in r assert r.endswith(", new_unit)") @@ -243,6 +243,7 @@ def test_transfer_read_only(self, read_only_with_overwritten, caplog): assert old_contents == new_contents def test_cleanup(self, root_with_contents): + root_with_contents.delete_staging = True # slightly naughty path = pathlib.Path(root_with_contents.__fspath__()) / "data.txt" assert path.exists() root_with_contents.cleanup() From 7cc10f9b692503e6eeff24769c43d498bd2ee284 Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Fri, 16 Jun 2023 16:26:55 -0500 Subject: [PATCH 30/44] test for permanent transfer to external --- gufe/tests/storage/test_stagingdirectory.py | 25 ++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/gufe/tests/storage/test_stagingdirectory.py b/gufe/tests/storage/test_stagingdirectory.py index 3a7e3ef4..2dfc1ffd 100644 --- a/gufe/tests/storage/test_stagingdirectory.py +++ b/gufe/tests/storage/test_stagingdirectory.py @@ -49,6 +49,18 @@ def read_only_with_overwritten(root_with_contents): return read_only, staged +@pytest.fixture +def permanent(tmp_path): + shared = MemoryStorage() + shared.store_bytes("final/old_unit/data.txt", b"foo") + perm = PermanentStaging( + scratch=tmp_path, + external=MemoryStorage(), + shared=shared, + prefix="final", + delete_staging=True + ) + return perm def test_safe_to_delete_staging_ok(tmp_path): external = FileStorage(tmp_path / "foo") @@ -266,5 +278,16 @@ def test_register_cleanup_preexisting_file(self, root): class TestPermanentStage: - def test_delete_staging_safe(self): + def test_delete_staging_safe(self, permanent): + ... + + def test_load_missing_for_transfer(self, permanent): + fname = pathlib.Path(permanent) / "old_unit/data.txt" + assert not fname.exists() + staging = permanent / "old_unit/data.txt" + assert not fname.exists() + assert permanent.external._data == {} + permanent.transfer_staging_to_external() + assert fname.exists() + assert permanent.external._data == {"final/old_unit/data.txt": b"foo"} ... From ab025f13e29064065f31dc38bf7675adbc35b7b5 Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Sat, 17 Jun 2023 11:44:07 -0500 Subject: [PATCH 31/44] test for Permanent delete staging --- gufe/tests/storage/test_stagingdirectory.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/gufe/tests/storage/test_stagingdirectory.py b/gufe/tests/storage/test_stagingdirectory.py index 2dfc1ffd..98e4befd 100644 --- a/gufe/tests/storage/test_stagingdirectory.py +++ b/gufe/tests/storage/test_stagingdirectory.py @@ -125,8 +125,6 @@ def test_delete_empty_dirs_delete_root(tmp_path, delete_root): assert base.exists() is not delete_root - - class TestSharedStaging: def test_repr(self, root): r = repr(root) @@ -278,8 +276,18 @@ def test_register_cleanup_preexisting_file(self, root): class TestPermanentStage: - def test_delete_staging_safe(self, permanent): - ... + @pytest.mark.parametrize('is_safe', [True, False]) + def test_delete_staging_safe(self, tmp_path, is_safe): + staging = ".staging" if is_safe else "" + permanent = PermanentStaging( + scratch=tmp_path, + external=MemoryStorage(), + shared=FileStorage(tmp_path), + prefix="final", + staging=staging, + delete_staging=True + ) + assert permanent._delete_staging_safe() is is_safe def test_load_missing_for_transfer(self, permanent): fname = pathlib.Path(permanent) / "old_unit/data.txt" @@ -290,4 +298,3 @@ def test_load_missing_for_transfer(self, permanent): permanent.transfer_staging_to_external() assert fname.exists() assert permanent.external._data == {"final/old_unit/data.txt": b"foo"} - ... From cd70ab22ae9714c5f34fbc1b211e8b1d6b906523 Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Sat, 17 Jun 2023 12:29:47 -0500 Subject: [PATCH 32/44] Add test for missing file on cleanup --- gufe/storage/stagingdirectory.py | 5 +++-- gufe/tests/storage/test_stagingdirectory.py | 12 ++++++++++++ 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/gufe/storage/stagingdirectory.py b/gufe/storage/stagingdirectory.py index e6cc34e0..fbe43cd0 100644 --- a/gufe/storage/stagingdirectory.py +++ b/gufe/storage/stagingdirectory.py @@ -153,8 +153,9 @@ def cleanup(self): _logger.debug(f"Removing file {file}") remove(file) else: - _logger.warning("Request to remove missing file " - f"{file}") + _logger.warning("During staging cleanup, file " + f"{file} was marked for deletion, but " + "can not be found on disk.") _delete_empty_dirs(self.staging_dir) def register_path(self, staging_path: StagingPath): diff --git a/gufe/tests/storage/test_stagingdirectory.py b/gufe/tests/storage/test_stagingdirectory.py index 98e4befd..0c1879f5 100644 --- a/gufe/tests/storage/test_stagingdirectory.py +++ b/gufe/tests/storage/test_stagingdirectory.py @@ -259,6 +259,18 @@ def test_cleanup(self, root_with_contents): root_with_contents.cleanup() assert not path.exists() + def test_cleanup_missing(self, root, caplog): + root.delete_staging = True + file = root / "foo.txt" + assert file in root.registry + assert not pathlib.Path(file).exists() + logger_name = "gufe.storage.stagingdirectory" + caplog.set_level(logging.WARNING, logger=logger_name) + root.cleanup() + assert len(caplog.records) == 1 + record = caplog.records[0] + assert "can not be found on disk" in record.msg + def test_register_cleanup_preexisting_file(self, root): filename = pathlib.Path(root.__fspath__()) / "foo.txt" filename.touch() From 90f2597d1af1cc4e5b2b8c70a57618d1feb889fb Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Thu, 22 Jun 2023 11:56:53 -0500 Subject: [PATCH 33/44] get_other_shared to private --- gufe/storage/stagingdirectory.py | 4 ++-- gufe/tests/storage/test_stagingdirectory.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/gufe/storage/stagingdirectory.py b/gufe/storage/stagingdirectory.py index e9fd86f5..25ac49f1 100644 --- a/gufe/storage/stagingdirectory.py +++ b/gufe/storage/stagingdirectory.py @@ -240,7 +240,7 @@ def __init__( delete_staging=delete_staging) self.read_only = read_only - def get_other_shared(self, prefix: str, + def _get_other_shared(self, prefix: str, delete_staging: Optional[bool] = None): """Get a related unit's staging directory. """ @@ -264,7 +264,7 @@ def other_shared(self, prefix: str, This is usually the recommended way to get a previous unit's shared data. """ - other = self.get_other_shared(prefix, delete_staging) + other = self._get_other_shared(prefix, delete_staging) yield other other.cleanup() diff --git a/gufe/tests/storage/test_stagingdirectory.py b/gufe/tests/storage/test_stagingdirectory.py index 0c1879f5..bed95378 100644 --- a/gufe/tests/storage/test_stagingdirectory.py +++ b/gufe/tests/storage/test_stagingdirectory.py @@ -157,7 +157,7 @@ def test_read_old(self, root): # when we create the specific StagingPath, it registers and # "downloads" the file - old_staging = root.get_other_shared("old_unit") + old_staging = root._get_other_shared("old_unit") filepath = old_staging / "data.txt" assert pathlib.Path(filepath) == on_filesystem assert on_filesystem.exists() @@ -179,7 +179,7 @@ def test_write_new(self, root): assert not root.external.exists(label) def test_write_old_fail(self, root): - old_staging = root.get_other_shared("old_unit") + old_staging = root._get_other_shared("old_unit") with pytest.raises(IOError, match="read-only"): old_staging / "foo.txt" From dd1b6dc19819c2822acd8d159dc36d6431b7c84d Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Fri, 1 Dec 2023 10:41:17 -0600 Subject: [PATCH 34/44] updates from other branch --- gufe/storage/stagingdirectory.py | 120 ++++++++---- gufe/storage/storagemanager.py | 206 ++++++++------------ gufe/tests/storage/test_stagingdirectory.py | 18 +- gufe/tests/storage/test_storagemanager.py | 70 +++---- gufe/utils.py | 28 +++ 5 files changed, 230 insertions(+), 212 deletions(-) diff --git a/gufe/storage/stagingdirectory.py b/gufe/storage/stagingdirectory.py index 25ac49f1..356a5d34 100644 --- a/gufe/storage/stagingdirectory.py +++ b/gufe/storage/stagingdirectory.py @@ -6,6 +6,8 @@ from .externalresource import ExternalStorage, FileStorage from contextlib import contextmanager +from gufe.utils import delete_empty_dirs + import logging _logger = logging.getLogger(__name__) @@ -36,27 +38,6 @@ def _safe_to_delete_staging(external: ExternalStorage, path: PathLike, return False -def _delete_empty_dirs(root: PathLike, delete_root: bool = True): - """Delete all empty directories. - - Repeats so that directories that only contained empty directories also - get deleted. - """ - root = Path(root) - - def find_empty_dirs(directory): - if not (paths := list(directory.iterdir())): - return [directory] - directories = [p for p in paths if p.is_dir()] - return sum([find_empty_dirs(d) for d in directories], []) - - while root.exists() and (empties := find_empty_dirs(root)): - if empties == [root] and not delete_root: - return - for directory in empties: - _logger.debug(f"Removing '{directory}'") - rmdir(directory) - class StagingDirectory: """PathLike local representation of an :class:`.ExternalStorage`. @@ -74,7 +55,7 @@ class StagingDirectory: 2. When requested, it transfers any newly created files to the :class:`.ExternalStorage`. - 3. It can delete all of the files it manages + 3. It can delete all of the files it manages. Parameters ---------- @@ -104,11 +85,13 @@ def __init__( *, staging: PathLike = Path(".staging"), delete_staging: bool = True, + delete_empty_dirs: bool = True, ): self.external = external self.scratch = Path(scratch) self.prefix = Path(prefix) self.delete_staging = delete_staging + self.delete_empty_dirs = delete_empty_dirs self.staging = staging self.registry : set[StagingPath] = set() @@ -128,7 +111,7 @@ def _delete_staging_safe(self): def transfer_single_file_to_external(self, held_file: StagingPath): """Transfer a given file from staging into external storage """ - path = Path(held_file) + path = Path(held_file.fspath) if not path.exists(): _logger.info(f"Found nonexistent path {path}, not " "transfering to external storage") @@ -138,25 +121,39 @@ def transfer_single_file_to_external(self, held_file: StagingPath): else: _logger.info(f"Transfering {path} to external storage") self.external.store_path(held_file.label, path) + return held_file + + return None # no transfer + def transfer_staging_to_external(self): - """Transfer all objects in the registry to external storage""" - for obj in self.registry: - self.transfer_single_file_to_external(obj) + """Transfer all objects in the registry to external storage + + """ + return [ + transferred + for file in self.registry + if (transferred := self.transfer_single_file_to_external(file)) + ] def cleanup(self): """Perform end-of-lifecycle cleanup. """ if self.delete_staging and self._delete_staging_safe(): for file in self.registry - self.preexisting: - if Path(file).exists(): + path = Path(file.fspath) + if path.exists(): _logger.debug(f"Removing file {file}") - remove(file) + # TODO: handle special case of directory? + path.unlink() + self.registry.remove(file) else: _logger.warning("During staging cleanup, file " f"{file} was marked for deletion, but " "can not be found on disk.") - _delete_empty_dirs(self.staging_dir) + + if self.delete_empty_dirs: + delete_empty_dirs(self.staging_dir) def register_path(self, staging_path: StagingPath): """ @@ -178,7 +175,11 @@ def register_path(self, staging_path: StagingPath): the path to track """ label_exists = self.external.exists(staging_path.label) - fspath = Path(staging_path.__fspath__()) + fspath = Path(staging_path.fspath) + + # TODO: what if the staging path is a directory? not sure that we + # have a way to know that; but not sure that adding it to the + # registry is right either if not fspath.parent.exists(): fspath.parent.mkdir(parents=True, exist_ok=True) @@ -192,13 +193,16 @@ def register_path(self, staging_path: StagingPath): def _load_file_from_external(self, external: ExternalStorage, staging_path: StagingPath): + # import pdb; pdb.set_trace() scratch_path = self.staging_dir / staging_path.path # TODO: switch this to using `get_filename` and `store_path` - with external.load_stream(staging_path.label) as f: - external_bytes = f.read() if scratch_path.exists(): self.preexisting.add(staging_path) - ... # TODO: something to check that the bytes are the same? + + with external.load_stream(staging_path.label) as f: + external_bytes = f.read() + ... # TODO: check that the bytes are the same if preexisting? + scratch_path.parent.mkdir(exist_ok=True, parents=True) with open(scratch_path, mode='wb') as f: f.write(external_bytes) @@ -211,7 +215,7 @@ def __fspath__(self): def __repr__(self): return ( - f"{self.__class__.__name__}({self.scratch}, {self.external}, " + f"{self.__class__.__name__}('{self.scratch}', {self.external}, " f"{self.prefix})" ) @@ -234,10 +238,12 @@ def __init__( *, staging: PathLike = Path(".staging"), delete_staging: bool = True, + delete_empty_dirs: bool = True, read_only: bool = False, ): super().__init__(scratch, external, prefix, staging=staging, - delete_staging=delete_staging) + delete_staging=delete_staging, + delete_empty_dirs=delete_empty_dirs) self.read_only = read_only def _get_other_shared(self, prefix: str, @@ -273,14 +279,14 @@ def transfer_single_file_to_external(self, held_file: StagingPath): _logger.debug("Read-only: Not transfering to external storage") return # early exit - super().transfer_single_file_to_external(held_file) + return super().transfer_single_file_to_external(held_file) def transfer_staging_to_external(self): if self.read_only: _logger.debug("Read-only: Not transfering to external storage") return # early exit - super().transfer_staging_to_external() + return super().transfer_staging_to_external() def register_path(self, staging_path: StagingPath): label_exists = self.external.exists(staging_path.label) @@ -308,9 +314,11 @@ def __init__( *, staging: PathLike = Path(".staging"), delete_staging: bool = True, + delete_empty_dirs: bool = True, ): super().__init__(scratch, external, prefix, staging=staging, - delete_staging=delete_staging) + delete_staging=delete_staging, + delete_empty_dirs=delete_empty_dirs) self.shared = shared def _delete_staging_safe(self): @@ -323,7 +331,7 @@ def _delete_staging_safe(self): def transfer_single_file_to_external(self, held_file: StagingPath): # if we can't find it locally, we load it from shared storage - path = Path(held_file) + path = Path(held_file.fspath) if not path.exists(): self._load_file_from_external(self.shared, held_file) @@ -336,28 +344,58 @@ class StagingPath: On creation, this registers with a :class:`.StagingDirectory` that will manage the local path and transferring data with its :class:`.ExternalStorage`. + + This object can always be used as a FileLike (using, e.g., the standard + ``open`` builtin). This requires that a staged path that exists on an + external resource be downloaded into a local file when it is referenced. + + For a representation of a file that does not require the download (for + example, when deserializing results that point to files) instead use + :class:`.ExternalFile`. """ def __init__(self, root: StagingDirectory, path: Union[PathLike, str]): self.root = root self.path = Path(path) + + def register(self): + """Register this path with its StagingDirectory. + + If a file associated with this path exists in an external storage, + it will be downloaded to the staging area as part of registration. + """ self.root.register_path(self) def __truediv__(self, path: Union[PathLike, str]): return StagingPath(self.root, self.path / path) - def __fspath__(self): + def __eq__(self, other): + return (isinstance(other, StagingPath) + and self.root == other.root + and self.path == other.path) + + def __hash__(self): + return hash((self.root, self.path)) + + @property + def fspath(self): return str(self.root.staging_dir / self.path) + def __fspath__(self): + self.register() + return self.fspath + @property def label(self) -> str: """Label used in :class:`.ExternalStorage` for this path""" return str(self.root.prefix / self.path) def __repr__(self): - return f"StagingPath({self.__fspath__()})" + return f"StagingPath('{self.fspath}')" # TODO: how much of the pathlib.Path interface do we want to wrap? # although edge cases may be a pain, we can get most of it with, e.g.: # def exists(self): return Path(self).exists() # but also, can do pathlib.Path(staging_path) and get hte whole thing + + diff --git a/gufe/storage/storagemanager.py b/gufe/storage/storagemanager.py index dbba86ea..9061a874 100644 --- a/gufe/storage/storagemanager.py +++ b/gufe/storage/storagemanager.py @@ -4,108 +4,15 @@ from contextlib import contextmanager import shutil +from gufe.utils import delete_empty_dirs + from typing import Type from .externalresource import ExternalStorage, FileStorage from .stagingdirectory import SharedStaging, PermanentStaging -class DAGContextManager: - @classmethod - @contextmanager - def running_dag(cls, storage_manager: StorageManager, dag_label: str): - """Return a context manager for when a DAG is started. - - This context manager handles the DAG scale of the lifecycle. - """ - raise NotImplementedError() - - @contextmanager - def running_unit(self, unit_label: str): - """Return a context manager for when unit is started. - - This context manager handles the unit scale of the lifecycle. - """ - raise NotImplementedError() - -_DCMType = Type[DAGContextManager] # to shorten some lines - -class SingleProcDAGContextManager(DAGContextManager): - """Context manager to handle details of storage lifecycle. - - Making this a separate class ensures that ``running_unit`` is always - called within the context of a given DAG. This is usually not created - directly; instead, it is created (and used) with its ``running_dag`` - classmethod, typically from within a ``StorageManager``. - """ - def __init__(self, storage_manager: StorageManager, dag_label: str): - self.manager = storage_manager - self.dag_label = dag_label - self.permanents: list[PermanentStaging] = [] - - @classmethod # NB: classmethod must be on top - @contextmanager - def running_dag(cls, storage_manager: StorageManager, dag_label: str): - """DAG level of the storage lifecycle - - When the DAG is completed, transfer everything to the permanent - storage, and delete the staging area for permanent (if we are - supposed to). - - This is not usually called by users; instead it is called from - within the ``StorageManager``. - """ - dag_manager = cls(storage_manager, dag_label) - try: - yield dag_manager - finally: - for permanent in dag_manager.permanents: - permanent.transfer_staging_to_external() - - if not dag_manager.manager.keep_staging: - for d in dag_manager.permanents: - # import pdb; pdb.set_trace() - d.cleanup() - - @contextmanager - def running_unit(self, unit_label: str): - """Unit level of the storage lifecycle. - - This provides the staging directories used for scratch, shared, and - permanent. At the end of the unit, it transfers anything from shared - to the real shared external storage, cleans up the scratch - directory and the shared staging directory. - - Note that the unit label here is the *entire* label; that is, it - would also include information identifying the DAG. - """ - scratch = self.manager.get_scratch(unit_label) - shared = self.manager.get_shared(unit_label) - permanent = self.manager.get_permanent(unit_label) - try: - yield scratch, shared, permanent - finally: - # TODO: should some of this be in an else clause instead? - self.permanents.append(permanent) - shared.transfer_staging_to_external() - # everything in permanent must also be available in shared - for file in permanent.registry: - shared.transfer_single_file_to_external(file) - - if not self.manager.keep_scratch: - shutil.rmtree(scratch) - - if not self.manager.keep_staging: - shared.cleanup() - - class StorageManager: - """Tool to manage the storage lifecycle during a DAG. - - This object primarily contains the logic for getting the staging - directories. A separate class, in the ``DAGContextClass`` variable, - handles the logic for the context managers. - """ def __init__( self, scratch_root: PathLike, @@ -114,56 +21,107 @@ def __init__( *, keep_scratch: bool = False, keep_staging: bool = False, + keep_shared: bool = False, staging: PathLike = Path(".staging"), - DAGContextClass: _DCMType = SingleProcDAGContextManager, + delete_empty_dirs: bool = True, ): self.scratch_root = Path(scratch_root) self.shared_root = shared_root self.permanent_root = permanent_root self.keep_scratch = keep_scratch self.keep_staging = keep_staging + self.keep_shared = keep_shared self.staging = staging - self.DAGContextClass = DAGContextClass + self.delete_empty_dirs = delete_empty_dirs - def get_scratch(self, unit_label: str) -> Path: - """Get the path for this unit's scratch directory""" + # these are used to track what files can be deleted from shared if + # keep_shared is False + self.shared_xfer = set() + self.permanent_xfer = set() - scratch = self.scratch_root / "scratch" / unit_label - scratch.mkdir(parents=True, exist_ok=True) - return scratch - - def get_permanent(self, unit_label) -> PermanentStaging: - """Get the object for this unit's permanent staging directory""" - return PermanentStaging( + self.permanent_staging = PermanentStaging( scratch=self.scratch_root, external=self.permanent_root, shared=self.shared_root, - prefix=unit_label, staging=self.staging, + delete_empty_dirs=delete_empty_dirs, + prefix="" ) - def get_shared(self, unit_label) -> SharedStaging: - """Get the object for this unit's shared staging directory""" - return SharedStaging( + self.shared_staging = SharedStaging( scratch=self.scratch_root, external=self.shared_root, - prefix=unit_label, staging=self.staging, + delete_empty_dirs=delete_empty_dirs, + prefix="" # TODO: remove prefix ) - def running_dag(self, dag_label: str): - """Return a context manager that handles storage. + def make_label(self, dag_label, unit_label, attempt, **kwargs): + """ + + The specific executor may change this by making a very simple + adapter subclass and overriding this method, which can take + arbitrary additional kwargs that may tie it to a specific executor. + """ + return f"{dag_label}/{unit_label}_attempt_{attempt}" - For simple use cases, this is the only method a user needs to call. - Usage is something like: + @property + def _scratch_base(self): + return self.scratch_root / "scratch" - .. code:: + def _scratch_loc(self, dag_label, unit_label, attempt, **kwargs): + label = self.make_label(dag_label, unit_label, attempt) + return self._scratch_base / label - with manager.running_dag(dag_label) as dag_ctx: - for unit in dag_ordered_units: - label = f"{dag_ctx.dag_label}/{unit.key}" - with dag_ctx.running_unit(label) as dirs: - scratch, shared, permanent = dirs - # run the unit - """ - return self.DAGContextClass.running_dag(self, dag_label) + @contextmanager + def running_dag(self, dag_label): + # TODO: remove (or use) dag_label + try: + yield self + finally: + # import pdb; pdb.set_trace() + # clean up after DAG completes + self.permanent_staging.transfer_staging_to_external() + + if not self.keep_staging: + self.permanent_staging.cleanup() + + if not self.keep_shared: + for file in self.shared_xfer: + self.shared_root.delete(file.label) + + for file in self.permanent_xfer: + if self.shared_root != self.permanent_root: + self.shared_root.delete(file.label) + + if self.delete_empty_dirs: + delete_empty_dirs(self._scratch_base, delete_root=False) + + @contextmanager + def running_unit(self, dag_label, unit_label, **kwargs): + scratch = self._scratch_loc(dag_label, unit_label, **kwargs) + label = self.make_label(dag_label, unit_label, **kwargs) + scratch.mkdir(parents=True, exist_ok=True) + shared = self.shared_staging / label + permanent = self.permanent_staging / label + try: + yield scratch, shared, permanent + finally: + # import pdb; pdb.set_trace() + # clean up after unit + + # track the files that were in shared so that we can delete them + # at the end of the DAG if requires + shared_xfers = self.shared_staging.transfer_staging_to_external() + self.shared_xfer.update(set(shared_xfers)) + + # everything in permanent should also be in shared + for file in self.permanent_staging.registry: + self.shared_staging.transfer_single_file_to_external(file) + self.permanent_xfer.add(file) + + if not self.keep_scratch: + shutil.rmtree(scratch) + + if not self.keep_staging: + self.shared_staging.cleanup() diff --git a/gufe/tests/storage/test_stagingdirectory.py b/gufe/tests/storage/test_stagingdirectory.py index bed95378..f94a1888 100644 --- a/gufe/tests/storage/test_stagingdirectory.py +++ b/gufe/tests/storage/test_stagingdirectory.py @@ -7,8 +7,8 @@ from gufe.storage.externalresource import MemoryStorage, FileStorage from gufe.storage.stagingdirectory import ( - SharedStaging, PermanentStaging, _delete_empty_dirs, - _safe_to_delete_staging + SharedStaging, PermanentStaging, _safe_to_delete_staging, + delete_empty_dirs, # TODO: move to appropriate place ) @pytest.fixture @@ -43,6 +43,8 @@ def read_only_with_overwritten(root_with_contents): filename = pathlib.Path(read_only) / "data.txt" assert not filename.exists() staged = read_only / "data.txt" + assert not filename.exists() + staged.__fspath__() assert filename.exists() with open(staged, mode='w') as f: f.write("changed") @@ -97,7 +99,7 @@ def test_delete_empty_dirs(tmp_path): path.parent.mkdir(parents=True, exist_ok=True) path.touch() - _delete_empty_dirs(base) + delete_empty_dirs(base) for path in paths: assert path.exists() @@ -116,7 +118,7 @@ def test_delete_empty_dirs_delete_root(tmp_path, delete_root): for directory in dirs: directory.mkdir(parents=True, exist_ok=True) - _delete_empty_dirs(base, delete_root=delete_root) + delete_empty_dirs(base, delete_root=delete_root) for directory in dirs: assert not directory.exists() @@ -180,8 +182,9 @@ def test_write_new(self, root): def test_write_old_fail(self, root): old_staging = root._get_other_shared("old_unit") + staged = old_staging / "foo.txt" with pytest.raises(IOError, match="read-only"): - old_staging / "foo.txt" + staged.__fspath__() def test_transfer_to_external(self, root_with_contents): path = list(root_with_contents.registry)[0] # only 1 @@ -262,6 +265,7 @@ def test_cleanup(self, root_with_contents): def test_cleanup_missing(self, root, caplog): root.delete_staging = True file = root / "foo.txt" + file.__fspath__() assert file in root.registry assert not pathlib.Path(file).exists() logger_name = "gufe.storage.stagingdirectory" @@ -279,6 +283,9 @@ def test_register_cleanup_preexisting_file(self, root): assert len(root.preexisting) == 0 staging = root / "foo.txt" assert staging.label == "new_unit/foo.txt" + assert len(root.registry) == 0 + assert len(root.preexisting) == 0 + staging.__fspath__() assert len(root.registry) == 1 assert len(root.preexisting) == 1 @@ -305,6 +312,7 @@ def test_load_missing_for_transfer(self, permanent): fname = pathlib.Path(permanent) / "old_unit/data.txt" assert not fname.exists() staging = permanent / "old_unit/data.txt" + staging.__fspath__() assert not fname.exists() assert permanent.external._data == {} permanent.transfer_staging_to_external() diff --git a/gufe/tests/storage/test_storagemanager.py b/gufe/tests/storage/test_storagemanager.py index fb584be1..80af74c1 100644 --- a/gufe/tests/storage/test_storagemanager.py +++ b/gufe/tests/storage/test_storagemanager.py @@ -1,7 +1,5 @@ import pytest -from gufe.storage.storagemanager import ( - StorageManager -) +from gufe.storage.storagemanager import StorageManager from gufe.storage.stagingdirectory import StagingDirectory from gufe.storage.externalresource import MemoryStorage, FileStorage from pathlib import Path @@ -35,7 +33,7 @@ def run(self, scratch, shared, permanent): (scratch / "foo2.txt").touch() # TODO: this will change; the inputs should include a way to get # the previous shared unit label - with shared.other_shared("dag/unit1") as prev_shared: + with shared.root.other_shared("dag/unit1_attempt_0") as prev_shared: with open(prev_shared / "bar.txt", mode='r') as f: bar = f.read() @@ -48,6 +46,7 @@ def run(self, scratch, shared, permanent): return [Unit1(), Unit2()] + class LifecycleHarness: @pytest.fixture def storage_manager(self, tmp_path): @@ -58,10 +57,10 @@ def get_files_dict(storage_manager): root = storage_manager.scratch_root staging = storage_manager.staging return { - "foo": root / "scratch/dag/unit1/foo.txt", - "foo2": root / "scratch/dag/unit2/foo2.txt", - "bar": root / staging / "dag/unit1/bar.txt", - "baz": root / staging / "dag/unit1/baz.txt", + "foo": root / "scratch/dag/unit1_attempt_0/foo.txt", + "foo2": root / "scratch/dag/unit2_attempt_0/foo2.txt", + "bar": root / staging / "dag/unit1_attempt_0/bar.txt", + "baz": root / staging / "dag/unit1_attempt_0/baz.txt", } def test_lifecycle(self, storage_manager, dag_units, tmp_path): @@ -69,9 +68,12 @@ def test_lifecycle(self, storage_manager, dag_units, tmp_path): dag_label = "dag" with storage_manager.running_dag(dag_label) as dag_ctx: for unit in dag_units: - label = f"{dag_ctx.dag_label}/{unit.key}" - with dag_ctx.running_unit(label) as (scratch, shared, perm): + label = f"{dag_label}/{unit.key}" + with dag_ctx.running_unit(dag_label, unit.key, attempt=0) as ( + scratch, shared, perm + ): results.append(unit.run(scratch, shared, perm)) + # import pdb; pdb.set_trace() self.in_unit_asserts(storage_manager, label) self.after_unit_asserts(storage_manager, label) self.after_dag_asserts(storage_manager) @@ -117,7 +119,8 @@ def in_unit_asserts(self, storage_manager, unit_label): permanent_root = storage_manager.permanent_root expected_in_shared = { "dag/unit1": set(), - "dag/unit2": {"dag/unit1/bar.txt", "dag/unit1/baz.txt"} + "dag/unit2": {"dag/unit1_attempt_0/bar.txt", + "dag/unit1_attempt_0/baz.txt"} }[unit_label] | self._in_staging_shared(unit_label, "in") assert set(shared_root.iter_contents()) == expected_in_shared @@ -134,7 +137,8 @@ def after_unit_asserts(self, storage_manager, unit_label): permanent_root = storage_manager.permanent_root shared_extras = self._in_staging_shared(unit_label, "after") permanent_extras = self._in_staging_permanent(unit_label, "after") - expected_in_shared = {"dag/unit1/bar.txt", "dag/unit1/baz.txt"} + expected_in_shared = {"dag/unit1_attempt_0/bar.txt", + "dag/unit1_attempt_0/baz.txt"} expected_in_shared |= shared_extras assert set(shared_root.iter_contents()) == expected_in_shared assert set(permanent_root.iter_contents()) == permanent_extras @@ -155,7 +159,8 @@ def after_dag_asserts(self, storage_manager): # expected_in_shared = {"dag/unit1/bar.txt", "dag/unit1/baz.txt"} # expected_in_shared |= shared_extras # assert set(shared_root.iter_contents()) == expected_in_shared - expected_in_permanent = {"dag/unit1/baz.txt"} | permanent_extras + expected_in_permanent = ({"dag/unit1_attempt_0/baz.txt"} + | permanent_extras) assert set(permanent_root.iter_contents()) == expected_in_permanent # manager-specific check for files @@ -239,16 +244,14 @@ def _after_unit_existing_files(self, unit_label): return {"bar", "baz"} def _after_dag_existing_files(self): - # NOTE: currently we don't delete bar at the end of a cycle, but we - # don't guarantee that we would not. So it exists, but changing that - # isn't API-breaking. - return {"bar", "baz"} + # these get deleted because we don't keep shared here + return set() def _in_staging_shared(self, unit_label, in_after): - bar = "dag/unit1/bar.txt" - baz = "dag/unit1/baz.txt" - foo = "scratch/dag/unit1/foo.txt" - foo2 = "scratch/dag/unit2/foo2.txt" + bar = "dag/unit1_attempt_0/bar.txt" + baz = "dag/unit1_attempt_0/baz.txt" + foo = "scratch/dag/unit1_attempt_0/foo.txt" + foo2 = "scratch/dag/unit2_attempt_0/foo2.txt" return { ("dag/unit1", "in"): {bar, baz, foo}, ("dag/unit1", "after"): {bar, baz}, @@ -278,10 +281,10 @@ def _after_dag_existing_files(self): return {"baz"} def _in_staging_permanent(self, unit_label, in_after): - bar = "dag/unit1/bar.txt" - baz = "dag/unit1/baz.txt" - foo = "scratch/dag/unit1/foo.txt" - foo2 = "scratch/dag/unit2/foo2.txt" + bar = "dag/unit1_attempt_0/bar.txt" + baz = "dag/unit1_attempt_0/baz.txt" + foo = "scratch/dag/unit1_attempt_0/foo.txt" + foo2 = "scratch/dag/unit2_attempt_0/foo2.txt" return { ("dag/unit1", "in"): {bar, baz, foo}, ("dag/unit1", "after"): {baz}, @@ -292,20 +295,3 @@ def _in_staging_permanent(self, unit_label, in_after): def _after_unit_existing_files(self, unit_label): # same for both; all files come from unit 1 return {"baz"} - - -class TestStorageManager: - def test_get_scratch(self, storage_manager_std): - scratch = storage_manager_std.get_scratch("dag_label/unit_label") - assert str(scratch).endswith("scratch/dag_label/unit_label") - assert isinstance(scratch, Path) - - def test_get_permanent(self, storage_manager_std): - perm = storage_manager_std.get_permanent("dag_label/unit_label") - assert perm.__fspath__().endswith(".staging/dag_label/unit_label") - assert isinstance(perm, StagingDirectory) - - def test_get_shared(self, storage_manager_std): - shared = storage_manager_std.get_shared("dag_label/unit_label") - assert shared.__fspath__().endswith(".staging/dag_label/unit_label") - assert isinstance(shared, StagingDirectory) diff --git a/gufe/utils.py b/gufe/utils.py index f9d3b0ff..4382a801 100644 --- a/gufe/utils.py +++ b/gufe/utils.py @@ -4,6 +4,12 @@ import io import warnings +from os import PathLike, rmdir +import pathlib + +import logging +_logger = logging.getLogger(__name__) + class ensure_filelike: """Context manager to convert pathlike or filelike to filelike. @@ -52,3 +58,25 @@ def __exit__(self, type, value, traceback): if self.do_close: self.context.close() + + +def delete_empty_dirs(root: PathLike, delete_root: bool = True): + """Delete all empty directories. + + Repeats so that directories that only contained empty directories also + get deleted. + """ + root = pathlib.Path(root) + + def find_empty_dirs(directory): + if not (paths := list(directory.iterdir())): + return [directory] + directories = [p for p in paths if p.is_dir()] + return sum([find_empty_dirs(d) for d in directories], []) + + while root.exists() and (empties := find_empty_dirs(root)): + if empties == [root] and not delete_root: + return + for directory in empties: + _logger.debug(f"Removing '{directory}'") + rmdir(directory) From a575dd3db002b65cc0744534accf1e7d5b722af0 Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Fri, 1 Dec 2023 11:08:28 -0600 Subject: [PATCH 35/44] make mypy happy --- gufe/storage/storagemanager.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/gufe/storage/storagemanager.py b/gufe/storage/storagemanager.py index 9061a874..61665b7f 100644 --- a/gufe/storage/storagemanager.py +++ b/gufe/storage/storagemanager.py @@ -10,6 +10,7 @@ from .externalresource import ExternalStorage, FileStorage from .stagingdirectory import SharedStaging, PermanentStaging +from .stagingdirectory import StagingPath # typing class StorageManager: @@ -36,8 +37,8 @@ def __init__( # these are used to track what files can be deleted from shared if # keep_shared is False - self.shared_xfer = set() - self.permanent_xfer = set() + self.shared_xfer: set[StagingPath] = set() + self.permanent_xfer: set[StagingPath] = set() self.permanent_staging = PermanentStaging( scratch=self.scratch_root, From 5d0df5ff28fce4381c103357662aebe13d5a2a7f Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Fri, 1 Dec 2023 14:12:49 -0600 Subject: [PATCH 36/44] pep8 --- gufe/storage/stagingdirectory.py | 12 ++++-------- gufe/tests/storage/test_stagingdirectory.py | 9 +++++++++ gufe/tests/storage/test_storagemanager.py | 8 +++++--- gufe/utils.py | 1 - 4 files changed, 18 insertions(+), 12 deletions(-) diff --git a/gufe/storage/stagingdirectory.py b/gufe/storage/stagingdirectory.py index 356a5d34..8c27b416 100644 --- a/gufe/storage/stagingdirectory.py +++ b/gufe/storage/stagingdirectory.py @@ -38,7 +38,6 @@ def _safe_to_delete_staging(external: ExternalStorage, path: PathLike, return False - class StagingDirectory: """PathLike local representation of an :class:`.ExternalStorage`. @@ -94,8 +93,8 @@ def __init__( self.delete_empty_dirs = delete_empty_dirs self.staging = staging - self.registry : set[StagingPath] = set() - self.preexisting : set[StagingPath] = set() + self.registry: set[StagingPath] = set() + self.preexisting: set[StagingPath] = set() self.staging_dir = self.scratch / staging / prefix self.staging_dir.mkdir(exist_ok=True, parents=True) @@ -125,7 +124,6 @@ def transfer_single_file_to_external(self, held_file: StagingPath): return None # no transfer - def transfer_staging_to_external(self): """Transfer all objects in the registry to external storage @@ -201,7 +199,7 @@ def _load_file_from_external(self, external: ExternalStorage, with external.load_stream(staging_path.label) as f: external_bytes = f.read() - ... # TODO: check that the bytes are the same if preexisting? + ... # TODO: check that the bytes are the same if preexisting? scratch_path.parent.mkdir(exist_ok=True, parents=True) with open(scratch_path, mode='wb') as f: @@ -247,7 +245,7 @@ def __init__( self.read_only = read_only def _get_other_shared(self, prefix: str, - delete_staging: Optional[bool] = None): + delete_staging: Optional[bool] = None): """Get a related unit's staging directory. """ if delete_staging is None: @@ -397,5 +395,3 @@ def __repr__(self): # although edge cases may be a pain, we can get most of it with, e.g.: # def exists(self): return Path(self).exists() # but also, can do pathlib.Path(staging_path) and get hte whole thing - - diff --git a/gufe/tests/storage/test_stagingdirectory.py b/gufe/tests/storage/test_stagingdirectory.py index f94a1888..08bed6e6 100644 --- a/gufe/tests/storage/test_stagingdirectory.py +++ b/gufe/tests/storage/test_stagingdirectory.py @@ -11,6 +11,7 @@ delete_empty_dirs, # TODO: move to appropriate place ) + @pytest.fixture def root(tmp_path): external = MemoryStorage() @@ -23,6 +24,7 @@ def root(tmp_path): ) return root + @pytest.fixture def root_with_contents(root): with open(root / "data.txt", mode='wb') as f: @@ -30,6 +32,7 @@ def root_with_contents(root): return root + @pytest.fixture def read_only_with_overwritten(root_with_contents): read_only = SharedStaging( @@ -51,6 +54,7 @@ def read_only_with_overwritten(root_with_contents): return read_only, staged + @pytest.fixture def permanent(tmp_path): shared = MemoryStorage() @@ -64,24 +68,28 @@ def permanent(tmp_path): ) return perm + def test_safe_to_delete_staging_ok(tmp_path): external = FileStorage(tmp_path / "foo") prefix = "bar" staging = tmp_path / "foo" / "baz" assert _safe_to_delete_staging(external, staging, prefix) + def test_safe_to_delete_staging_danger(tmp_path): external = FileStorage(tmp_path / "foo") prefix = "bar" staging = tmp_path / "foo" / "bar" / "baz" assert not _safe_to_delete_staging(external, staging, prefix) + def test_safe_to_delete_staging_not_filestorage(tmp_path): external = MemoryStorage() prefix = "bar" staging = tmp_path / "bar" assert _safe_to_delete_staging(external, staging, prefix) + def test_delete_empty_dirs(tmp_path): base = tmp_path / "tmp" paths = [ @@ -108,6 +116,7 @@ def test_delete_empty_dirs(tmp_path): assert not (base / "foo" / "bar").exists() + @pytest.mark.parametrize('delete_root', [True, False]) def test_delete_empty_dirs_delete_root(tmp_path, delete_root): base = tmp_path / "tmp" diff --git a/gufe/tests/storage/test_storagemanager.py b/gufe/tests/storage/test_storagemanager.py index 80af74c1..c98a52d9 100644 --- a/gufe/tests/storage/test_storagemanager.py +++ b/gufe/tests/storage/test_storagemanager.py @@ -4,6 +4,7 @@ from gufe.storage.externalresource import MemoryStorage, FileStorage from pathlib import Path + @pytest.fixture def storage_manager_std(tmp_path): return StorageManager( @@ -12,6 +13,7 @@ def storage_manager_std(tmp_path): permanent_root=MemoryStorage(), ) + @pytest.fixture def dag_units(): class Unit1: @@ -33,7 +35,9 @@ def run(self, scratch, shared, permanent): (scratch / "foo2.txt").touch() # TODO: this will change; the inputs should include a way to get # the previous shared unit label - with shared.root.other_shared("dag/unit1_attempt_0") as prev_shared: + with ( + shared.root.other_shared("dag/unit1_attempt_0") as prev_shared + ): with open(prev_shared / "bar.txt", mode='r') as f: bar = f.read() @@ -220,7 +224,6 @@ def _after_dag_existing_files(self): return self.files_after_unit('dag/unit2') - class TestStagingOverlapsSharedStorageManager(LifecycleHarness): @pytest.fixture def storage_manager(self, tmp_path): @@ -238,7 +241,6 @@ def _in_unit_existing_files(self, unit_label): "dag/unit2": {'foo2', 'bar', 'baz'}, }[unit_label] - def _after_unit_existing_files(self, unit_label): # same for both; all files come from unit 1 return {"bar", "baz"} diff --git a/gufe/utils.py b/gufe/utils.py index 4382a801..519835d5 100644 --- a/gufe/utils.py +++ b/gufe/utils.py @@ -59,7 +59,6 @@ def __exit__(self, type, value, traceback): self.context.close() - def delete_empty_dirs(root: PathLike, delete_root: bool = True): """Delete all empty directories. From e0573323108e1708ddc54ee2a758675c1c764db2 Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Fri, 1 Dec 2023 14:13:51 -0600 Subject: [PATCH 37/44] pep8 --- gufe/tests/storage/test_storagemanager.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gufe/tests/storage/test_storagemanager.py b/gufe/tests/storage/test_storagemanager.py index c98a52d9..70852bf0 100644 --- a/gufe/tests/storage/test_storagemanager.py +++ b/gufe/tests/storage/test_storagemanager.py @@ -254,7 +254,7 @@ def _in_staging_shared(self, unit_label, in_after): baz = "dag/unit1_attempt_0/baz.txt" foo = "scratch/dag/unit1_attempt_0/foo.txt" foo2 = "scratch/dag/unit2_attempt_0/foo2.txt" - return { + return { ("dag/unit1", "in"): {bar, baz, foo}, ("dag/unit1", "after"): {bar, baz}, ("dag/unit2", "in"): {bar, baz, foo2}, @@ -287,7 +287,7 @@ def _in_staging_permanent(self, unit_label, in_after): baz = "dag/unit1_attempt_0/baz.txt" foo = "scratch/dag/unit1_attempt_0/foo.txt" foo2 = "scratch/dag/unit2_attempt_0/foo2.txt" - return { + return { ("dag/unit1", "in"): {bar, baz, foo}, ("dag/unit1", "after"): {baz}, ("dag/unit2", "in"): {baz, foo2}, From 1cfe9108ea43542036daf79bec86072515e61815 Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Mon, 4 Dec 2023 12:28:39 -0600 Subject: [PATCH 38/44] StagingDirectory -> StagingRegistry also, some docs cleanup/updates --- docs/guide/storage.rst | 40 +++++++++---------- ...stagingdirectory.py => stagingregistry.py} | 16 ++++---- gufe/storage/storagemanager.py | 4 +- ...ngdirectory.py => test_stagingregistry.py} | 14 +++---- gufe/tests/storage/test_storagemanager.py | 1 - 5 files changed, 37 insertions(+), 38 deletions(-) rename gufe/storage/{stagingdirectory.py => stagingregistry.py} (97%) rename gufe/tests/storage/{test_stagingdirectory.py => test_stagingregistry.py} (96%) diff --git a/docs/guide/storage.rst b/docs/guide/storage.rst index d740525e..165b9640 100644 --- a/docs/guide/storage.rst +++ b/docs/guide/storage.rst @@ -15,13 +15,13 @@ correspond to three lifetimes of data: * ``scratch``: This is temporary data that is only needed for the lifetime of a :class:`.ProtocolUnit`. This data is not guaranteed to be available - beyown the single :class:`.ProtocolUnit` where it is created, but may be + beyond the single :class:`.ProtocolUnit` where it is created, but may be reused within that :class:`.ProtocolUnit`. * ``shared``: This is data that is shared between different units in a :class:`.ProtocolDAG`. For example, a single equilibration stage might be shared between multiple production runs. The output snapshot of the equilibration would be suitable for as something to put in ``shared`` - data. This data is guarateed to be present from when it is created until + data. This data is guaranteed to be present from when it is created until the end of the :class:`.ProtocolDAG`, but is not guaranteed to exist after the :class:`.ProtocolDAG` terminates. * ``permanent``: This is the data that will be needed beyond the scope of a @@ -40,10 +40,10 @@ between stages of the GUFE storage system, and simplifies the API for protocol authors. In detail, this provides protocol authors with ``PathLike`` objects for ``scratch``, ``shared``, and ``permanent``. All three of these objects actually point to special subdirectories of the -scratch space for a specific unit, but are managed by context manangers at -the executor level, which handle the process of moving objects from local -staging directories to the actual ``shared`` and ``permanent`` locations, -which can be external resources. +local scratch space for a specific unit, but are managed by context +managers at the executor level, which handle the process of moving objects +from local staging directories to the actual ``shared`` and ``permanent`` +locations, which can be external resources. External resource utilities @@ -77,30 +77,30 @@ helpers. The information in this section is mostly of interest to authors of executors. The helpers are: * :class:`.StorageManager`: This is the overall façade interface for - interacting with the rest of the storage lifecycle tools. -* :class:`.DAGContextManager`: This provides context managers at the DAG and - unit level to handle the transfer of storage. GUFE provides a - :class:`.SingleProcDAGContextManager` to handle the simple case that an - entire DAG is run within a single process. If individual units are run on - different remote resources, a more complicated :class:`.DAGContextManager` - would be needed. -* :class:`.StagingDirectory`: This represents the root directory for staging - the results of a given :class:`.ProtocolUnit`. This is an abstract + interacting with the rest of the storage lifecycle tools. It provides two + methods to generate context managers; one for the :class:`.ProtocolDAG` + level of the lifecycle, and one for the :class:`.ProtocoUnit` level of the + lifecycle. This class is designed for the use case that the entire DAG is + run in serial within a single process. Subclasses of this can be created + for other execution architectures, where the main logic changes would be + in the methods that return those context managers. +* :class:`.StagingRegistry`: This handles the logic around staging paths + within a :class:`.ProtocolUnit`. Think of this as an abstract representation of a local directory. Paths within it register with it, and it handles deletion of the temporary local files when not needed, as well as the download of remote files when necessary for reading. There are two important subclasses of this: :class:`.SharedStaging` for a ``shared`` resource, and :class:`.PermanentStaging` for a ``permanent`` resource. * :class:`.StagingPath`: This represents a file within the - :class:`.StagingDirectory`. It contains both the key (label) used in the - key-value store, as well as the actual local path to the file. On - creation, it registers itself with its :class:`.StagingDirectory`, which - handles managing it over its lifecycle. + :class:`.StagingRegistry`. It contains both the key (label) used in the + key-value store, as well as the actual local path to the file. When its + ``__fspath__`` method is called, it registers itself with its + :class:`.StagingRegistry`, which handles managing it over its lifecycle. In practice, the executor uses the :class:`.StorageManager` to create a :class:`.DAGContextManager` at the level of a DAG, and then uses the :class:`.DAGContextManager` to create a context to run a unit. That context creates a :class:`.SharedStaging` and a :class:`.PermanentStaging` associated with the specific unit. Those staging directories, with the -scratch directory, are provided to the :class:`.ProtocolDAGUnit`, so that +scratch directory, are provided to the :class:`.ProtocolUnit`, so that these are the only objects protocol authors need to interact with. diff --git a/gufe/storage/stagingdirectory.py b/gufe/storage/stagingregistry.py similarity index 97% rename from gufe/storage/stagingdirectory.py rename to gufe/storage/stagingregistry.py index 8c27b416..74278830 100644 --- a/gufe/storage/stagingdirectory.py +++ b/gufe/storage/stagingregistry.py @@ -38,7 +38,7 @@ def _safe_to_delete_staging(external: ExternalStorage, path: PathLike, return False -class StagingDirectory: +class StagingRegistry: """PathLike local representation of an :class:`.ExternalStorage`. This connects objects on a local filesystem to the key-value store of a @@ -73,7 +73,7 @@ class StagingDirectory: temporarily stored; default is '.staging'. This must be the same for all units within a DAG. delete_staging : bool - whether to delete the contents of the $SCRATCH/$HOLDING/$PREFIX + whether to delete the contents of the $SCRATCH/$STAGING/$PREFIX directory when this object is deleted """ def __init__( @@ -155,7 +155,7 @@ def cleanup(self): def register_path(self, staging_path: StagingPath): """ - Register a :class:`.StagingPath` with this :class:`.StagingDirectory`. + Register a :class:`.StagingPath` with this :class:`.StagingRegistry`. This marks a given path as something for this object to manage, by loading it into the ``registry``. This way it is tracked such that @@ -223,7 +223,7 @@ def __del__(self): # -no-cov- self.cleanup() -class SharedStaging(StagingDirectory): +class SharedStaging(StagingRegistry): """Staging for shared external storage. This enables read-only versions to be loaded from other units. @@ -297,7 +297,7 @@ def register_path(self, staging_path: StagingPath): super().register_path(staging_path) -class PermanentStaging(StagingDirectory): +class PermanentStaging(StagingRegistry): """Staging directory for the permanent storage. This allows files to be downloaded from a shared @@ -339,7 +339,7 @@ def transfer_single_file_to_external(self, held_file: StagingPath): class StagingPath: """PathLike object linking local path with label for external storage. - On creation, this registers with a :class:`.StagingDirectory` that will + On creation, this registers with a :class:`.StagingRegistry` that will manage the local path and transferring data with its :class:`.ExternalStorage`. @@ -351,13 +351,13 @@ class StagingPath: example, when deserializing results that point to files) instead use :class:`.ExternalFile`. """ - def __init__(self, root: StagingDirectory, + def __init__(self, root: StagingRegistry, path: Union[PathLike, str]): self.root = root self.path = Path(path) def register(self): - """Register this path with its StagingDirectory. + """Register this path with its StagingRegistry. If a file associated with this path exists in an external storage, it will be downloaded to the staging area as part of registration. diff --git a/gufe/storage/storagemanager.py b/gufe/storage/storagemanager.py index 61665b7f..d5a80d92 100644 --- a/gufe/storage/storagemanager.py +++ b/gufe/storage/storagemanager.py @@ -9,8 +9,8 @@ from typing import Type from .externalresource import ExternalStorage, FileStorage -from .stagingdirectory import SharedStaging, PermanentStaging -from .stagingdirectory import StagingPath # typing +from .stagingregistry import SharedStaging, PermanentStaging +from .stagingregistry import StagingPath # typing class StorageManager: diff --git a/gufe/tests/storage/test_stagingdirectory.py b/gufe/tests/storage/test_stagingregistry.py similarity index 96% rename from gufe/tests/storage/test_stagingdirectory.py rename to gufe/tests/storage/test_stagingregistry.py index 08bed6e6..65806f67 100644 --- a/gufe/tests/storage/test_stagingdirectory.py +++ b/gufe/tests/storage/test_stagingregistry.py @@ -6,7 +6,7 @@ import pathlib from gufe.storage.externalresource import MemoryStorage, FileStorage -from gufe.storage.stagingdirectory import ( +from gufe.storage.stagingregistry import ( SharedStaging, PermanentStaging, _safe_to_delete_staging, delete_empty_dirs, # TODO: move to appropriate place ) @@ -160,7 +160,7 @@ def test_read_old(self, root): # When the file doesn't exist locally, it should be pulled down the # first time that we register the path. - # initial conditions, without touching StagingDirectory/StagingPath + # initial conditions, without touching StagingRegistry/StagingPath label = "old_unit/data.txt" on_filesystem = root.scratch / root.staging / "old_unit/data.txt" assert not on_filesystem.exists() @@ -210,7 +210,7 @@ def test_transfer_to_external_no_file(self, root, caplog): nonfile = root / "does_not_exist.txt" # ensure that we've set this up correctly assert nonfile not in root.registry - logger_name = "gufe.storage.stagingdirectory" + logger_name = "gufe.storage.stagingregistry" caplog.set_level(logging.INFO, logger=logger_name) root.transfer_single_file_to_external(nonfile) assert len(caplog.records) == 1 @@ -222,7 +222,7 @@ def test_transfer_to_external_directory(self, root, caplog): with open(directory / "file.txt", mode='w') as f: f.write("foo") - logger_name = "gufe.storage.stagingdirectory" + logger_name = "gufe.storage.stagingregistry" caplog.set_level(logging.DEBUG, logger=logger_name) root.transfer_single_file_to_external(directory) assert len(caplog.records) == 1 @@ -238,7 +238,7 @@ def test_single_file_transfer_read_only(self, old_contents = f.read() assert old_contents == b"foo" - logger_name = "gufe.storage.stagingdirectory" + logger_name = "gufe.storage.stagingregistry" caplog.set_level(logging.DEBUG, logger=logger_name) read_only.transfer_single_file_to_external(staged) assert len(caplog.records) == 1 @@ -254,7 +254,7 @@ def test_transfer_read_only(self, read_only_with_overwritten, caplog): old_contents = f.read() assert old_contents == b"foo" - logger_name = "gufe.storage.stagingdirectory" + logger_name = "gufe.storage.stagingregistry" caplog.set_level(logging.DEBUG, logger=logger_name) read_only.transfer_staging_to_external() assert len(caplog.records) == 1 @@ -277,7 +277,7 @@ def test_cleanup_missing(self, root, caplog): file.__fspath__() assert file in root.registry assert not pathlib.Path(file).exists() - logger_name = "gufe.storage.stagingdirectory" + logger_name = "gufe.storage.stagingregistry" caplog.set_level(logging.WARNING, logger=logger_name) root.cleanup() assert len(caplog.records) == 1 diff --git a/gufe/tests/storage/test_storagemanager.py b/gufe/tests/storage/test_storagemanager.py index 70852bf0..69b1a6dc 100644 --- a/gufe/tests/storage/test_storagemanager.py +++ b/gufe/tests/storage/test_storagemanager.py @@ -1,6 +1,5 @@ import pytest from gufe.storage.storagemanager import StorageManager -from gufe.storage.stagingdirectory import StagingDirectory from gufe.storage.externalresource import MemoryStorage, FileStorage from pathlib import Path From 78e003b602ba179db5c1b65e6b0fd0d61fbea111 Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Thu, 7 Dec 2023 17:07:44 -0600 Subject: [PATCH 39/44] remove prefix; remove get_other_shared This required a pretty significant rewrite of the code. --- gufe/storage/stagingregistry.py | 123 +++++++-------------- gufe/storage/storagemanager.py | 13 ++- gufe/tests/storage/test_stagingregistry.py | 99 ++++++++++------- gufe/tests/storage/test_storagemanager.py | 25 ++--- 4 files changed, 118 insertions(+), 142 deletions(-) diff --git a/gufe/storage/stagingregistry.py b/gufe/storage/stagingregistry.py index 74278830..2d6f5c99 100644 --- a/gufe/storage/stagingregistry.py +++ b/gufe/storage/stagingregistry.py @@ -11,31 +11,23 @@ import logging _logger = logging.getLogger(__name__) - -def _safe_to_delete_staging(external: ExternalStorage, path: PathLike, - prefix: Union[PathLike, str]) -> bool: - """Check if deleting ``path`` could delete externally stored data. - - If external storage is a FileStorage, then it will store files for - this unit or dag in the directory ``external.root_dir / prefix``, where - ``prefix`` is either the unit label or the dag label. If ``path`` is - inside that directory, then deleting it may delete information from the - external storage. In that case, this returns False, indicating a - conflict. Otherwise, this returns True. - """ - # this is a little brittle; I don't like hard-coding the class here +def _safe_to_delete_file( + external: ExternalStorage, + path: PathLike +) -> bool: + """Check that deleting this file will not remove it from external""" + # kind of brittle: deals with internals of FileStorage if isinstance(external, FileStorage): - root = Path(external.root_dir) / prefix + root = external.root_dir else: return True p = Path(path) try: - _ = p.relative_to(root) + label = str(p.relative_to(root)) except ValueError: return True - else: - return False + return not external.exists(label) class StagingRegistry: @@ -62,25 +54,18 @@ class StagingRegistry: the scratch directory shared by all objects on this host external : :class:`.ExternalStorage` external storage resource where objects should eventualy go - prefix : str - label for this specific unit; this should be a slash-separated - description of where this unit fits in the hierarchy. For example, - it might be ``$DAG_LABEL/$UNIT_LABEL`` or - ``$DAG_LABEL/$UNIT_LABEL/$UNIT_REPEAT``. It must be a unique - identifier for this unit within the permanent storage. staging : PathLike name of the subdirectory of scratch where staged results are temporarily stored; default is '.staging'. This must be the same for all units within a DAG. delete_staging : bool - whether to delete the contents of the $SCRATCH/$STAGING/$PREFIX + whether to delete the contents of the $SCRATCH/$STAGING directory when this object is deleted """ def __init__( self, scratch: PathLike, external: ExternalStorage, - prefix: str, *, staging: PathLike = Path(".staging"), delete_staging: bool = True, @@ -88,23 +73,20 @@ def __init__( ): self.external = external self.scratch = Path(scratch) - self.prefix = Path(prefix) self.delete_staging = delete_staging self.delete_empty_dirs = delete_empty_dirs self.staging = staging self.registry: set[StagingPath] = set() self.preexisting: set[StagingPath] = set() - self.staging_dir = self.scratch / staging / prefix + self.staging_dir = self.scratch / staging self.staging_dir.mkdir(exist_ok=True, parents=True) - def _delete_staging_safe(self): - """Check if deleting staging will remove data from external. - """ - return _safe_to_delete_staging( + def _delete_file_safe(self, file): + """Check if deleting this file will remove it from external.""" + return _safe_to_delete_file( external=self.external, - path=self.staging_dir, - prefix=self.prefix, + path=file ) def transfer_single_file_to_external(self, held_file: StagingPath): @@ -134,21 +116,26 @@ def transfer_staging_to_external(self): if (transferred := self.transfer_single_file_to_external(file)) ] + def _delete_file(self, file: StagingPath): + path = Path(file.fspath) + if path.exists(): + _logger.debug(f"Removing file '{file}'") + # TODO: handle special case of directory? + path.unlink() + self.registry.remove(file) + else: + _logger.warning( + f"During staging cleanup, file '{file}' was marked for " + "deletion, but can not be found on disk." + ) + def cleanup(self): """Perform end-of-lifecycle cleanup. """ - if self.delete_staging and self._delete_staging_safe(): + if self.delete_staging: for file in self.registry - self.preexisting: - path = Path(file.fspath) - if path.exists(): - _logger.debug(f"Removing file {file}") - # TODO: handle special case of directory? - path.unlink() - self.registry.remove(file) - else: - _logger.warning("During staging cleanup, file " - f"{file} was marked for deletion, but " - "can not be found on disk.") + if self._delete_file_safe(file): + self._delete_file(file) if self.delete_empty_dirs: delete_empty_dirs(self.staging_dir) @@ -213,8 +200,7 @@ def __fspath__(self): def __repr__(self): return ( - f"{self.__class__.__name__}('{self.scratch}', {self.external}, " - f"{self.prefix})" + f"{self.__class__.__name__}('{self.scratch}', {self.external})" ) def __del__(self): # -no-cov- @@ -232,46 +218,17 @@ def __init__( self, scratch: PathLike, external: ExternalStorage, - prefix: str, *, staging: PathLike = Path(".staging"), delete_staging: bool = True, delete_empty_dirs: bool = True, read_only: bool = False, ): - super().__init__(scratch, external, prefix, staging=staging, + super().__init__(scratch, external, staging=staging, delete_staging=delete_staging, delete_empty_dirs=delete_empty_dirs) self.read_only = read_only - def _get_other_shared(self, prefix: str, - delete_staging: Optional[bool] = None): - """Get a related unit's staging directory. - """ - if delete_staging is None: - delete_staging = self.delete_staging - - return SharedStaging( - scratch=self.scratch, - external=self.external, - prefix=prefix, - staging=self.staging, - delete_staging=delete_staging, - read_only=True, - ) - - @contextmanager - def other_shared(self, prefix: str, - delete_staging: Optional[bool] = None): - """Context manager approach for getting a related unit's directory. - - This is usually the recommended way to get a previous unit's shared - data. - """ - other = self._get_other_shared(prefix, delete_staging) - yield other - other.cleanup() - def transfer_single_file_to_external(self, held_file: StagingPath): if self.read_only: _logger.debug("Read-only: Not transfering to external storage") @@ -308,24 +265,22 @@ def __init__( scratch: PathLike, external: ExternalStorage, shared: ExternalStorage, - prefix: str, *, staging: PathLike = Path(".staging"), delete_staging: bool = True, delete_empty_dirs: bool = True, ): - super().__init__(scratch, external, prefix, staging=staging, + super().__init__(scratch, external, staging=staging, delete_staging=delete_staging, delete_empty_dirs=delete_empty_dirs) self.shared = shared - def _delete_staging_safe(self): - shared_safe = _safe_to_delete_staging( + def _delete_file_safe(self, file): + shared_safe = _safe_to_delete_file( external=self.shared, - path=self.staging_dir, - prefix=self.prefix + path=file ) - return shared_safe and super()._delete_staging_safe() + return shared_safe and super()._delete_file_safe(file) def transfer_single_file_to_external(self, held_file: StagingPath): # if we can't find it locally, we load it from shared storage @@ -386,7 +341,7 @@ def __fspath__(self): @property def label(self) -> str: """Label used in :class:`.ExternalStorage` for this path""" - return str(self.root.prefix / self.path) + return str(self.path) def __repr__(self): return f"StagingPath('{self.fspath}')" diff --git a/gufe/storage/storagemanager.py b/gufe/storage/storagemanager.py index d5a80d92..21cc5f9f 100644 --- a/gufe/storage/storagemanager.py +++ b/gufe/storage/storagemanager.py @@ -46,7 +46,6 @@ def __init__( shared=self.shared_root, staging=self.staging, delete_empty_dirs=delete_empty_dirs, - prefix="" ) self.shared_staging = SharedStaging( @@ -54,7 +53,6 @@ def __init__( external=self.shared_root, staging=self.staging, delete_empty_dirs=delete_empty_dirs, - prefix="" # TODO: remove prefix ) def make_label(self, dag_label, unit_label, attempt, **kwargs): @@ -87,8 +85,17 @@ def running_dag(self, dag_label): if not self.keep_staging: self.permanent_staging.cleanup() + if not self.keep_shared: - for file in self.shared_xfer: + # we'd like to do something like loop over + # self.shared_xfer - self.permanent_xfer; however, + # StagedPaths have different staging registries. This gives + # the set of paths we do want to delete + perm_xfer_paths = {p.fspath for p in self.permanent_xfer} + shared_xfer_to_delete = {p for p in self.shared_xfer + if p.fspath not in perm_xfer_paths} + + for file in shared_xfer_to_delete: self.shared_root.delete(file.label) for file in self.permanent_xfer: diff --git a/gufe/tests/storage/test_stagingregistry.py b/gufe/tests/storage/test_stagingregistry.py index 65806f67..c0f21fc5 100644 --- a/gufe/tests/storage/test_stagingregistry.py +++ b/gufe/tests/storage/test_stagingregistry.py @@ -7,7 +7,7 @@ from gufe.storage.externalresource import MemoryStorage, FileStorage from gufe.storage.stagingregistry import ( - SharedStaging, PermanentStaging, _safe_to_delete_staging, + SharedStaging, PermanentStaging, _safe_to_delete_file, delete_empty_dirs, # TODO: move to appropriate place ) @@ -19,7 +19,6 @@ def root(tmp_path): root = SharedStaging( scratch=tmp_path, external=external, - prefix="new_unit", delete_staging=False ) return root @@ -27,7 +26,8 @@ def root(tmp_path): @pytest.fixture def root_with_contents(root): - with open(root / "data.txt", mode='wb') as f: + # file staged but not yet shipped to external + with open(root / "new_unit/data.txt", mode='wb') as f: f.write(b"bar") return root @@ -38,14 +38,13 @@ def read_only_with_overwritten(root_with_contents): read_only = SharedStaging( scratch=root_with_contents.scratch, external=root_with_contents.external, - prefix="old_unit", staging=root_with_contents.staging, delete_staging=root_with_contents.delete_staging, read_only=True ) - filename = pathlib.Path(read_only) / "data.txt" + filename = pathlib.Path(read_only) / "old_unit/data.txt" assert not filename.exists() - staged = read_only / "data.txt" + staged = read_only / "old_unit/data.txt" assert not filename.exists() staged.__fspath__() assert filename.exists() @@ -58,36 +57,35 @@ def read_only_with_overwritten(root_with_contents): @pytest.fixture def permanent(tmp_path): shared = MemoryStorage() - shared.store_bytes("final/old_unit/data.txt", b"foo") + shared.store_bytes("old_unit/data.txt", b"foo") perm = PermanentStaging( - scratch=tmp_path, + scratch=tmp_path / "final", external=MemoryStorage(), shared=shared, - prefix="final", delete_staging=True ) return perm -def test_safe_to_delete_staging_ok(tmp_path): +@pytest.mark.parametrize('rel_path', [ + ("bar"), ("baz"), ("../bar") +]) +def test_safe_to_delete_file(tmp_path, rel_path): external = FileStorage(tmp_path / "foo") - prefix = "bar" - staging = tmp_path / "foo" / "baz" - assert _safe_to_delete_staging(external, staging, prefix) - + external.store_bytes("bar", b"") + ext_loc = tmp_path / "foo" / "bar" + assert ext_loc.exists() -def test_safe_to_delete_staging_danger(tmp_path): - external = FileStorage(tmp_path / "foo") - prefix = "bar" - staging = tmp_path / "foo" / "bar" / "baz" - assert not _safe_to_delete_staging(external, staging, prefix) + staged = external.root_dir / rel_path + is_safe = (rel_path != "bar") + assert _safe_to_delete_file(external, staged) is is_safe -def test_safe_to_delete_staging_not_filestorage(tmp_path): +def test_safe_to_delete_file_not_filestorage(tmp_path): external = MemoryStorage() - prefix = "bar" + external.store_bytes("bar", b"") staging = tmp_path / "bar" - assert _safe_to_delete_staging(external, staging, prefix) + assert _safe_to_delete_file(external, staging) def test_delete_empty_dirs(tmp_path): @@ -141,7 +139,6 @@ def test_repr(self, root): r = repr(root) assert r.startswith("SharedStaging") assert "MemoryStorage" in r - assert r.endswith(", new_unit)") @pytest.mark.parametrize('pathlist', [ ['file.txt'], ['dir', 'file.txt'] @@ -168,9 +165,11 @@ def test_read_old(self, root): # when we create the specific StagingPath, it registers and # "downloads" the file - old_staging = root._get_other_shared("old_unit") - filepath = old_staging / "data.txt" - assert pathlib.Path(filepath) == on_filesystem + filepath = root / "old_unit/data.txt" + assert pathlib.Path(filepath.fspath) == on_filesystem + + assert not on_filesystem.exists() + filepath.register() assert on_filesystem.exists() # let's just be sure we can read in the data as desired @@ -181,7 +180,7 @@ def test_write_new(self, root): label = "new_unit/somefile.txt" on_filesystem = root.scratch / root.staging / "new_unit/somefile.txt" assert not on_filesystem.exists() - with open(root / "somefile.txt", mode='wb') as f: + with open(root / "new_unit/somefile.txt", mode='wb') as f: f.write(b"testing") # this has been written to disk in scratch, but not yet saved to @@ -189,9 +188,10 @@ def test_write_new(self, root): assert on_filesystem.exists() assert not root.external.exists(label) + @pytest.mark.xfail # Need test that read-only errors on new files def test_write_old_fail(self, root): old_staging = root._get_other_shared("old_unit") - staged = old_staging / "foo.txt" + staged = old_,tstaging / "foo.txt" with pytest.raises(IOError, match="read-only"): staged.__fspath__() @@ -207,7 +207,7 @@ def test_transfer_to_external(self, root_with_contents): def test_transfer_to_external_no_file(self, root, caplog): with mock.patch.object(root, 'register_path'): - nonfile = root / "does_not_exist.txt" + nonfile = root / "old_unit/does_not_exist.txt" # ensure that we've set this up correctly assert nonfile not in root.registry logger_name = "gufe.storage.stagingregistry" @@ -218,7 +218,7 @@ def test_transfer_to_external_no_file(self, root, caplog): assert "nonexistent" in record.msg def test_transfer_to_external_directory(self, root, caplog): - directory = root / "directory" + directory = root / "old_unit/directory" with open(directory / "file.txt", mode='w') as f: f.write("foo") @@ -266,15 +266,16 @@ def test_transfer_read_only(self, read_only_with_overwritten, caplog): def test_cleanup(self, root_with_contents): root_with_contents.delete_staging = True # slightly naughty - path = pathlib.Path(root_with_contents.__fspath__()) / "data.txt" + root_path = pathlib.Path(root_with_contents.__fspath__()) + path = root_path / "new_unit/data.txt" assert path.exists() root_with_contents.cleanup() assert not path.exists() def test_cleanup_missing(self, root, caplog): root.delete_staging = True - file = root / "foo.txt" - file.__fspath__() + file = root / "old_unit/foo.txt" + file.register() assert file in root.registry assert not pathlib.Path(file).exists() logger_name = "gufe.storage.stagingregistry" @@ -285,12 +286,13 @@ def test_cleanup_missing(self, root, caplog): assert "can not be found on disk" in record.msg def test_register_cleanup_preexisting_file(self, root): - filename = pathlib.Path(root.__fspath__()) / "foo.txt" + filename = pathlib.Path(root.__fspath__()) / "new_unit/foo.txt" + filename.parent.mkdir(parents=True, exist_ok=True) filename.touch() root.external.store_bytes("new_unit/foo.txt", b"") assert len(root.registry) == 0 assert len(root.preexisting) == 0 - staging = root / "foo.txt" + staging = root / "new_unit/foo.txt" assert staging.label == "new_unit/foo.txt" assert len(root.registry) == 0 assert len(root.preexisting) == 0 @@ -303,19 +305,32 @@ def test_register_cleanup_preexisting_file(self, root): assert filename.exists() -class TestPermanentStage: +class TestPermanentStaging: @pytest.mark.parametrize('is_safe', [True, False]) - def test_delete_staging_safe(self, tmp_path, is_safe): + def test_delete_file_safe(self, tmp_path, is_safe): staging = ".staging" if is_safe else "" + scratch_root_dir = tmp_path / "final" + + # create a file in the external storage + external = FileStorage(scratch_root_dir) + external.store_bytes("foo.txt", b"foo") + external_file_loc = external.root_dir / "foo.txt" + assert external_file_loc.exists() + permanent = PermanentStaging( - scratch=tmp_path, + scratch=scratch_root_dir, external=MemoryStorage(), - shared=FileStorage(tmp_path), - prefix="final", + shared=external, staging=staging, delete_staging=True ) - assert permanent._delete_staging_safe() is is_safe + my_file = permanent / "foo.txt" + + # double check that we set things up correctly + assert (str(external_file_loc) != my_file.fspath) is is_safe + + # test the code + assert permanent._delete_file_safe(my_file) is is_safe def test_load_missing_for_transfer(self, permanent): fname = pathlib.Path(permanent) / "old_unit/data.txt" @@ -326,4 +341,4 @@ def test_load_missing_for_transfer(self, permanent): assert permanent.external._data == {} permanent.transfer_staging_to_external() assert fname.exists() - assert permanent.external._data == {"final/old_unit/data.txt": b"foo"} + assert permanent.external._data == {"old_unit/data.txt": b"foo"} diff --git a/gufe/tests/storage/test_storagemanager.py b/gufe/tests/storage/test_storagemanager.py index 69b1a6dc..779d5552 100644 --- a/gufe/tests/storage/test_storagemanager.py +++ b/gufe/tests/storage/test_storagemanager.py @@ -34,16 +34,14 @@ def run(self, scratch, shared, permanent): (scratch / "foo2.txt").touch() # TODO: this will change; the inputs should include a way to get # the previous shared unit label - with ( - shared.root.other_shared("dag/unit1_attempt_0") as prev_shared - ): - with open(prev_shared / "bar.txt", mode='r') as f: - bar = f.read() + prev_shared = shared.root / "dag/unit1_attempt_0" + with open(prev_shared / "bar.txt", mode='r') as f: + bar = f.read() - # note that you can open a file from permanent as if it was - # from shared -- everything in permanent is in shared - with open(prev_shared / "baz.txt", mode='r') as f: - baz = f.read() + # note that you can open a file from permanent as if it was + # from shared -- everything in permanent is in shared + with open(prev_shared / "baz.txt", mode='r') as f: + baz = f.read() return {"bar": bar, "baz": baz} @@ -75,8 +73,8 @@ def test_lifecycle(self, storage_manager, dag_units, tmp_path): with dag_ctx.running_unit(dag_label, unit.key, attempt=0) as ( scratch, shared, perm ): - results.append(unit.run(scratch, shared, perm)) # import pdb; pdb.set_trace() + results.append(unit.run(scratch, shared, perm)) self.in_unit_asserts(storage_manager, label) self.after_unit_asserts(storage_manager, label) self.after_dag_asserts(storage_manager) @@ -180,7 +178,8 @@ def storage_manager(self, storage_manager_std): def _in_unit_existing_files(self, unit_label): return { "dag/unit1": {'bar', 'baz', 'foo'}, - "dag/unit2": {'foo2', 'baz'} + "dag/unit2": {'foo2', 'baz', 'bar'}, + # bar was deleted, but gets brought back in unit2 }[unit_label] def _after_unit_existing_files(self, unit_label): @@ -275,7 +274,7 @@ def storage_manager(self, tmp_path): def _in_unit_existing_files(self, unit_label): return { "dag/unit1": {'foo', 'bar', 'baz'}, - "dag/unit2": {"foo2", "baz"}, # no bar because it was temporary + "dag/unit2": {"foo2", "baz", "bar"}, # bar is resurrected }[unit_label] def _after_dag_existing_files(self): @@ -289,7 +288,7 @@ def _in_staging_permanent(self, unit_label, in_after): return { ("dag/unit1", "in"): {bar, baz, foo}, ("dag/unit1", "after"): {baz}, - ("dag/unit2", "in"): {baz, foo2}, + ("dag/unit2", "in"): {baz, foo2, bar}, # bar is resurrected ("dag/unit2", "after"): {baz} }[unit_label, in_after] From 265e786bb0f9fe6866cd57d07fd93541e4d64304 Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Thu, 7 Dec 2023 17:22:59 -0600 Subject: [PATCH 40/44] delete_empty_dirs => keep_empty_dirs Mainly matters for the StorageManager --- gufe/storage/stagingregistry.py | 14 +++++++------- gufe/storage/storagemanager.py | 10 +++++----- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/gufe/storage/stagingregistry.py b/gufe/storage/stagingregistry.py index 2d6f5c99..626e6b8f 100644 --- a/gufe/storage/stagingregistry.py +++ b/gufe/storage/stagingregistry.py @@ -69,12 +69,12 @@ def __init__( *, staging: PathLike = Path(".staging"), delete_staging: bool = True, - delete_empty_dirs: bool = True, + keep_empty_dirs: bool = False, ): self.external = external self.scratch = Path(scratch) self.delete_staging = delete_staging - self.delete_empty_dirs = delete_empty_dirs + self.keep_empty_dirs = keep_empty_dirs self.staging = staging self.registry: set[StagingPath] = set() @@ -137,7 +137,7 @@ def cleanup(self): if self._delete_file_safe(file): self._delete_file(file) - if self.delete_empty_dirs: + if not self.keep_empty_dirs: delete_empty_dirs(self.staging_dir) def register_path(self, staging_path: StagingPath): @@ -221,12 +221,12 @@ def __init__( *, staging: PathLike = Path(".staging"), delete_staging: bool = True, - delete_empty_dirs: bool = True, + keep_empty_dirs: bool = False, read_only: bool = False, ): super().__init__(scratch, external, staging=staging, delete_staging=delete_staging, - delete_empty_dirs=delete_empty_dirs) + keep_empty_dirs=keep_empty_dirs) self.read_only = read_only def transfer_single_file_to_external(self, held_file: StagingPath): @@ -268,11 +268,11 @@ def __init__( *, staging: PathLike = Path(".staging"), delete_staging: bool = True, - delete_empty_dirs: bool = True, + keep_empty_dirs: bool = False, ): super().__init__(scratch, external, staging=staging, delete_staging=delete_staging, - delete_empty_dirs=delete_empty_dirs) + keep_empty_dirs=keep_empty_dirs) self.shared = shared def _delete_file_safe(self, file): diff --git a/gufe/storage/storagemanager.py b/gufe/storage/storagemanager.py index 21cc5f9f..4370b5b5 100644 --- a/gufe/storage/storagemanager.py +++ b/gufe/storage/storagemanager.py @@ -23,8 +23,8 @@ def __init__( keep_scratch: bool = False, keep_staging: bool = False, keep_shared: bool = False, + keep_empty_dirs: bool = False, staging: PathLike = Path(".staging"), - delete_empty_dirs: bool = True, ): self.scratch_root = Path(scratch_root) self.shared_root = shared_root @@ -33,7 +33,7 @@ def __init__( self.keep_staging = keep_staging self.keep_shared = keep_shared self.staging = staging - self.delete_empty_dirs = delete_empty_dirs + self.keep_empty_dirs = keep_empty_dirs # these are used to track what files can be deleted from shared if # keep_shared is False @@ -45,14 +45,14 @@ def __init__( external=self.permanent_root, shared=self.shared_root, staging=self.staging, - delete_empty_dirs=delete_empty_dirs, + keep_empty_dirs=keep_empty_dirs, ) self.shared_staging = SharedStaging( scratch=self.scratch_root, external=self.shared_root, staging=self.staging, - delete_empty_dirs=delete_empty_dirs, + keep_empty_dirs=keep_empty_dirs, ) def make_label(self, dag_label, unit_label, attempt, **kwargs): @@ -102,7 +102,7 @@ def running_dag(self, dag_label): if self.shared_root != self.permanent_root: self.shared_root.delete(file.label) - if self.delete_empty_dirs: + if not self.keep_empty_dirs: delete_empty_dirs(self._scratch_base, delete_root=False) @contextmanager From ce12326a2d3d35f13f471f742faa27261a63a7e2 Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Mon, 11 Dec 2023 12:46:25 -0600 Subject: [PATCH 41/44] Add logging to not clean up registered directory --- gufe/storage/stagingregistry.py | 12 +++++++++--- gufe/tests/storage/test_stagingregistry.py | 17 +++++++++++++++++ 2 files changed, 26 insertions(+), 3 deletions(-) diff --git a/gufe/storage/stagingregistry.py b/gufe/storage/stagingregistry.py index 626e6b8f..6da50d23 100644 --- a/gufe/storage/stagingregistry.py +++ b/gufe/storage/stagingregistry.py @@ -119,9 +119,15 @@ def transfer_staging_to_external(self): def _delete_file(self, file: StagingPath): path = Path(file.fspath) if path.exists(): - _logger.debug(f"Removing file '{file}'") - # TODO: handle special case of directory? - path.unlink() + if not path.is_dir(): + _logger.debug(f"Removing file '{file}'") + path.unlink() + else: + _logger.debug( + f"During staging cleanup, the directory '{file}' was " + "found as a staged path. This will be deleted only if " + "`keep_empty` is False." + ) self.registry.remove(file) else: _logger.warning( diff --git a/gufe/tests/storage/test_stagingregistry.py b/gufe/tests/storage/test_stagingregistry.py index c0f21fc5..e62ab697 100644 --- a/gufe/tests/storage/test_stagingregistry.py +++ b/gufe/tests/storage/test_stagingregistry.py @@ -285,6 +285,23 @@ def test_cleanup_missing(self, root, caplog): record = caplog.records[0] assert "can not be found on disk" in record.msg + def test_cleanup_directory(self, root, caplog): + root.delete_staging = True + dirname = root / "old_unit" + assert dirname not in root.registry + dirname.register() + assert dirname in root.registry + + assert not pathlib.Path(dirname).exists() + file = dirname / "foo.txt" + file.register() + # directory is created when something in the directory registered + assert pathlib.Path(dirname).exists() + logger_name = "gufe.storage.stagingregistry" + caplog.set_level(logging.DEBUG, logger=logger_name) + root.cleanup() + assert "During staging cleanup, the directory" in caplog.text + def test_register_cleanup_preexisting_file(self, root): filename = pathlib.Path(root.__fspath__()) / "new_unit/foo.txt" filename.parent.mkdir(parents=True, exist_ok=True) From aaa2aab0a77549f84dc745a596ab152c06d3997e Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Wed, 13 Dec 2023 16:28:48 -0600 Subject: [PATCH 42/44] StagingPath.fspath => StagingPath.as_path() Make a function that returns a pathlib.Path be the main thing, instead of the property that returns a string. Almost all usage of `.fspath` was to then wrap it in a pathlib.Path. This is more convenient for users. --- gufe/storage/stagingregistry.py | 18 +++++++++++------- gufe/storage/storagemanager.py | 8 +++++--- gufe/tests/storage/test_stagingregistry.py | 4 ++-- 3 files changed, 18 insertions(+), 12 deletions(-) diff --git a/gufe/storage/stagingregistry.py b/gufe/storage/stagingregistry.py index 6da50d23..34b623c7 100644 --- a/gufe/storage/stagingregistry.py +++ b/gufe/storage/stagingregistry.py @@ -92,7 +92,7 @@ def _delete_file_safe(self, file): def transfer_single_file_to_external(self, held_file: StagingPath): """Transfer a given file from staging into external storage """ - path = Path(held_file.fspath) + path = held_file.as_path() if not path.exists(): _logger.info(f"Found nonexistent path {path}, not " "transfering to external storage") @@ -117,7 +117,7 @@ def transfer_staging_to_external(self): ] def _delete_file(self, file: StagingPath): - path = Path(file.fspath) + path = file.as_path() if path.exists(): if not path.is_dir(): _logger.debug(f"Removing file '{file}'") @@ -166,7 +166,7 @@ def register_path(self, staging_path: StagingPath): the path to track """ label_exists = self.external.exists(staging_path.label) - fspath = Path(staging_path.fspath) + fspath = staging_path.as_path() # TODO: what if the staging path is a directory? not sure that we # have a way to know that; but not sure that adding it to the @@ -290,7 +290,7 @@ def _delete_file_safe(self, file): def transfer_single_file_to_external(self, held_file: StagingPath): # if we can't find it locally, we load it from shared storage - path = Path(held_file.fspath) + path = held_file.as_path() if not path.exists(): self._load_file_from_external(self.shared, held_file) @@ -336,13 +336,17 @@ def __eq__(self, other): def __hash__(self): return hash((self.root, self.path)) + def as_path(self): + """Return the pathlib.Path where this is staged""" + return Path(self._fspath) + @property - def fspath(self): + def _fspath(self): return str(self.root.staging_dir / self.path) def __fspath__(self): self.register() - return self.fspath + return self._fspath @property def label(self) -> str: @@ -350,7 +354,7 @@ def label(self) -> str: return str(self.path) def __repr__(self): - return f"StagingPath('{self.fspath}')" + return f"StagingPath('{self._fspath}')" # TODO: how much of the pathlib.Path interface do we want to wrap? # although edge cases may be a pain, we can get most of it with, e.g.: diff --git a/gufe/storage/storagemanager.py b/gufe/storage/storagemanager.py index 4370b5b5..588f2852 100644 --- a/gufe/storage/storagemanager.py +++ b/gufe/storage/storagemanager.py @@ -91,9 +91,11 @@ def running_dag(self, dag_label): # self.shared_xfer - self.permanent_xfer; however, # StagedPaths have different staging registries. This gives # the set of paths we do want to delete - perm_xfer_paths = {p.fspath for p in self.permanent_xfer} - shared_xfer_to_delete = {p for p in self.shared_xfer - if p.fspath not in perm_xfer_paths} + perm_xfer_paths = {p.as_path() for p in self.permanent_xfer} + shared_xfer_to_delete = { + p for p in self.shared_xfer + if p.as_path() not in perm_xfer_paths + } for file in shared_xfer_to_delete: self.shared_root.delete(file.label) diff --git a/gufe/tests/storage/test_stagingregistry.py b/gufe/tests/storage/test_stagingregistry.py index e62ab697..de5508af 100644 --- a/gufe/tests/storage/test_stagingregistry.py +++ b/gufe/tests/storage/test_stagingregistry.py @@ -166,7 +166,7 @@ def test_read_old(self, root): # when we create the specific StagingPath, it registers and # "downloads" the file filepath = root / "old_unit/data.txt" - assert pathlib.Path(filepath.fspath) == on_filesystem + assert filepath.as_path() == on_filesystem assert not on_filesystem.exists() filepath.register() @@ -344,7 +344,7 @@ def test_delete_file_safe(self, tmp_path, is_safe): my_file = permanent / "foo.txt" # double check that we set things up correctly - assert (str(external_file_loc) != my_file.fspath) is is_safe + assert (str(external_file_loc) != my_file._fspath) is is_safe # test the code assert permanent._delete_file_safe(my_file) is is_safe From cf60d1b17876567f2c7d06193eba757896114e17 Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Wed, 13 Dec 2023 16:41:44 -0600 Subject: [PATCH 43/44] pep8 --- gufe/storage/stagingregistry.py | 1 + gufe/storage/storagemanager.py | 1 - gufe/tests/storage/test_stagingregistry.py | 2 +- 3 files changed, 2 insertions(+), 2 deletions(-) diff --git a/gufe/storage/stagingregistry.py b/gufe/storage/stagingregistry.py index 34b623c7..51d2696b 100644 --- a/gufe/storage/stagingregistry.py +++ b/gufe/storage/stagingregistry.py @@ -11,6 +11,7 @@ import logging _logger = logging.getLogger(__name__) + def _safe_to_delete_file( external: ExternalStorage, path: PathLike diff --git a/gufe/storage/storagemanager.py b/gufe/storage/storagemanager.py index 588f2852..ac030cb8 100644 --- a/gufe/storage/storagemanager.py +++ b/gufe/storage/storagemanager.py @@ -85,7 +85,6 @@ def running_dag(self, dag_label): if not self.keep_staging: self.permanent_staging.cleanup() - if not self.keep_shared: # we'd like to do something like loop over # self.shared_xfer - self.permanent_xfer; however, diff --git a/gufe/tests/storage/test_stagingregistry.py b/gufe/tests/storage/test_stagingregistry.py index de5508af..392d34d9 100644 --- a/gufe/tests/storage/test_stagingregistry.py +++ b/gufe/tests/storage/test_stagingregistry.py @@ -191,7 +191,7 @@ def test_write_new(self, root): @pytest.mark.xfail # Need test that read-only errors on new files def test_write_old_fail(self, root): old_staging = root._get_other_shared("old_unit") - staged = old_,tstaging / "foo.txt" + staged = old_staging / "foo.txt" with pytest.raises(IOError, match="read-only"): staged.__fspath__() From da9955e713efaa8a61a67962d080ea839ffc72ef Mon Sep 17 00:00:00 2001 From: "David W.H. Swenson" Date: Fri, 22 Dec 2023 10:43:54 -0500 Subject: [PATCH 44/44] remove fspath from StagingRegistry This is to avoid likely footguns related to using os.path.join. Includes test of error on os.path.join. --- gufe/storage/stagingregistry.py | 5 +---- gufe/tests/storage/test_stagingregistry.py | 15 ++++++++++----- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/gufe/storage/stagingregistry.py b/gufe/storage/stagingregistry.py index 51d2696b..40b311a9 100644 --- a/gufe/storage/stagingregistry.py +++ b/gufe/storage/stagingregistry.py @@ -32,7 +32,7 @@ def _safe_to_delete_file( class StagingRegistry: - """PathLike local representation of an :class:`.ExternalStorage`. + """Local representation of an :class:`.ExternalStorage`. This connects objects on a local filesystem to the key-value store of a (possibly remote) :class:`.ExternalStorage`. It presents a FileLike @@ -202,9 +202,6 @@ def _load_file_from_external(self, external: ExternalStorage, def __truediv__(self, path: Union[PathLike, str]): return StagingPath(root=self, path=path) - def __fspath__(self): - return str(self.staging_dir) - def __repr__(self): return ( f"{self.__class__.__name__}('{self.scratch}', {self.external})" diff --git a/gufe/tests/storage/test_stagingregistry.py b/gufe/tests/storage/test_stagingregistry.py index 392d34d9..d5e5d71e 100644 --- a/gufe/tests/storage/test_stagingregistry.py +++ b/gufe/tests/storage/test_stagingregistry.py @@ -42,7 +42,7 @@ def read_only_with_overwritten(root_with_contents): delete_staging=root_with_contents.delete_staging, read_only=True ) - filename = pathlib.Path(read_only) / "old_unit/data.txt" + filename = (read_only / "old_unit/data.txt").as_path() assert not filename.exists() staged = read_only / "old_unit/data.txt" assert not filename.exists() @@ -140,6 +140,12 @@ def test_repr(self, root): assert r.startswith("SharedStaging") assert "MemoryStorage" in r + def test_fspath_fail(self, root): + # ensure that we get an error on os.path.join (or really, anything + # that hits os.fspath) + with pytest.raises(TypeError): + os.path.join(root, "filename.txt") + @pytest.mark.parametrize('pathlist', [ ['file.txt'], ['dir', 'file.txt'] ]) @@ -266,8 +272,7 @@ def test_transfer_read_only(self, read_only_with_overwritten, caplog): def test_cleanup(self, root_with_contents): root_with_contents.delete_staging = True # slightly naughty - root_path = pathlib.Path(root_with_contents.__fspath__()) - path = root_path / "new_unit/data.txt" + path = (root_with_contents / "new_unit/data.txt").as_path() assert path.exists() root_with_contents.cleanup() assert not path.exists() @@ -303,7 +308,7 @@ def test_cleanup_directory(self, root, caplog): assert "During staging cleanup, the directory" in caplog.text def test_register_cleanup_preexisting_file(self, root): - filename = pathlib.Path(root.__fspath__()) / "new_unit/foo.txt" + filename = (root / "new_unit/foo.txt").as_path() filename.parent.mkdir(parents=True, exist_ok=True) filename.touch() root.external.store_bytes("new_unit/foo.txt", b"") @@ -350,7 +355,7 @@ def test_delete_file_safe(self, tmp_path, is_safe): assert permanent._delete_file_safe(my_file) is is_safe def test_load_missing_for_transfer(self, permanent): - fname = pathlib.Path(permanent) / "old_unit/data.txt" + fname = (permanent / "old_unit/data.txt").as_path() assert not fname.exists() staging = permanent / "old_unit/data.txt" staging.__fspath__()