From 01c4b3f5e38a3605fb71b856aebb34462a22ff29 Mon Sep 17 00:00:00 2001 From: Michael Panchenko Date: Wed, 6 Dec 2023 15:57:54 +0100 Subject: [PATCH 1/4] =?UTF-8?q?Bump=20version:=200.4.6=20=E2=86=92=200.4.7?= =?UTF-8?q?-dev0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .bumpversion.cfg | 2 +- latest_release_notes.md | 2 +- setup.py | 2 +- src/accsr/__init__.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.bumpversion.cfg b/.bumpversion.cfg index 2d41754..145f4a4 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.4.6 +current_version = 0.4.7-dev0 commit = False tag = False allow_dirty = False diff --git a/latest_release_notes.md b/latest_release_notes.md index 9036c7b..b302892 100644 --- a/latest_release_notes.md +++ b/latest_release_notes.md @@ -1,4 +1,4 @@ -# Release Notes: 0.4.6 +# Release Notes: 0.4.7-dev0 ## Bugfix release - Fixed bugs in RemoteStorage related to name collisions and serialization. diff --git a/setup.py b/setup.py index da3843b..642a054 100644 --- a/setup.py +++ b/setup.py @@ -15,7 +15,7 @@ package_dir={"": "src"}, packages=find_packages(where="src"), include_package_data=True, - version="0.4.6", + version="0.4.7-dev0", description="Utils for accessing data from anywhere", install_requires=open("requirements.txt").readlines(), setup_requires=["wheel"], diff --git a/src/accsr/__init__.py b/src/accsr/__init__.py index 3dd3d2d..6536291 100644 --- a/src/accsr/__init__.py +++ b/src/accsr/__init__.py @@ -1 +1 @@ -__version__ = "0.4.6" +__version__ = "0.4.7-dev0" From 423f4a8e7272b5b417f8828ae15e7892a5df7932 Mon Sep 17 00:00:00 2001 From: Michael Panchenko Date: Fri, 24 May 2024 13:12:57 +0200 Subject: [PATCH 2/4] Improved typing issues in RemoteStorage --- requirements-dev.txt | 2 +- src/accsr/remote_storage.py | 127 ++++++++++++++++++++---------------- 2 files changed, 71 insertions(+), 58 deletions(-) diff --git a/requirements-dev.txt b/requirements-dev.txt index 85af9b4..55378f1 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,6 +1,6 @@ tox bump2version black -isort==5.6.4 +isort~=5.12.0 nbstripout==0.5.0 pre-commit diff --git a/src/accsr/remote_storage.py b/src/accsr/remote_storage.py index cd7861e..bfc929d 100644 --- a/src/accsr/remote_storage.py +++ b/src/accsr/remote_storage.py @@ -11,17 +11,20 @@ from pathlib import Path from typing import ( Dict, + Generator, List, + Literal, Optional, Pattern, Protocol, Sequence, Union, + cast, runtime_checkable, ) -import libcloud from libcloud.storage.base import Container, StorageDriver +from libcloud.storage.providers import get_driver from libcloud.storage.types import ( ContainerAlreadyExistsError, InvalidContainerNameError, @@ -63,7 +66,7 @@ def _replace_driver_by_name(obj): # Since sometimes we want to be able to deepcopy these things around, # we replace the driver by its name. This is needed for `asdict` to work. if isinstance(obj, RemoteObjectProtocol) and hasattr(obj, "driver"): - obj.driver = obj.driver.name + obj.driver = obj.driver.name # type: ignore if isinstance(obj, list) or isinstance(obj, tuple): for item in obj: _replace_driver_by_name(item) @@ -81,7 +84,7 @@ def __repr__(self): @contextmanager -def _switch_to_dir(path: str = None) -> bool: +def _switch_to_dir(path: Optional[str] = None) -> Generator[None, None, None]: if path: cur_dir = os.getcwd() try: @@ -128,9 +131,9 @@ class SyncObject(_JsonReprMixin): def __init__( self, - local_path: str = None, - remote_obj: RemoteObjectProtocol = None, - remote_path: str = None, + local_path: Optional[str] = None, + remote_obj: Optional[RemoteObjectProtocol] = None, + remote_path: Optional[str] = None, ): if remote_path is not None: remote_path = remote_path.lstrip("/") @@ -161,8 +164,13 @@ def __init__( raise ValueError(f"Either remote_path or remote_obj should be not None") self.remote_path = remote_obj.name - self.local_size = os.path.getsize(local_path) if self.exists_locally else 0 - self.local_hash = md5sum(local_path) if self.exists_locally else None + if self.exists_locally: + assert self.local_path is not None + self.local_size = os.path.getsize(self.local_path) + self.local_hash = md5sum(self.local_path) + else: + self.local_size = 0 + self.local_hash = None @property def name(self): @@ -261,7 +269,7 @@ class TransactionSummary(_JsonReprMixin): skipped_source_files: List[SyncObject] = field(default_factory=list) synced_files: List[SyncObject] = field(default_factory=list) - sync_direction: Optional[str] = None + sync_direction: Optional[Literal["push", "pull"]] = None def __post_init__(self): if self.sync_direction not in ["pull", "push", None]: @@ -377,9 +385,9 @@ class RemoteStorageConfig: key: str bucket: str secret: str = field(repr=False) - region: str = None - host: str = None - port: int = None + region: Optional[str] = None + host: Optional[str] = None + port: Optional[int] = None base_path: str = "" secure: bool = True @@ -455,13 +463,11 @@ def bucket(self) -> Container: @cached_property def driver(self) -> StorageDriver: - storage_driver_factory = libcloud.get_driver( - libcloud.DriverType.STORAGE, self.provider - ) + storage_driver_factory = get_driver(self.provider) return storage_driver_factory(**self.driver_kwargs) def _execute_sync( - self, sync_object: SyncObject, direction: str, force=False + self, sync_object: SyncObject, direction: Literal["push", "pull"], force=False ) -> SyncObject: """ Synchronizes the local and the remote file in the given direction. Will raise an error if a file from the source @@ -475,10 +481,6 @@ def _execute_sync( will be overwritten. :return: a SyncObject that represents the status of remote and target after the synchronization """ - if direction not in ["push", "pull"]: - raise ValueError( - f"Unknown direction {direction}, has to be either 'push' or 'pull'." - ) if sync_object.equal_md5_hash_sum: log.debug( f"Skipping {direction} of {sync_object.name} because of coinciding hash sums" @@ -495,11 +497,14 @@ def _execute_sync( raise FileNotFoundError( f"Cannot push non-existing file: {sync_object.local_path}" ) - # noinspection PyTypeChecker - remote_obj: RemoteObjectProtocol = self.bucket.upload_object( - sync_object.local_path, - sync_object.remote_path, - verify_hash=False, + assert sync_object.local_path is not None + remote_obj = cast( + RemoteObjectProtocol, + self.bucket.upload_object( + sync_object.local_path, + sync_object.remote_path, + verify_hash=False, + ), ) return SyncObject(sync_object.local_path, remote_obj) @@ -508,6 +513,7 @@ def _execute_sync( raise RuntimeError( f"Cannot pull without remote object and local path. Affects: {sync_object.name}" ) + assert sync_object.local_path is not None if os.path.isdir(sync_object.local_path): raise FileExistsError( f"Cannot pull file to a path which is an existing directory: {sync_object.local_path}" @@ -519,6 +525,10 @@ def _execute_sync( sync_object.local_path, overwrite_existing=force ) return SyncObject(sync_object.local_path, sync_object.remote_obj) + else: + raise ValueError( + f"Unknown direction {direction}, has to be either 'push' or 'pull'." + ) @staticmethod def _get_remote_path(remote_obj: RemoteObjectProtocol) -> str: @@ -620,6 +630,7 @@ def _execute_sync_from_summary( desc = "force " + desc with tqdm(total=summary.size_files_to_sync(), desc=desc) as pbar: for sync_obj in summary.files_to_sync: + assert summary.sync_direction is not None synced_obj = self._execute_sync( sync_obj, direction=summary.sync_direction, force=force ) @@ -630,13 +641,13 @@ def _execute_sync_from_summary( def pull( self, remote_path: str, - local_base_dir="", - force=False, - include_regex: Union[Pattern, str] = None, - exclude_regex: Union[Pattern, str] = None, - convert_to_linux_path=True, - dryrun=False, - path_regex: Union[Pattern, str] = None, + local_base_dir: str = "", + force: bool = False, + include_regex: Optional[Union[Pattern, str]] = None, + exclude_regex: Optional[Union[Pattern, str]] = None, + convert_to_linux_path: bool = True, + dryrun: bool = False, + path_regex: Optional[Union[Pattern, str]] = None, ) -> TransactionSummary: r""" Pull either a file or a directory under the given path relative to local_base_dir. @@ -684,11 +695,11 @@ def _get_destination_path( def _get_pull_summary( self, remote_path: str, - local_base_dir="", - include_regex: Union[Pattern, str] = None, - exclude_regex: Union[Pattern, str] = None, - convert_to_linux_path=True, - path_regex: Union[Pattern, str] = None, + local_base_dir: str = "", + include_regex: Optional[Union[Pattern, str]] = None, + exclude_regex: Optional[Union[Pattern, str]] = None, + convert_to_linux_path: bool = True, + path_regex: Optional[Union[Pattern, str]] = None, ) -> TransactionSummary: r""" Creates TransactionSummary of the specified pull operation. @@ -721,8 +732,8 @@ def _get_pull_summary( summary = TransactionSummary(sync_direction="pull") full_remote_path = self._full_remote_path(remote_path) # noinspection PyTypeChecker - remote_objects: List[RemoteObjectProtocol] = list( - self.bucket.list_objects(full_remote_path) + remote_objects = cast( + List[RemoteObjectProtocol], list(self.bucket.list_objects(full_remote_path)) ) for obj in tqdm( @@ -837,10 +848,13 @@ def _get_push_summary( skip = self._should_skip(file, include_regex, exclude_regex) remote_path = self.get_push_remote_path(file) - # noinspection PyTypeChecker + + all_matched_remote_obj = cast( + List[RemoteObjectProtocol], self.bucket.list_objects(remote_path) + ) matched_remote_obj = [ obj - for obj in self.bucket.list_objects(remote_path) + for obj in all_matched_remote_obj if not self._listed_due_to_name_collision(remote_path, obj) ] @@ -850,7 +864,6 @@ def _get_push_summary( elif matched_remote_obj: remote_obj = matched_remote_obj[0] - synced_obj = SyncObject(file, remote_obj, remote_path=remote_path) summary.add_entry( synced_obj, @@ -861,7 +874,9 @@ def _get_push_summary( return summary @staticmethod - def _should_skip(file: str, include_regex: Pattern, exclude_regex: Pattern): + def _should_skip( + file: str, include_regex: Optional[Pattern], exclude_regex: Optional[Pattern] + ): if include_regex is not None and not include_regex.match(file): log.debug( f"Skipping {file} since it does not match regular expression '{include_regex}'." @@ -895,12 +910,12 @@ def _handle_deprecated_path_regex( def push( self, path: str, - local_path_prefix: str = None, + local_path_prefix: Optional[str] = None, force: bool = False, - include_regex: Union[Pattern, str] = None, - exclude_regex: Union[Pattern, str] = None, + include_regex: Optional[Union[Pattern, str]] = None, + exclude_regex: Optional[Union[Pattern, str]] = None, dryrun: bool = False, - path_regex: Union[Pattern, str] = None, + path_regex: Optional[Union[Pattern, str]] = None, ) -> TransactionSummary: """ Upload files into the remote storage. @@ -948,9 +963,9 @@ def push( def delete( self, remote_path: str, - include_regex: Union[Pattern, str] = None, - exclude_regex: Union[Pattern, str] = None, - path_regex: Union[Pattern, str] = None, + include_regex: Optional[Union[Pattern, str]] = None, + exclude_regex: Optional[Union[Pattern, str]] = None, + path_regex: Optional[Union[Pattern, str]] = None, ) -> List[RemoteObjectProtocol]: """ Deletes a file or a directory under the given path relative to local_base_dir. Use with caution! @@ -968,7 +983,9 @@ def delete( full_remote_path = self._full_remote_path(remote_path) - remote_objects = self.bucket.list_objects(full_remote_path) + remote_objects = cast( + List[RemoteObjectProtocol], self.bucket.list_objects(full_remote_path) + ) if len(remote_objects) == 0: log.warning( f"No such remote file or directory: {full_remote_path}. Not deleting anything" @@ -976,8 +993,6 @@ def delete( return [] deleted_objects = [] for remote_obj in remote_objects: - remote_obj: RemoteObjectProtocol - if self._listed_due_to_name_collision(full_remote_path, remote_obj): log.debug( f"Skipping deletion of {remote_obj.name} as it was listed due to name collision" @@ -992,8 +1007,7 @@ def delete( log.info(f"Skipping {relative_obj_path} due to regex {exclude_regex}") continue log.debug(f"Deleting {remote_obj.name}") - # noinspection PyTypeChecker - self.bucket.delete_object(remote_obj) + self.bucket.delete_object(remote_obj) # type: ignore deleted_objects.append(remote_obj) return deleted_objects @@ -1003,5 +1017,4 @@ def list_objects(self, remote_path: str) -> List[RemoteObjectProtocol]: :return: list of remote objects under the remote path (multiple entries if the remote path is a directory) """ full_remote_path = self._full_remote_path(remote_path) - # noinspection PyTypeChecker - return self.bucket.list_objects(full_remote_path) + return self.bucket.list_objects(full_remote_path) # type: ignore From 89ee3ba8dca038f8051fb2906f1a54357547ee41 Mon Sep 17 00:00:00 2001 From: Michael Panchenko Date: Fri, 24 May 2024 13:44:14 +0200 Subject: [PATCH 3/4] RemoteStorage: allow passing absolute paths when pulling This further increases convenience by introducing a useful default behavior when local_base_dir as passed to pull is an absolute path --- src/accsr/remote_storage.py | 38 ++++++++++++++++++++++++++++-- tests/accsr/test_remote_storage.py | 15 ++++++++++++ 2 files changed, 51 insertions(+), 2 deletions(-) diff --git a/src/accsr/remote_storage.py b/src/accsr/remote_storage.py index bfc929d..a5853db 100644 --- a/src/accsr/remote_storage.py +++ b/src/accsr/remote_storage.py @@ -648,12 +648,15 @@ def pull( convert_to_linux_path: bool = True, dryrun: bool = False, path_regex: Optional[Union[Pattern, str]] = None, + strip_abspath_prefix: Optional[str] = None, + strip_abs_local_base_dir: bool = True, ) -> TransactionSummary: r""" Pull either a file or a directory under the given path relative to local_base_dir. :param remote_path: remote path on storage bucket relative to the configured remote base path. - e.g. 'data/ground_truth/some_file.json' + e.g. 'data/ground_truth/some_file.json'. Can also be an absolute local path if ``strip_abspath_prefix`` + is specified. :param local_base_dir: Local base directory for constructing local path e.g. passing 'local_base_dir' will download to the path 'local_base_dir/data/ground_truth/some_file.json' in the above example @@ -669,8 +672,39 @@ def pull( (which is discouraged). :param dryrun: If True, simulates the pull operation and returns the remote objects that would have been pulled. :param path_regex: DEPRECATED! Use ``include_regex`` instead. + :param strip_abspath_prefix: Will only have an effect if the `remote_path` is absolute. + Then the given prefix is removed from it before pulling. This is useful for pulling files from a remote storage + by directly specifying absolute local paths instead of first converting them to actual remote paths. + Similar in logic to `local_path_prefix` in `push`. + A common use case is to always set `local_base_dir` to the same value and to always pass absolute paths + as `remote_path` to `pull`. + :param strip_abs_local_base_dir: If True, and `local_base_dir` is an absolute path, then + the `local_base_dir` will be treated as `strip_abspath_prefix`. See explanation of `strip_abspath_prefix`. :return: An object describing the summary of the operation. """ + + if strip_abs_local_base_dir and os.path.isabs(local_base_dir): + if strip_abspath_prefix is not None: + raise ValueError( + f"Cannot specify both `strip_abs_local_base_dir`={strip_abs_local_base_dir} " + f"and `strip_abspath_prefix`={strip_abspath_prefix}" + f"when `local_base_dir`={local_base_dir} is an absolute path." + ) + strip_abspath_prefix = local_base_dir + + remote_path_is_abs = remote_path.startswith("/") or os.path.isabs(remote_path) + + if strip_abspath_prefix is not None and remote_path_is_abs: + remote_path = remote_path.replace("\\", "/") + strip_abspath_prefix = strip_abspath_prefix.replace("\\", "/").rstrip("/") + if not remote_path.startswith(strip_abspath_prefix): + raise ValueError( + f"Remote path {remote_path} is absolute but does not start " + f"with the given prefix {strip_abspath_prefix}" + ) + # +1 for removing the leading '/' + remote_path = remote_path[len(strip_abspath_prefix) + 1 :] + include_regex = self._handle_deprecated_path_regex(include_regex, path_regex) summary = self._get_pull_summary( remote_path, @@ -921,7 +955,7 @@ def push( Upload files into the remote storage. Does not upload files for which the md5sum matches existing remote files. The remote path for uploading will be constructed from the remote_base_path and the provided path. - The local_path_prefix serves for finding the directory on the local system or for stripping off + The `local_path_prefix` serves for finding the directory on the local system or for stripping off parts of absolute paths if path is absolute, see examples below. Examples: diff --git a/tests/accsr/test_remote_storage.py b/tests/accsr/test_remote_storage.py index a7befd4..60304b4 100644 --- a/tests/accsr/test_remote_storage.py +++ b/tests/accsr/test_remote_storage.py @@ -184,6 +184,21 @@ def test_pull_file(self, storage, file_on_storage, tmpdir): pull_summary = storage.pull(file_on_storage, force=False) assert len(pull_summary.synced_files) == 0 + @pytest.mark.parametrize( + "file_on_storage", + ["sample.txt"], + indirect=["file_on_storage"], + ) + def test_pull_file_local_absolute_path(self, storage, file_on_storage, tmpdir): + local_base_dir = os.path.abspath(tmpdir.mkdir("remote_storage")) + pulled_file_abspath = os.path.join(local_base_dir, file_on_storage) + assert not os.path.exists(pulled_file_abspath) + storage.pull( + pulled_file_abspath, + local_base_dir=local_base_dir, + ) + assert os.path.isfile(pulled_file_abspath) + @pytest.mark.parametrize( "file_on_storage", ["sample.txt"], From 447059063704458e8d87380c0f4474658b94994c Mon Sep 17 00:00:00 2001 From: Michael Panchenko Date: Fri, 24 May 2024 13:44:31 +0200 Subject: [PATCH 4/4] =?UTF-8?q?Bump=20version:=200.4.7-dev0=20=E2=86=92=20?= =?UTF-8?q?0.4.7?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .bumpversion.cfg | 2 +- latest_release_notes.md | 2 +- setup.py | 2 +- src/accsr/__init__.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.bumpversion.cfg b/.bumpversion.cfg index 145f4a4..2c1f05a 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.4.7-dev0 +current_version = 0.4.7 commit = False tag = False allow_dirty = False diff --git a/latest_release_notes.md b/latest_release_notes.md index b302892..c4ef6bf 100644 --- a/latest_release_notes.md +++ b/latest_release_notes.md @@ -1,4 +1,4 @@ -# Release Notes: 0.4.7-dev0 +# Release Notes: 0.4.7 ## Bugfix release - Fixed bugs in RemoteStorage related to name collisions and serialization. diff --git a/setup.py b/setup.py index 642a054..8ae8b32 100644 --- a/setup.py +++ b/setup.py @@ -15,7 +15,7 @@ package_dir={"": "src"}, packages=find_packages(where="src"), include_package_data=True, - version="0.4.7-dev0", + version="0.4.7", description="Utils for accessing data from anywhere", install_requires=open("requirements.txt").readlines(), setup_requires=["wheel"], diff --git a/src/accsr/__init__.py b/src/accsr/__init__.py index 6536291..a34b2f6 100644 --- a/src/accsr/__init__.py +++ b/src/accsr/__init__.py @@ -1 +1 @@ -__version__ = "0.4.7-dev0" +__version__ = "0.4.7"