-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
2/4: Add DistroFinder sub-classes for using .zip files
These classes allow hab to resolve distros stored as .zip files both locally and on cloud services (aws s3 buckets). - Support sidecar .hab.json files next to the distro .zip files for remote services that don't support reading ranges of a file. - For services that support reading ranges of files like s3 it can download the .hab.json file from inside of the remote .zip file without downloading the entire .zip file first. - Add `hab.utils.loads_json` that raises the same errors as `load_json_file` but works on strings instead of files.
- Loading branch information
1 parent
217d4f5
commit 561b5cf
Showing
8 changed files
with
762 additions
and
21 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,133 @@ | ||
import logging | ||
import pathlib | ||
import time | ||
import zipfile | ||
from abc import ABCMeta, abstractmethod | ||
|
||
import remotezip | ||
from cloudpathlib import CloudPath | ||
|
||
from .df_zip import DistroFinderZip | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
class HabRemoteZip(remotezip.RemoteZip): | ||
"""`remotezip.RemoteZip` that doesn't call `close()` when exiting a with context. | ||
Opening a new RemoteZip instance is slow and changes depending on the size | ||
of the .zip file. Cloud based workflow doesn't need to close the file pointer | ||
like you need to when working on a local file. | ||
""" | ||
|
||
def __exit__(self, type, value, traceback): | ||
pass | ||
|
||
|
||
class DistroFinderCloudZip(DistroFinderZip, metaclass=ABCMeta): | ||
"""Works with zipped distros stored remotely in Amazon S3 buckets. | ||
Working with zipped distros extracting the `hab_filename` information from | ||
inside the .zip file. This is useful when you have direct access to the .zip | ||
file. | ||
For `path`, this class uses a .zip `member path`. A member path is the absolute | ||
path to the .zip joined with the member path of files contained inside the .zip | ||
file. So if the archive file path is `c:/temp/dist_a_v0.1.zip` and the member is | ||
`hab_filename`, then the member_path would be `c:/temp/dist_a_v0.1.zip/.hab.json`. | ||
Note: | ||
This class should only be used to install distros in the hab download system. | ||
This expects one file to exist with a specific naming convention: | ||
- `{distro}_v{version}.zip` contains the entire contents of the distro. | ||
This should also contain the top level file `hab_filename`. When the distro | ||
is installed and using hab normally this file will be used. The `hab_filename` | ||
file's contents are extracted from the zip file and used to initialize the | ||
`DistroVersion` returned by `self.distro` without being written to disk. | ||
""" | ||
|
||
def __init__(self, root, site=None, safe=False, client=None): | ||
# Only define client if it was passed, otherwise create it lazily. | ||
if client: | ||
self.client = client | ||
super().__init__(root, site=site, safe=safe) | ||
self._archives = {} | ||
|
||
def as_posix(self): | ||
"""Returns the root path as a posix style string.""" | ||
if isinstance(self.root, CloudPath): | ||
# CloudPath doesn't need as_posix | ||
return str(self.root) | ||
return super().as_posix() | ||
|
||
def cast_path(self, path): | ||
"""Return path cast to the `pathlib.Path` like class preferred by this class.""" | ||
return CloudPath(path, client=self.client) | ||
|
||
@property | ||
@abstractmethod | ||
def client(self): | ||
"""A `cloudpathlib.client.Client` used to create `CloudPath` instances.""" | ||
|
||
@client.setter | ||
@abstractmethod | ||
def client(self, client): | ||
pass | ||
|
||
@abstractmethod | ||
def credentials(self): | ||
"""Returns the credentials needed for requests to connect to the cloud resource. | ||
Generates these credentials using the client object. | ||
""" | ||
|
||
def archive(self, zip_path, partial=True): | ||
"""Returns a `zipfile.Zipfile` like instance for zip_path. | ||
Args: | ||
zip_path (cloudpathlib.CloudPath): The path to the zip file to open. | ||
partial (bool, optional): If True then you only need access to a small | ||
part of the archive. If True then `HabRemoteZip` will be used | ||
to only download specific files from the remote archive without | ||
caching them to disk. If False then remote archives will be fully | ||
downloaded to disk(using caching) before returning the open archive. | ||
""" | ||
if not partial or isinstance(zip_path, pathlib.PurePath): | ||
logger.debug(f"Using CloudPath to open(downloading if needed) {zip_path}.") | ||
archive = zipfile.ZipFile(zip_path) | ||
archive.filename = zip_path | ||
return archive | ||
|
||
# Creating a RemoteZip instance is very slow compared to local file access. | ||
# Reuse existing objects if already created. | ||
if zip_path in self._archives: | ||
logger.debug(f"Reusing cloud .zip resource: {zip_path}") | ||
return self._archives[zip_path] | ||
|
||
logger.debug(f"Connecting to cloud .zip resource: {zip_path}") | ||
s = time.time() | ||
auth, headers = self.credentials() | ||
|
||
archive = HabRemoteZip(zip_path.as_url(), auth=auth, headers=headers) | ||
archive.filename = zip_path | ||
e = time.time() | ||
logger.info(f"Connected to cloud .zip resource: {zip_path}, took: {e - s}") | ||
self._archives[zip_path] = archive | ||
return archive | ||
|
||
def clear_cache(self, persistent=False): | ||
"""Clear cached data in memory. If `persistent` is True then also remove | ||
cache data from disk if it exists. | ||
""" | ||
if persistent: | ||
self.remove_download_cache() | ||
super().clear_cache(persistent=persistent) | ||
|
||
# Ensure all cached archives are closed before clearing the cache. | ||
for archive in self._archives.values(): | ||
archive.close() | ||
self._archives = {} | ||
if persistent: | ||
# Clear downloaded temp files | ||
self.client.clear_cache() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,165 @@ | ||
import logging | ||
|
||
from .. import utils | ||
from .zip_sidecar import DistroFinderZipSidecar | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
class DistroFinderZip(DistroFinderZipSidecar): | ||
"""Working with zipped distros extracting the `hab_filename` information from | ||
inside the .zip file. This is useful when you have direct access to the .zip | ||
file. | ||
For `path`, this class uses a .zip `member path`. A member path is the absolute | ||
path to the .zip joined with the member path of files contained inside the .zip | ||
file. So if the archive file path is `c:/temp/dist_a_v0.1.zip` and the member is | ||
`hab_filename`, then the member_path would be `c:/temp/dist_a_v0.1.zip/.hab.json`. | ||
Note: | ||
This class should only be used to install distros in the hab download system. | ||
This expects one file to exist with a specific naming convention: | ||
- `{distro}_v{version}.zip` contains the entire contents of the distro. | ||
This should also contain the top level file `hab_filename`. When the distro | ||
is installed and using hab normally this file will be used. The `hab_filename` | ||
file's contents are extracted from the zip file and used to initialize the | ||
`DistroVersion` returned by `self.distro` without being written to disk. | ||
""" | ||
|
||
def __init__(self, root, site=None, safe=True): | ||
super().__init__(root, site=site) | ||
self.glob_str = "*.zip" | ||
self._cache = {} | ||
self.safe = safe | ||
|
||
def clear_cache(self, persistent=False): | ||
"""Clear cached data in memory. If `persistent` is True then also remove | ||
cache data from disk if it exists. | ||
""" | ||
self._cache = {} | ||
|
||
def content(self, path): | ||
"""Returns the distro container for a given path as `pathlib.Path`. | ||
For this class it returns the path to the .zip file. This .zip file | ||
contains the contents of the distro and the actual `hab_filename` used | ||
to create the distro. | ||
Args: | ||
path (pathlib.Path): The member path to the `hab_filename` file defining | ||
the distro. | ||
""" | ||
# If path is already a .zip file return it. | ||
# Note: We can't concatenate this with `pathlib.Path.parents` so this has | ||
# to be done separately from the for loop later | ||
if path.suffix.lower() == ".zip": | ||
return path | ||
|
||
# Search for the right most .zip file extension and return that path if found | ||
for parent in path.parents: | ||
if parent.suffix.lower() == ".zip": | ||
return parent | ||
|
||
# Otherwise fall back to returning the path | ||
return path | ||
|
||
def content_member(self, path): | ||
"""Splits a member path into content and member. | ||
Args: | ||
path (os.PathLike): The member path to split. | ||
Returns: | ||
content(os.PathLike): Path to the .zip file. | ||
member (str): Any remaining member path after the .zip file. If path | ||
doesn't specify a member, then a empty string is returned. | ||
""" | ||
content = self.content(path) | ||
member = str(path.relative_to(content)) | ||
# Return a empty string instead of the relative dot | ||
if member == ".": | ||
member = "" | ||
return content, member | ||
|
||
def distro_path_info(self): | ||
"""Generator yielding distro info for each distro found by this distro finder. | ||
Note: | ||
This class doesn't use habcache features so cached will always be `False`. | ||
Yields: | ||
dirname: Will always be `None`. This class deals with only compressed | ||
.zip files so there is not a parent directory to work with. | ||
path: The member path to a given resource. | ||
cached: Will always be `False`. The path is not stored in a .habcache | ||
file so this data is not cached across processes. | ||
""" | ||
for path in self.root.glob(self.glob_str): | ||
member_path = path / self.hab_filename | ||
if self.safe: | ||
# Opening archives on cloud based systems is slow, this allows us | ||
# to disable checking that the archive actually has a `hab_filename` file. | ||
data = self.get_file_data(member_path) | ||
# This should only return None if the archive doesn't contain member | ||
if data is None: | ||
continue | ||
|
||
yield None, member_path, False | ||
|
||
def get_file_data(self, path): | ||
"""Return the data stored inside a member of a .zip file as bytes. | ||
This is cached and will only open the .zip file to read the contents the | ||
first time path is used for this instance. | ||
Args: | ||
path: The member path to a given resource. If the path points directly | ||
to a .zip file then member is assumed to be `self.hab_filename`. | ||
""" | ||
content, member = self.content_member(path) | ||
if not member: | ||
member = self.hab_filename | ||
path = path / member | ||
logger.debug(f'Implicitly added member "{member}" to path "{path}".') | ||
|
||
if path in self._cache: | ||
return self._cache[path] | ||
|
||
with self.archive(content) as archive: | ||
if member in archive.namelist(): | ||
data = archive.read(member) | ||
else: | ||
data = None | ||
self._cache[path] = data | ||
|
||
return self._cache[path] | ||
|
||
def load_path(self, path): | ||
"""Returns a raw dictionary use to create a `DistroVersion` with version set. | ||
Returns the actual contents of the .zip file's top level file `hab_filename` | ||
without writing that data to disk. The return is passed to `DistroVersion.load` | ||
as the data argument. This allows the `DistroFinder` class to directly use | ||
the data contained inside the .zip archive. | ||
The version property will always be set in the return. If not defined | ||
in the `hab_filename` file's contents, its set to the return of `version_for_path`. | ||
Args: | ||
path (pathlib.Path): The member path to the `hab_filename` file inside | ||
of the .zip file. | ||
Raises: | ||
KeyError: This method uses the cache populated by `distro_path_info` | ||
and that method needs to be called before calling this. It is also | ||
raised if the requested `path` is not defined in the distro. | ||
""" | ||
logger.debug(f'Loading json: "{path}"') | ||
data = self.get_file_data(path) | ||
data = data.decode("utf-8") | ||
data = utils.loads_json(data, source=path) | ||
# Pull the version from the sidecar filename if its not explicitly set | ||
if "version" not in data: | ||
_, data["version"] = self.version_for_path(path) | ||
return data |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,84 @@ | ||
import logging | ||
from hashlib import sha256 | ||
|
||
from cloudpathlib import S3Client | ||
from requests_aws4auth import AWS4Auth | ||
|
||
from .. import utils | ||
from .cloud_zip import DistroFinderCloudZip | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
class DistroFinderS3Zip(DistroFinderCloudZip): | ||
"""Works with zipped distros stored remotely in Amazon S3 buckets. | ||
Working with zipped distros extracting the `hab_filename` information from | ||
inside the .zip file. This is useful when you have direct access to the .zip | ||
file. | ||
For `path`, this class uses a .zip `member path`. A member path is the absolute | ||
path to the .zip joined with the member path of files contained inside the .zip | ||
file. So if the archive file path is `c:/temp/dist_a_v0.1.zip` and the member is | ||
`hab_filename`, then the member_path would be `c:/temp/dist_a_v0.1.zip/.hab.json`. | ||
Note: | ||
This class should only be used to install distros in the hab download system. | ||
This expects one file to exist with a specific naming convention: | ||
- `{distro}_v{version}.zip` contains the entire contents of the distro. | ||
This should also contain the top level file `hab_filename`. When the distro | ||
is installed and using hab normally this file will be used. The `hab_filename` | ||
file's contents are extracted from the zip file and used to initialize the | ||
`DistroVersion` returned by `self.distro` without being written to disk. | ||
""" | ||
|
||
def __init__(self, root, site=None, safe=False, client=None, profile_name=None): | ||
self.profile_name = profile_name | ||
super().__init__(root, site=site, safe=safe, client=client) | ||
|
||
@property | ||
def client(self): | ||
try: | ||
return self._client | ||
except AttributeError: | ||
kwargs = {} | ||
if self.profile_name: | ||
kwargs["profile_name"] = self.profile_name | ||
if self.site: | ||
kwargs["local_cache_dir"] = self.site.downloads["cache_root"] | ||
else: | ||
kwargs["local_cache_dir"] = utils.Platform.default_download_cache() | ||
self._client = S3Client(**kwargs) | ||
return self._client | ||
|
||
@client.setter | ||
def client(self, client): | ||
self._client = client | ||
|
||
def credentials(self): | ||
"""Returns the credentials needed for requests to connect to aws s3 bucket. | ||
Generates these credentials using the client object. | ||
""" | ||
|
||
try: | ||
return self._credentials | ||
except AttributeError: | ||
pass | ||
# The `x-amz-content-sha256` header is required for all AWS Signature | ||
# Version 4 requests. It provides a hash of the request payload. If | ||
# there is no payload, you must provide the hash of an empty string. | ||
headers = {"x-amz-content-sha256": sha256(b"").hexdigest()} | ||
|
||
location = self.client.client.get_bucket_location(Bucket=self.root.bucket)[ | ||
"LocationConstraint" | ||
] | ||
auth = AWS4Auth( | ||
refreshable_credentials=self.client.sess.get_credentials(), | ||
region=location, | ||
service="s3", | ||
) | ||
|
||
self._credentials = (auth, headers) | ||
return self._credentials |
Oops, something went wrong.