Skip to content

Commit

Permalink
2/4: Add DistroFinder sub-classes for using .zip files
Browse files Browse the repository at this point in the history
These classes allow hab to resolve distros stored as .zip files both locally
and on cloud services (aws s3 buckets).
- Support sidecar .hab.json files next to the distro .zip files for remote
services that don't support reading ranges of a file.
- For services that support reading ranges of files like s3 it can download
the .hab.json file from inside of the remote .zip file without downloading
the entire .zip file first.
- Add `hab.utils.loads_json` that raises the same errors as `load_json_file`
but works on strings instead of files.
  • Loading branch information
MHendricks committed Dec 13, 2024
1 parent 217d4f5 commit 561b5cf
Show file tree
Hide file tree
Showing 8 changed files with 762 additions and 21 deletions.
133 changes: 133 additions & 0 deletions hab/distro_finders/cloud_zip.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
import logging
import pathlib
import time
import zipfile
from abc import ABCMeta, abstractmethod

import remotezip
from cloudpathlib import CloudPath

from .df_zip import DistroFinderZip

logger = logging.getLogger(__name__)


class HabRemoteZip(remotezip.RemoteZip):
"""`remotezip.RemoteZip` that doesn't call `close()` when exiting a with context.
Opening a new RemoteZip instance is slow and changes depending on the size
of the .zip file. Cloud based workflow doesn't need to close the file pointer
like you need to when working on a local file.
"""

def __exit__(self, type, value, traceback):
pass


class DistroFinderCloudZip(DistroFinderZip, metaclass=ABCMeta):
"""Works with zipped distros stored remotely in Amazon S3 buckets.
Working with zipped distros extracting the `hab_filename` information from
inside the .zip file. This is useful when you have direct access to the .zip
file.
For `path`, this class uses a .zip `member path`. A member path is the absolute
path to the .zip joined with the member path of files contained inside the .zip
file. So if the archive file path is `c:/temp/dist_a_v0.1.zip` and the member is
`hab_filename`, then the member_path would be `c:/temp/dist_a_v0.1.zip/.hab.json`.
Note:
This class should only be used to install distros in the hab download system.
This expects one file to exist with a specific naming convention:
- `{distro}_v{version}.zip` contains the entire contents of the distro.
This should also contain the top level file `hab_filename`. When the distro
is installed and using hab normally this file will be used. The `hab_filename`
file's contents are extracted from the zip file and used to initialize the
`DistroVersion` returned by `self.distro` without being written to disk.
"""

def __init__(self, root, site=None, safe=False, client=None):
# Only define client if it was passed, otherwise create it lazily.
if client:
self.client = client
super().__init__(root, site=site, safe=safe)
self._archives = {}

def as_posix(self):
"""Returns the root path as a posix style string."""
if isinstance(self.root, CloudPath):
# CloudPath doesn't need as_posix
return str(self.root)
return super().as_posix()

def cast_path(self, path):
"""Return path cast to the `pathlib.Path` like class preferred by this class."""
return CloudPath(path, client=self.client)

@property
@abstractmethod
def client(self):
"""A `cloudpathlib.client.Client` used to create `CloudPath` instances."""

@client.setter
@abstractmethod
def client(self, client):
pass

@abstractmethod
def credentials(self):
"""Returns the credentials needed for requests to connect to the cloud resource.
Generates these credentials using the client object.
"""

def archive(self, zip_path, partial=True):
"""Returns a `zipfile.Zipfile` like instance for zip_path.
Args:
zip_path (cloudpathlib.CloudPath): The path to the zip file to open.
partial (bool, optional): If True then you only need access to a small
part of the archive. If True then `HabRemoteZip` will be used
to only download specific files from the remote archive without
caching them to disk. If False then remote archives will be fully
downloaded to disk(using caching) before returning the open archive.
"""
if not partial or isinstance(zip_path, pathlib.PurePath):
logger.debug(f"Using CloudPath to open(downloading if needed) {zip_path}.")
archive = zipfile.ZipFile(zip_path)
archive.filename = zip_path
return archive

# Creating a RemoteZip instance is very slow compared to local file access.
# Reuse existing objects if already created.
if zip_path in self._archives:
logger.debug(f"Reusing cloud .zip resource: {zip_path}")
return self._archives[zip_path]

logger.debug(f"Connecting to cloud .zip resource: {zip_path}")
s = time.time()
auth, headers = self.credentials()

archive = HabRemoteZip(zip_path.as_url(), auth=auth, headers=headers)
archive.filename = zip_path
e = time.time()
logger.info(f"Connected to cloud .zip resource: {zip_path}, took: {e - s}")
self._archives[zip_path] = archive
return archive

def clear_cache(self, persistent=False):
"""Clear cached data in memory. If `persistent` is True then also remove
cache data from disk if it exists.
"""
if persistent:
self.remove_download_cache()
super().clear_cache(persistent=persistent)

# Ensure all cached archives are closed before clearing the cache.
for archive in self._archives.values():
archive.close()
self._archives = {}
if persistent:
# Clear downloaded temp files
self.client.clear_cache()
165 changes: 165 additions & 0 deletions hab/distro_finders/df_zip.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
import logging

from .. import utils
from .zip_sidecar import DistroFinderZipSidecar

logger = logging.getLogger(__name__)


class DistroFinderZip(DistroFinderZipSidecar):
"""Working with zipped distros extracting the `hab_filename` information from
inside the .zip file. This is useful when you have direct access to the .zip
file.
For `path`, this class uses a .zip `member path`. A member path is the absolute
path to the .zip joined with the member path of files contained inside the .zip
file. So if the archive file path is `c:/temp/dist_a_v0.1.zip` and the member is
`hab_filename`, then the member_path would be `c:/temp/dist_a_v0.1.zip/.hab.json`.
Note:
This class should only be used to install distros in the hab download system.
This expects one file to exist with a specific naming convention:
- `{distro}_v{version}.zip` contains the entire contents of the distro.
This should also contain the top level file `hab_filename`. When the distro
is installed and using hab normally this file will be used. The `hab_filename`
file's contents are extracted from the zip file and used to initialize the
`DistroVersion` returned by `self.distro` without being written to disk.
"""

def __init__(self, root, site=None, safe=True):
super().__init__(root, site=site)
self.glob_str = "*.zip"
self._cache = {}
self.safe = safe

def clear_cache(self, persistent=False):
"""Clear cached data in memory. If `persistent` is True then also remove
cache data from disk if it exists.
"""
self._cache = {}

def content(self, path):
"""Returns the distro container for a given path as `pathlib.Path`.
For this class it returns the path to the .zip file. This .zip file
contains the contents of the distro and the actual `hab_filename` used
to create the distro.
Args:
path (pathlib.Path): The member path to the `hab_filename` file defining
the distro.
"""
# If path is already a .zip file return it.
# Note: We can't concatenate this with `pathlib.Path.parents` so this has
# to be done separately from the for loop later
if path.suffix.lower() == ".zip":
return path

# Search for the right most .zip file extension and return that path if found
for parent in path.parents:
if parent.suffix.lower() == ".zip":
return parent

# Otherwise fall back to returning the path
return path

def content_member(self, path):
"""Splits a member path into content and member.
Args:
path (os.PathLike): The member path to split.
Returns:
content(os.PathLike): Path to the .zip file.
member (str): Any remaining member path after the .zip file. If path
doesn't specify a member, then a empty string is returned.
"""
content = self.content(path)
member = str(path.relative_to(content))
# Return a empty string instead of the relative dot
if member == ".":
member = ""
return content, member

def distro_path_info(self):
"""Generator yielding distro info for each distro found by this distro finder.
Note:
This class doesn't use habcache features so cached will always be `False`.
Yields:
dirname: Will always be `None`. This class deals with only compressed
.zip files so there is not a parent directory to work with.
path: The member path to a given resource.
cached: Will always be `False`. The path is not stored in a .habcache
file so this data is not cached across processes.
"""
for path in self.root.glob(self.glob_str):
member_path = path / self.hab_filename
if self.safe:
# Opening archives on cloud based systems is slow, this allows us
# to disable checking that the archive actually has a `hab_filename` file.
data = self.get_file_data(member_path)
# This should only return None if the archive doesn't contain member
if data is None:
continue

yield None, member_path, False

def get_file_data(self, path):
"""Return the data stored inside a member of a .zip file as bytes.
This is cached and will only open the .zip file to read the contents the
first time path is used for this instance.
Args:
path: The member path to a given resource. If the path points directly
to a .zip file then member is assumed to be `self.hab_filename`.
"""
content, member = self.content_member(path)
if not member:
member = self.hab_filename
path = path / member
logger.debug(f'Implicitly added member "{member}" to path "{path}".')

if path in self._cache:
return self._cache[path]

with self.archive(content) as archive:
if member in archive.namelist():
data = archive.read(member)
else:
data = None
self._cache[path] = data

return self._cache[path]

def load_path(self, path):
"""Returns a raw dictionary use to create a `DistroVersion` with version set.
Returns the actual contents of the .zip file's top level file `hab_filename`
without writing that data to disk. The return is passed to `DistroVersion.load`
as the data argument. This allows the `DistroFinder` class to directly use
the data contained inside the .zip archive.
The version property will always be set in the return. If not defined
in the `hab_filename` file's contents, its set to the return of `version_for_path`.
Args:
path (pathlib.Path): The member path to the `hab_filename` file inside
of the .zip file.
Raises:
KeyError: This method uses the cache populated by `distro_path_info`
and that method needs to be called before calling this. It is also
raised if the requested `path` is not defined in the distro.
"""
logger.debug(f'Loading json: "{path}"')
data = self.get_file_data(path)
data = data.decode("utf-8")
data = utils.loads_json(data, source=path)
# Pull the version from the sidecar filename if its not explicitly set
if "version" not in data:
_, data["version"] = self.version_for_path(path)
return data
84 changes: 84 additions & 0 deletions hab/distro_finders/s3_zip.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
import logging
from hashlib import sha256

from cloudpathlib import S3Client
from requests_aws4auth import AWS4Auth

from .. import utils
from .cloud_zip import DistroFinderCloudZip

logger = logging.getLogger(__name__)


class DistroFinderS3Zip(DistroFinderCloudZip):
"""Works with zipped distros stored remotely in Amazon S3 buckets.
Working with zipped distros extracting the `hab_filename` information from
inside the .zip file. This is useful when you have direct access to the .zip
file.
For `path`, this class uses a .zip `member path`. A member path is the absolute
path to the .zip joined with the member path of files contained inside the .zip
file. So if the archive file path is `c:/temp/dist_a_v0.1.zip` and the member is
`hab_filename`, then the member_path would be `c:/temp/dist_a_v0.1.zip/.hab.json`.
Note:
This class should only be used to install distros in the hab download system.
This expects one file to exist with a specific naming convention:
- `{distro}_v{version}.zip` contains the entire contents of the distro.
This should also contain the top level file `hab_filename`. When the distro
is installed and using hab normally this file will be used. The `hab_filename`
file's contents are extracted from the zip file and used to initialize the
`DistroVersion` returned by `self.distro` without being written to disk.
"""

def __init__(self, root, site=None, safe=False, client=None, profile_name=None):
self.profile_name = profile_name
super().__init__(root, site=site, safe=safe, client=client)

@property
def client(self):
try:
return self._client
except AttributeError:
kwargs = {}
if self.profile_name:
kwargs["profile_name"] = self.profile_name
if self.site:
kwargs["local_cache_dir"] = self.site.downloads["cache_root"]
else:
kwargs["local_cache_dir"] = utils.Platform.default_download_cache()
self._client = S3Client(**kwargs)
return self._client

@client.setter
def client(self, client):
self._client = client

def credentials(self):
"""Returns the credentials needed for requests to connect to aws s3 bucket.
Generates these credentials using the client object.
"""

try:
return self._credentials
except AttributeError:
pass
# The `x-amz-content-sha256` header is required for all AWS Signature
# Version 4 requests. It provides a hash of the request payload. If
# there is no payload, you must provide the hash of an empty string.
headers = {"x-amz-content-sha256": sha256(b"").hexdigest()}

location = self.client.client.get_bucket_location(Bucket=self.root.bucket)[
"LocationConstraint"
]
auth = AWS4Auth(
refreshable_credentials=self.client.sess.get_credentials(),
region=location,
service="s3",
)

self._credentials = (auth, headers)
return self._credentials
Loading

0 comments on commit 561b5cf

Please sign in to comment.