From 1e00790815a5229c7bab47a44249c617d9d26a5c Mon Sep 17 00:00:00 2001 From: Matthew Abruzzo Date: Tue, 27 Aug 2024 13:01:45 -0400 Subject: [PATCH 01/36] initial attempt at writing grdata.py --- pyproject.toml | 4 +- .../pygrackle/file_registry/__init__.py | 0 .../pygrackle/file_registry/file_registry.txt | 15 + src/python/pygrackle/utilities/data_path.py | 47 +- src/python/pygrackle/utilities/grdata.py | 972 ++++++++++++++++++ 5 files changed, 1029 insertions(+), 9 deletions(-) create mode 100644 src/python/pygrackle/file_registry/__init__.py create mode 100644 src/python/pygrackle/file_registry/file_registry.txt create mode 100644 src/python/pygrackle/utilities/grdata.py diff --git a/pyproject.toml b/pyproject.toml index a7286790..ddf18508 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -47,7 +47,9 @@ dependencies = [ 'h5py', 'numpy', 'matplotlib', - 'yt>=4.0.2' + 'yt>=4.0.2', + 'pooch', + "importlib_resources;python_version<'3.9'" ] [project.license] diff --git a/src/python/pygrackle/file_registry/__init__.py b/src/python/pygrackle/file_registry/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/python/pygrackle/file_registry/file_registry.txt b/src/python/pygrackle/file_registry/file_registry.txt new file mode 100644 index 00000000..c4335aeb --- /dev/null +++ b/src/python/pygrackle/file_registry/file_registry.txt @@ -0,0 +1,15 @@ +// This is a file registry generated by the grackle data management tool +// To overwrite this file with an updated copy (assuming that pygrackle is +// installed), you might invoke: +// python -m pygrackle --hash_name sha1 --output +// in this sample command, you would substitute: +// -> ```` with a path to the output file +// -> ```` with a path to the directory containing all files that are +// to be included in the registry +{"CloudyData_UVB=FG2011.h5", "5b3423fb5cb96d6f8fae65655e204f1f82a276fa"}, +{"CloudyData_UVB=FG2011_shielded.h5", "60d13b4632f074fcb295f7adea85843046c0d4ef"}, +{"CloudyData_UVB=HM2012.h5", "3ae95f71926aa9543964fbd41c5e53a42345c19c"}, +{"CloudyData_UVB=HM2012_high_density.h5", "6db93abf8cb818975e8d751776328c5dab44d4ee"}, +{"CloudyData_UVB=HM2012_shielded.h5", "16cab5b5bd0bf5ef87db717dd5e8901be11812c2"}, +{"CloudyData_noUVB.h5", "55fed7c4bfd10e35d60660ca1adc5ceb411befb2"}, +{"cloudy_metals_2008_3D.h5", "ade563216d1102e8befab822cbb60c418b130aa1"} diff --git a/src/python/pygrackle/utilities/data_path.py b/src/python/pygrackle/utilities/data_path.py index 4e5e28b5..6b1d4d62 100644 --- a/src/python/pygrackle/utilities/data_path.py +++ b/src/python/pygrackle/utilities/data_path.py @@ -11,17 +11,48 @@ # software. ######################################################################## +from functools import partial import os +import sys +from pygrackle.grackle_wrapper import get_grackle_version +from pygrackle.utilities.grdata import make_config_objects, get_version_dir from pygrackle.utilities.misc import dirname -grackle_data_dir = os.environ.get("GRACKLE_DATA_DIR") -if grackle_data_dir is None: - # Note, this only works with an editable install of pygrackle. + +# when we shift to scikit-build-core we can do something more robust here +def _is_editable_install(): _install_dir = dirname(os.path.abspath(__file__), level=5) - grackle_data_dir = os.path.join(_install_dir, "input") + return os.path.exists(os.path.join(_install_dir, "grackle_data_files")) + + +is_editable_install = _is_editable_install() + + +def _get_file_registry_contents(editable_install): + if editable_install: + fname = os.path.join( + dirname(os.path.abspath(__file__), 2), "file_registry", "file_registry.txt" + ) + if not os.path.isfile(fname): + raise RuntimeError( + "could not find the file_registry.txt in an editable install." + ) + return fname + + if (sys.version_info.major, sys.version_info.minor) < (3, 9): + import importlib_resources as resources + else: + from importlib import resources + ref = resources.files("pygrackle.file_registry") / "file_registry.txt" + + contents = ref.read_text(encoding="utf-8") + return io.StringIO(contents) + + +_CONFIG_PAIR = make_config_objects( + grackle_version=get_grackle_version()["version"], + file_registry_file=_get_file_registry_contents(is_editable_install), +) -if not os.path.isdir(grackle_data_dir): - raise RuntimeError( - f"grackle_data_dir not set to a valid directory: {grackle_data_dir}. " + \ - "Use the GRACKLE_DATA_DIR environment variable to set path to Grackle data.") +grackle_data_dir = get_version_dir(*_CONFIG_PAIR) diff --git a/src/python/pygrackle/utilities/grdata.py b/src/python/pygrackle/utilities/grdata.py new file mode 100644 index 00000000..7148ad12 --- /dev/null +++ b/src/python/pygrackle/utilities/grdata.py @@ -0,0 +1,972 @@ +#!/usr/bin/env python3 + +# A tool for managing grackle data files. This should be usable as +# -> a standalone command line tool (when pygrackle isn't installed) +# -> as a part of pygrackle. + + +# Notes on the file registry +# -------------------------- +# The file registry refers to a small file that associates a filename with a checksum. +# +# In the long term, we plan to support 3 cases involving hashes checksums: +# +# 1. have the C layer of Grackle provide the option to directly read in files from the +# data-store without being provided a full path +# -> this logic is already mostly implemented +# -> In this case, we will also need to have access to the file checksum so we can +# validate that the correct file is being accessed. (This is mostly to ensure we +# don't break other people's results because we make a mistake). The checksum +# validation will be performed with a tool like picohash +# -> When we do this, we will directly embed the information encoded in the file +# registry inside of scikit-build-core (we picked the file format to ensure that +# the information is easy to embed in a C file) +# +# 2. Continue supporting the functionality (and cli) implemented by this file within +# pygrackle +# +# 3. Support running using this script as a standalone command-line program. +# -> See the end of this file for what that entails + + +# With that said, the pooch package seems like a great tool for helping us do +# this (it does a bunch of the things we would probably need to do anyways). +# If/when we want to distribute this file as a standalone application alongside +# the Grackle library, we have 2 options: +# 1. We could remove pygrackle as a dependency. +# 2. We could package this file, pooch and pooch's dependencies as a +# `zipapp `_. +# This is only possible as long as: +# - pooch doesn't add many more required dependencies (right now it only +# has 2 dependencies) +# - pooch and its dependencies remain written in pure python +# - pooch and its dependencies maintain compatability with a wide range of +# python versions + + +_DESCRIPTION = """\ +This is a command line tool for downloading and managing data files to be used +with Grackle. + + + +""" + +import argparse +from contextlib import contextmanager, ExitStack +import filecmp +import hashlib +import io +import os +import re +import shutil +import stat +import sys +import traceback +import warnings + +import pooch # external import + +# pooch will assume that any unlabeled checksum was computed with the following algorithm +_POOCH_DEFAULT_CKSUM_KIND = "sha256" + +if (sys.version_info.major, sys.version_info.minor) < (3, 3): + raise RuntimeError("python 3.3 or newer is required") + + +_UNSPECIFIED = object() +_OBJECT_STORE_SUBDIR = "object-store" + + +class ToolConfig: + """Tracks basic information about this tool""" + + def __init__(self, *, grackle_version, protocol_version="1", checksum_kind="sha1"): + self.grackle_version = grackle_version + self.protocol_version = protocol_version + self.checksum_kind = checksum_kind + + +class DataStoreConfig: + """Track basic configuration information + + In principle, this information is intended to be a little more + flexible and might not be known as early as ToolConfig. + """ + + def __init__( + self, + *, + data_dir, + store_location, + data_repository_url, + contemporaneous_git_hash, + checksum_kind, + file_registry_file, + ): + # where data is actually stored + self.data_dir = data_dir + self.store_location = store_location + + # properties for tracking files + self.data_repository_url = data_repository_url + self.contemporaneous_git_hash = contemporaneous_git_hash + + # specifies the kinds of checksums listed in the registry + self.checksum_kind = checksum_kind + # the following specifies the file containing the file registry + self.file_registry_file = file_registry_file + + +def _get_platform_data_dir(appname="grackle"): + """Returns a string specifying the default data directory + + All of these choices are inspired by the API description of the platformdirs python + package + * we only looked at online documentation: + https://platformdirs.readthedocs.io/en/latest/ + * we have NOT read any source code + """ + if sys.platform.startswith("win32"): + raise RuntimeError() + elif sys.platform.startswith("darwin"): + return os.path.expanduser(f"~/Library/Application Support/{appname}") + else: # assume linux/unix + # https://specifications.freedesktop.org/basedir-spec/latest/ + dflt = "~/.local/share" + env_str = os.getenv("XDG_DATA_HOME", default=dflt) + if env_str[:1] not in ["~", "/"]: + # this is what the specification tells us to do + warnings.warn( + "ignoring XDG_DATA_HOME because it doesn't hold an " "absolute path" + ) + env_str = dflt + + # now actually infer the absolute path + if (env_str[0] == "~") and (not env_str.startswith("~/")): + if env_str[:2] != "~/": # for parity with C-version of this function + raise RuntimeError( + "can't expand can't expand env-variable, XDG_DATA_HOME when " + "it starts with `~user/` or just contains `~`" + ) + return os.path.expanduser(f"{env_str}/{appname}") + else: + return f"{env_str}/{appname}" + + +def _get_data_dir(): + manual_choice = os.getenv("GRACKLE_DATA_DIR", default=None) + if (manual_choice is None) or (len(manual_choice) == 0): + return _get_platform_data_dir() + elif (manual_choice[0] != "~") and (not os.path.isabs(manual_choice)): + raise RuntimeError("GRACKLE_DATA_DIR must specify an absolute path") + elif manual_choice[0] == "~": + if not env_str[:2] != "~/": # for parity with C-version of this function + raise RuntimeError( + "can't expand can't expand env-variable, GRACKLE_DATA_DIR when " + "it starts with `~user/` or just contains `~`" + ) + return os.path.expanduser(manual_choice) + else: + return manual_choice + + +def _parse_file_registry(f): + """Read the file registry, as a dict from a text file + + Parameters + ---------- + f : file or str or bytes or ``os.PathLike`` + Contains the data to be read in + + Notes + ----- + We describe the format below. This format was choosen so that the + contents could be injected into a C to be used as a literal. + + * empty lines and lines that start with ``//`` are ignored + + * all other lines should look like ``{"", ""}`` + and there is allowed to be a trailing comma + """ + + if (sys.version_info.major, sys.version_info.minor) < (3, 6): + if not isinstance(f, io.IOBase): + path = f + else: + path = None + else: + try: + path = os.fspath(f) + except TypeError: + path = None + + with ExitStack() as stack: + if path is None: + file = f + else: + file = stack.enter_context(open(path, "r")) + + file_registry = {} + for i, line in enumerate(file): # iterater over lines + if (len(line) == 0) or line.isspace() or line.startswith("//"): + continue + m = re.match( + r'^\s*{\s*"(?P[^"]+)"\s*,\s*"(?P[^"]+)"\s*},?\s*', line + ) + if m is None: + raise RuntimeError( + f"Something went wrong with parsing line {i+1} of {f}:\n " + f" `{line}`" + ) + file_registry[m["fname"]] = m["cksum"] + return file_registry + + +class GenericToolError(RuntimeError): + pass + + +class LockFileExistsError(FileExistsError): + pass + + +@contextmanager +def lock_dir(lock_file_path): + """ + Contextmanager that creates a "lock file." The context-manager will delete + the file when we finish. If the lock already exists, the program will abort + with an explanatory error (this ensures that only 1 copy of the program will + try to run at a time). + """ + try: + f = open(lock_file_path, "x") + f.close() + except FileExistsError as err: + raise LockFileExistsError( + err.errno, + err.strerror, + err.filename, + getattr(err, "winerror", None), + err.filename2, + ) from None + + try: + yield None + finally: + os.remove(lock_file_path) + + +@contextmanager +def standard_lockfile(data_config): + lock_file_path = os.path.join(data_config.data_dir, "lockfile") + with lock_dir(lock_file_path): + yield None + + +def calc_checksum(fname, alg_name, *, chunksize=4096): + """Calculate the checksum for a given fname""" + # construct the object to track intermediate state of the checksum + # calculation as we stream through the data + hash_obj = hashlib.new(alg_name) + with open(fname, "rb") as f: + buffer = bytearray(chunksize) + while True: + nbytes = f.readinto(buffer) + if nbytes == chunksize: + hash_obj.update(buffer) + elif nbytes: # equivalent to: (nbytes is not None) and (nbytes > 0) + hash_obj.update(buffer[:nbytes]) + else: + break + return hash_obj.hexdigest() + + +def matches_checksum(fname, alg_name, checksum): + return checksum == calc_checksum(fname, alg_name) + + +def _create_retriever(destination_path, data_config): + """ + create pooch object responding for fetching data files + + Notes + ----- + If we ever move away from pooch (e.g. to make this functionality easier to use in a + portable standalone script), we need to ensure that the our new approach implements + a similar procedure that they adopt where: + 1. any downloaded file is first put in a temporary location + 2. and then, only after we verify that the checksum is correct, we move the + file to the downloads directory. + """ + repo_url = data_config.data_repository_url + repo_version = data_config.contemporaneous_git_hash + prefix = f"{data_config.checksum_kind}" + + file_registry = _parse_file_registry(data_config.file_registry_file) + + # if we move away from pooch, ( + return pooch.create( + path=destination_path, + base_url=f"{repo_url}/raw/{repo_version}/input/", + registry=dict((k, f"{prefix}:{v}") for k, v in file_registry.items()), + ) + + +def _pretty_log(arg): + """indent messages so it's clear when multiline messages are a single thought""" + lines = arg.splitlines() + if len(lines): + print("\n".join([f"-- {lines[0]}"] + [f" {e}" for e in lines[1:]])) + else: + print("") + + +def _ensure_exists(path, content_description): + if not os.path.isdir(path): + if len(content_description) > 0: + _pretty_log(f"creating directory {content_description}\n-> {path}") + os.mkdir(path) + + +# to be used with os.chmod to set permissions to prevent mutations of files (you can +# always delete it if you own it) +_IMMUTABLE_MODE = stat.S_IREAD | stat.S_IRGRP | stat.S_IROTH + + +class _HardlinkStrat: + """ + Acts as a "namespace" for functions related to our deduplication strategy + that uses Hardlinks + """ + + @staticmethod + def are_linked(fname, fname2): + """return whether ``fname`` & ``fname2`` specify paths that are hardlinks""" + try: + statinfo1 = os.stat(fname, follow_symlinks=False) + statinfo2 = os.stat(fname2, follow_symlinks=False) + except FileNotFoundError: + return False + return statinfo1.st_ino == statinfo2.st_ino + + @staticmethod + def remove_if_norefs(fname): + """ + Removes the specified file if there are no other references to it. + + Parameters + ---------- + fname : str + Path to the file that we are operating on + + Returns + ------- + bool + Indicates if any file was removed + """ + statinfo = os.stat(fname, follow_symlinks=False) + + # statinfo.st_nlink == 1 means that the only hardlink is the hardlink of + # associated with fname + # -> it should be possible for ``os.stat(fname).st_nlink`` to return ``0`` + if statinfo.st_nlink == 1: + os.remove(fname) + return True + return False + + @staticmethod + def deduplicate(full_fname, shared_fname): + """ + Perform logic to ensure that ``full_fname`` and ``shared_fname`` + are both paths that refer to the same hardlink. + + This handles 3 main cases: + + 1. ``full_fname`` and ``shared_fname`` are already hardlinked. + + * Nothing is done. + + 2. ``full_fname`` exists and ``shared_fname`` doesn't. + + * A hardlink will be created at ``shared_fname`` that refers + to ``full_fname``. + + 3. ``full_fname`` and ``shared_fname`` specify existing distinct + copies of the same existing file. + + * in this case, ``full_fname`` is deleted and then replaced + with a hardlink that refers to ``shared_fname``. + + Parameters + ---------- + full_fname : str + Specifies an existing file-path that already exists + shared_fname : str + Specifies a file-path that may or may not exist. If it does + already exist, it will be preserved (in case it is already in + use for deduplicating other existing files) + """ + if not os.path.isfile(full_fname): + raise FileNotFoundError(full_fname) + elif _HardlinkStrat.are_linked(full_fname, shared_fname): + pass # do nothing! + elif os.path.isfile(shared_fname): + if not filecmp.cmp(full_fname, shared_fname, shallow=False): + raise ValueError( + f"`{full_fname}` and `{shared_fname}` specify files that aren't " + "perfect copies" + ) + os.remove(full_fname) + os.link(shared_fname, full_fname) + else: + os.link(full_fname, shared_fname) + + +def _fetch_files(retriever, cksum_kind, object_dir): + """ + Does the heavy lifting of fetching files. + + Parameters + ---------- + retriever : ``pooch.Pooch`` + pooch object that manages downloads of the specified files + cksum_kind : str + The primary checksum algorithm that the tool is globally configured to use. + object_dir : str + Path to the object directory. This is the name where checksum names are used as + filenames. (This is the mechanism used to aid deduplication) + """ + try: + import tqdm + + progressbar = True + except: + progressbar = False + + num_download_attempts = 0 + + # NOTE: the docstring for ``pooch.create`` makes it clear that ``retriever.registry`` + # is a part of the public API + for fname, full_checksum_str in retriever.registry.items(): + # extract the checksum_kind and string that are stored in the registry + # (we are being a little more careful here than necessary, but if this ever + # becomes library-code, it will pay off) + if ":" in full_checksum_str: + cur_cksum_kind, checksum = full_checksum_str.split(":") + else: + cur_cksum_kind, checksum = _POOCH_DEFAULT_CKSUM_KIND, full_checksum_str + + if cur_cksum_kind != cksum_kind: + raise ValueError( + "Currently, we only support downloading from file registries where the " + f"checksum algorithm matches the globally used algorithm, {cksum_kind}. " + f"The checksum algorithm associated with {fname} is {cur_cksum_kind}" + ) + + # name associated with current file in the current grackle version + full_fname = os.path.join(retriever.path, fname) + + # if the file already exists we are done + if os.path.exists(full_fname): + if not matches_checksum(full_fname, cksum_kind, checksum): + raise RuntimeError( + f"{full_fname} already exists but has the wrong hash" + ) + continue + + num_download_attempts += 1 + + # download the file (pooch will log a detailed message + retriever.fetch(fname, progressbar=progressbar) + os.chmod(full_fname, _IMMUTABLE_MODE) + + # now deduplicate + checksum_fname = os.path.join(object_dir, checksum) + + try: + _HardlinkStrat.deduplicate(full_fname, checksum_fname) + + # not strictly necessary, but doing this for safety reasons + os.chmod(checksum_fname, _IMMUTABLE_MODE) + + except Exception as err: + # remove full_fname since we don't want users to use it before dealing + # with the larger issue. We also want to make the errors reproducible + os.remove(full_fname) + if not (isinstance(err, ValueError) and os.path.is_file(checksum_fname)): + raise err + + # this should only happens when full_fname and checksum_fname both exist, + # but aren't perfect matches of each other. We try to provide a more + # informative error message + if not matches_checksum( + checksum_fname, data_config.checksum_kind, checksum + ): + raise GenericToolError(f"""\ +A file (used for deduplication) that already existed on disk + `{checksum_fname}` +which is probably a version of `{fname}`, +doesn't have the appropriate {data_config.checksum_kind} checksum. +-> expected: {calc_checksum(checksum_fname, data_config.checksum_kind)} +-> actual: {checksum} +-> This implies that the data was corrupted and it needs to be dealt with. + To avoid confusion we have deleted the newly downloaded version of + `{fname}` +-> The safest bet is probably to delete the data directory""") + else: + raise GenericToolError(f"""\ +Something bizare (& extremely unlikely) happened: +-> a previous invocation of this tool appears to have installed a data file + with the same checksum as {fname}, but has different contents. +-> we adopt a similar system to git and the odds for this to organically + happen for a small collection of files is truly astronomical! +-> this is probably a sign that something went wrong. We deleted the newly + downloaded version of the file""") + + if num_download_attempts == 0: + _pretty_log("no files needed to be loaded") + + +def get_version_dir(tool_config, data_config): + return os.path.join(data_config.store_location, tool_config.grackle_version) + + +def fetch_command(args, tool_config, data_config): + # the data_dir is a directory that contains: + # -> the data-store directory for data managed by the current protocol version + # -> (possibly) data-store directories for data managed by other protocol version + # -> (possibly) a directory called `user-data/` where users can put custom data + _ensure_exists(data_config.data_dir, "that will hold all Grackle data") + + with standard_lockfile(data_config): + # even though it isn't used for anything right now, make the directory that is + # reserved for user content + _ensure_exists( + os.path.join(data_config.data_dir, "user-data"), + "reserved for user-defined data", + ) + + # do a little more setup! + _ensure_exists(data_config.store_location, "that will hold the data-store") + + # ensure version_dir and object_dir both exist. They respectively store + # filenames that are (hard) linked to the data-file. + # -> version_dir uses the names known by the associated grackle-version + # -> object_dir uses the checksum as a filenames + object_dir = os.path.join(data_config.store_location, _OBJECT_STORE_SUBDIR) + + _ensure_exists(object_dir, "") + version_dir = get_version_dir(tool_config, data_config) + _ensure_exists(version_dir, "that holds data for current Grackle version") + + # create the object that is used to actually loads the data + retriever = _create_retriever( + destination_path=version_dir, data_config=data_config + ) + + _fetch_files(retriever, data_config.checksum_kind, object_dir) + + +def direntry_iter(path, *, ftype="file", mismatch="skip", ignore=None): + """ + Iterate over the contents of a single directory with focus on a + particular file type assumption that all. + + Parameters + ---------- + path : str + path to the directory + ftype : {None, 'file', 'dir'} + When not ``None``, the iterator only produces entries for the + specified file-type + mismatch : {'skip', 'lazy_err', 'eager_err'} + Specifies the action to take when this generator encounters an + entry in ``path`` that doesn't have the specified type. + * ``'skip'`` means to simply skip the entry + * ``'lazy_err'`` means that we raise an error + * ``'eager_err'`` means that we check for any mismatches and + raise an error if any mismatch is encountered and afterwards, + we start yielding elements + ignore : container of str, optional + Optional container of strings that are ignored + + Yields + ------ + pair : tuple of two str + The first element is the entry's base filename and the second is the full path + """ + if ftype is None: + has_ftype = lambda full_path: True + if ftype == "dir": + has_ftype = os.path.isdir + elif ftype == "file": + has_ftype = os.path.isfile + else: + raise ValueError("ftype must be None, 'file' or 'dir'") + + if ignore is None: + ignore = [] + elif isinstance(ignore, str): + raise TypeError("ignore can't be a string") + + it = map( + lambda e: (e, os.path.join(path, e)), + filter(lambda e: e not in ignore, os.listdir(path)), + ) + if mismatch == "eager_err": + for pair in direntry_iter(path, ftype=ftype, mismatch="lazy_err"): + pass + yield from it + elif mismatch in ["lazy_err", "skip"]: + for pair in it: + if has_ftype(pair[1]): + yield pair + elif mismatch == "lazy_err": + raise RuntimeError(f"{pair[1]} isn't a {ftype}") + else: + raise ValueError("mismatch must be 'eager_err', 'lazy_err' or 'skip'") + + +def rm_command(args, tool_config, data_config): + """Logic for removing files""" + if args.vdata is _UNSPECIFIED: + # this means that we are removing the whole data store + if not args.data_store: + raise RuntimeError("SOMETHING WENT HORRIBLY, HORRIBLY WRONG") + + _descr = os.path.basename(data_config.store_location) + target_path = data_config.store_location + operation_description = ( + f"deleting ALL files in the data-store associated with this tool, {_descr}" + ) + if not os.path.isdir(target_path): + raise GenericToolError( + "intended to recursively delete all contents of the associated data-store. " + "But no such directory can be found." + ) + + fn = shutil.rmtree + + else: + if args.vdata is None: + target = tool_config.grackle_version + _descr = f"associated with this tool (`{tool_config.grackle_version}`)" + else: + target = args.vdata + _descr = f"`{target}`" + target_path = os.path.join(data_config.store_location, target) + operation_description = ( + f"deleting all data file references for the grackle-version {_descr}. " + "Any files for which the reference-count drops to zero will also be removed." + ) + + if not os.path.isdir(target_path): + raise GenericToolError( + "intended to delete all data-file references for the grackle-version " + f"{_descr}, but no such data is tracked in the data-store." + ) + + def fn(path): + object_dir = os.path.join(data_config.store_location, _OBJECT_STORE_SUBDIR) + if not os.path.isdir(object_dir): + raise RuntimeError( + "SOMETHING IS HORRIBLY WRONG!!! THE {object_dir} IS MISSING" + ) + + # we throw an err if this directory contains some unexpected stuff + it = direntry_iter(path, ftype="file", mismatch="eager_err") + for name, full_path in it: + # get path to corresponding hardlinked file in _OBJECT_STORE_SUBDIR + checksum = calc_checksum(full_path, alg_name=tool_config.checksum_kind) + checksum_fname = os.path.join(object_dir, checksum) + checksum_fname_exists = os.path.isfile(checksum_fname) + + if not checksum_fname_exists: + warnings.warn( + "Something weird has happened. There is no deduplication file " + f"associated with {full_path}" + ) + os.remove(full_path) + if checksum_fname_exists: + _HardlinkStrat.remove_if_norefs(checksum_fname) + os.rmdir(path) + + target_exists = os.path.isdir(target_path) + + if not args.force: + _pretty_log( + f"{operation_description}\n" + "-> essentially, we are recursively removing\n" + f" `{target_path}`\n" + "-> to actually perform this command, pass the --force flag" + ) + else: + fn(target_path) + + +def lsversions_command(args, tool_config, data_config): + if not os.path.exists(data_config.store_location): + print("there is no data") + with standard_lockfile(data_config): + it = direntry_iter( + data_config.store_location, + ftype="dir", + mismatch="lazy_err", + ignore=[_OBJECT_STORE_SUBDIR], + ) + print(*sorted(pair[0] for pair in it), sep="\n") + + +def getpath_command(args, tool_config, data_config): + print(data_config.data_dir) + + +def calcreg_command(args, tool_config, data_config): + # print the properly file registry information (in the proper format that can be + # used to configure newer versions of Grackle + + # we use listdir since we are targetting 3.3, but we set things up so that we could + # use os.scandir + try: + it = direntry_iter(args.path, ftype="file", mismatch="eager_err") + except FileNotFoundError: + raise ValueError(f"{path!r} doesn't specify a directory or file") + except NotADirectoryError: + it = [(os.path.basename(args.path), args.path)] + + pairs = [(name, calc_checksum(path, args.hash_name)) for name, path in it] + + with ExitStack() as stack: + if args.output is None: + file = sys.stdout + else: + file = stack.enter_context(open(args.output, "w")) + + if file is not None: + file.write(f"""\ +// This is a file registry generated by the grackle data management tool +// To overwrite this file with an updated copy (assuming that pygrackle is +// installed), you might invoke: +// python -m pygrackle --hash_name {args.hash_name} --output +// in this sample command, you would substitute: +// -> ```` with a path to the output file +// -> ```` with a path to the directory containing all files that are +// to be included in the registry +""") + print(*[f'{{"{p[0]}", "{p[1]}"}}' for p in sorted(pairs)], sep=",\n", file=file) + + +def _add_version(parser, version_flag, version_name, value): + """add argument to parser to show a version and exit (similar to --help)""" + + class _Action(argparse.Action): + def __call__(self, *args, **kwargs): + print(value) + sys.exit(0) + + parser.add_argument( + version_flag, + metavar="", + action=_Action, + nargs=0, + help=f"show associated {version_name} and exit", + ) + + +def build_parser(tool_config, prog_name): + parser = argparse.ArgumentParser( + prog=prog_name, + description=_DESCRIPTION, + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + + _add_version( + parser, "--version-grackle", "Grackle version", tool_config.grackle_version + ) + _add_version( + parser, + "--version-protocol", + "data-store protocol version", + tool_config.protocol_version, + ) + + subparsers = parser.add_subparsers(required=True) + + parser_fetch = subparsers.add_parser( + "fetch", + help=( + "fetch data files if we don't already have the data for the " + "associated version of grackle" + ), + ) + parser_fetch.set_defaults(func=fetch_command) + + parser_ls = subparsers.add_parser("ls-versions", help="list the versions") + parser_ls.set_defaults(func=lsversions_command) + + parser_rm = subparsers.add_parser( + "rm", help="remove data associated with a given version" + ) + parser_rm.add_argument( + "-f", + "--force", + action="store_true", + help="This option must be present to actually remove things", + ) + rm_spec_grp = parser_rm.add_argument_group( + title="Target", description="specifies the target that will be removed" + ).add_mutually_exclusive_group(required=True) + rm_spec_grp.add_argument( + "--data-store", action="store_true", help="remove the full data-store" + ) + rm_spec_grp.add_argument( + "--vdata", + default=_UNSPECIFIED, + nargs="?", + help="remove all data associated with the contemporaneous grackle version", + ) + parser_rm.set_defaults(func=rm_command) + + parser_getpath = subparsers.add_parser( + "getpath", help="get filesystem location where all the data is stored" + ) + parser_getpath.set_defaults(func=getpath_command) + + parser_calcregistry = subparsers.add_parser( + "calcreg", + help=( + "prints the file registry (file hash pairs) for a given directory. This " + "computed registry can be used to configure future versions of Grackle." + ), + ) + parser_calcregistry.add_argument( + "-o", + "--output", + metavar="FILE", + help=( + "Write the output to a file instead of stdout. The file will include extra " + "metadata (as comments)." + ), + ) + parser_calcregistry.add_argument( + "--hash-name", + required=True, + metavar="HASH", + choices=hashlib.algorithms_guaranteed, + help=( + "the kind of checksum to compute. Must be one of: " + f"{ ', '.join(sorted(hashlib.algorithms_guaranteed))}" + ), + ) + parser_calcregistry.add_argument( + "path", help="path to the directory containing the files in the registry" + ) + parser_calcregistry.set_defaults(func=calcreg_command) + + return parser + + +def main(tool_config, data_config, prog_name): + parser = build_parser(tool_config, prog_name) + args = parser.parse_args() + + try: + args.func(args, tool_config=tool_config, data_config=data_config) + except SystemExit: + pass # this shouldn't come up! + except LockFileExistsError as err: + lock_file_path = err.filename + print( + f"""\ +ERROR: The `{lock_file_path}` lock-file already exists. +-> This probably means that another copy of this tool is currently running. +-> If you are absolutely sure that's not the case, that probably means that a copy + of this tool previously crashed""", + file=sys.stderr, + ) + sys.exit(78) # https://www.man7.org/linux/man-pages/man3/sysexits.h.3head.html + except GenericToolError as err: + print(f"ERROR: {err.args[0]}") + sys.exit(70) # https://www.man7.org/linux/man-pages/man3/sysexits.h.3head.html + except: + print(f"Unexpected error:", file=sys.stderr) + traceback.print_exc(file=sys.stderr) + sys.exit(70) # https://www.man7.org/linux/man-pages/man3/sysexits.h.3head.html + else: + sys.exit(0) + + +def _default_data_config(tool_config, file_registry_file): + """Provides default data configuration""" + _REPO_URL = "https://github.com/grackle-project/grackle_data_files/" + + # this is hash that holds the versions of the datafiles from the time when this + # version of the file was shipped + _CONTEMPORANEOUS_COMMIT_HASH = "9a63dbefeb1410483df0071eefcbff666f40816d" + + # FILE_REGISTRY is in a format that could be injected into a C file as a literal + data_dir = _get_data_dir() + protocol_version = tool_config.protocol_version + return DataStoreConfig( + data_dir=data_dir, + store_location=os.path.join(data_dir, f"data-store-v{protocol_version}"), + data_repository_url=_REPO_URL, + contemporaneous_git_hash=_CONTEMPORANEOUS_COMMIT_HASH, + checksum_kind=tool_config.checksum_kind, + file_registry_file=file_registry_file, + ) + + +def make_config_objects(grackle_version, file_registry_file): + """Construct the pair of configuration objects used for running the calculation + + Parameters + ---------- + grackle_version : str + the version of grackle (NOT pygrackle) + file_registry_file : file or str or bytes or ``os.PathLike`` + Contains the file registry + """ + tool_config = ToolConfig(grackle_version=grackle_version) + data_config = _default_data_config(tool_config, file_registry_file) + return tool_config, data_config + + +# to support installing this as a standalone script, we will need to introduce the +# following procedure to the build-system: +# - treat this file as a template-file and configure it with CMake's +# ``configure_file`` command (or invoke ``configure_file.py`` under the classic +# build system) in order to substitute the names enclosed by the @ symbols +# - if we are still using pooch (or some other external package) we'll need to +# introduce logic to convert this into a zipapp (this is a special zip file that +# contains all dependencies that the python interpretter knows how to execute) +# - make resulting file executable (and maybe drop the .py suffix) +# - install it into the bin directory alongside the grackle libraries + +if __name__ == "__main__": + _GRACKLE_VERSION = "@GRACKLE_VERSION@" + _FILE_REGISTRY_CONTENTS = """\ +@FILE_REGISTRY_CONTENTS@ +""" + + def _check_substitution_problems(var_name, var_value): + if ( + (var_name in var_value) + or ("@" in var_value) + or (len(var_value) == 0) + or (var_value.isspace()) + ): + raise RuntimeError( + "something went wrong when the build-system was configuring the " + f"{var_name} variable" + ) + + _check_substitution_problems("GRACKLE_VERSION", _GRACKLE_VERSION) + _check_substitution_problems("FILE_REGISTRY_CONTENTS", _FILE_REGISTRY_CONTENTS) + + _CONFIG_PAIR = make_config_objects( + grackle_version=_GRACKLE_VERSION, + file_registry_file=io.StringIO(_FILE_REGISTRY_CONTENTS), + ) + main(*_CONFIG_PAIR, prog_name="grdata") From 8106e6fc01d0bff74c46d0a8c0f8b180ec3619e0 Mon Sep 17 00:00:00 2001 From: Matthew Abruzzo Date: Tue, 27 Aug 2024 16:41:09 -0400 Subject: [PATCH 02/36] Significant refactoring of grdata.py I also added documentation and integrated the tool into the testing framework. --- doc/source/Python.rst | 11 + doc/source/Tools.rst | 75 ++ doc/source/index.rst | 1 + .../pygrackle/file_registry/file_registry.txt | 14 +- src/python/pygrackle/utilities/data_path.py | 22 +- src/python/pygrackle/utilities/grdata.py | 1010 ++++++++++++----- src/python/tests/conftest.py | 12 + 7 files changed, 871 insertions(+), 274 deletions(-) create mode 100644 doc/source/Tools.rst create mode 100644 src/python/tests/conftest.py diff --git a/doc/source/Python.rst b/doc/source/Python.rst index 3f1938f2..f9defbfa 100644 --- a/doc/source/Python.rst +++ b/doc/source/Python.rst @@ -151,6 +151,17 @@ To make sure everything is installed properly, you can try invoking pygrackle fr If this command executes without raising any errors, then you have successfully installed Pygrackle. +Installing DataFiles +++++++++++++++++++++ + +To install the datafiles in a location usable for automatic usage in the Pygrackle examples (and tests) we recommend invoking the following command (from any directory): + +.. code-block:: shell-session + + $ python -m pygrackle fetch + +:ref:`This section ` for more details about customizing the the location where data is stored and about managing datafiles in general. + .. _pygrackle-dev: Installing Pygrackle Development Requirements diff --git a/doc/source/Tools.rst b/doc/source/Tools.rst new file mode 100644 index 00000000..1a3d4aac --- /dev/null +++ b/doc/source/Tools.rst @@ -0,0 +1,75 @@ + +.. _manage-data-files: + +Datafile Management +=================== + +We provide a command line tool to optionally manage Grackle's datafiles. + +At a Quick Glance +----------------- + +Currently, this command line tool is only accessible when :ref:`pygrackle is installed `. +To execute the tool execute + +.. code-block:: shell-session + + $ python -m pygrackle ... + +Where ``...`` is replaced with one or more command-line arguments. +For example, ``fetch`` will invoke a subcommand that downloads all associated files (if they aren't already downloaded). +You can use the ``--help`` option to get a list of all subcommands. +You can also pass the ``--help`` option after the name of a subcommand (e.g. you can use ``fetch --help``) to get more details about subcommand-specific options. + +.. note:: + + At the moment, this functionality is most useful for pygrackle. + In the near future [#df1]_\ , it will be possible install pygrackle without manually downloading the grackle repository. + At that time, this will be the most efficient way to retrieve the files. + The pygrackle examples and some of the pygrackle tests rely upon this functionality. + However, you are free to completely ignore this functionality for your own purposes. + + There is ongoing work to implement functionality for the Grackle C library to directly access the datafiles managed by this tool. + When these efforts are finished, we plan to additionally provide this command-line-tool as a standalone program that is always installed alongside Grackle (so that you can access this functionality without installing pygrackle) + +Description +----------- + +.. include:: ../../src/python/pygrackle/utilities/grdata.py + :start-after: [[[BEGIN-SECTION:DESCRIPTION]]] + :end-before: [[[END-SECTION:DESCRIPTION]]] + + + +Motivation +---------- + +.. include:: ../../src/python/pygrackle/utilities/grdata.py + :start-after: [[[BEGIN-SECTION:MOTIVATION]]] + :end-before: [[[END-SECTION:MOTIVATION]]] + + +How it works +------------ + +.. include:: ../../src/python/pygrackle/utilities/grdata.py + :start-after: [[[BEGIN-SECTION:INTERNALS-OVERVIEW]]] + :end-before: [[[END-SECTION:INTERNALS-OVERVIEW]]] + + +Sample Directory Structure +++++++++++++++++++++++++++ + +Down below, we sketch out what the directory-structure might look like: + + +.. literalinclude:: ../../src/python/pygrackle/utilities/grdata.py + :language: none + :start-after: [[[BEGIN:DIRECTORY-CARTOON]]] + :end-before: [[[END:DIRECTORY-CARTOON]]] + + +.. rubric:: Footnotes + +.. [#df1] Once `GH-#208 `__ is merged, you will be able to instruct pip to install pygrackle by just specifying the URL of the GitHub repository. + We also have plans to upload pygrackle to pip. diff --git a/doc/source/index.rst b/doc/source/index.rst index a1c007c3..a859235b 100644 --- a/doc/source/index.rst +++ b/doc/source/index.rst @@ -47,6 +47,7 @@ Contents: Reference.rst Versioning.rst Python.rst + Tools.rst Conduct.rst Contributing.rst Help.rst diff --git a/src/python/pygrackle/file_registry/file_registry.txt b/src/python/pygrackle/file_registry/file_registry.txt index c4335aeb..573741f5 100644 --- a/src/python/pygrackle/file_registry/file_registry.txt +++ b/src/python/pygrackle/file_registry/file_registry.txt @@ -6,10 +6,10 @@ // -> ```` with a path to the output file // -> ```` with a path to the directory containing all files that are // to be included in the registry -{"CloudyData_UVB=FG2011.h5", "5b3423fb5cb96d6f8fae65655e204f1f82a276fa"}, -{"CloudyData_UVB=FG2011_shielded.h5", "60d13b4632f074fcb295f7adea85843046c0d4ef"}, -{"CloudyData_UVB=HM2012.h5", "3ae95f71926aa9543964fbd41c5e53a42345c19c"}, -{"CloudyData_UVB=HM2012_high_density.h5", "6db93abf8cb818975e8d751776328c5dab44d4ee"}, -{"CloudyData_UVB=HM2012_shielded.h5", "16cab5b5bd0bf5ef87db717dd5e8901be11812c2"}, -{"CloudyData_noUVB.h5", "55fed7c4bfd10e35d60660ca1adc5ceb411befb2"}, -{"cloudy_metals_2008_3D.h5", "ade563216d1102e8befab822cbb60c418b130aa1"} +{"CloudyData_UVB=FG2011.h5", "sha1:5b3423fb5cb96d6f8fae65655e204f1f82a276fa"}, +{"CloudyData_UVB=FG2011_shielded.h5", "sha1:60d13b4632f074fcb295f7adea85843046c0d4ef"}, +{"CloudyData_UVB=HM2012.h5", "sha1:3ae95f71926aa9543964fbd41c5e53a42345c19c"}, +{"CloudyData_UVB=HM2012_high_density.h5", "sha1:6db93abf8cb818975e8d751776328c5dab44d4ee"}, +{"CloudyData_UVB=HM2012_shielded.h5", "sha1:16cab5b5bd0bf5ef87db717dd5e8901be11812c2"}, +{"CloudyData_noUVB.h5", "sha1:55fed7c4bfd10e35d60660ca1adc5ceb411befb2"}, +{"cloudy_metals_2008_3D.h5", "sha1:ade563216d1102e8befab822cbb60c418b130aa1"} diff --git a/src/python/pygrackle/utilities/data_path.py b/src/python/pygrackle/utilities/data_path.py index 6b1d4d62..c45f0707 100644 --- a/src/python/pygrackle/utilities/data_path.py +++ b/src/python/pygrackle/utilities/data_path.py @@ -11,14 +11,21 @@ # software. ######################################################################## -from functools import partial +import io import os import sys from pygrackle.grackle_wrapper import get_grackle_version -from pygrackle.utilities.grdata import make_config_objects, get_version_dir +from pygrackle.utilities.grdata import ( + make_config_objects, + VersionDataManager, + _parse_file_registry, +) from pygrackle.utilities.misc import dirname +# maybe it would be better to export nothing? +__all__ = ["grackle_data_dir"] + # when we shift to scikit-build-core we can do something more robust here def _is_editable_install(): @@ -55,4 +62,13 @@ def _get_file_registry_contents(editable_install): file_registry_file=_get_file_registry_contents(is_editable_install), ) -grackle_data_dir = get_version_dir(*_CONFIG_PAIR) +_MANAGER = VersionDataManager.create(*_CONFIG_PAIR) + + +def _download_all_datafiles(): + """Download all datafiles if it hasn't been downloaded already.""" + registry = _parse_file_registry(_CONFIG_PAIR[1].file_registry_file) + return _MANAGER.fetch_all(registry) + + +grackle_data_dir = _MANAGER.version_dir diff --git a/src/python/pygrackle/utilities/grdata.py b/src/python/pygrackle/utilities/grdata.py index 7148ad12..e4c5fcf0 100644 --- a/src/python/pygrackle/utilities/grdata.py +++ b/src/python/pygrackle/utilities/grdata.py @@ -3,6 +3,247 @@ # A tool for managing grackle data files. This should be usable as # -> a standalone command line tool (when pygrackle isn't installed) # -> as a part of pygrackle. +# +# There is 1 snag to converting this to file to a standalone command-line-tool +# (that can be used without installing any packages): +# +# -> Currently, we are using the external pooch package. +# -> I originaly thought it would be a great tool for helping us manage +# datafiles. But our custom deduplication strategy required me to +# implement a bunch of functionality that makes a lot of pooch's features +# unnecessary. +# -> We have 2 choices: +# 1. We could totally remove pooch and use urllib instead +# 2. We could package this program as a +# `zipapp `_. + + +import argparse +from contextlib import ExitStack +import filecmp +import hashlib +import io +import os +import re +import shutil +import stat +import sys +import traceback +from typing import IO, NamedTuple, Union +import warnings + +import pooch # external import + +if (sys.version_info.major, sys.version_info.minor) < (3, 6, 1): + raise RuntimeError("python 3.6.1 or newer is required") + +# Down below, we provide a detailed description that serves 3 purposes +# 1. to act as a description of this files contents for developers +# 2. to serve as documentation on the website +# 3. to serve as queryable documentation via the `help` subcommand +# +# The text enclosed in triple braces serves 2 purposes: +# -> it is designed to be anchors used by sphinx to include the documentation. +# -> while executing the `help` subcommand, anchors of the format +# `[[[BEGIN-SECTION:]]]` will be replaced with a section title ``, +# and all other anchors are removed. + +_EXTENDED_DESCRIPTION = """\ +[[[BEGIN-SECTION:DESCRIPTION]]] +This is a management system for managing Grackle data files. The command line +interface provides commands to fetch these data files, list all of the +available data, and delete the data. + +The system stores the data files at a single global location. (Grackle, +itself, will soon be able to access files from this location). + +The key feature of this system is its support for versioning: + +- it is able to support management of sets of datafiles (associated with + different grackle versions) where the datafiles have been renamed, + modified, or deleted between Grackle versions. + +- additionally, the system implements deduplication for the (very common) + scenario when the contents of a file are unchanged between grackle + versions. + +One minor caveat: a given version of this tool is ONLY able to download +data for the grackle version specified by the ``--version-grackle`` flag +(i.e. this is the grackle version that the tool ships with). However, it +does support listing and deleting data associated with other grackle +versions. + +The location of the data is controlled by the ``GRACKLE_DATA_DIR`` +environment variable. When this variable isn't specified, the tool uses +the operating-system recommendation for user-site-data. This location can +be queried with the ``getpath`` subcomand. +[[[END-SECTION:DESCRIPTION]]] + +[[[BEGIN-SECTION:MOTIVATION]]] +Why does this tool exist? Datafiles are required by **ANY** non-trivial +program (e.g. a simulation-code or python script) that invokes Grackle. + +It is instructive to consider the historic experience of an end-user of one of +these programs. To build Grackle, they would typically clone the git repository +for Grackle (including the data files). To invoke their program, they would +manually specify the path to the downloaded data file. Frankly, this doesn't +seem so bad; the manual intervention is a minor inconvenience, at worst. +While it would be nice to eliminate the manual intervention, this it doesn't +seem it warrants development of a special tool. + +Indeed, this is all true. Users who like this workflow can continue using it. +However, this manual management of datafiles becomes problematic in any +use-case that is marginally more complex. There are 3 considerations worth +highlighting: + + 1. **Portability:** Currently, there is no out-of-the-box approach for + any program using Grackle configured to run on one computer to run on + another machine without manual intervention. + + - If there are differences in how the machines are set up (e.g. where + the data files are placed), the paths to the Grackle data file(s) need + to be updated. This is relevant if you want to use a Pygrackle script + on a different machine or if you want to use a configuration script to + rerun a simulation (involving Grackle) on a different machine. + + - This is particularly noteworthy when it comes to automated testing! For + example, before this tool existed, Pygrackle, made some assumptions that + it was installed as an editable installation to run some examples. The + test-suite of Enzo-E is another example where extra book-keeping is + required for all test-problems that invoke Grackle. + + 2. **If the Grackle repository isn't present:** This includes the case where + a user deletes the repository after installing Grackle. It is more + important to consider the case where users are installing programs that use + Grackle without downloading the repository (or, even if the repository is + downloaded, it is done so without the user's knowledge). This latter case + will become increasingly common as we make pygrackle easier to install. + This is also plausible for cmake-builds of downstream projects that embed + Grackle compilation as part of their build. + + 3. **Having multiple Grackle Versions Installed:** This is going to be + increasingly common as Pygrackle becomes easier to install. Users have 2 + existing options in this case: (i) they maintain separate repositories of + data files for each version or (ii) they assume that they can just use + the newest version of the data-file repository. The latter option, has + historically been true (and will probably continue to be true). But, it could + conceivably lead to cases where people could unintentionally use a data-file + created for a newer version of grackle. (While this likely won't be a + problem, users should probably be explicitly aware that they are doing this + on the off-chance that problems do arise). + +This tool is a first step to addressing these cases. + +Currently the tool just works for Pygrackle. There is an ongoing effort to add +functionality for the Grackle library, itself, to access the files managed by +this tool. +[[[END-SECTION:MOTIVATION]]] + +[[[BEGIN-SECTION:INTERNALS-OVERVIEW]]] +We now turn our attention to describing how the internals of the +management system work. + +Fundamentally, the data management system manages a **data store**. +We will return to that in a moment. + +Protocol Version +++++++++++++++++ + +This internal logic has an associated protocol-version, (you can query +this via the ``--version-protocol`` flag). The logic may change between +protocol versions. The protocol version will change very rarely (if it +ever changes at all) + +Data Directory +++++++++++++++ + +This is simply the data directory that includes all grackle data. This path +is given by the ``GRACKLE_DATA_DIR`` environment variable, if it exists. +Otherwise it defaults to the operating-system's recommendation for +user-site-data. + +This contains several entries including the: + + - a **user-data** directory. This directory currently isn't used yet, but + it is reserved for users to put custom data-files in the future. + + - a **tmp** directory (used by the data-management tool) + + - it sometimes holds a lockfile (used to ensure that multiple instances of + this tool aren't running at once) + + - the **data store** directory(ies). This is named + ``data-store-v`` so that earlier versions of this + tool will continue to function if we ever change the protocol. (Each of + these directories are completely independent of each other). + +Outside of the **user-data** directory, users should not modify/create/delete +any files within Data Directory (unless the tool instructs them to). + +Data Store +++++++++++ + +This is where we track the data files managed by this system. This holds a +directory called **object-store** and 1 or more "version-directories". + +The primary-representation of each file is tracked within the ``object-store`` +subdirectory. + +- The name of each item in this directory is a unique key. This key is the + file’s SHA-1 checksum. + +- Git internally tracks objects in a very similar way (they have historically + used SHA-1 checksums as unique keys). The chance of an accidental collision + in the checksum in a large Git repository is extremely tiny. It was only 10 or + 12 years after Git was created that the developers started worrying about + collisions (and they are primarily concerned with intentional collisions from + maclicious actors). + +Each version-directory is named after a Grackle version (**NOT** a Pygrackle +version). + +- a given version directory holds data-file references. +- the references have the contemporaneous name of each of the data-files that + was shipped with the Grackle-version that corresponds to the directory's name. +- each reference is linked to the corresponding file in the ``object-store``. + +When a program outside of this tool accesses a data-file, they will **ONLY** +access the references in the version-directory that shares its name with the +version of Grackle that the program is linked against. + +This tool makes use of references and the ``object-store`` to effectively +deduplicate data. Whenever this tool deletes a "data-file" reference it will +also delete the corresponding file from the ``object-store`` if it had no other +references. We choose to implement references as "hard links" in order to make +it easy to determine when a file in ``object-store`` has no reference. +[[[END-SECTION:INTERNALS-OVERVIEW]]] + +Sample Directory Structure +++++++++++++++++++++++++++ + +Down below, we sketch out what the directory-structure might look like: + +[[[BEGIN:DIRECTORY-CARTOON]]] +GRACKLE_DATA_DIR/ + ├── data-store-v1/ # <- the data-store + │ ├── 3.3.1-dev/ # <- a version-dir + │ │ ├── CloudyData_UVB=FG2011.h5 + │ │ ├── ... + │ │ └── cloudy_metals_2008_3D.h5 + │ ├── 3.4.0/ # <- another version-dir + │ │ ├── CloudyData_UVB=FG2011.h5 + │ │ ├── ... + │ │ └── cloudy_metals_2008_3D.h5 + │ └── object-store/ # <- the object-store + │ ├── ... + │ └── ... + ├── tmp/ # <- reserved for scratch-space + ├── user-data/ # <- reserved for user data + │ ├── ... + │ └── ... + └── lockfile # <- temporary file +[[[END:DIRECTORY-CARTOON]]] +""" # Notes on the file registry @@ -29,96 +270,150 @@ # -> See the end of this file for what that entails -# With that said, the pooch package seems like a great tool for helping us do -# this (it does a bunch of the things we would probably need to do anyways). -# If/when we want to distribute this file as a standalone application alongside -# the Grackle library, we have 2 options: -# 1. We could remove pygrackle as a dependency. -# 2. We could package this file, pooch and pooch's dependencies as a -# `zipapp `_. -# This is only possible as long as: -# - pooch doesn't add many more required dependencies (right now it only -# has 2 dependencies) -# - pooch and its dependencies remain written in pure python -# - pooch and its dependencies maintain compatability with a wide range of -# python versions +def _use_progress_bar(): + try: + import tqdm # noqa: F401 + return True + except ImportError: + return False -_DESCRIPTION = """\ -This is a command line tool for downloading and managing data files to be used -with Grackle. +_PROGRESSBAR = _use_progress_bar() -""" +_UNSPECIFIED = object() +_OBJECT_STORE_SUBDIR = "object-store" -import argparse -from contextlib import contextmanager, ExitStack -import filecmp -import hashlib -import io -import os -import re -import shutil -import stat -import sys -import traceback -import warnings -import pooch # external import +class GenericToolError(RuntimeError): + pass -# pooch will assume that any unlabeled checksum was computed with the following algorithm -_POOCH_DEFAULT_CKSUM_KIND = "sha256" -if (sys.version_info.major, sys.version_info.minor) < (3, 3): - raise RuntimeError("python 3.3 or newer is required") +class ToolConfig(NamedTuple): + """Tracks basic information about this tool""" + grackle_version: str + protocol_version: str = "1" + checksum_kind: str = "sha1" -_UNSPECIFIED = object() -_OBJECT_STORE_SUBDIR = "object-store" +def _ensure_all_removed(fnames): + for fname in fnames: + try: + os.remove(fname) + except FileNotFoundError: + continue -class ToolConfig: - """Tracks basic information about this tool""" - def __init__(self, *, grackle_version, protocol_version="1", checksum_kind="sha1"): - self.grackle_version = grackle_version - self.protocol_version = protocol_version - self.checksum_kind = checksum_kind +class Fetcher(NamedTuple): + """Encodes information for fetching data files + + Note + ---- + Right now, we always assume that we want to support downloading from + GitHub, but in the future, we can also support fetching from a + directory + """ + + base_path: str + holds_url: bool + + @classmethod + def configure_GitHub_url(cls, data_repository_url, contemporaneous_git_hash): + repo_url = data_repository_url + repo_version = contemporaneous_git_hash + # we could also use the name of a branch (instead of a commit-hash) if we think + # that would be better + return cls(base_path=f"{repo_url}/raw/{repo_version}/input/", holds_url=True) + @classmethod + def configure_src_dir(cls, dir_path): + return cls(base_path=dir_path, holds_url=False) -class DataStoreConfig: + def __call__(self, fname, checksum, checksum_kind, dest_dir): + """ + Retrieve the file named ``fname`` to a location dest_dir + + Returns + ------- + full_path: str + Upon success, we return the full path of the newly fetched file + + Notes + ----- + It doesn't look too difficult to swap out the ``pooch.retrieve`` + functionality for custom logic that calls methods from python's + builtin urllib module. This would simplify the process of shipping + this functionality as a portable standalone script. + + If we ever move away from pooch (e.g. to make this functionality + easier to use in a portable standalone script), we need to ensure + that the our new approach implements a similar procedure that + they adopt where: + 1. any downloaded file is first put in a temporary location + 2. and then, only after we verify that the checksum is correct, + we move the file to the downloads directory. + """ + + if self.holds_url: + return pooch.retrieve( + url=f"{self.base_path}/{fname}", + fname=fname, + known_hash=f"{checksum_kind}:{checksum}", + path=dest_dir, + progressbar=_PROGRESSBAR, + ) + else: + tmp_name = os.path.join(dest_dir, "_tempfile") + # tmp_name can safely be removed if it exists (it only exists if this logic + # previously crashed or was interupted by SIGKILL) + _ensure_all_removed([tmp_name]) + try: + src = os.path.join(self.base_path, fname) + dst = os.path.join(dest_dir, fname) + _pretty_log( + f"retrieving {fname}:\n" f"-> from: {src}\n" f"-> to: {dst}" + ) + # copy the file + shutil.copyfile(src, tmp_name) + if not matches_checksum(tmp_name, checksum_kind, checksum): + if matches_checksum(src, checksum_kind, checksum): + raise GenericToolError( + f"while copying from {src}, data may have been corrupted" + ) + raise GenericToolError(f"{src} does't have the expected checksum") + os.rename(tmp_name, dst) + + finally: + _ensure_all_removed([tmp_name]) + + +class DataStoreConfig(NamedTuple): """Track basic configuration information In principle, this information is intended to be a little more flexible and might not be known as early as ToolConfig. """ - def __init__( - self, - *, - data_dir, - store_location, - data_repository_url, - contemporaneous_git_hash, - checksum_kind, - file_registry_file, - ): - # where data is actually stored - self.data_dir = data_dir - self.store_location = store_location - - # properties for tracking files - self.data_repository_url = data_repository_url - self.contemporaneous_git_hash = contemporaneous_git_hash - - # specifies the kinds of checksums listed in the registry - self.checksum_kind = checksum_kind - # the following specifies the file containing the file registry - self.file_registry_file = file_registry_file - - -def _get_platform_data_dir(appname="grackle"): + data_dir: str + store_location: str + checksum_kind: str + default_fetcher: Fetcher + file_registry_file: Union[str, bytes, os.PathLike, IO, None] + + @property + def tmp_dir(self): + """Used for hardlink test and scratch-space""" + return os.path.join(self.data_dir, "tmp") + + @property + def user_data_dir(self): + """Reserved for user data""" + return os.path.join(self.data_dir, "user-data") + + +def _get_platform_data_dir(appname="grackle", system_str=None): """Returns a string specifying the default data directory All of these choices are inspired by the API description of the platformdirs python @@ -127,9 +422,11 @@ def _get_platform_data_dir(appname="grackle"): https://platformdirs.readthedocs.io/en/latest/ * we have NOT read any source code """ - if sys.platform.startswith("win32"): + if system_str is None: + system_str=sys.platform + if system_str.startswith("win32"): raise RuntimeError() - elif sys.platform.startswith("darwin"): + elif system_str.startswith("darwin"): return os.path.expanduser(f"~/Library/Application Support/{appname}") else: # assume linux/unix # https://specifications.freedesktop.org/basedir-spec/latest/ @@ -143,7 +440,7 @@ def _get_platform_data_dir(appname="grackle"): env_str = dflt # now actually infer the absolute path - if (env_str[0] == "~") and (not env_str.startswith("~/")): + if env_str[0] == "~": if env_str[:2] != "~/": # for parity with C-version of this function raise RuntimeError( "can't expand can't expand env-variable, XDG_DATA_HOME when " @@ -153,7 +450,6 @@ def _get_platform_data_dir(appname="grackle"): else: return f"{env_str}/{appname}" - def _get_data_dir(): manual_choice = os.getenv("GRACKLE_DATA_DIR", default=None) if (manual_choice is None) or (len(manual_choice) == 0): @@ -161,7 +457,7 @@ def _get_data_dir(): elif (manual_choice[0] != "~") and (not os.path.isabs(manual_choice)): raise RuntimeError("GRACKLE_DATA_DIR must specify an absolute path") elif manual_choice[0] == "~": - if not env_str[:2] != "~/": # for parity with C-version of this function + if not manual_choice[:2] != "~/": # for parity with C-version of this function raise RuntimeError( "can't expand can't expand env-variable, GRACKLE_DATA_DIR when " "it starts with `~user/` or just contains `~`" @@ -223,45 +519,81 @@ def _parse_file_registry(f): return file_registry -class GenericToolError(RuntimeError): +class LockFileExistsError(FileExistsError): pass -class LockFileExistsError(FileExistsError): - pass +class LockFileContext: + """Reentrant context manager that creates a "lockfile". + The context-manager will delete the file when we finish. If the lock + already exists, the program will abort with an explanatory error + (this ensures that only 1 copy of the program will try to run at a + time). -@contextmanager -def lock_dir(lock_file_path): - """ - Contextmanager that creates a "lock file." The context-manager will delete - the file when we finish. If the lock already exists, the program will abort - with an explanatory error (this ensures that only 1 copy of the program will - try to run at a time). + Examples + -------- + To use this you might invoke: + + >>> dir_lock = LockFileContext("path/to/lockfile") + >>> with dir_lock: + ... # do something critical + + This is reentrant in the sense that you can perform something like the + following (the real value here is that you can mover internal + with-statement inside of functions) + + >>> dir_lock = LockFileContext("path/to/lockfile") + >>> with dir_lock: + ... # do something critical + ... with dir_lock: + ... # do something else critical """ - try: - f = open(lock_file_path, "x") - f.close() - except FileExistsError as err: - raise LockFileExistsError( - err.errno, - err.strerror, - err.filename, - getattr(err, "winerror", None), - err.filename2, - ) from None - try: - yield None - finally: - os.remove(lock_file_path) + def __init__(self, lock_file_path): + self.lock_file_path = lock_file_path + + # the following is always non-negative. It can exceed 1 if the same context + # manager is used in nested with-statements + self._acquisition_count = 0 + + def locked(self): + return self._acquisition_count > 0 + + def __enter__(self): + if self._acquisition_count == 0: + # try to acquire the lock (by trying to create the file) + try: + f = open(self.lock_file_path, "x") + f.close() + except FileExistsError as err: + raise LockFileExistsError( + err.errno, + err.strerror, + err.filename, + getattr(err, "winerror", None), + err.filename2, + ) from None + else: + # this is a nested with-statement, in a process that already owns the lock + pass + + self._acquisition_count += 1 # only executed if process owns the lock + + def __exit__(self, exc_type, exc_value, traceback): + if exc_type is FileExistsError: + return False + elif self._acquisition_count <= 0: + raise RuntimeError("the contextmanager has a totally invalid state!") + elif self._acquisition_count == 1: + os.remove(self.lock_file_path) + self._acquisition_count -= 1 + return False # if an exception triggered the exitting of a context manager, + # don't suppress it! -@contextmanager -def standard_lockfile(data_config): - lock_file_path = os.path.join(data_config.data_dir, "lockfile") - with lock_dir(lock_file_path): - yield None +def standard_lockfile(data_store_config): + return LockFileContext(os.path.join(data_store_config.data_dir, "lockfile")) def calc_checksum(fname, alg_name, *, chunksize=4096): @@ -286,33 +618,6 @@ def matches_checksum(fname, alg_name, checksum): return checksum == calc_checksum(fname, alg_name) -def _create_retriever(destination_path, data_config): - """ - create pooch object responding for fetching data files - - Notes - ----- - If we ever move away from pooch (e.g. to make this functionality easier to use in a - portable standalone script), we need to ensure that the our new approach implements - a similar procedure that they adopt where: - 1. any downloaded file is first put in a temporary location - 2. and then, only after we verify that the checksum is correct, we move the - file to the downloads directory. - """ - repo_url = data_config.data_repository_url - repo_version = data_config.contemporaneous_git_hash - prefix = f"{data_config.checksum_kind}" - - file_registry = _parse_file_registry(data_config.file_registry_file) - - # if we move away from pooch, ( - return pooch.create( - path=destination_path, - base_url=f"{repo_url}/raw/{repo_version}/input/", - registry=dict((k, f"{prefix}:{v}") for k, v in file_registry.items()), - ) - - def _pretty_log(arg): """indent messages so it's clear when multiline messages are a single thought""" lines = arg.splitlines() @@ -340,6 +645,27 @@ class _HardlinkStrat: that uses Hardlinks """ + @staticmethod + def is_supported(dirname): + """returns whether the OS (and filesystem supports hardlinks)""" + + fnames = [os.path.join(dirname, f"linktest_f{i}.txt") for i in [0, 1]] + _ensure_all_removed(fnames) + + try: + _contents = "THIS IS SOME TEST DATA" + with open(fnames[0], "w") as f: + f.write(_contents) + os.link(fnames[0], fnames[1]) + os.remove(fnames[0]) + with open(fnames[1], "r") as f: + support_hardlinks = f.read() == _contents + except OSError: + support_hardlinks = False + finally: + _ensure_all_removed(fnames) + return support_hardlinks + @staticmethod def are_linked(fname, fname2): """return whether ``fname`` & ``fname2`` specify paths that are hardlinks""" @@ -423,99 +749,207 @@ def deduplicate(full_fname, shared_fname): os.link(full_fname, shared_fname) -def _fetch_files(retriever, cksum_kind, object_dir): +def _ensure_data_dir_exists(data_store_config): + """Creates the data_dir if it doesn't exist + + the data_dir is a directory that contains: + -> the data-store directory for data managed by the current protocol version + -> (possibly) data-store directories for data managed by other protocol version + -> (possibly) a directory called `user-data/` where users can put custom data """ - Does the heavy lifting of fetching files. + _ensure_exists(data_store_config.data_dir, "that will hold all Grackle data") - Parameters - ---------- - retriever : ``pooch.Pooch`` - pooch object that manages downloads of the specified files - cksum_kind : str - The primary checksum algorithm that the tool is globally configured to use. - object_dir : str - Path to the object directory. This is the name where checksum names are used as - filenames. (This is the mechanism used to aid deduplication) + # even though it isn't used for anything right now, make the directory that is + # reserved for user content + _ensure_exists(data_store_config.user_data_dir, "reserved for user-defined data") + + # primarily for testing whether hard-links are supported + _ensure_exists(data_store_config.tmp_dir, "reserved for scratch-space") + + +def get_version_dir(tool_config, data_store_config): + return os.path.join(data_store_config.store_location, tool_config.grackle_version) + + +def get_object_dir(data_store_config): + return os.path.join(data_store_config.store_location, _OBJECT_STORE_SUBDIR) + + +class VersionDataManager(NamedTuple): + """Actually manages downloads of files + + Warnings + -------- + This should not be considered part of a public API. The names and + existence of all attributes and methods are subject to change + + Notes + ----- + A major motivating factor in the design was providing the capacity + to create the necessary directories only when absolutely necessary + (i.e. when we are about to download data) + + Some future methods that might be worth implmenting + + * a method to download a single file + + * a method to check the validity of a single Version file (i.e. it + ONLY contains files listed in the specified registry, all files + match the specified checksum, AND they are all properly linked to + a file in the object directory) """ - try: - import tqdm - progressbar = True - except: - progressbar = False + # define attributes holding directory paths where data files are actually stored + + # Path to output directory, where the file-name matches the name given + # in the registry and is known by the associated grackle-version + version_dir: str + # Path to the object directory. This is the name where checksum names are used as + # filenames. (This is the mechanism used to aid deduplication) + object_dir: str + + # data_store_config holds a little more information than we actually need + # -> we may chooise to redefine this in the future + data_store_config: DataStoreConfig + + # encodes the configuration (and logic) for fetching the files + fetcher: Fetcher + + @classmethod + def create(cls, tool_config, data_store_config, *, override_fetcher=None): + """create a new instance""" + + fetcher = override_fetcher + if fetcher is None: + fetcher = data_store_config.default_fetcher + + return cls( + version_dir=get_version_dir(tool_config, data_store_config), + object_dir=get_object_dir(data_store_config), + data_store_config=data_store_config, + fetcher=fetcher, + ) - num_download_attempts = 0 + def _setup_file_system(self): + """ + helper function that ensures that the file system is set up for + fetching new files and returns the configured lockfile context + manager (it isn't locked yet) + """ + _ensure_data_dir_exists(self.data_store_config) + + lockfile_ctx = standard_lockfile(self.data_store_config) + with lockfile_ctx: + # let's validate we can actually use hardlinks + if not hasattr(os, "link"): + raise GenericToolError("The operating system doesn't support hardlinks") + elif not _HardlinkStrat.is_supported(self.data_store_config.tmp_dir): + raise GenericToolError("The file system does not support hardlinks") + + # a little more set up + _ensure_exists( + self.data_store_config.store_location, "that will hold the data-store" + ) + _ensure_exists(self.object_dir, "") + _ensure_exists( + self.version_dir, "that holds data for current Grackle version" + ) + + assert not lockfile_ctx.locked() # sanity check! + return lockfile_ctx + + def _fetch_file(self, fname, full_checksum_str, *, lockfile_ctx=None): + """ + Helper method to fetch a single file and provide the full path + + Returns + ------- + any_work : bool + ``True`` indicates that we actually needed to go get the file, + while ``False`` indicates that the file already existed + full_path : str + Full path to the file + """ + + if lockfile_ctx is None: + lockfile_ctx = self._setup_file_system() + + # get the global checksum kind + cksum_kind = self.data_store_config.checksum_kind - # NOTE: the docstring for ``pooch.create`` makes it clear that ``retriever.registry`` - # is a part of the public API - for fname, full_checksum_str in retriever.registry.items(): # extract the checksum_kind and string that are stored in the registry # (we are being a little more careful here than necessary, but if this ever # becomes library-code, it will pay off) if ":" in full_checksum_str: cur_cksum_kind, checksum = full_checksum_str.split(":") else: - cur_cksum_kind, checksum = _POOCH_DEFAULT_CKSUM_KIND, full_checksum_str + raise ValueError( + f"the checksum for {fname} does not specify the checksum kind" + ) if cur_cksum_kind != cksum_kind: raise ValueError( "Currently, we only support downloading from file registries where the " - f"checksum algorithm matches the globally used algorithm, {cksum_kind}. " - f"The checksum algorithm associated with {fname} is {cur_cksum_kind}" + "checksum algorithm matches the globally used algorithm, " + f"{cksum_kind}. The checksum algorithm associated with {fname} is " + f"{cur_cksum_kind}." ) - # name associated with current file in the current grackle version - full_fname = os.path.join(retriever.path, fname) - - # if the file already exists we are done - if os.path.exists(full_fname): - if not matches_checksum(full_fname, cksum_kind, checksum): - raise RuntimeError( - f"{full_fname} already exists but has the wrong hash" - ) - continue + with lockfile_ctx: + # name associated with current file in the current grackle version + full_fname = os.path.join(self.version_dir, fname) - num_download_attempts += 1 + # if the file already exists we are done + if os.path.exists(full_fname): + if not matches_checksum(full_fname, cksum_kind, checksum): + raise RuntimeError( + f"{full_fname} already exists but has the wrong hash" + ) + return (False, full_fname) + + # download the file (pooch will log a detailed message + fetcher = self.fetcher + fetcher( + fname, + checksum=checksum, + checksum_kind=cksum_kind, + dest_dir=self.version_dir, + ) + os.chmod(full_fname, _IMMUTABLE_MODE) - # download the file (pooch will log a detailed message - retriever.fetch(fname, progressbar=progressbar) - os.chmod(full_fname, _IMMUTABLE_MODE) + # now deduplicate + cksum_fname = os.path.join(self.object_dir, checksum) - # now deduplicate - checksum_fname = os.path.join(object_dir, checksum) + try: + _HardlinkStrat.deduplicate(full_fname, cksum_fname) - try: - _HardlinkStrat.deduplicate(full_fname, checksum_fname) + # not strictly necessary, but doing this for safety reasons + os.chmod(cksum_fname, _IMMUTABLE_MODE) - # not strictly necessary, but doing this for safety reasons - os.chmod(checksum_fname, _IMMUTABLE_MODE) + except Exception as err: + # remove full_fname since we don't want users to use it before dealing + # with the larger issue. We also want to make the errors reproducible + os.remove(full_fname) + if (not isinstance(err, ValueError)) and os.path.is_file(cksum_fname): + raise err - except Exception as err: - # remove full_fname since we don't want users to use it before dealing - # with the larger issue. We also want to make the errors reproducible - os.remove(full_fname) - if not (isinstance(err, ValueError) and os.path.is_file(checksum_fname)): - raise err - - # this should only happens when full_fname and checksum_fname both exist, - # but aren't perfect matches of each other. We try to provide a more - # informative error message - if not matches_checksum( - checksum_fname, data_config.checksum_kind, checksum - ): - raise GenericToolError(f"""\ + # this should only happens when full_fname and cksum_fname both exist, + # but aren't perfect matches of each other. We try to provide a more + # informative error message + if not matches_checksum(cksum_fname, cksum_kind, checksum): + raise GenericToolError(f"""\ A file (used for deduplication) that already existed on disk - `{checksum_fname}` + `{cksum_fname}` which is probably a version of `{fname}`, -doesn't have the appropriate {data_config.checksum_kind} checksum. --> expected: {calc_checksum(checksum_fname, data_config.checksum_kind)} +doesn't have the appropriate {self.data_store_config.checksum_kind} checksum. +-> expected: {calc_checksum(cksum_fname, cksum_kind)} -> actual: {checksum} -> This implies that the data was corrupted and it needs to be dealt with. To avoid confusion we have deleted the newly downloaded version of `{fname}` -> The safest bet is probably to delete the data directory""") - else: - raise GenericToolError(f"""\ + else: + raise GenericToolError(f"""\ Something bizare (& extremely unlikely) happened: -> a previous invocation of this tool appears to have installed a data file with the same checksum as {fname}, but has different contents. @@ -523,49 +957,44 @@ def _fetch_files(retriever, cksum_kind, object_dir): happen for a small collection of files is truly astronomical! -> this is probably a sign that something went wrong. We deleted the newly downloaded version of the file""") + return (True, full_fname) - if num_download_attempts == 0: - _pretty_log("no files needed to be loaded") - - -def get_version_dir(tool_config, data_config): - return os.path.join(data_config.store_location, tool_config.grackle_version) - + def fetch_all(self, registry): + """ + Ensures that all files in the specified registry are downloaded -def fetch_command(args, tool_config, data_config): - # the data_dir is a directory that contains: - # -> the data-store directory for data managed by the current protocol version - # -> (possibly) data-store directories for data managed by other protocol version - # -> (possibly) a directory called `user-data/` where users can put custom data - _ensure_exists(data_config.data_dir, "that will hold all Grackle data") + Parameters + ---------- + registry : dict + maps file names to associated checksums + """ - with standard_lockfile(data_config): - # even though it isn't used for anything right now, make the directory that is - # reserved for user content - _ensure_exists( - os.path.join(data_config.data_dir, "user-data"), - "reserved for user-defined data", - ) + # ensure all needed directories exist and fetch the lockfile context manager + lockfile_ctx = self._setup_file_system() - # do a little more setup! - _ensure_exists(data_config.store_location, "that will hold the data-store") + with lockfile_ctx: + num_fetched = 0 + for fname, full_checksum_str in registry.items(): + any_work, _ = self._fetch_file( + fname, full_checksum_str, lockfile_ctx=lockfile_ctx + ) + num_fetched += any_work - # ensure version_dir and object_dir both exist. They respectively store - # filenames that are (hard) linked to the data-file. - # -> version_dir uses the names known by the associated grackle-version - # -> object_dir uses the checksum as a filenames - object_dir = os.path.join(data_config.store_location, _OBJECT_STORE_SUBDIR) + if num_fetched == 0: + _pretty_log("no files needed to be downloaded") - _ensure_exists(object_dir, "") - version_dir = get_version_dir(tool_config, data_config) - _ensure_exists(version_dir, "that holds data for current Grackle version") - # create the object that is used to actually loads the data - retriever = _create_retriever( - destination_path=version_dir, data_config=data_config - ) - - _fetch_files(retriever, data_config.checksum_kind, object_dir) +def fetch_command(args, tool_config, data_store_config): + override_fetcher = None + if args.from_dir is not None: + override_fetcher = Fetcher.configure_src_dir(args.from_dir) + man = VersionDataManager.create( + tool_config=tool_config, + data_store_config=data_store_config, + override_fetcher=override_fetcher, + ) + registry = _parse_file_registry(data_store_config.file_registry_file) + man.fetch_all(registry) def direntry_iter(path, *, ftype="file", mismatch="skip", ignore=None): @@ -596,9 +1025,13 @@ def direntry_iter(path, *, ftype="file", mismatch="skip", ignore=None): pair : tuple of two str The first element is the entry's base filename and the second is the full path """ + + def always_true(*args): + return True + if ftype is None: - has_ftype = lambda full_path: True - if ftype == "dir": + has_ftype = always_true + elif ftype == "dir": has_ftype = os.path.isdir elif ftype == "file": has_ftype = os.path.isfile @@ -628,15 +1061,15 @@ def direntry_iter(path, *, ftype="file", mismatch="skip", ignore=None): raise ValueError("mismatch must be 'eager_err', 'lazy_err' or 'skip'") -def rm_command(args, tool_config, data_config): +def rm_command(args, tool_config, data_store_config): """Logic for removing files""" if args.vdata is _UNSPECIFIED: # this means that we are removing the whole data store if not args.data_store: raise RuntimeError("SOMETHING WENT HORRIBLY, HORRIBLY WRONG") - _descr = os.path.basename(data_config.store_location) - target_path = data_config.store_location + _descr = os.path.basename(data_store_config.store_location) + target_path = data_store_config.store_location operation_description = ( f"deleting ALL files in the data-store associated with this tool, {_descr}" ) @@ -655,7 +1088,7 @@ def rm_command(args, tool_config, data_config): else: target = args.vdata _descr = f"`{target}`" - target_path = os.path.join(data_config.store_location, target) + target_path = os.path.join(data_store_config.store_location, target) operation_description = ( f"deleting all data file references for the grackle-version {_descr}. " "Any files for which the reference-count drops to zero will also be removed." @@ -668,7 +1101,9 @@ def rm_command(args, tool_config, data_config): ) def fn(path): - object_dir = os.path.join(data_config.store_location, _OBJECT_STORE_SUBDIR) + object_dir = os.path.join( + data_store_config.store_location, _OBJECT_STORE_SUBDIR + ) if not os.path.isdir(object_dir): raise RuntimeError( "SOMETHING IS HORRIBLY WRONG!!! THE {object_dir} IS MISSING" @@ -679,21 +1114,19 @@ def fn(path): for name, full_path in it: # get path to corresponding hardlinked file in _OBJECT_STORE_SUBDIR checksum = calc_checksum(full_path, alg_name=tool_config.checksum_kind) - checksum_fname = os.path.join(object_dir, checksum) - checksum_fname_exists = os.path.isfile(checksum_fname) + cksum_fname = os.path.join(object_dir, checksum) + cksum_fname_exists = os.path.isfile(cksum_fname) - if not checksum_fname_exists: + if not cksum_fname_exists: warnings.warn( "Something weird has happened. There is no deduplication file " f"associated with {full_path}" ) os.remove(full_path) - if checksum_fname_exists: - _HardlinkStrat.remove_if_norefs(checksum_fname) + if cksum_fname_exists: + _HardlinkStrat.remove_if_norefs(cksum_fname) os.rmdir(path) - target_exists = os.path.isdir(target_path) - if not args.force: _pretty_log( f"{operation_description}\n" @@ -705,12 +1138,12 @@ def fn(path): fn(target_path) -def lsversions_command(args, tool_config, data_config): - if not os.path.exists(data_config.store_location): +def lsversions_command(args, tool_config, data_store_config): + if not os.path.exists(data_store_config.store_location): print("there is no data") - with standard_lockfile(data_config): + with standard_lockfile(data_store_config): it = direntry_iter( - data_config.store_location, + data_store_config.store_location, ftype="dir", mismatch="lazy_err", ignore=[_OBJECT_STORE_SUBDIR], @@ -718,11 +1151,11 @@ def lsversions_command(args, tool_config, data_config): print(*sorted(pair[0] for pair in it), sep="\n") -def getpath_command(args, tool_config, data_config): - print(data_config.data_dir) +def getpath_command(args, tool_config, data_store_config): + print(data_store_config.data_dir) -def calcreg_command(args, tool_config, data_config): +def calcreg_command(args, tool_config, data_store_config): # print the properly file registry information (in the proper format that can be # used to configure newer versions of Grackle @@ -731,7 +1164,7 @@ def calcreg_command(args, tool_config, data_config): try: it = direntry_iter(args.path, ftype="file", mismatch="eager_err") except FileNotFoundError: - raise ValueError(f"{path!r} doesn't specify a directory or file") + raise ValueError(f"{args.path!r} doesn't specify a directory or file") except NotADirectoryError: it = [(os.path.basename(args.path), args.path)] @@ -754,7 +1187,33 @@ def calcreg_command(args, tool_config, data_config): // -> ```` with a path to the directory containing all files that are // to be included in the registry """) - print(*[f'{{"{p[0]}", "{p[1]}"}}' for p in sorted(pairs)], sep=",\n", file=file) + print( + *[f'{{"{p[0]}", "{args.hash_name}:{p[1]}"}}' for p in sorted(pairs)], + sep=",\n", + file=file, + ) + + +def help_command(*args, **kwargs): + # it might be nice to pipe to a pager (specified by PAGER env variable or + + # here is some logic to strip anchors + # replace the [[[BEGIN:...]]] & [[[END:...]]] anchors + _open, _close = r"\[\[\[", r"\]\]\]" + section_start_anchor = re.compile( + rf"^{_open}BEGIN-SECTION:([-,_+.! 0-9A-Za-z]+){_close}[ \t]*$" + ) + generic_anchor = re.compile(rf"^{_open}[-:,_+.! 0-9A-Za-z]+{_close}[ \t]*$") + + for line in _EXTENDED_DESCRIPTION.splitlines(): + m = section_start_anchor.match(line) + if m: + section_name = m.group(1) + print(section_name, len(section_name) * "-", sep="\n") + elif generic_anchor.match(line): + continue + else: + print(line) def _add_version(parser, version_flag, version_name, value): @@ -777,8 +1236,11 @@ def __call__(self, *args, **kwargs): def build_parser(tool_config, prog_name): parser = argparse.ArgumentParser( prog=prog_name, - description=_DESCRIPTION, - formatter_class=argparse.RawDescriptionHelpFormatter, + description=( + "This is a management system for Grackle's data files. Subcommands are " + "provided to fetch data files, list all available data, and delete data" + ), + epilog=f"Invoke `{prog_name} help` to get a detailed overview of the tool", ) _add_version( @@ -790,9 +1252,9 @@ def build_parser(tool_config, prog_name): "data-store protocol version", tool_config.protocol_version, ) - subparsers = parser.add_subparsers(required=True) + # fetch subcommand parser_fetch = subparsers.add_parser( "fetch", help=( @@ -800,11 +1262,21 @@ def build_parser(tool_config, prog_name): "associated version of grackle" ), ) + parser_fetch.add_argument( + "--from-dir", + default=None, + help=( + "optionally specify a path to a directory where we copy the files from " + "(instead of downloading them)" + ), + ) parser_fetch.set_defaults(func=fetch_command) + # ls-versions subcommand parser_ls = subparsers.add_parser("ls-versions", help="list the versions") parser_ls.set_defaults(func=lsversions_command) + # rm subcommand parser_rm = subparsers.add_parser( "rm", help="remove data associated with a given version" ) @@ -828,11 +1300,13 @@ def build_parser(tool_config, prog_name): ) parser_rm.set_defaults(func=rm_command) + # getpath subcommand parser_getpath = subparsers.add_parser( "getpath", help="get filesystem location where all the data is stored" ) parser_getpath.set_defaults(func=getpath_command) + # calcreg subcommand parser_calcregistry = subparsers.add_parser( "calcreg", help=( @@ -864,15 +1338,21 @@ def build_parser(tool_config, prog_name): ) parser_calcregistry.set_defaults(func=calcreg_command) + # help subcommand + parser_help = subparsers.add_parser( + "help", help="Display detailed help information about this tool" + ) + parser_help.set_defaults(func=help_command) + return parser -def main(tool_config, data_config, prog_name): +def main(tool_config, data_store_config, prog_name): parser = build_parser(tool_config, prog_name) args = parser.parse_args() try: - args.func(args, tool_config=tool_config, data_config=data_config) + args.func(args, tool_config=tool_config, data_store_config=data_store_config) except SystemExit: pass # this shouldn't come up! except LockFileExistsError as err: @@ -889,15 +1369,15 @@ def main(tool_config, data_config, prog_name): except GenericToolError as err: print(f"ERROR: {err.args[0]}") sys.exit(70) # https://www.man7.org/linux/man-pages/man3/sysexits.h.3head.html - except: - print(f"Unexpected error:", file=sys.stderr) + except BaseException: + print("Unexpected error:", file=sys.stderr) traceback.print_exc(file=sys.stderr) sys.exit(70) # https://www.man7.org/linux/man-pages/man3/sysexits.h.3head.html else: sys.exit(0) -def _default_data_config(tool_config, file_registry_file): +def _default_data_store_config(tool_config, file_registry_file): """Provides default data configuration""" _REPO_URL = "https://github.com/grackle-project/grackle_data_files/" @@ -911,8 +1391,10 @@ def _default_data_config(tool_config, file_registry_file): return DataStoreConfig( data_dir=data_dir, store_location=os.path.join(data_dir, f"data-store-v{protocol_version}"), - data_repository_url=_REPO_URL, - contemporaneous_git_hash=_CONTEMPORANEOUS_COMMIT_HASH, + default_fetcher=Fetcher.configure_GitHub_url( + data_repository_url=_REPO_URL, + contemporaneous_git_hash=_CONTEMPORANEOUS_COMMIT_HASH, + ), checksum_kind=tool_config.checksum_kind, file_registry_file=file_registry_file, ) @@ -929,11 +1411,11 @@ def make_config_objects(grackle_version, file_registry_file): Contains the file registry """ tool_config = ToolConfig(grackle_version=grackle_version) - data_config = _default_data_config(tool_config, file_registry_file) - return tool_config, data_config + data_store_config = _default_data_store_config(tool_config, file_registry_file) + return tool_config, data_store_config -# to support installing this as a standalone script, we will need to introduce the +# to support installing this file as a standalone program, we will need to introduce the # following procedure to the build-system: # - treat this file as a template-file and configure it with CMake's # ``configure_file`` command (or invoke ``configure_file.py`` under the classic diff --git a/src/python/tests/conftest.py b/src/python/tests/conftest.py new file mode 100644 index 00000000..f3051408 --- /dev/null +++ b/src/python/tests/conftest.py @@ -0,0 +1,12 @@ +# define some hook-functions that will customize pytest's behavior + +from pygrackle.utilities.data_path import _download_all_datafiles + + +def pytest_sessionstart(session): + # this is a hook that is called just before collecting tests and entering + # the test loop. + + # All we want to do is make sure that we have all of the data files that we + # need downloaded (This might not be the right place to put this logic) + _download_all_datafiles() From 468e08797144c314460374c3b7e140a601ecfaf7 Mon Sep 17 00:00:00 2001 From: Matthew Abruzzo Date: Sun, 1 Sep 2024 16:57:52 -0600 Subject: [PATCH 03/36] some remaining changes to the grdata.py script. --- src/python/pygrackle/utilities/data_path.py | 18 +++-- src/python/pygrackle/utilities/grdata.py | 78 +++++++++++++++++---- 2 files changed, 79 insertions(+), 17 deletions(-) diff --git a/src/python/pygrackle/utilities/data_path.py b/src/python/pygrackle/utilities/data_path.py index c45f0707..ccfe130c 100644 --- a/src/python/pygrackle/utilities/data_path.py +++ b/src/python/pygrackle/utilities/data_path.py @@ -57,14 +57,24 @@ def _get_file_registry_contents(editable_install): return io.StringIO(contents) -_CONFIG_PAIR = make_config_objects( - grackle_version=get_grackle_version()["version"], - file_registry_file=_get_file_registry_contents(is_editable_install), -) +def _make_config_pair(grackle_version=None): + if grackle_version is None: + grackle_version = get_grackle_version()["version"] + return make_config_objects( + grackle_version=grackle_version, + file_registry_file=_get_file_registry_contents(is_editable_install), + ) + +_CONFIG_PAIR = _make_config_pair() _MANAGER = VersionDataManager.create(*_CONFIG_PAIR) +def _fnames_in_registry(): + # used for testing/debugging + return tuple(_parse_file_registry(_CONFIG_PAIR[1].file_registry_file).keys()) + + def _download_all_datafiles(): """Download all datafiles if it hasn't been downloaded already.""" registry = _parse_file_registry(_CONFIG_PAIR[1].file_registry_file) diff --git a/src/python/pygrackle/utilities/grdata.py b/src/python/pygrackle/utilities/grdata.py index e4c5fcf0..82974524 100644 --- a/src/python/pygrackle/utilities/grdata.py +++ b/src/python/pygrackle/utilities/grdata.py @@ -423,7 +423,7 @@ def _get_platform_data_dir(appname="grackle", system_str=None): * we have NOT read any source code """ if system_str is None: - system_str=sys.platform + system_str = sys.platform if system_str.startswith("win32"): raise RuntimeError() elif system_str.startswith("darwin"): @@ -450,6 +450,7 @@ def _get_platform_data_dir(appname="grackle", system_str=None): else: return f"{env_str}/{appname}" + def _get_data_dir(): manual_choice = os.getenv("GRACKLE_DATA_DIR", default=None) if (manual_choice is None) or (len(manual_choice) == 0): @@ -1152,7 +1153,53 @@ def lsversions_command(args, tool_config, data_store_config): def getpath_command(args, tool_config, data_store_config): - print(data_store_config.data_dir) + if args.data_dir: + print(data_store_config.data_dir) + elif args.data_store: + print(data_store_config.store_location) + else: + assert args.vdata is not _UNSPECIFIED # sanity check! + if args.vdata is None: + version = tool_config.grackle_version + else: + version = args.vdata + print(os.path.join(data_store_config.store_location, version)) + + +def _register_getpath_subcommand(subparsers): + parser_getpath = subparsers.add_parser( + "getpath", + description=( + "Provides the expected filesystem location for data. This command " + "doesn't care about whether the filesystem location actually exists." + ), + help="show expected filesystem location for data.", + ) + getpath_spec_grp = parser_getpath.add_argument_group( + title="Target", + description="specifies the target that we retrieve the path for.", + ).add_mutually_exclusive_group(required=True) + getpath_spec_grp.add_argument( + "--data-dir", action="store_true", help="get path to the data directory" + ) + getpath_spec_grp.add_argument( + "--data-store", + action="store_true", + help="get path to the data-store (for the protocol version used by this tool)", + ) + getpath_spec_grp.add_argument( + "--vdata", + default=_UNSPECIFIED, + nargs="?", + help=( + "get path to the directory of files-references associated with the " + "specified version. This command assumes that the version-data was " + "managed by a version of this tool that uses the same protocol version " + "as the version returned by --version-protocol. If no version is " + "specified, it uses the version associated with the --version-dir flag." + ), + ) + parser_getpath.set_defaults(func=getpath_command) def calcreg_command(args, tool_config, data_store_config): @@ -1301,10 +1348,7 @@ def build_parser(tool_config, prog_name): parser_rm.set_defaults(func=rm_command) # getpath subcommand - parser_getpath = subparsers.add_parser( - "getpath", help="get filesystem location where all the data is stored" - ) - parser_getpath.set_defaults(func=getpath_command) + _register_getpath_subcommand(subparsers) # calcreg subcommand parser_calcregistry = subparsers.add_parser( @@ -1347,9 +1391,17 @@ def build_parser(tool_config, prog_name): return parser -def main(tool_config, data_store_config, prog_name): +def main(tool_config, data_store_config, prog_name, *, args=None): + """ + Launch the command + + Returns + ------- + int + Specified the exit code + """ parser = build_parser(tool_config, prog_name) - args = parser.parse_args() + args = parser.parse_args(args=args) try: args.func(args, tool_config=tool_config, data_store_config=data_store_config) @@ -1365,16 +1417,16 @@ def main(tool_config, data_store_config, prog_name): of this tool previously crashed""", file=sys.stderr, ) - sys.exit(78) # https://www.man7.org/linux/man-pages/man3/sysexits.h.3head.html + return 78 # https://www.man7.org/linux/man-pages/man3/sysexits.h.3head.html except GenericToolError as err: print(f"ERROR: {err.args[0]}") - sys.exit(70) # https://www.man7.org/linux/man-pages/man3/sysexits.h.3head.html + return 70 # https://www.man7.org/linux/man-pages/man3/sysexits.h.3head.html except BaseException: print("Unexpected error:", file=sys.stderr) traceback.print_exc(file=sys.stderr) - sys.exit(70) # https://www.man7.org/linux/man-pages/man3/sysexits.h.3head.html + return 70 # https://www.man7.org/linux/man-pages/man3/sysexits.h.3head.html else: - sys.exit(0) + return 0 def _default_data_store_config(tool_config, file_registry_file): @@ -1451,4 +1503,4 @@ def _check_substitution_problems(var_name, var_value): grackle_version=_GRACKLE_VERSION, file_registry_file=io.StringIO(_FILE_REGISTRY_CONTENTS), ) - main(*_CONFIG_PAIR, prog_name="grdata") + sys.exit(main(*_CONFIG_PAIR, prog_name="grdata")) From fb82d3194ec20be9cb20d58d9a0ade10bbbf3deb Mon Sep 17 00:00:00 2001 From: Matthew Abruzzo Date: Tue, 17 Sep 2024 19:57:58 -0400 Subject: [PATCH 04/36] I completely forgot to commit __main__.py --- src/python/pygrackle/__main__.py | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 src/python/pygrackle/__main__.py diff --git a/src/python/pygrackle/__main__.py b/src/python/pygrackle/__main__.py new file mode 100644 index 00000000..6f633728 --- /dev/null +++ b/src/python/pygrackle/__main__.py @@ -0,0 +1,7 @@ +import sys + +from .utilities.grdata import main +from .utilities.data_path import _CONFIG_PAIR + +if __name__ == '__main__': + sys.exit(main(*_CONFIG_PAIR, prog_name="python -m pygrackle")) From e48524f200420f791f182d451b6bcf79cab8bd3e Mon Sep 17 00:00:00 2001 From: Matthew Abruzzo Date: Thu, 19 Sep 2024 13:10:16 -0400 Subject: [PATCH 05/36] replace all usage of pooch with urllib --- pyproject.toml | 1 - src/python/pygrackle/utilities/grdata.py | 231 +++++++++++++++-------- 2 files changed, 154 insertions(+), 78 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index ddf18508..3d9bf3be 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -48,7 +48,6 @@ dependencies = [ 'numpy', 'matplotlib', 'yt>=4.0.2', - 'pooch', "importlib_resources;python_version<'3.9'" ] diff --git a/src/python/pygrackle/utilities/grdata.py b/src/python/pygrackle/utilities/grdata.py index 82974524..44ac5732 100644 --- a/src/python/pygrackle/utilities/grdata.py +++ b/src/python/pygrackle/utilities/grdata.py @@ -1,28 +1,18 @@ #!/usr/bin/env python3 -# A tool for managing grackle data files. This should be usable as -# -> a standalone command line tool (when pygrackle isn't installed) -# -> as a part of pygrackle. +# A tool for managing grackle data files. (More details provided after imports) # -# There is 1 snag to converting this to file to a standalone command-line-tool -# (that can be used without installing any packages): +# This file should be usable as both (i) a part of pygrackle and (ii) a standalone +# command line tool (when pygrackle IS NOT installed) # -# -> Currently, we are using the external pooch package. -# -> I originaly thought it would be a great tool for helping us manage -# datafiles. But our custom deduplication strategy required me to -# implement a bunch of functionality that makes a lot of pooch's features -# unnecessary. -# -> We have 2 choices: -# 1. We could totally remove pooch and use urllib instead -# 2. We could package this program as a -# `zipapp `_. - +# To support scenario 1, this CAN ONLY use python's built in modules. import argparse -from contextlib import ExitStack +from contextlib import contextmanager, ExitStack import filecmp import hashlib import io +from math import log10 import os import re import shutil @@ -30,11 +20,12 @@ import sys import traceback from typing import IO, NamedTuple, Union +import urllib.request +from urllib.error import URLError, HTTPError import warnings -import pooch # external import - if (sys.version_info.major, sys.version_info.minor) < (3, 6, 1): + # 3.6.0 doesn't support all NamedTuple features raise RuntimeError("python 3.6.1 or newer is required") # Down below, we provide a detailed description that serves 3 purposes @@ -270,20 +261,20 @@ # -> See the end of this file for what that entails -def _use_progress_bar(): - try: - import tqdm # noqa: F401 - - return True - except ImportError: - return False +# define some constants +# ===================== +# default chunksize used for file operations +_CHUNKSIZE = 4096 -_PROGRESSBAR = _use_progress_bar() - +# the name of the subdirectory in a data-store where we handle deduplication +_OBJECT_STORE_SUBDIR = "object-store" +# Alternative to `None` for specifying that a value wasn't specified. This is primarily +# used as a default value for an optional command line flag that requires an argument. +# In that case, a value of _UNSPECIFIED means that the flag wasn't specified while a +# value of `None` means that the flag doesn't have an associated value. _UNSPECIFIED = object() -_OBJECT_STORE_SUBDIR = "object-store" class GenericToolError(RuntimeError): @@ -306,6 +297,100 @@ def _ensure_all_removed(fnames): continue +_MAX_BAR = 160 * "=" + + +@contextmanager +def _progress_bar(ncols, total_bytes, *, use_dummy=False): + """ + ContextManager that provides a function used for drawing/updating progress bars + + If the program wasn't excuted from a shell, or the caller want to draw + too few columns, the returned function does nothing. + """ + # the main template is '[] / ' + # -> is some fraction of _FULL_BAR and empty space + # -> describes the current/total download size (takes up to 6 characters) + # -> is 1 or 2 characters + # -> thus, we need 19 characters for everything other than + bar_len = min(len(_MAX_BAR), ncols - 19) + + if use_dummy or (total_bytes <= 0) or (bar_len <= 0) or not sys.stdout.isatty(): + use_dummy = True + + def _update(size): + return None + else: + power_div_3 = int(log10(total_bytes)) // 3 + factor, unit = 1000.0**power_div_3, ("B", "KB", "MB", "GB")[power_div_3] + fmt = "\r[{bar:{len}.{nfill}}] {size:.2f}" + f"/{total_bytes/factor:.2f} {unit}" + + def _update(size): + nfill = bar_len * int(size / total_bytes) + val = fmt.format(bar=_MAX_BAR, len=bar_len, nfill=nfill, size=size / factor) + print(val, end="", flush=True) + + try: + yield _update + finally: + # always execute this clause when exiting the context manager. If an exception + # caused the exit, it will be re-raised after this clause + if not use_dummy: + print(flush=True) + + +def _retrieve_url(url, dest, fname, *, use_progress_bar=True, chunksize=_CHUNKSIZE): + """ + download the file from url to dest + + Note + ---- + Online discussion about calling `response.read(chunksize)`, where + `response` is the context manager object produced by + `url.request.urlopen`, seems to strongly imply that this limits the + amount of data read from the http request into memory at a given + point in time. However, the documentation seems vague on this point. + + This is unlikely to ever be a problem (the biggest file we need is + currently under 10 Megabytes). However, if it does become a problem, + we have 2 options: + 1. we could conditionally fall back to ``curl`` (cli tool) or + ``Requests`` (python package) if they are present on the system + 2. we could craft custom http requests + """ + ncols = shutil.get_terminal_size()[0] - 1 + req = urllib.request.Request(url) + try: + with ExitStack() as stack: + # enter context managers for http-response, progress-bar, & output-file + response = stack.enter_context(urllib.request.urlopen(req)) + total_bytes = int(response.headers.get("Content-Length", -1)) + update_progress = stack.enter_context( + _progress_bar(ncols, total_bytes, use_dummy=not use_progress_bar) + ) + out_file = stack.enter_context(open(dest, "wb")) + + # write downloaded data to a file + downloaded_bytes = 0 + while True: + update_progress(downloaded_bytes) + block = response.read(chunksize) + if not block: + break + downloaded_bytes += len(block) + out_file.write(block) + except HTTPError as e: + raise GenericToolError( + f"The server couldn't fulfill the request for retrieving {fname} from " + f"{url}.\nError code: {e.code}" + ) + except URLError as e: + raise GenericToolError( + f"The server couldn't be reached while trying to retrieve {fname} from " + f"{url}.\nError code: {e.code}" + ) + + class Fetcher(NamedTuple): """Encodes information for fetching data files @@ -342,51 +427,41 @@ def __call__(self, fname, checksum, checksum_kind, dest_dir): Notes ----- - It doesn't look too difficult to swap out the ``pooch.retrieve`` - functionality for custom logic that calls methods from python's - builtin urllib module. This would simplify the process of shipping - this functionality as a portable standalone script. - - If we ever move away from pooch (e.g. to make this functionality - easier to use in a portable standalone script), we need to ensure - that the our new approach implements a similar procedure that - they adopt where: - 1. any downloaded file is first put in a temporary location - 2. and then, only after we verify that the checksum is correct, - we move the file to the downloads directory. + We follow the following procedure (inspired by pooch): + 1. downloads the file to a temporary location + 2. verifies that the checksum is correct + 3. move the file to the appropriate destination + + This provides robust behavior if the program is interupted. In + principle, we could combine steps 1 and 2. But, there may be some + minor benefits to keeping our procedure like this (theoretically, + we may be more likely to catch corruption from harddrive hardware + errors). """ + src = os.path.join(self.base_path, fname) + dst = os.path.join(dest_dir, fname) + _pretty_log(f"-> fetching `{fname}`", indent_all=True) + tmp_name = os.path.join(dest_dir, "_tempfile") + # tmp_name can safely be removed if it exists (it only exists if this logic + # previously crashed or was interupted by SIGKILL) + _ensure_all_removed([tmp_name]) - if self.holds_url: - return pooch.retrieve( - url=f"{self.base_path}/{fname}", - fname=fname, - known_hash=f"{checksum_kind}:{checksum}", - path=dest_dir, - progressbar=_PROGRESSBAR, - ) - else: - tmp_name = os.path.join(dest_dir, "_tempfile") - # tmp_name can safely be removed if it exists (it only exists if this logic - # previously crashed or was interupted by SIGKILL) - _ensure_all_removed([tmp_name]) - try: - src = os.path.join(self.base_path, fname) - dst = os.path.join(dest_dir, fname) - _pretty_log( - f"retrieving {fname}:\n" f"-> from: {src}\n" f"-> to: {dst}" - ) + try: + if self.holds_url: + _retrieve_url(src, tmp_name, fname) + else: # copy the file shutil.copyfile(src, tmp_name) - if not matches_checksum(tmp_name, checksum_kind, checksum): - if matches_checksum(src, checksum_kind, checksum): - raise GenericToolError( - f"while copying from {src}, data may have been corrupted" - ) - raise GenericToolError(f"{src} does't have the expected checksum") - os.rename(tmp_name, dst) + if not matches_checksum(tmp_name, checksum_kind, checksum): + if matches_checksum(src, checksum_kind, checksum): + raise GenericToolError( + f"while copying from {src}, data may have been corrupted" + ) + raise GenericToolError(f"{src} does't have the expected checksum") + os.rename(tmp_name, dst) - finally: - _ensure_all_removed([tmp_name]) + finally: + _ensure_all_removed([tmp_name]) class DataStoreConfig(NamedTuple): @@ -597,7 +672,7 @@ def standard_lockfile(data_store_config): return LockFileContext(os.path.join(data_store_config.data_dir, "lockfile")) -def calc_checksum(fname, alg_name, *, chunksize=4096): +def calc_checksum(fname, alg_name, *, chunksize=_CHUNKSIZE): """Calculate the checksum for a given fname""" # construct the object to track intermediate state of the checksum # calculation as we stream through the data @@ -619,13 +694,14 @@ def matches_checksum(fname, alg_name, checksum): return checksum == calc_checksum(fname, alg_name) -def _pretty_log(arg): +def _pretty_log(arg, *, indent_all=False): """indent messages so it's clear when multiline messages are a single thought""" lines = arg.splitlines() - if len(lines): - print("\n".join([f"-- {lines[0]}"] + [f" {e}" for e in lines[1:]])) + if len(lines) and not indent_all: + formatted = [f"-- {lines[0]}"] + [f" {e}" for e in lines[1:]] else: - print("") + formatted = [f" {e}" for e in lines] + print(*formatted, sep="\n") def _ensure_exists(path, content_description): @@ -975,6 +1051,7 @@ def fetch_all(self, registry): with lockfile_ctx: num_fetched = 0 + _pretty_log(f"preparing to fetch files from: {self.fetcher.base_path}") for fname, full_checksum_str in registry.items(): any_work, _ = self._fetch_file( fname, full_checksum_str, lockfile_ctx=lockfile_ctx @@ -982,7 +1059,7 @@ def fetch_all(self, registry): num_fetched += any_work if num_fetched == 0: - _pretty_log("no files needed to be downloaded") + _pretty_log("-> no files needed to be retrieved", indent_all=True) def fetch_command(args, tool_config, data_store_config): @@ -1467,14 +1544,14 @@ def make_config_objects(grackle_version, file_registry_file): return tool_config, data_store_config +# Here, we define machinery employed when used as a standalone program +# ==================================================================== + # to support installing this file as a standalone program, we will need to introduce the # following procedure to the build-system: # - treat this file as a template-file and configure it with CMake's # ``configure_file`` command (or invoke ``configure_file.py`` under the classic # build system) in order to substitute the names enclosed by the @ symbols -# - if we are still using pooch (or some other external package) we'll need to -# introduce logic to convert this into a zipapp (this is a special zip file that -# contains all dependencies that the python interpretter knows how to execute) # - make resulting file executable (and maybe drop the .py suffix) # - install it into the bin directory alongside the grackle libraries From 9ed61efbe4eaf534fe422788bcba15f9d026bb71 Mon Sep 17 00:00:00 2001 From: Matthew Abruzzo Date: Sun, 22 Sep 2024 16:34:35 -0400 Subject: [PATCH 06/36] Added some basic tests of the grdata command-line tool --- src/python/pygrackle/__main__.py | 12 +- src/python/pygrackle/utilities/grdata.py | 163 ++++-- src/python/tests/test_grdata.py | 711 +++++++++++++++++++++++ 3 files changed, 837 insertions(+), 49 deletions(-) create mode 100644 src/python/tests/test_grdata.py diff --git a/src/python/pygrackle/__main__.py b/src/python/pygrackle/__main__.py index 6f633728..3cc0675c 100644 --- a/src/python/pygrackle/__main__.py +++ b/src/python/pygrackle/__main__.py @@ -1,7 +1,13 @@ import sys -from .utilities.grdata import main -from .utilities.data_path import _CONFIG_PAIR +from .utilities.grdata import main as grdata_main +from .utilities.data_path import _make_config_pair + +def main(args=None): + return grdata_main( + *_make_config_pair(), prog_name="python -m pygrackle", args=args + ) + if __name__ == '__main__': - sys.exit(main(*_CONFIG_PAIR, prog_name="python -m pygrackle")) + sys.exit(main()) diff --git a/src/python/pygrackle/utilities/grdata.py b/src/python/pygrackle/utilities/grdata.py index 44ac5732..e0db8b75 100644 --- a/src/python/pygrackle/utilities/grdata.py +++ b/src/python/pygrackle/utilities/grdata.py @@ -543,6 +543,26 @@ def _get_data_dir(): return manual_choice +@contextmanager +def _file_openner(f, mode, **kwargs): + """Open a file or pass through an already open file""" + if (sys.version_info.major, sys.version_info.minor) < (3, 6): + if not isinstance(f, io.IOBase): + path = f + else: + path = None + else: + try: + path = os.fspath(f) + except TypeError: + path = None + if path is None: + yield f + else: + with open(path, mode, **kwargs) as fobj: + yield fobj + + def _parse_file_registry(f): """Read the file registry, as a dict from a text file @@ -562,23 +582,7 @@ def _parse_file_registry(f): and there is allowed to be a trailing comma """ - if (sys.version_info.major, sys.version_info.minor) < (3, 6): - if not isinstance(f, io.IOBase): - path = f - else: - path = None - else: - try: - path = os.fspath(f) - except TypeError: - path = None - - with ExitStack() as stack: - if path is None: - file = f - else: - file = stack.enter_context(open(path, "r")) - + with _file_openner(f, "r") as file: file_registry = {} for i, line in enumerate(file): # iterater over lines if (len(line) == 0) or line.isspace() or line.startswith("//"): @@ -677,7 +681,10 @@ def calc_checksum(fname, alg_name, *, chunksize=_CHUNKSIZE): # construct the object to track intermediate state of the checksum # calculation as we stream through the data hash_obj = hashlib.new(alg_name) - with open(fname, "rb") as f: + with _file_openner(fname, "rb") as f: + if f is fname: + f.seek(0, os.SEEK_SET) + buffer = bytearray(chunksize) while True: nbytes = f.readinto(buffer) @@ -1153,8 +1160,8 @@ def rm_command(args, tool_config, data_store_config): ) if not os.path.isdir(target_path): raise GenericToolError( - "intended to recursively delete all contents of the associated data-store. " - "But no such directory can be found." + "intended to recursively delete all contents of the associated " + "data-store. But no such directory can be found." ) fn = shutil.rmtree @@ -1169,7 +1176,8 @@ def rm_command(args, tool_config, data_store_config): target_path = os.path.join(data_store_config.store_location, target) operation_description = ( f"deleting all data file references for the grackle-version {_descr}. " - "Any files for which the reference-count drops to zero will also be removed." + "Any files for which the reference-count drops to zero will also be " + "removed." ) if not os.path.isdir(target_path): @@ -1205,15 +1213,16 @@ def fn(path): _HardlinkStrat.remove_if_norefs(cksum_fname) os.rmdir(path) - if not args.force: - _pretty_log( - f"{operation_description}\n" - "-> essentially, we are recursively removing\n" - f" `{target_path}`\n" - "-> to actually perform this command, pass the --force flag" - ) - else: - fn(target_path) + with standard_lockfile(data_store_config): + if not args.force: + _pretty_log( + f"{operation_description}\n" + "-> essentially, we are recursively removing\n" + f" `{target_path}`\n" + "-> to actually perform this command, pass the --force flag" + ) + else: + fn(target_path) def lsversions_command(args, tool_config, data_store_config): @@ -1279,6 +1288,27 @@ def _register_getpath_subcommand(subparsers): parser_getpath.set_defaults(func=getpath_command) +def showknownreg_command(args, tool_config, data_store_config): + f = data_store_config.file_registry_file + if isinstance(f, io.IOBase): + lines = f.readlines() + else: + with open(data_store_config.file_registry_file, "r") as f: + lines = f.readlines() + contents = [ + line for line in lines if len(line.strip()) > 0 and not line.startswith("//") + ] + print(*contents, sep="", end="") + + +def _fmt_registry_lines(fname_cksum_pairs, hash_alg): + length, suffix = len(fname_cksum_pairs), (",\n", "\n") + return [ + f'{{"{fname}", "{hash_alg}:{cksum}"}}{suffix[(i+1) == length]}' + for i, (fname, cksum) in enumerate(sorted(fname_cksum_pairs)) + ] + + def calcreg_command(args, tool_config, data_store_config): # print the properly file registry information (in the proper format that can be # used to configure newer versions of Grackle @@ -1300,6 +1330,7 @@ def calcreg_command(args, tool_config, data_store_config): else: file = stack.enter_context(open(args.output, "w")) + ''' if file is not None: file.write(f"""\ // This is a file registry generated by the grackle data management tool @@ -1311,11 +1342,8 @@ def calcreg_command(args, tool_config, data_store_config): // -> ```` with a path to the directory containing all files that are // to be included in the registry """) - print( - *[f'{{"{p[0]}", "{args.hash_name}:{p[1]}"}}' for p in sorted(pairs)], - sep=",\n", - file=file, - ) + ''' + print(*_fmt_registry_lines(pairs, args.hash_name), sep="", end="", file=file) def help_command(*args, **kwargs): @@ -1340,8 +1368,13 @@ def help_command(*args, **kwargs): print(line) -def _add_version(parser, version_flag, version_name, value): - """add argument to parser to show a version and exit (similar to --help)""" +def _add_program_prop_query(parser, flag, value, short_descr): + """ + add a flag to parser to trigger a control flow that: + 1. shows a fundamental piece of information about the command line program (like a + version number or the ``--help`` option) + 2. then immediately exits the program + """ class _Action(argparse.Action): def __call__(self, *args, **kwargs): @@ -1349,11 +1382,11 @@ def __call__(self, *args, **kwargs): sys.exit(0) parser.add_argument( - version_flag, + flag, metavar="", action=_Action, nargs=0, - help=f"show associated {version_name} and exit", + help=f"show associated {short_descr} and exit", ) @@ -1367,15 +1400,31 @@ def build_parser(tool_config, prog_name): epilog=f"Invoke `{prog_name} help` to get a detailed overview of the tool", ) - _add_version( - parser, "--version-grackle", "Grackle version", tool_config.grackle_version + # This is a hidden argument. It is only used for sake of testing (we may remove it + # any time in the future) + parser.add_argument( + "--testing-override-registry-file", + help=argparse.SUPPRESS, # hides the help message + default=argparse.SUPPRESS, # adds no attribute if option wasn't specified ) - _add_version( - parser, - "--version-protocol", - "data-store protocol version", - tool_config.protocol_version, + parser.add_argument( + "--testing-override-version-grackle", + help=argparse.SUPPRESS, # hides the help message + default=argparse.SUPPRESS, # adds no attribute if option wasn't specified ) + + query_l = [ + ("--version-grackle", "Grackle version", tool_config.grackle_version), + ( + "--version-protocol", + "data-store protocol version", + tool_config.protocol_version, + ), + ("--cksum-alg", "name of the checksum algorithm", tool_config.checksum_kind), + ] + for flag, short_descr, val in query_l: + _add_program_prop_query(parser, flag, val, short_descr) + subparsers = parser.add_subparsers(required=True) # fetch subcommand @@ -1427,6 +1476,16 @@ def build_parser(tool_config, prog_name): # getpath subcommand _register_getpath_subcommand(subparsers) + # showknownreg subcommand + parser_showknownreg = subparsers.add_parser( + "showknownreg", + help=( + "prints the pre-registered file registry expected by the current version " + "of Grackle" + ), + ) + parser_showknownreg.set_defaults(func=showknownreg_command) + # calcreg subcommand parser_calcregistry = subparsers.add_parser( "calcreg", @@ -1480,6 +1539,18 @@ def main(tool_config, data_store_config, prog_name, *, args=None): parser = build_parser(tool_config, prog_name) args = parser.parse_args(args=args) + # handle testing overrides + if hasattr(args, "testing_override_registry_file"): + # _replace makes a copy & in the copy any specified attributes are overridden + data_store_config = data_store_config._replace( + file_registry_file=args.testing_override_registry_file + ) + if hasattr(args, "testing_override_version_grackle"): + # _replace makes a copy & in the copy any specified attributes are overridden + tool_config = tool_config._replace( + grackle_version=args.testing_override_version_grackle + ) + try: args.func(args, tool_config=tool_config, data_store_config=data_store_config) except SystemExit: diff --git a/src/python/tests/test_grdata.py b/src/python/tests/test_grdata.py new file mode 100644 index 00000000..19075b34 --- /dev/null +++ b/src/python/tests/test_grdata.py @@ -0,0 +1,711 @@ +######################################################################## +# +# Test the command line tool managing test files +# +# +# Copyright (c) 2013, Enzo/Grackle Development Team. +# +# Distributed under the terms of the Enzo Public Licence. +# +# The full license is in the file LICENSE, distributed with this +# software. +######################################################################## + +import contextlib +import hashlib +import io +import operator +import os +import shutil +import subprocess +import sys +from textwrap import indent +from typing import Any, NamedTuple + +import pytest + +# a goal here is to be able to run this test without installing pygrackle! +# -> in the near future, we will install grdata as a standalone command-line +# script and it would really nice to be able to test the command-line script +# without installing pygrackle +# -> when that time comes, we will modify the logic within the cli_app fixture +# -> currently, we need to include the following import. But, in the future, we +# could add a new subcommand to grdata to make it unnecessary +from pygrackle.utilities.grdata import _parse_file_registry + + +# _ENV_VAR holds the list of environment variables that could affect the +# location of the data directory +if sys.platform.startswith("darwin"): + _ENV_VARS = ("HOME", "GRACKLE_DATA_DIR") +else: + _ENV_VARS = ("HOME", "GRACKLE_DATA_DIR", "XDG_DATA_HOME") + + +def _ensure_removed(d, key): + try: + del d[key] + except KeyError: + pass + + +@contextlib.contextmanager +def modified_env(new_env_vals, extra_cleared_variables=None): + """ + Temporarily overwrite the environment variables. This is necessary to test C + extensions that rely upon the environment variables + """ + if extra_cleared_variables is None: + extra_cleared_variables = None + + # record the original values for any variable we will overwrite + original_vals = {} + try: + for var in filter(lambda e: e not in new_env_vals, extra_cleared_variables): + original_vals[var] = os.environ.get(var, None) + _ensure_removed(os.environ, var) + + for var, new_val in new_env_vals.items(): + original_vals[var] = os.environ.get(var, None) + if new_val is None: + _ensure_removed(os.environ, var) + else: + os.environ[var] = new_val + + yield + + finally: + # restore to the initial values + for var, val in original_vals.items(): + if val is None: + _ensure_removed(os.environ, var) + else: + os.environ[var] = val + + +@contextlib.contextmanager +def custom_datadir(path): + """ + A contextmanager used to put the data directory at an arbitrary location. + """ + clear_env = [var for var in _ENV_VARS if var != "GRACKLE_DATA_DIR"] + with modified_env({"GRACKLE_DATA_DIR": path}, clear_env): + yield + + +class GRDataExecErr(Exception): + pass + + +class CLIApp: + """exists to wrap the command-line-interface. + + We use this so that we can eventually test the application when it is + configured as a standalone script. + """ + + def __init__(self, common_args, *, use_function_call=None): + self.common_args = common_args + self.use_function_call = use_function_call + + def __call__(self, subcommand_args, *, expect_success=None): + # have pytest hide certain kinds of noisy tracebacks + __tracebackhide__ = operator.methodcaller("errisinstance", GRDataExecErr) + + all_args = self.common_args + subcommand_args + if self.use_function_call is None: + tmp = subprocess.run(all_args, capture_output=True) + returncode = tmp.returncode + stdout = tmp.stdout.decode("ascii") + stderr = tmp.stderr.decode("ascii") + else: + fn = self.use_function_call + with contextlib.ExitStack() as stack: + f_out, f_err = [ + stack.enter_context(contextlib.redirect_stdout(io.StringIO())), + stack.enter_context(contextlib.redirect_stderr(io.StringIO())), + ] + try: + returncode = fn(all_args) + except SystemExit as err: + returncode = err.code + stdout = f_out.getvalue() + stderr = f_err.getvalue() + + expected_result = ( + (expect_success is None) + or (returncode == 0 and expect_success) + or (returncode != 0 and not expect_success) + ) + if not expected_result: + detail_indent = " >" + msg_lines = [ + "Invocation of grdata produced an unexpected result:\n", + f" expected: {('failure', 'success')[expect_success]}\n", + f" args: {all_args}\n", + " env:\n", + ] + for var in _ENV_VARS: + msg_lines.append( + f"{detail_indent}{var!r}: {os.environ.get(var,'')!r}\n" + ) + msg_lines.append(f" returncode: {tmp.returncode}\n") + + for stream, val in [("stdout", stdout), ("stderr", stderr)]: + if val is None or len(val) == 0: + msg_lines.append(f" {stream}: \n") + else: + msg_lines += [ + f" {stream}:\n", + indent(val.decode("ascii"), detail_indent), + ] + raise GRDataExecErr("".join(msg_lines)) + return returncode, stdout.rstrip() + + def fetch(self, src_dir=None, *, expect_success=None): + # a number of tests care about whether the command fails. (In some case we + # actually expect it to fail). We return whether or not it was succesful. + + # have pytest hide certain kinds of noisy tracebacks + __tracebackhide__ = operator.methodcaller("errisinstance", GRDataExecErr) + + if src_dir is None: + subcommand_args = ["fetch"] + else: + subcommand_args = ["fetch", "--from-dir", src_dir] + return self(subcommand_args, expect_success=expect_success)[0] == 0 + + def rm_vdata(self, version, *, omit_force=False, expect_success=None): + # remove a whole version-directory. returns whether this was successful + + # have pytest hide certain kinds of noisy tracebacks + __tracebackhide__ = operator.methodcaller("errisinstance", GRDataExecErr) + + if version is None: + # delete the associated version + subcommand_args = ["rm", "--force", "--vdata"] + elif isinstance(version, str): + subcommand_args = ["rm", "--force", "--vdata", version] + else: + # this particular mistake occurs a surprising amount + raise TypeError("version must be None or a str") + if omit_force: + subcommand_args.remove("--force") + + return self(subcommand_args, expect_success=expect_success)[0] == 0 + + def rm_datastore(self, *, omit_force=False, expect_success=None): + # remove a whole data-store. returns whether this was successful + + # have pytest hide certain kinds of noisy tracebacks + __tracebackhide__ = operator.methodcaller("errisinstance", GRDataExecErr) + subcommand_args = ["rm", "--force", "--data-store"] + if omit_force: + subcommand_args.remove("--force") + return self(subcommand_args, expect_success=expect_success)[0] == 0 + + def showknownreg(self): + # have pytest hide certain kinds of noisy tracebacks + __tracebackhide__ = operator.methodcaller("errisinstance", GRDataExecErr) + return self(["showknownreg"], expect_success=True)[1] + + def calcreg(self, cksum_alg, dir_path): + # have pytest hide certain kinds of noisy tracebacks + __tracebackhide__ = operator.methodcaller("errisinstance", GRDataExecErr) + return self( + ["calcreg", "--hash-name", cksum_alg, dir_path], expect_success=True + )[1] + + def cksum_alg(self): + # have pytest hide certain kinds of noisy tracebacks + __tracebackhide__ = operator.methodcaller("errisinstance", GRDataExecErr) + return self(["--cksum-alg"], expect_success=True)[1] + + def version_dir_path(self): + __tracebackhide__ = operator.methodcaller("errisinstance", GRDataExecErr) + return self(["getpath", "--vdata"], expect_success=True)[1] + + def data_dir_path(self): + __tracebackhide__ = operator.methodcaller("errisinstance", GRDataExecErr) + return self(["getpath", "--data-dir"], expect_success=True)[1] + + +@pytest.fixture(scope="module") +def cli_app(): + if False: + assert sys.executable is not None + return CLIApp([sys.executable, "-m", "pygrackle"]) + else: + import pygrackle.__main__ + + return CLIApp([], use_function_call=pygrackle.__main__.main) + + +_SHASUM_INSTALLED = shutil.which("shasum") is not None + + +def _calc_ref_cksum(contents, cksum_alg): + if _SHASUM_INSTALLED: + _algs = {"sha1": "1", "sha256": "256"} + args = ["shasum", "--algorithm", _algs[cksum_alg], "-"] + rslt_str = ( + subprocess.run( + args, input=contents.encode("ascii"), check=True, capture_output=True + ) + .stdout.rstrip() + .decode("utf8") + ) + if rslt_str.endswith(" -"): + cksum = rslt_str[:-3].lower() + else: + raise RuntimeError(f"the output of shasum was unexpected: '{rslt_str}'") + else: + if isinstance(contents, str): + contents = contents.encode("ascii") + hash_obj = hashlib.new(cksum_alg) + hash_obj.update(contents) + cksum = hash_obj.hexdigest() + return f"{cksum_alg}:{cksum}" + + +class DummyFileSpec(NamedTuple): + contents_str: str + sha1: str + sha256: str + + +def _dummy_file_contents(variant=1, trailing_content=None): + assert variant >= 0 and int(variant) == variant + newline_str = "\n" * (variant + 1) + contents_str = f"I am a test-file.{newline_str}Variant number {variant}\n" + if trailing_content is not None: + contents_str = contents_str + trailing_content + return DummyFileSpec( + contents_str, + _calc_ref_cksum(contents_str, "sha1"), + _calc_ref_cksum(contents_str, "sha256"), + ) + + +# here we define file-sets. Each fileset is a sequence of pairs specifying a filename +# and its contents. The idea is to act like these correspond to different grackle +# versions and make sure we can handle them appropriately + +_DUMMY_SET_PRIMARY = ( + ("file-0.txt", _dummy_file_contents(1)), + ("file-1.txt", _dummy_file_contents(2)), +) + +_DUMMY_SET_RENAME = ( + _DUMMY_SET_PRIMARY[0], + ("renamed-file-2.txt", _DUMMY_SET_PRIMARY[1][1]), +) + +# this scenario shouldn't come up in practice (we replaced a file with a different one, +# of the same name), but we should still handle it properly +_DUMMY_SET_REPLACE = ( + _DUMMY_SET_PRIMARY[0], + ("file-2.txt", _dummy_file_contents(2, trailing_content="version 2 of file\n")), +) + + +class FileSetTuple(NamedTuple): + """Holds an object for each of the filesets""" + + # corresponds to the primary file-set + primary: Any + # exactly like the primary file-set, but the 2nd file was renamed + rename: Any + # exactly like the primary file-set, but the contents of the second file was changed + replace: Any + + def get(self, key): + if key in self._fields: + return getattr(self, key) + raise KeyError(key) + + +_DUMMY_SET_TUPLE = FileSetTuple( + _DUMMY_SET_PRIMARY, _DUMMY_SET_RENAME, _DUMMY_SET_REPLACE +) + + +class DummyFileRepository(NamedTuple): + test_dir: str # the path reserved for the user to do stuff in + + # following variables specify properties for primary set of dummy files + registry_path: FileSetTuple + src_file_dir: FileSetTuple + + def cli_app_with_overrides(self, ref, kind, version_override=None): + new_args = ["--testing-override-registry-file", self.registry_path.get(kind)] + + if version_override is not None: + if not isinstance(version_override, str): + raise TypeError("version_override must be a str") + new_args += ["--testing-override-version-grackle", version_override] + return CLIApp( + ref.common_args + new_args, use_function_call=ref.use_function_call + ) + + +@pytest.fixture +def dummy_file_repo(tmp_path, cli_app): + test_dir = os.path.join(tmp_path, "test-dir") + os.mkdir(test_dir) + + path = os.path.join(tmp_path, "fixture_dir") + + cksum_kind = cli_app.cksum_alg() + + registry_paths, src_file_dirs = {}, {} + + for kind in _DUMMY_SET_TUPLE._fields: + file_set = _DUMMY_SET_TUPLE.get(kind) + + registry_path = os.path.join(path, f"{kind}_file_registry.txt") + src_file_dir = os.path.join(path, f"{kind}-ref-files") + os.makedirs(src_file_dir) + + src_file_dirs[kind] = src_file_dir + registry_paths[kind] = registry_path + + pairs = [] + for i, (fname, file_spec) in enumerate(file_set): + full_path = os.path.join(src_file_dir, fname) + with open(full_path, "w") as f: + f.write(file_spec.contents_str) + pairs.append((fname, getattr(file_spec, cksum_kind))) + with open(registry_path, "w") as f: + print( + *["".join(['{"', p[0], '", "', p[1], '"}']) for p in pairs], + sep=",\n", + file=f, + ) + + yield DummyFileRepository( + test_dir=test_dir, + registry_path=FileSetTuple(**registry_paths), + src_file_dir=FileSetTuple(**src_file_dirs), + ) + + +def test_showknownreg(dummy_file_repo, cli_app): + # essentially, we are checking that the testing override works + app = dummy_file_repo.cli_app_with_overrides(cli_app, "primary") + full_registry_str = app.showknownreg().rstrip() + + with open(dummy_file_repo.registry_path.primary, "r") as f: + ref_full_registry_str = f.read().rstrip() + assert full_registry_str == ref_full_registry_str + + +def test_calcreg(dummy_file_repo, cli_app): + for alg in ["sha1", "sha256"]: + registry_str = cli_app.calcreg(alg, dummy_file_repo.src_file_dir.primary) + registry = _parse_file_registry(io.StringIO(registry_str)) + + for i, (fname, cksum) in enumerate(sorted(registry.items())): + if not hasattr(_DUMMY_SET_PRIMARY[i][1], alg): + raise RuntimeError("This should never happen. Unclear what went wrong") + ref = getattr(_DUMMY_SET_PRIMARY[i][1], alg) + if cksum != ref: + raise AssertionError( + f"calculation of the {alg} checksum for the dummy-file, {fname}, " + "may have revealed an issue in the command line tool's " + "internal checksum logic\n" + f"expected: {ref}\nactual: {cksum}" + ) + + +def _get_lockfile_path(datadir_path): + return os.path.join(datadir_path, "lockfile") + + +def _get_datastore_dir(datadir_path): + return os.path.join(datadir_path, "data-store-v1") + + +def _get_version_dir(datadir_path, version): + return os.path.join(datadir_path, "data-store-v1", version) + + +def _get_managed_file(datadir_path, version, fname): + return os.path.join(datadir_path, "data-store-v1", version, fname) + + +def check_version_data_dir( + datadir_path, version, lockfile_should_exist=False, file_set=None, *, err_msg=None +): + __tracebackhide__ = True # suppress noisy pytest tracebacks + required_paths = [ + ("data directory", datadir_path), + ("user-data directory", _get_version_dir(datadir_path, version)), + ("version data directory", os.path.join(datadir_path, "user-data")), + ] + + def prep_assertion_err(nominal_err): + if err_msg is None: + return AssertionError(nominal_err) + return AssertionError(f"{err_msg}\n\n{nominal_err}") + + for descr, path in required_paths: + if not os.path.isdir(path): + raise prep_assertion_err( + f"the {descr}, {path} does not exist following an invocation of grdata" + ) + + lockfile_path = _get_lockfile_path(datadir_path) + if lockfile_should_exist and not os.path.isfile(lockfile_path): + raise prep_assertion_err( + f"the lockfile should exist at {lockfile_path} following " + "the invocation of grdata." + ) + elif (not lockfile_should_exist) and os.path.isfile(lockfile_path): + raise prep_assertion_err( + "a lockfile shouldn't exist after the last invocation of grdata." + ) + + if file_set is None: + file_set = [] + + for fname, file_spec in file_set: + full_path = _get_managed_file(datadir_path, version, fname) + + if not os.path.isfile(full_path): + raise prep_assertion_err( + f"file, {full_path}, doesn't exist after the last invocation of grdata" + ) + with open(full_path, "r") as f: + contents = f.read() + if contents != file_spec.contents_str: + raise prep_assertion_err( + f"the file, {full_path}, doesn't have the correct contents" + ) + + +@pytest.mark.parametrize( + "rm_approach", ["rm-implicit-vdata", "rm-explicit-vdata", "rm-data-store"] +) +def test_fetch_and_remove(dummy_file_repo, rm_approach, cli_app): + version = "1.0" + app = dummy_file_repo.cli_app_with_overrides(cli_app, "primary", version) + data_dir = os.path.join(dummy_file_repo.test_dir, "my-data-dir") + with custom_datadir(data_dir): + # fetch the data + app.fetch(dummy_file_repo.src_file_dir.primary, expect_success=True) + check_version_data_dir(data_dir, version, file_set=_DUMMY_SET_PRIMARY) + + # confirm that if we call fetch again, that we still consider it a success + app.fetch(dummy_file_repo.src_file_dir.primary, expect_success=True) + check_version_data_dir(data_dir, version, file_set=_DUMMY_SET_PRIMARY) + + # now, lets remove the version-data + # -> currently, we explicitly test a handful of scenarios where we expect things + # to fail. It might be nice to split these cases out into separate tests + if "rm-data-store" == rm_approach: + rm_method, rm_args, retains_datastore = app.rm_datastore, (), False + elif "rm-implicit-vdata" == rm_approach: + rm_method, rm_args, retains_datastore = app.rm_vdata, (None,), True + elif "rm-explicit-vdata" == rm_approach: + rm_method, rm_args, retains_datastore = app.rm_vdata, (version,), True + + # extra scenario worth testing - trying to remove values that don't exist + app.rm_vdata(version="9999999999999.0", expect_success=False) + check_version_data_dir(data_dir, version, file_set=_DUMMY_SET_PRIMARY) + else: + raise RuntimeError("unexpected rm_approach") + + # we expect the remove command to report success, when we omit the force flag, + # but not actually do anything + rm_method(*rm_args, omit_force=True, expect_success=True) + check_version_data_dir(data_dir, version, file_set=_DUMMY_SET_PRIMARY) + + # we expect the following case to fail + with open(_get_lockfile_path(data_dir), "w") as f: + f.write("a dummy lockfile") + rm_method(*rm_args, expect_success=False) + check_version_data_dir( + data_dir, version, lockfile_should_exist=True, file_set=_DUMMY_SET_PRIMARY + ) + os.remove(_get_lockfile_path(data_dir)) + + # now we expect it to succeed + rm_method(*rm_args, expect_success=True) + + version_dir = _get_version_dir(data_dir, version) + datastore_dir = _get_datastore_dir(data_dir) + if os.path.isdir(version_dir): + raise AssertionError( + f"after a successful remove operation, the version-dir, {version_dir}, " + "shouldn't exist" + ) + elif os.path.isdir(datastore_dir) != retains_datastore: + raise AssertionError( + f"the data-store directory, {datastore_dir}, should " + + ["not", "still"][retains_datastore] + + "after the removal operation" + ) + + +@pytest.mark.parametrize( + "src_file_dir_key", + [ + pytest.param("replace", id="cksum-mismatch"), + pytest.param("rename", id="no-src-file"), + ], +) +def test_fetch_fail(src_file_dir_key, dummy_file_repo, cli_app): + # here we intentionally use a file registry and a mismatched source-directory (where + # files are fetched from). Essentially we want to ensure correct (and graceful) + # behavior in 2 failure modes: + # 1. somehow the checksum is mismatched. This might happen if a file got corrupted in + # a download OR (more likely) we made an error while creating the registry file. + # 2. somwhow the file can't be fetched. This might happen for a range of reasons + # such as internet connectivity issues or server issues. Alternatively it could + # happen if we make a mistake (e.g. update the registry, but forget to upload the + # data file) + nominal_version = "1.0" + app = dummy_file_repo.cli_app_with_overrides(cli_app, "primary", nominal_version) + data_dir = os.path.join(dummy_file_repo.test_dir, "my-data-dir") + with custom_datadir(data_dir): + success = app.fetch(dummy_file_repo.src_file_dir.get(src_file_dir_key)) + assert not success + + # the directories should exist, but the file should not be "downloaded". We also + # confirm that there isn't a lockfile (i.e. we exit gracefully) + check_version_data_dir(data_dir, nominal_version, lockfile_should_exist=False) + assert not os.path.isfile( + _get_managed_file(data_dir, nominal_version, _DUMMY_SET_REPLACE[1][0]) + ) + + +def test_fetch_fail_locked(dummy_file_repo, cli_app): + # confirm that the fetch command will fail if a lockfile exists + nominal_version = "1.0" + app = dummy_file_repo.cli_app_with_overrides(cli_app, "primary", nominal_version) + data_dir = os.path.join(dummy_file_repo.test_dir, "my-data-dir") + # let's create a lockfile + os.makedirs(data_dir) + with open(_get_lockfile_path(data_dir), "w") as f: + f.write("a dummy lockfile") + + with custom_datadir(data_dir): + success = app.fetch(dummy_file_repo.src_file_dir.primary) + assert not success, "Failure is expected when a lockfile exists" + if os.path.isdir(_get_version_dir(data_dir, nominal_version)): + raise AssertionError( + "the tool should not create a version directory when a lock file exists" + ) + + +def is_linked(*paths): + if len(paths) < 2: + raise TypeError("is_linked() must have at least 2 arguments") + try: + # if ref has a value of 0, then we can't actually make a meaningful + # comparison. There isn't any obvious behavior in this scenario + ref = os.stat(paths[0], follow_symlinks=True).st_ino + return all( + ref == os.stat(path, follow_symlinks=True).st_ino for path in paths[1:] + ) + except FileNotFoundError: + return False + + +def test_multiversion(dummy_file_repo, cli_app): + # test what happens when we fetch multiple sets of files + # - in the future, it might be nice to break this test up into smaller pieces + + # from a realism perspective, primary -> rename -> replace may make more sense, but + # the current order seems more likely to catch an error + version_kind_map = {"1.0": "primary", "2.0": "replace", "3.0": "rename"} + versions = tuple(version_kind_map.keys()) + app_v1, app_v2, app_v3 = [ + dummy_file_repo.cli_app_with_overrides(cli_app, kind, version) + for version, kind in version_kind_map.items() + ] + data_dir = os.path.join(dummy_file_repo.test_dir, "my-data-dir") + + def _basic_datastore_check(expected_versions, last_op, last_op_version): + err_msg = ( + f"This check is performed right after performing the `{last_op}` " + "operation with grdata, specialized for simulated version " + f"{last_op_version} of grackle. (That version is associated with the " + f"{version_kind_map[last_op_version]!r} dummy fileset)" + ) + + for ver in expected_versions: + check_version_data_dir( + data_dir, + ver, + file_set=_DUMMY_SET_TUPLE.get(version_kind_map[ver]), + err_msg=err_msg, + ) + for ver in version_kind_map.keys(): + if ver not in expected_versions: + ver_dir = _get_version_dir(data_dir, ver) + if os.path.isdir(ver_dir): + raise AssertionError( + f"{err_msg}\n\n" + "The version-directorty, {ver_dir}, should not exist!" + ) + + with custom_datadir(data_dir): + # step 1: load data associated with v1 (the `primary` fileset) + app_v1.fetch(dummy_file_repo.src_file_dir.primary, expect_success=True) + _basic_datastore_check(versions[:1], "fetch", last_op_version=versions[0]) + + # step 2: load data associated with v2 (the `replace` fileset) + app_v2.fetch(dummy_file_repo.src_file_dir.replace, expect_success=True) + _basic_datastore_check(versions[:2], "fetch", last_op_version=versions[1]) + + # confirming linking... (it might be better to check disk-usage and be more + # agnostic about deduplication) + assert is_linked( + _get_managed_file(data_dir, versions[0], _DUMMY_SET_TUPLE.primary[0][0]), + _get_managed_file(data_dir, versions[1], _DUMMY_SET_TUPLE.replace[0][0]), + ), "the file-0.txt files should all be linked" + assert not is_linked( + _get_managed_file(data_dir, versions[0], _DUMMY_SET_TUPLE.primary[1][0]), + _get_managed_file(data_dir, versions[1], _DUMMY_SET_TUPLE.replace[1][0]), + ), "the file-1.txt files are expected to hold different contents" + + # step 3: load data associated with v3 (the `rename` fileset) + app_v3.fetch(dummy_file_repo.src_file_dir.rename, expect_success=True) + _basic_datastore_check(versions, "fetch", last_op_version=versions[2]) + # checking linking... (it might be better to check disk-usage and be more + # agnostic about the fact that we use linking for deduplication) + assert is_linked( + _get_managed_file(data_dir, versions[0], _DUMMY_SET_TUPLE.primary[0][0]), + _get_managed_file(data_dir, versions[1], _DUMMY_SET_TUPLE.replace[0][0]), + _get_managed_file(data_dir, versions[2], _DUMMY_SET_TUPLE.rename[0][0]), + ), "the file-0.txt files should all be linked" + assert is_linked( + _get_managed_file(data_dir, versions[0], _DUMMY_SET_TUPLE.primary[1][0]), + _get_managed_file(data_dir, versions[2], _DUMMY_SET_TUPLE.rename[1][0]), + ), "the file-1.txt files from primary and rename filesets should be linked" + assert not is_linked( + _get_managed_file(data_dir, versions[0], _DUMMY_SET_TUPLE.primary[1][0]), + _get_managed_file(data_dir, versions[1], _DUMMY_SET_TUPLE.replace[1][0]), + ), "file-1.txt file from the replace fileset should not be linked to anything" + + # step 4: remove data associated with v1 (the `primary` fileset) + # -> we EXPLICITLY use a different app version to remove this data + app_v3.rm_vdata(versions[0], expect_success=True) + _basic_datastore_check( + [versions[1], versions[2]], "rm-vdata", last_op_version=versions[0] + ) + assert is_linked( + _get_managed_file(data_dir, versions[1], _DUMMY_SET_TUPLE.replace[0][0]), + _get_managed_file(data_dir, versions[2], _DUMMY_SET_TUPLE.rename[0][0]), + ), "remaining file-0.txt files should remain linked" + assert not is_linked( + _get_managed_file(data_dir, versions[1], _DUMMY_SET_TUPLE.replace[1][0]), + _get_managed_file(data_dir, versions[2], _DUMMY_SET_TUPLE.rename[1][0]), + ), "remaining file-1.txt files should remain unlinked" + + # step 5: remove data associated with v3 (the `rename` fileset) + # -> we EXPLICITLY use a different app version to remove this data + app_v2.rm_vdata(versions[2], expect_success=True) + _basic_datastore_check([versions[1]], "rm-vdata", last_op_version=versions[2]) From 39b8b16d73d3ffc913597481c37c3a2f7a1f95e2 Mon Sep 17 00:00:00 2001 From: Matthew Abruzzo Date: Sun, 22 Sep 2024 16:49:56 -0400 Subject: [PATCH 07/36] reorganized where we declare the arguments for each subcommand. --- src/python/pygrackle/utilities/grdata.py | 207 ++++++++++++----------- 1 file changed, 110 insertions(+), 97 deletions(-) diff --git a/src/python/pygrackle/utilities/grdata.py b/src/python/pygrackle/utilities/grdata.py index e0db8b75..6516d072 100644 --- a/src/python/pygrackle/utilities/grdata.py +++ b/src/python/pygrackle/utilities/grdata.py @@ -1082,6 +1082,26 @@ def fetch_command(args, tool_config, data_store_config): man.fetch_all(registry) +def _register_fetch_command(subparsers): + parser_fetch = subparsers.add_parser( + "fetch", + help=( + "fetch data files if we don't already have the data for the " + "associated version of grackle" + ), + ) + parser_fetch.add_argument( + "--from-dir", + default=None, + help=( + "optionally specify a path to a directory where we copy the files from " + "(instead of downloading them)" + ), + ) + parser_fetch.set_defaults(func=fetch_command) + + + def direntry_iter(path, *, ftype="file", mismatch="skip", ignore=None): """ Iterate over the contents of a single directory with focus on a @@ -1225,6 +1245,32 @@ def fn(path): fn(target_path) +def _register_rm_command(subparsers): + parser_rm = subparsers.add_parser( + "rm", help="remove data associated with a given version" + ) + parser_rm.add_argument( + "-f", + "--force", + action="store_true", + help="This option must be present to actually remove things", + ) + rm_spec_grp = parser_rm.add_argument_group( + title="Target", description="specifies the target that will be removed" + ).add_mutually_exclusive_group(required=True) + rm_spec_grp.add_argument( + "--data-store", action="store_true", help="remove the full data-store" + ) + rm_spec_grp.add_argument( + "--vdata", + default=_UNSPECIFIED, + nargs="?", + help="remove all data associated with the contemporaneous grackle version", + ) + parser_rm.set_defaults(func=rm_command) + + + def lsversions_command(args, tool_config, data_store_config): if not os.path.exists(data_store_config.store_location): print("there is no data") @@ -1238,6 +1284,11 @@ def lsversions_command(args, tool_config, data_store_config): print(*sorted(pair[0] for pair in it), sep="\n") +def _register_lsversions_command(subparsers): + parser_ls = subparsers.add_parser("ls-versions", help="list the versions") + parser_ls.set_defaults(func=lsversions_command) + + def getpath_command(args, tool_config, data_store_config): if args.data_dir: print(data_store_config.data_dir) @@ -1252,7 +1303,7 @@ def getpath_command(args, tool_config, data_store_config): print(os.path.join(data_store_config.store_location, version)) -def _register_getpath_subcommand(subparsers): +def _register_getpath_command(subparsers): parser_getpath = subparsers.add_parser( "getpath", description=( @@ -1301,6 +1352,17 @@ def showknownreg_command(args, tool_config, data_store_config): print(*contents, sep="", end="") +def _register_showknownreg_command(subparsers): + parser_showknownreg = subparsers.add_parser( + "showknownreg", + help=( + "prints the pre-registered file registry expected by the current version " + "of Grackle" + ), + ) + parser_showknownreg.set_defaults(func=showknownreg_command) + + def _fmt_registry_lines(fname_cksum_pairs, hash_alg): length, suffix = len(fname_cksum_pairs), (",\n", "\n") return [ @@ -1346,6 +1408,39 @@ def calcreg_command(args, tool_config, data_store_config): print(*_fmt_registry_lines(pairs, args.hash_name), sep="", end="", file=file) +def _register_calcreg_command(subparsers): + parser_calcregistry = subparsers.add_parser( + "calcreg", + help=( + "prints the file registry (file hash pairs) for a given directory. This " + "computed registry can be used to configure future versions of Grackle." + ), + ) + parser_calcregistry.add_argument( + "-o", + "--output", + metavar="FILE", + help=( + "Write the output to a file instead of stdout. The file will include extra " + "metadata (as comments)." + ), + ) + parser_calcregistry.add_argument( + "--hash-name", + required=True, + metavar="HASH", + choices=hashlib.algorithms_guaranteed, + help=( + "the kind of checksum to compute. Must be one of: " + f"{ ', '.join(sorted(hashlib.algorithms_guaranteed))}" + ), + ) + parser_calcregistry.add_argument( + "path", help="path to the directory containing the files in the registry" + ) + parser_calcregistry.set_defaults(func=calcreg_command) + + def help_command(*args, **kwargs): # it might be nice to pipe to a pager (specified by PAGER env variable or @@ -1368,6 +1463,13 @@ def help_command(*args, **kwargs): print(line) +def _register_help_command(subparsers): + parser_help = subparsers.add_parser( + "help", help="Display detailed help information about this tool" + ) + parser_help.set_defaults(func=help_command) + + def _add_program_prop_query(parser, flag, value, short_descr): """ add a flag to parser to trigger a control flow that: @@ -1427,102 +1529,13 @@ def build_parser(tool_config, prog_name): subparsers = parser.add_subparsers(required=True) - # fetch subcommand - parser_fetch = subparsers.add_parser( - "fetch", - help=( - "fetch data files if we don't already have the data for the " - "associated version of grackle" - ), - ) - parser_fetch.add_argument( - "--from-dir", - default=None, - help=( - "optionally specify a path to a directory where we copy the files from " - "(instead of downloading them)" - ), - ) - parser_fetch.set_defaults(func=fetch_command) - - # ls-versions subcommand - parser_ls = subparsers.add_parser("ls-versions", help="list the versions") - parser_ls.set_defaults(func=lsversions_command) - - # rm subcommand - parser_rm = subparsers.add_parser( - "rm", help="remove data associated with a given version" - ) - parser_rm.add_argument( - "-f", - "--force", - action="store_true", - help="This option must be present to actually remove things", - ) - rm_spec_grp = parser_rm.add_argument_group( - title="Target", description="specifies the target that will be removed" - ).add_mutually_exclusive_group(required=True) - rm_spec_grp.add_argument( - "--data-store", action="store_true", help="remove the full data-store" - ) - rm_spec_grp.add_argument( - "--vdata", - default=_UNSPECIFIED, - nargs="?", - help="remove all data associated with the contemporaneous grackle version", - ) - parser_rm.set_defaults(func=rm_command) - - # getpath subcommand - _register_getpath_subcommand(subparsers) - - # showknownreg subcommand - parser_showknownreg = subparsers.add_parser( - "showknownreg", - help=( - "prints the pre-registered file registry expected by the current version " - "of Grackle" - ), - ) - parser_showknownreg.set_defaults(func=showknownreg_command) - - # calcreg subcommand - parser_calcregistry = subparsers.add_parser( - "calcreg", - help=( - "prints the file registry (file hash pairs) for a given directory. This " - "computed registry can be used to configure future versions of Grackle." - ), - ) - parser_calcregistry.add_argument( - "-o", - "--output", - metavar="FILE", - help=( - "Write the output to a file instead of stdout. The file will include extra " - "metadata (as comments)." - ), - ) - parser_calcregistry.add_argument( - "--hash-name", - required=True, - metavar="HASH", - choices=hashlib.algorithms_guaranteed, - help=( - "the kind of checksum to compute. Must be one of: " - f"{ ', '.join(sorted(hashlib.algorithms_guaranteed))}" - ), - ) - parser_calcregistry.add_argument( - "path", help="path to the directory containing the files in the registry" - ) - parser_calcregistry.set_defaults(func=calcreg_command) - - # help subcommand - parser_help = subparsers.add_parser( - "help", help="Display detailed help information about this tool" - ) - parser_help.set_defaults(func=help_command) + _register_fetch_command(subparsers) + _register_rm_command(subparsers) + _register_lsversions_command(subparsers) + _register_getpath_command(subparsers) + _register_showknownreg_command(subparsers) + _register_calcreg_command(subparsers) + _register_help_command(subparsers) return parser From af04e97c85fcf4ee33fb97da37a43a4ea434a65b Mon Sep 17 00:00:00 2001 From: Matthew Abruzzo Date: Thu, 3 Oct 2024 13:01:11 -0400 Subject: [PATCH 08/36] made it possible to download individual files and to an untracked directory. --- src/python/pygrackle/utilities/grdata.py | 339 +++++++++++++++++------ src/python/tests/test_grdata.py | 197 +++++++++++-- 2 files changed, 421 insertions(+), 115 deletions(-) diff --git a/src/python/pygrackle/utilities/grdata.py b/src/python/pygrackle/utilities/grdata.py index 6516d072..3346c876 100644 --- a/src/python/pygrackle/utilities/grdata.py +++ b/src/python/pygrackle/utilities/grdata.py @@ -8,7 +8,7 @@ # To support scenario 1, this CAN ONLY use python's built in modules. import argparse -from contextlib import contextmanager, ExitStack +import contextlib import filecmp import hashlib import io @@ -19,15 +19,11 @@ import stat import sys import traceback -from typing import IO, NamedTuple, Union +from typing import IO, NamedTuple, Optional, Union import urllib.request from urllib.error import URLError, HTTPError import warnings -if (sys.version_info.major, sys.version_info.minor) < (3, 6, 1): - # 3.6.0 doesn't support all NamedTuple features - raise RuntimeError("python 3.6.1 or newer is required") - # Down below, we provide a detailed description that serves 3 purposes # 1. to act as a description of this files contents for developers # 2. to serve as documentation on the website @@ -277,6 +273,29 @@ _UNSPECIFIED = object() +# Version check and define some backports +# ======================================= + +if sys.version_info < (3, 6, 1): + # 3.6.0 doesn't support all NamedTuple features + raise RuntimeError("python 3.6.1 or newer is required") + +elif sys.version_info < (3, 7, 0): + + class nullcontext: + def __init__(self, enter_result=None): + self.enter_result = enter_result + + def __enter__(self): + return self.enter_result + + def __exit__(self, *args): + pass + +else: + nullcontext = contextlib.nullcontext + + class GenericToolError(RuntimeError): pass @@ -300,7 +319,7 @@ def _ensure_all_removed(fnames): _MAX_BAR = 160 * "=" -@contextmanager +@contextlib.contextmanager def _progress_bar(ncols, total_bytes, *, use_dummy=False): """ ContextManager that provides a function used for drawing/updating progress bars @@ -361,7 +380,7 @@ def _retrieve_url(url, dest, fname, *, use_progress_bar=True, chunksize=_CHUNKSI ncols = shutil.get_terminal_size()[0] - 1 req = urllib.request.Request(url) try: - with ExitStack() as stack: + with contextlib.ExitStack() as stack: # enter context managers for http-response, progress-bar, & output-file response = stack.enter_context(urllib.request.urlopen(req)) total_bytes = int(response.headers.get("Content-Length", -1)) @@ -543,7 +562,7 @@ def _get_data_dir(): return manual_choice -@contextmanager +@contextlib.contextmanager def _file_openner(f, mode, **kwargs): """Open a file or pass through an already open file""" if (sys.version_info.major, sys.version_info.minor) < (3, 6): @@ -860,7 +879,21 @@ def get_object_dir(data_store_config): class VersionDataManager(NamedTuple): - """Actually manages downloads of files + """ + Actually manages downloads of files to a directory where the + directory and files are associated with single Grackle version. + + Instances of this class support 2 modes of operation: + 1. The instance manages a data file directory that is part of the + larger data-management system. This data-management system may + include multiple version directories and data-files are + deduplicated. + 2. The instance manages a data file directory that is completely + isolated from any data-management system (i.e. there is no + deduplication) + + The first mode is the primary usecase of this class. (The second mode + is mostly provided as a convenience) Warnings -------- @@ -883,87 +916,140 @@ class VersionDataManager(NamedTuple): a file in the object directory) """ - # define attributes holding directory paths where data files are actually stored - - # Path to output directory, where the file-name matches the name given - # in the registry and is known by the associated grackle-version + # Path to output directory, where the file-name matches the name given in the + # registry and is known by the associated grackle-version. version_dir: str - # Path to the object directory. This is the name where checksum names are used as - # filenames. (This is the mechanism used to aid deduplication) - object_dir: str # data_store_config holds a little more information than we actually need # -> we may chooise to redefine this in the future - data_store_config: DataStoreConfig + data_store_config: Optional[DataStoreConfig] # encodes the configuration (and logic) for fetching the files fetcher: Fetcher @classmethod - def create(cls, tool_config, data_store_config, *, override_fetcher=None): - """create a new instance""" + def create( + cls, + tool_config, + data_store_config, + *, + untracked_dest_dir=None, + override_fetcher=None, + ): + """ + Create a new instance + + Parameters + ---------- + tool_config : ToolConfig + data_store_config : DataStoreConfig + untracked_dest_dir : str, optional + When specified the constructed object will be used to manage + fetched files are placed in the specified + directory and no attempt is made to track the file as part of + the data-directory. + override_fetcher : Fetcher, optional + When specified, this fetcher is used in place of the standard + default fetcher provided by data_store_config. + """ fetcher = override_fetcher if fetcher is None: fetcher = data_store_config.default_fetcher + if untracked_dest_dir is None: + version_dir = get_version_dir(tool_config, data_store_config) + else: + version_dir = untracked_dest_dir + data_store_config = None + return cls( - version_dir=get_version_dir(tool_config, data_store_config), - object_dir=get_object_dir(data_store_config), + version_dir=version_dir, data_store_config=data_store_config, fetcher=fetcher, ) + def manages_untracked_data(self): + return self.data_store_config is None + + def _object_dir(self): + """ + If the instance manages data as part of a larger data management + system, then this method returns the path to the object directory. + This is the directory where checksum names are used as filenames. + (Linking with files in this directory is the mechanism used to aid + deduplication) + """ + if self.data_store_config is None: + return None + return get_object_dir(self.data_store_config) + def _setup_file_system(self): """ helper function that ensures that the file system is set up for fetching new files and returns the configured lockfile context manager (it isn't locked yet) """ - _ensure_data_dir_exists(self.data_store_config) + if self.manages_untracked_data(): + lockfile_ctx = nullcontext() + else: + _ensure_data_dir_exists(self.data_store_config) + + lockfile_ctx = standard_lockfile(self.data_store_config) + with lockfile_ctx: + # let's validate we can actually use hardlinks + if not hasattr(os, "link"): + raise GenericToolError("operating system doesn't support hardlinks") + elif not _HardlinkStrat.is_supported(self.data_store_config.tmp_dir): + raise GenericToolError("The file system does not support hardlinks") + + # a little more set up + _ensure_exists( + self.data_store_config.store_location, "that holds the data-store" + ) + _ensure_exists(self._object_dir(), "") - lockfile_ctx = standard_lockfile(self.data_store_config) - with lockfile_ctx: - # let's validate we can actually use hardlinks - if not hasattr(os, "link"): - raise GenericToolError("The operating system doesn't support hardlinks") - elif not _HardlinkStrat.is_supported(self.data_store_config.tmp_dir): - raise GenericToolError("The file system does not support hardlinks") + assert not lockfile_ctx.locked() # sanity check! - # a little more set up - _ensure_exists( - self.data_store_config.store_location, "that will hold the data-store" - ) - _ensure_exists(self.object_dir, "") + with lockfile_ctx: _ensure_exists( self.version_dir, "that holds data for current Grackle version" ) - assert not lockfile_ctx.locked() # sanity check! return lockfile_ctx def _fetch_file(self, fname, full_checksum_str, *, lockfile_ctx=None): """ - Helper method to fetch a single file and provide the full path + Helper method to fetch a single file. Provides the full path + + Parameters + ---------- + fname : str + The name of the file to be fetched + full_checksum_str : str + The checksum of the file. + lockfile_ctx : LockFileContext, optional + When this is None, the calling process doesn't already own the + lock for the data-directory (and this function will try to + acquire the lock) Returns ------- any_work : bool - ``True`` indicates that we actually needed to go get the file, - while ``False`` indicates that the file already existed - full_path : str - Full path to the file + Indicates whether any work was actually required to fetch the + file. the file was freshly fetched. ``True`` indicates that we + actually needed to go get the file, while ``False`` denotes + that the file already existed + full_fname : str + Specifies the absolute path to the file. When a tracked file + is fetched, then this is path to the file entry where + `os.path.basename(full_fname)` is equal `fname` """ if lockfile_ctx is None: lockfile_ctx = self._setup_file_system() - # get the global checksum kind - cksum_kind = self.data_store_config.checksum_kind - # extract the checksum_kind and string that are stored in the registry - # (we are being a little more careful here than necessary, but if this ever - # becomes library-code, it will pay off) if ":" in full_checksum_str: cur_cksum_kind, checksum = full_checksum_str.split(":") else: @@ -971,69 +1057,97 @@ def _fetch_file(self, fname, full_checksum_str, *, lockfile_ctx=None): f"the checksum for {fname} does not specify the checksum kind" ) - if cur_cksum_kind != cksum_kind: - raise ValueError( - "Currently, we only support downloading from file registries where the " - "checksum algorithm matches the globally used algorithm, " - f"{cksum_kind}. The checksum algorithm associated with {fname} is " - f"{cur_cksum_kind}." - ) + # when tracking files as part of the data file management system, there are + # strict requirements on the kind of checksum that is used. + # -> This is because the entries in the object directory (used for + # deduplication) use the checksum string (without the algorithm tag) as + # file names. + # -> here, we check this requirement + if self.manages_untracked_data(): + req_cksum_kind = None + else: + req_cksum_kind = self.data_store_config.checksum_kind + if cur_cksum_kind != req_cksum_kind: + raise ValueError( + "To download a file as part of Grackle's data file management " + "system, we must know the file's checksum computed with the " + f"{req_cksum_kind} checksum algorithm. The provided checksum " + f"for {fname} was computed with the {cur_cksum_kind} algorithm." + ) + # now we actually fetch the file (if necessary) with lockfile_ctx: - # name associated with current file in the current grackle version + # get the full path to the downloaded file full_fname = os.path.join(self.version_dir, fname) # if the file already exists we are done if os.path.exists(full_fname): - if not matches_checksum(full_fname, cksum_kind, checksum): + if not matches_checksum(full_fname, cur_cksum_kind, checksum): raise RuntimeError( f"{full_fname} already exists but has the wrong hash" ) + # when we're handling tracked data, we could theoretically check whether + # this data is properly linked. But I don't think I would want to take + # any kind of action if it isn't properly tracked (the most action I + # would recommend is providing the user with a warning) return (False, full_fname) - # download the file (pooch will log a detailed message + # download the file fetcher = self.fetcher fetcher( fname, checksum=checksum, - checksum_kind=cksum_kind, + checksum_kind=cur_cksum_kind, dest_dir=self.version_dir, ) - os.chmod(full_fname, _IMMUTABLE_MODE) - - # now deduplicate - cksum_fname = os.path.join(self.object_dir, checksum) - - try: - _HardlinkStrat.deduplicate(full_fname, cksum_fname) - - # not strictly necessary, but doing this for safety reasons - os.chmod(cksum_fname, _IMMUTABLE_MODE) - - except Exception as err: - # remove full_fname since we don't want users to use it before dealing - # with the larger issue. We also want to make the errors reproducible - os.remove(full_fname) - if (not isinstance(err, ValueError)) and os.path.is_file(cksum_fname): - raise err - - # this should only happens when full_fname and cksum_fname both exist, - # but aren't perfect matches of each other. We try to provide a more - # informative error message - if not matches_checksum(cksum_fname, cksum_kind, checksum): - raise GenericToolError(f"""\ + if not self.manages_untracked_data(): + # by changing permissions, certain platforms (like MacOS) will ask + # users "are you sure you want to delete this file" when the ``rm`` + # command is used: + # -> this is desirable for files managed by the grackle data management + # systems (when we don't want users removing individual files) + # -> but, we avoid doing this for untracked data files (because it's + # just an annoyance for the end-user) + os.chmod(full_fname, _IMMUTABLE_MODE) + + if not self.manages_untracked_data(): + # handle deduplication (since the data files are part of the larger + # grackle data file management system) + cksum_fname = os.path.join(self._object_dir(), checksum) + + try: + _HardlinkStrat.deduplicate(full_fname, cksum_fname) + + # not strictly necessary, but doing this for safety reasons + os.chmod(cksum_fname, _IMMUTABLE_MODE) + + except Exception as err: + # remove full_fname + # -> we don't want users to use it before resolving the issues + # -> We also want to make the errors as reproducible as possible + # (ideally, rerunning the command should produce the same error + # if you haven't changed anything) + os.remove(full_fname) + if os.path.is_file(cksum_fname) and not isinstance(err, ValueError): + raise err + + # this should only happens when full_fname and cksum_fname both + # exist, but aren't perfect matches of each other. Here, we try to + # provide a more informative error message + if not matches_checksum(cksum_fname, req_cksum_kind, checksum): + raise GenericToolError(f"""\ A file (used for deduplication) that already existed on disk `{cksum_fname}` which is probably a version of `{fname}`, -doesn't have the appropriate {self.data_store_config.checksum_kind} checksum. --> expected: {calc_checksum(cksum_fname, cksum_kind)} +doesn't have the appropriate {req_cksum_kind} checksum. +-> expected: {calc_checksum(cksum_fname, req_cksum_kind)} -> actual: {checksum} -> This implies that the data was corrupted and it needs to be dealt with. To avoid confusion we have deleted the newly downloaded version of `{fname}` -> The safest bet is probably to delete the data directory""") - else: - raise GenericToolError(f"""\ + else: + raise GenericToolError(f"""\ Something bizare (& extremely unlikely) happened: -> a previous invocation of this tool appears to have installed a data file with the same checksum as {fname}, but has different contents. @@ -1043,23 +1157,38 @@ def _fetch_file(self, fname, full_checksum_str, *, lockfile_ctx=None): downloaded version of the file""") return (True, full_fname) - def fetch_all(self, registry): + def fetch_all(self, registry, *, fnames=None): """ - Ensures that all files in the specified registry are downloaded + Ensures that files in the specified registry are downloaded Parameters ---------- registry : dict maps file names to associated checksums + fnames : sequence, optional + Optionally specifies a list of files, with corresponding + registry entries to fetch. When this is ``None``, all files in + the registry are fetched. """ + if fnames is None: + fname_cksum_pairs = registry.items() + else: + for fname in filter(lambda fname: fname not in registry, fnames): + raise ValueError( + f"{fname} is not the name of a file with a registry " + "entry. Thus it can't be downloaded.\n\nFiles with " + f"registry entries include: {list(registry.keys())!r}" + ) + fname_cksum_pairs = ((fname, registry[fname]) for fname in fnames) + # ensure all needed directories exist and fetch the lockfile context manager lockfile_ctx = self._setup_file_system() with lockfile_ctx: num_fetched = 0 _pretty_log(f"preparing to fetch files from: {self.fetcher.base_path}") - for fname, full_checksum_str in registry.items(): + for fname, full_checksum_str in fname_cksum_pairs: any_work, _ = self._fetch_file( fname, full_checksum_str, lockfile_ctx=lockfile_ctx ) @@ -1073,13 +1202,17 @@ def fetch_command(args, tool_config, data_store_config): override_fetcher = None if args.from_dir is not None: override_fetcher = Fetcher.configure_src_dir(args.from_dir) + + fnames = None if len(args.fnames) == 0 else args.fnames + man = VersionDataManager.create( tool_config=tool_config, data_store_config=data_store_config, + untracked_dest_dir=args.untracked_dest_dir, override_fetcher=override_fetcher, ) registry = _parse_file_registry(data_store_config.file_registry_file) - man.fetch_all(registry) + man.fetch_all(registry, fnames=fnames) def _register_fetch_command(subparsers): @@ -1090,6 +1223,32 @@ def _register_fetch_command(subparsers): "associated version of grackle" ), ) + parser_fetch.add_argument( + "fnames", + nargs="*", + help=( + "Optionally specify the names of files that should be fetched. " + "Each listed file must have a corresponding entry in the file " + "registry used by this tool. If no files are specified, then the " + "tool will fetch every known file." + ), + ) + parser_fetch.add_argument( + "--untracked-dest-dir", + default=None, + help=( + "This flag can be used to instruct to download files to an " + "arbitrary directory where the data files will NOT be stored as " + "part of the data file management system. This provided as a " + "convenience. The specified directory should NOT be located " + "inside the grackle data directory. The tool also won't use any " + "kind of lock files (thus, it's the user's responsibility to " + "ensure that only a single process is modifying the specified " + "directory at any given time. This is provided mostly as a " + "convenience. Management of the files in this directory is " + "outside this tool's scope." + ), + ) parser_fetch.add_argument( "--from-dir", default=None, @@ -1101,7 +1260,6 @@ def _register_fetch_command(subparsers): parser_fetch.set_defaults(func=fetch_command) - def direntry_iter(path, *, ftype="file", mismatch="skip", ignore=None): """ Iterate over the contents of a single directory with focus on a @@ -1270,7 +1428,6 @@ def _register_rm_command(subparsers): parser_rm.set_defaults(func=rm_command) - def lsversions_command(args, tool_config, data_store_config): if not os.path.exists(data_store_config.store_location): print("there is no data") @@ -1386,7 +1543,7 @@ def calcreg_command(args, tool_config, data_store_config): pairs = [(name, calc_checksum(path, args.hash_name)) for name, path in it] - with ExitStack() as stack: + with contextlib.ExitStack() as stack: if args.output is None: file = sys.stdout else: diff --git a/src/python/tests/test_grdata.py b/src/python/tests/test_grdata.py index 19075b34..01255530 100644 --- a/src/python/tests/test_grdata.py +++ b/src/python/tests/test_grdata.py @@ -162,7 +162,14 @@ def __call__(self, subcommand_args, *, expect_success=None): raise GRDataExecErr("".join(msg_lines)) return returncode, stdout.rstrip() - def fetch(self, src_dir=None, *, expect_success=None): + def fetch( + self, + src_dir=None, + *, + file_list=None, + untracked_dest_dir=None, + expect_success=None, + ): # a number of tests care about whether the command fails. (In some case we # actually expect it to fail). We return whether or not it was succesful. @@ -173,6 +180,12 @@ def fetch(self, src_dir=None, *, expect_success=None): subcommand_args = ["fetch"] else: subcommand_args = ["fetch", "--from-dir", src_dir] + + if file_list is not None: + subcommand_args += file_list + + if untracked_dest_dir is not None: + subcommand_args += ["--untracked-dest-dir", untracked_dest_dir] return self(subcommand_args, expect_success=expect_success)[0] == 0 def rm_vdata(self, version, *, omit_force=False, expect_success=None): @@ -434,6 +447,39 @@ def _get_managed_file(datadir_path, version, fname): return os.path.join(datadir_path, "data-store-v1", version, fname) +def _dummy_errmsg_writer(msg): + return AssertionError(msg) + + +def check_version_data_dir_contents( + version_dir_path, + file_set, + *, + exhaustive_file_set=True, + errmsg_writer=_dummy_errmsg_writer, +): + __tracebackhide__ = True # suppress noisy pytest tracebacks + for fname, file_spec in file_set: + full_path = os.path.join(version_dir_path, fname) + + if not os.path.isfile(full_path): + raise errmsg_writer( + f"file, {full_path}, doesn't exist after the last invocation of grdata" + ) + with open(full_path, "r") as f: + contents = f.read() + if contents != file_spec.contents_str: + raise errmsg_writer( + f"the file, {full_path}, doesn't have the correct contents" + ) + + if exhaustive_file_set and (len(os.listdir(version_dir_path)) != len(file_set)): + raise errmsg_writer( + f"the directory, {version_dir_path}, doesn't contain the right number of " + "entries." + ) + + def check_version_data_dir( datadir_path, version, lockfile_should_exist=False, file_set=None, *, err_msg=None ): @@ -466,28 +512,41 @@ def prep_assertion_err(nominal_err): "a lockfile shouldn't exist after the last invocation of grdata." ) + exhaustive_file_set = True if file_set is None: file_set = [] + exhaustive_file_set = False + check_version_data_dir_contents( + version_dir_path=_get_version_dir(datadir_path, version), + file_set=file_set, + exhaustive_file_set=exhaustive_file_set, + errmsg_writer=prep_assertion_err, + ) - for fname, file_spec in file_set: - full_path = _get_managed_file(datadir_path, version, fname) - if not os.path.isfile(full_path): - raise prep_assertion_err( - f"file, {full_path}, doesn't exist after the last invocation of grdata" - ) - with open(full_path, "r") as f: - contents = f.read() - if contents != file_spec.contents_str: - raise prep_assertion_err( - f"the file, {full_path}, doesn't have the correct contents" - ) +def _check_removal(data_dir, version, retains_datastore): + version_dir = _get_version_dir(data_dir, version) + datastore_dir = _get_datastore_dir(data_dir) + if os.path.isdir(version_dir): + raise AssertionError( + f"after a successful remove operation, the version-dir, {version_dir}, " + "shouldn't exist" + ) + elif os.path.isdir(datastore_dir) != retains_datastore: + raise AssertionError( + f"the data-store directory, {datastore_dir}, should " + + ["not", "still"][retains_datastore] + + "after the removal operation" + ) @pytest.mark.parametrize( "rm_approach", ["rm-implicit-vdata", "rm-explicit-vdata", "rm-data-store"] ) def test_fetch_and_remove(dummy_file_repo, rm_approach, cli_app): + # in this test, the fetch operation is used to fetch all files in the registry + # and we vary the rm approach + version = "1.0" app = dummy_file_repo.cli_app_with_overrides(cli_app, "primary", version) data_dir = os.path.join(dummy_file_repo.test_dir, "my-data-dir") @@ -533,19 +592,77 @@ def test_fetch_and_remove(dummy_file_repo, rm_approach, cli_app): # now we expect it to succeed rm_method(*rm_args, expect_success=True) - version_dir = _get_version_dir(data_dir, version) - datastore_dir = _get_datastore_dir(data_dir) - if os.path.isdir(version_dir): - raise AssertionError( - f"after a successful remove operation, the version-dir, {version_dir}, " - "shouldn't exist" + _check_removal(data_dir, version, retains_datastore) + + +@pytest.mark.parametrize( + "fetch_subset", + [ + "fetch-single", + "fetch-single-then-all", + "fetch-all-then-single", + "fetch-all-explicit-list", + ], +) +def test_fetch_subset_and_remove(dummy_file_repo, fetch_subset, cli_app): + # in this test, the fetch operation is used to fetch a named subset of files + # in the registry and we use a single rm approach + version = "1.0" + app = dummy_file_repo.cli_app_with_overrides(cli_app, "primary", version) + data_dir = os.path.join(dummy_file_repo.test_dir, "my-data-dir") + with custom_datadir(data_dir): + # fetch the data + if fetch_subset == "fetch-single": + # call it twice to ensure we consider the operation a success each time + for i in range(2): + app.fetch( + dummy_file_repo.src_file_dir.primary, + file_list=[_DUMMY_SET_PRIMARY[1][0]], + expect_success=True, + ) + check_version_data_dir( + data_dir, version, file_set=[_DUMMY_SET_PRIMARY[1]] + ) + + elif fetch_subset == "fetch-single-then-all": + app.fetch( + dummy_file_repo.src_file_dir.primary, + file_list=[_DUMMY_SET_PRIMARY[1][0]], + expect_success=True, + ) + check_version_data_dir(data_dir, version, file_set=[_DUMMY_SET_PRIMARY[1]]) + + # fetch all data + app.fetch(dummy_file_repo.src_file_dir.primary, expect_success=True) + check_version_data_dir(data_dir, version, file_set=_DUMMY_SET_PRIMARY) + + elif fetch_subset == "fetch-all-then-single": + # fetch all data + app.fetch(dummy_file_repo.src_file_dir.primary, expect_success=True) + check_version_data_dir(data_dir, version, file_set=_DUMMY_SET_PRIMARY) + + # fetch a single data file + app.fetch( + dummy_file_repo.src_file_dir.primary, + file_list=[_DUMMY_SET_PRIMARY[1][0]], + expect_success=True, ) - elif os.path.isdir(datastore_dir) != retains_datastore: - raise AssertionError( - f"the data-store directory, {datastore_dir}, should " - + ["not", "still"][retains_datastore] - + "after the removal operation" + check_version_data_dir(data_dir, version, file_set=_DUMMY_SET_PRIMARY) + + elif fetch_subset == "fetch-all-explicit-list": + # fetch all data files + app.fetch( + dummy_file_repo.src_file_dir.primary, + file_list=[pair[0] for pair in _DUMMY_SET_PRIMARY], + expect_success=True, ) + check_version_data_dir(data_dir, version, file_set=_DUMMY_SET_PRIMARY) + + else: + raise RuntimeError("unexpected fetch_subset") + + app.rm_vdata(None, expect_success=True) + _check_removal(data_dir, version, retains_datastore=True) @pytest.mark.parametrize( @@ -599,6 +716,38 @@ def test_fetch_fail_locked(dummy_file_repo, cli_app): ) +def test_fetch_untracked(dummy_file_repo, cli_app): + # test the fetch operation is used to fetch files to an untracked directory + version = "1.0" + app = dummy_file_repo.cli_app_with_overrides(cli_app, "primary", version) + data_dir = os.path.join(dummy_file_repo.test_dir, "my-data-dir") + with custom_datadir(data_dir): + # first we download everything + dest_dir_1 = os.path.join(dummy_file_repo.test_dir, "my-untracked-dir-all") + for i in range(2): + app.fetch( + dummy_file_repo.src_file_dir.primary, + untracked_dest_dir=dest_dir_1, + expect_success=True, + ) + assert not os.path.isfile(data_dir) + check_version_data_dir_contents(dest_dir_1, file_set=_DUMMY_SET_PRIMARY) + + # now confirm that we can download a subset + dest_dir_2 = os.path.join(dummy_file_repo.test_dir, "my-untracked-dir-single") + for i in range(2): + app.fetch( + dummy_file_repo.src_file_dir.primary, + untracked_dest_dir=dest_dir_2, + file_list=[_DUMMY_SET_PRIMARY[1][0]], + expect_success=True, + ) + assert not os.path.isfile(data_dir) + check_version_data_dir_contents( + dest_dir_2, file_set=[_DUMMY_SET_PRIMARY[1]] + ) + + def is_linked(*paths): if len(paths) < 2: raise TypeError("is_linked() must have at least 2 arguments") From 4c4d926a9ea071eb56f320de5a09e96cb5bf2f03 Mon Sep 17 00:00:00 2001 From: Matthew Abruzzo Date: Thu, 10 Oct 2024 09:59:54 -0400 Subject: [PATCH 09/36] tweaked grdata.py to work with configure_file.py We plan to eventually install the grdata tool as a standalone command line program. Essentially the build-system will perform some substitutions (the CMake build system uses CMake's built-in ``configure_file`` command while the classic build system uses the analogous ``configure_file.py`` script) This commit introduces a few minor tweaks to grdata.py so that it can more easily be consumed by the ``configure_file.py`` script. - The ``configure_file.py`` script, itself, will ultimately require a few more tweaks so that it doesn't report occurences of python's decorator-syntax as errors - However, this commit minimizes the number of required changes --- src/python/pygrackle/utilities/grdata.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/python/pygrackle/utilities/grdata.py b/src/python/pygrackle/utilities/grdata.py index 3346c876..5b139cc8 100644 --- a/src/python/pygrackle/utilities/grdata.py +++ b/src/python/pygrackle/utilities/grdata.py @@ -1792,7 +1792,7 @@ def make_config_objects(grackle_version, file_registry_file): # following procedure to the build-system: # - treat this file as a template-file and configure it with CMake's # ``configure_file`` command (or invoke ``configure_file.py`` under the classic -# build system) in order to substitute the names enclosed by the @ symbols +# build system) in order to substitute the names enclosed by the "at sign" symbol # - make resulting file executable (and maybe drop the .py suffix) # - install it into the bin directory alongside the grackle libraries @@ -1803,9 +1803,12 @@ def make_config_objects(grackle_version, file_registry_file): """ def _check_substitution_problems(var_name, var_value): + # we use unicode escape sequence, \u0040, that python automatically converts + # to the "at sign" to prevent the configure_file.py script (used by Grackle's + # Grackle's build-system) from falsely reporting an error if ( (var_name in var_value) - or ("@" in var_value) + or ("\u0040" in var_value) or (len(var_value) == 0) or (var_value.isspace()) ): From 5d8a11b3dcf5c911eeb2879a9129f0acc9286160 Mon Sep 17 00:00:00 2001 From: Matthew Abruzzo Date: Sun, 25 Aug 2024 12:59:53 -0400 Subject: [PATCH 10/36] datafile-readers take path as a separate arg --- src/clib/initialize_UVbackground_data.c | 45 +++++++++++++------------ src/clib/initialize_chemistry_data.c | 13 ++++--- src/clib/initialize_cloudy_data.c | 16 ++++----- 3 files changed, 37 insertions(+), 37 deletions(-) diff --git a/src/clib/initialize_UVbackground_data.c b/src/clib/initialize_UVbackground_data.c index eff87ba8..9c6e7efa 100644 --- a/src/clib/initialize_UVbackground_data.c +++ b/src/clib/initialize_UVbackground_data.c @@ -48,7 +48,8 @@ void initialize_empty_UVBtable_struct(UVBtable *table) // Initialize UV Background data -int initialize_UVbackground_data(chemistry_data *my_chemistry, +int initialize_UVbackground_data(const char* path, + chemistry_data *my_chemistry, chemistry_data_storage *my_rates) { long long Nz; @@ -71,8 +72,8 @@ int initialize_UVbackground_data(chemistry_data *my_chemistry, if (grackle_verbose) fprintf(stdout, "Reading UV background data from %s.\n", - my_chemistry->grackle_data_file); - file_id = H5Fopen(my_chemistry->grackle_data_file, + path); + file_id = H5Fopen(path, H5F_ACC_RDONLY, H5P_DEFAULT); @@ -81,7 +82,7 @@ int initialize_UVbackground_data(chemistry_data *my_chemistry, dset_id = H5Dopen(file_id, "/UVBRates/Info"); if (dset_id == h5_error) { fprintf(stderr, "Can't open 'Info' dataset in %s.\n", - my_chemistry->grackle_data_file); + path); return FAIL; } @@ -107,21 +108,21 @@ int initialize_UVbackground_data(chemistry_data *my_chemistry, dset_id = H5Dopen(file_id, "/UVBRates/z"); if (dset_id == h5_error) { fprintf(stderr, "Can't open redshift dataset ('z') in %s.\n", - my_chemistry->grackle_data_file); + path); return FAIL; } dspace_id = H5Dget_space(dset_id); if (dspace_id == h5_error) { fprintf(stderr, "Error opening dataspace for dataset 'z' in %s.\n", - my_chemistry->grackle_data_file); + path); return FAIL; } Nz = H5Sget_simple_extent_npoints(dspace_id); if(Nz <= 0) { fprintf(stderr, "Redshift dataset ('z') has inappropriate size = %lld in %s.\n", - Nz, my_chemistry->grackle_data_file); + Nz, path); return FAIL; } @@ -164,7 +165,7 @@ int initialize_UVbackground_data(chemistry_data *my_chemistry, if(! read_dataset(file_id, "/UVBRates/z", my_rates->UVbackground_table.z) ) { fprintf(stderr, "Error reading dataset 'z' in %s.\n", - my_chemistry->grackle_data_file); + path); return FAIL; } @@ -172,7 +173,7 @@ int initialize_UVbackground_data(chemistry_data *my_chemistry, if(! read_dataset(file_id, "/UVBRates/Chemistry/k24", my_rates->UVbackground_table.k24) ) { fprintf(stderr, "Error reading dataset '/UVBRates/Chemistry/k24' in %s.\n", - my_chemistry->grackle_data_file); + path); return FAIL; } @@ -180,7 +181,7 @@ int initialize_UVbackground_data(chemistry_data *my_chemistry, if(! read_dataset(file_id, "/UVBRates/Chemistry/k25", my_rates->UVbackground_table.k25) ) { fprintf(stderr, "Error reading dataset '/UVBRates/Chemistry/k25' in %s.\n", - my_chemistry->grackle_data_file); + path); return FAIL; } @@ -188,7 +189,7 @@ int initialize_UVbackground_data(chemistry_data *my_chemistry, if(! read_dataset(file_id, "/UVBRates/Chemistry/k26", my_rates->UVbackground_table.k26) ) { fprintf(stderr, "Error reading dataset '/UVBRates/Chemistry/k26' in %s.\n", - my_chemistry->grackle_data_file); + path); return FAIL; } @@ -198,7 +199,7 @@ int initialize_UVbackground_data(chemistry_data *my_chemistry, if(! read_dataset(file_id, "/UVBRates/Chemistry/k27", my_rates->UVbackground_table.k27) ) { fprintf(stderr, "Error reading dataset '/UVBRates/Chemistry/k27' in %s.\n", - my_chemistry->grackle_data_file); + path); return FAIL; } @@ -206,7 +207,7 @@ int initialize_UVbackground_data(chemistry_data *my_chemistry, if(! read_dataset(file_id, "/UVBRates/Chemistry/k28", my_rates->UVbackground_table.k28) ) { fprintf(stderr, "Error reading dataset '/UVBRates/Chemistry/k28' in %s.\n", - my_chemistry->grackle_data_file); + path); return FAIL; } @@ -214,7 +215,7 @@ int initialize_UVbackground_data(chemistry_data *my_chemistry, if(! read_dataset(file_id, "/UVBRates/Chemistry/k29", my_rates->UVbackground_table.k29) ) { fprintf(stderr, "Error reading dataset '/UVBRates/Chemistry/k29' in %s.\n", - my_chemistry->grackle_data_file); + path); return FAIL; } @@ -222,7 +223,7 @@ int initialize_UVbackground_data(chemistry_data *my_chemistry, if(! read_dataset(file_id, "/UVBRates/Chemistry/k30", my_rates->UVbackground_table.k30) ) { fprintf(stderr, "Error reading dataset '/UVBRates/Chemistry/k30' in %s.\n", - my_chemistry->grackle_data_file); + path); return FAIL; } @@ -230,7 +231,7 @@ int initialize_UVbackground_data(chemistry_data *my_chemistry, if(! read_dataset(file_id, "/UVBRates/Chemistry/k31", my_rates->UVbackground_table.k31) ) { fprintf(stderr, "Error reading dataset '/UVBRates/Chemistry/k31' in %s.\n", - my_chemistry->grackle_data_file); + path); return FAIL; } @@ -240,7 +241,7 @@ int initialize_UVbackground_data(chemistry_data *my_chemistry, if(! read_dataset(file_id, "/UVBRates/Photoheating/piHI", my_rates->UVbackground_table.piHI) ) { fprintf(stderr, "Error reading dataset '/UVBRates/Photoheating/piHI' in %s.\n", - my_chemistry->grackle_data_file); + path); return FAIL; } @@ -248,7 +249,7 @@ int initialize_UVbackground_data(chemistry_data *my_chemistry, if(! read_dataset(file_id, "/UVBRates/Photoheating/piHeII", my_rates->UVbackground_table.piHeII) ) { fprintf(stderr, "Error reading dataset '/UVBRates/Photoheating/piHeII' in %s.\n", - my_chemistry->grackle_data_file); + path); return FAIL; } @@ -256,7 +257,7 @@ int initialize_UVbackground_data(chemistry_data *my_chemistry, if(! read_dataset(file_id, "/UVBRates/Photoheating/piHeI", my_rates->UVbackground_table.piHeI) ) { fprintf(stderr, "Error reading dataset '/UVBRates/Photoheating/piHeI' in %s.\n", - my_chemistry->grackle_data_file); + path); return FAIL; } @@ -266,7 +267,7 @@ int initialize_UVbackground_data(chemistry_data *my_chemistry, if(! read_dataset(file_id, "/UVBRates/CrossSections/hi_avg_crs", my_rates->UVbackground_table.crsHI) ) { fprintf(stderr, "Error reading dataset '/UVBRates/CrossSections/hi_avg_crs' in %s.\n", - my_chemistry->grackle_data_file); + path); fprintf(stderr, "In order to use self-shielding, you must use the shielding datasets\n"); return FAIL; } @@ -275,7 +276,7 @@ int initialize_UVbackground_data(chemistry_data *my_chemistry, if(! read_dataset(file_id, "/UVBRates/CrossSections/heii_avg_crs", my_rates->UVbackground_table.crsHeII) ) { fprintf(stderr, "Error reading dataset '/UVBRates/CrossSections/heii_avg_crs' in %s.\n", - my_chemistry->grackle_data_file); + path); fprintf(stderr, "In order to use self-shielding, you must use the shielding datasets\n"); return FAIL; } @@ -284,7 +285,7 @@ int initialize_UVbackground_data(chemistry_data *my_chemistry, if(! read_dataset(file_id, "/UVBRates/CrossSections/hei_avg_crs", my_rates->UVbackground_table.crsHeI) ) { fprintf(stderr, "Error reading dataset '/UVBRates/CrossSections/hei_avg_crs' in %s.\n", - my_chemistry->grackle_data_file); + path); fprintf(stderr, "In order to use self-shielding, you must use the shielding datasets\n"); return FAIL; } diff --git a/src/clib/initialize_chemistry_data.c b/src/clib/initialize_chemistry_data.c index ed51108b..36aa462c 100644 --- a/src/clib/initialize_chemistry_data.c +++ b/src/clib/initialize_chemistry_data.c @@ -40,13 +40,13 @@ grackle_version get_grackle_version(); void show_parameters(FILE *fp, chemistry_data *my_chemistry); int _free_cloudy_data(cloudy_data *my_cloudy, chemistry_data *my_chemistry, int primordial); -int initialize_cloudy_data(chemistry_data *my_chemistry, +int initialize_cloudy_data(const char* path, chemistry_data *my_chemistry, chemistry_data_storage *my_rates, cloudy_data *my_cloudy, char *group_name, code_units *my_units, int read_data); -int initialize_UVbackground_data(chemistry_data *my_chemistry, +int initialize_UVbackground_data(const char* path, chemistry_data *my_chemistry, chemistry_data_storage *my_rates); int local_free_chemistry_data(chemistry_data *my_chemistry, chemistry_data_storage *my_rates); @@ -313,13 +313,16 @@ int local_initialize_chemistry_data(chemistry_data *my_chemistry, //* Call initialise_rates to compute rate tables. initialize_rates(my_chemistry, my_rates, my_units, co_length_units, co_density_units); + // prepare to read data from data files + const char* path = my_chemistry->grackle_data_file; + /* Initialize Cloudy cooling. */ my_rates->cloudy_data_new = 1; int read_data; /* Primordial tables. */ read_data = my_chemistry->primordial_chemistry == 0; - if (initialize_cloudy_data(my_chemistry, my_rates, + if (initialize_cloudy_data(path, my_chemistry, my_rates, &my_rates->cloudy_primordial, "Primordial", my_units, read_data) == GR_FAIL) { fprintf(stderr, "Error in initialize_cloudy_data.\n"); @@ -328,7 +331,7 @@ int local_initialize_chemistry_data(chemistry_data *my_chemistry, /* Metal tables. */ read_data = my_chemistry->metal_cooling == TRUE; - if (initialize_cloudy_data(my_chemistry, my_rates, + if (initialize_cloudy_data(path, my_chemistry, my_rates, &my_rates->cloudy_metal, "Metals", my_units, read_data) == GR_FAIL) { fprintf(stderr, "Error in initialize_cloudy_data.\n"); @@ -337,7 +340,7 @@ int local_initialize_chemistry_data(chemistry_data *my_chemistry, /* Initialize UV Background data. */ initialize_empty_UVBtable_struct(&(my_rates->UVbackground_table)); - if (initialize_UVbackground_data(my_chemistry, my_rates) == GR_FAIL) { + if (initialize_UVbackground_data(path, my_chemistry, my_rates) == GR_FAIL) { fprintf(stderr, "Error in initialize_UVbackground_data.\n"); return GR_FAIL; } diff --git a/src/clib/initialize_cloudy_data.c b/src/clib/initialize_cloudy_data.c index 26d649e9..9d8acaee 100644 --- a/src/clib/initialize_cloudy_data.c +++ b/src/clib/initialize_cloudy_data.c @@ -41,7 +41,7 @@ void initialize_empty_cloudy_data_struct(cloudy_data *my_cloudy) } // Initialize Cloudy cooling data -int initialize_cloudy_data(chemistry_data *my_chemistry, +int initialize_cloudy_data(const char* path, chemistry_data *my_chemistry, chemistry_data_storage *my_rates, cloudy_data *my_cloudy, char *group_name, code_units *my_units, int read_data) @@ -60,7 +60,7 @@ int initialize_cloudy_data(chemistry_data *my_chemistry, if (grackle_verbose) { fprintf(stdout,"Initializing Cloudy cooling: %s.\n", group_name); - fprintf(stdout,"cloudy_table_file: %s.\n",my_chemistry->grackle_data_file); + fprintf(stdout,"cloudy_table_file: %s.\n",path); } /* Get conversion units. */ @@ -91,8 +91,7 @@ int initialize_cloudy_data(chemistry_data *my_chemistry, herr_t status; herr_t h5_error = -1; - file_id = H5Fopen(my_chemistry->grackle_data_file, - H5F_ACC_RDONLY, H5P_DEFAULT); + file_id = H5Fopen(path, H5F_ACC_RDONLY, H5P_DEFAULT); if (H5Aexists(file_id, "old_style")) { my_rates->cloudy_data_new = 0; @@ -105,8 +104,7 @@ int initialize_cloudy_data(chemistry_data *my_chemistry, sprintf(parameter_name, "/CoolingRates/%s/Cooling", group_name); dset_id = H5Dopen(file_id, parameter_name); if (dset_id == h5_error) { - fprintf(stderr,"Can't open Cooling in %s.\n", - my_chemistry->grackle_data_file); + fprintf(stderr,"Can't open Cooling in %s.\n", path); return FAIL; } @@ -249,8 +247,7 @@ int initialize_cloudy_data(chemistry_data *my_chemistry, sprintf(parameter_name, "/CoolingRates/%s/Heating", group_name); dset_id = H5Dopen(file_id, parameter_name); if (dset_id == h5_error) { - fprintf(stderr,"Can't open Heating in %s.\n", - my_chemistry->grackle_data_file); + fprintf(stderr,"Can't open Heating in %s.\n", path); return FAIL; } @@ -288,8 +285,7 @@ int initialize_cloudy_data(chemistry_data *my_chemistry, sprintf(parameter_name, "/CoolingRates/%s/MMW", group_name); dset_id = H5Dopen(file_id, parameter_name); if (dset_id == h5_error) { - fprintf(stderr,"Can't open MMW in %s.\n", - my_chemistry->grackle_data_file); + fprintf(stderr,"Can't open MMW in %s.\n", path); return FAIL; } From 657bee161a49fdda9990ee083130300ef404fc5e Mon Sep 17 00:00:00 2001 From: Matthew Abruzzo Date: Sun, 25 Aug 2024 13:02:04 -0400 Subject: [PATCH 11/36] introduce the grackle_data_file_options parameter. --- src/clib/grackle_chemistry_data_fields.def | 3 +++ src/include/grackle_chemistry_data.h | 3 +++ src/include/grackle_fortran_interface.def | 3 ++- 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/src/clib/grackle_chemistry_data_fields.def b/src/clib/grackle_chemistry_data_fields.def index bcc8aa4a..22df5656 100644 --- a/src/clib/grackle_chemistry_data_fields.def +++ b/src/clib/grackle_chemistry_data_fields.def @@ -53,6 +53,9 @@ ENTRY(UVbackground, INT, 0) /* data file containing cooling and UV background tables */ ENTRY(grackle_data_file, STRING, "") +/* specifies handling of grackle_data_file */ +ENTRY(grackle_data_file_options, INT, -1) + /* Use a CMB temperature floor 0) no, 1) yes */ ENTRY(cmb_temperature_floor, INT, 1) diff --git a/src/include/grackle_chemistry_data.h b/src/include/grackle_chemistry_data.h index 22ce5007..fece76f2 100644 --- a/src/include/grackle_chemistry_data.h +++ b/src/include/grackle_chemistry_data.h @@ -60,6 +60,9 @@ typedef struct /* data file containing cooling and UV background tables */ const char *grackle_data_file; + /* specifies handling of grackle_data_file */ + int grackle_data_file_options; + /* Use a CMB temperature floor 0) no, 1) yes */ int cmb_temperature_floor; diff --git a/src/include/grackle_fortran_interface.def b/src/include/grackle_fortran_interface.def index 6629c69e..b3ecb47a 100644 --- a/src/include/grackle_fortran_interface.def +++ b/src/include/grackle_fortran_interface.def @@ -89,6 +89,7 @@ c This is the fortran definition of grackle_chemistry_data INTEGER(C_INT) :: metal_cooling INTEGER(C_INT) :: UVbackground TYPE(C_PTR) :: grackle_data_file + INTEGER(C_INT) :: grackle_data_file_options INTEGER(C_INT) :: cmb_temperature_floor REAL(C_DOUBLE) :: Gamma INTEGER(C_INT) :: h2_on_dust @@ -252,4 +253,4 @@ c The following define the fortran interfaces to the C routines IMPORT TYPE(grackle_field_data), INTENT(INOUT) :: my_fields END FUNCTION gr_initialize_field_data - END INTERFACE \ No newline at end of file + END INTERFACE From 165e062b1794635d650618be346d39f5552248de Mon Sep 17 00:00:00 2001 From: Matthew Abruzzo Date: Sun, 25 Aug 2024 13:35:38 -0400 Subject: [PATCH 12/36] introduce a barebones version of determine_data_file_ --- src/clib/CMakeLists.txt | 2 + src/clib/data_file_utils.c | 55 ++++++++++++++++++++++++++++ src/clib/data_file_utils.h | 48 ++++++++++++++++++++++++ src/clib/initialize_chemistry_data.c | 20 ++++++++-- 4 files changed, 121 insertions(+), 4 deletions(-) create mode 100644 src/clib/data_file_utils.c create mode 100644 src/clib/data_file_utils.h diff --git a/src/clib/CMakeLists.txt b/src/clib/CMakeLists.txt index 35aedc94..00e9958e 100644 --- a/src/clib/CMakeLists.txt +++ b/src/clib/CMakeLists.txt @@ -86,6 +86,7 @@ add_library(Grackle_Grackle calculate_gamma.c calculate_pressure.c calculate_temperature.c + data_file_utils.c dynamic_api.c grackle_units.c index_helper.c @@ -135,6 +136,7 @@ add_library(Grackle_Grackle # C private headers cie_thin_cooling_rate_tables.h + data_file_utils.h grackle_chemistry_data_fields.def # <-- acts as a C header grackle_macros.h index_helper.h diff --git a/src/clib/data_file_utils.c b/src/clib/data_file_utils.c new file mode 100644 index 00000000..fff85aba --- /dev/null +++ b/src/clib/data_file_utils.c @@ -0,0 +1,55 @@ +/*********************************************************************** +/ +/ Implement logic used internally by Grackle to determine data files. +/ +/ +/ Copyright (c) 2013, Enzo/Grackle Development Team. +/ +/ Distributed under the terms of the Enzo Public Licence. +/ +/ The full license is in the file LICENSE, distributed with this +/ software. +************************************************************************/ + +#include +#include + +#include "data_file_utils.h" + +struct generic_file_props determine_data_file_(const char* grackle_data_file, + int grackle_data_file_options) +{ + // initialize output struct in a format that will denote an error (if is + // never modified) + struct generic_file_props out = {NULL, 0, NULL, 0}; + + if (grackle_data_file == NULL) { + fprintf(stderr, "grackle_data_file must not be NULL\n"); + return out; + + } else if (grackle_data_file_options == -1) { // the legacy case! + out.path = grackle_data_file; + return out; + + } else { + fprintf(stderr, "grackle_data_file_options has an unexpected value: %d\n", + grackle_data_file_options); + return out; + + } + + +} + +void free_generic_file_props_(struct generic_file_props* ptr) { + if (ptr != NULL) { + if (ptr->path_requires_dealloc){ + free((char*)ptr->path); + } + ptr->path = NULL; + if (ptr->checksum_requires_dealloc){ + free((char*)ptr->checksum); + } + ptr->checksum = NULL; + } +} diff --git a/src/clib/data_file_utils.h b/src/clib/data_file_utils.h new file mode 100644 index 00000000..664e3b3c --- /dev/null +++ b/src/clib/data_file_utils.h @@ -0,0 +1,48 @@ +/*********************************************************************** +/ +/ Declare utility functions used internally by Grackle to encapsulate +/ logic for determining data files +/ +/ +/ Copyright (c) 2013, Enzo/Grackle Development Team. +/ +/ Distributed under the terms of the Enzo Public Licence. +/ +/ The full license is in the file LICENSE, distributed with this +/ software. +************************************************************************/ + +#ifndef DATA_FILE_UTILS_H +#define DATA_FILE_UTILS_H + +/// used as the return type when determining the self-shielding location +/// +/// if ``path`` is ``NULL``, then there is an error. This struct should NEVER +/// be exposed as part of the public API +struct generic_file_props { + const char* path; + int path_requires_dealloc; + const char* checksum; + int checksum_requires_dealloc; +}; + + +/// Determines the path to the data file. +/// +/// @param[in] grackle_data_file specified grackle data file +/// @param[in] grackle_data_file_options specifies how to interpret the first +/// argument +/// +/// @note +/// If this functionality ever get's exposed as part of the public API, we +/// should stop using generic_data_file_props as a return type. We should also +/// make it possible for the caller to pre-allocate any buffers to hold the +/// file path and the computed checksum (in that case, we should consider +/// adopting an interface sorta like snprintf) +struct generic_file_props determine_data_file_(const char* grackle_data_file, + int grackle_data_file_options); + +/// Deallocates the memory held within a given ``struct generic_file_props`` +void free_generic_file_props_(struct generic_file_props* ptr); + +#endif /* DATA_FILE_UTILS_H */ diff --git a/src/clib/initialize_chemistry_data.c b/src/clib/initialize_chemistry_data.c index 36aa462c..858f344f 100644 --- a/src/clib/initialize_chemistry_data.c +++ b/src/clib/initialize_chemistry_data.c @@ -16,6 +16,7 @@ #include #include #include +#include "data_file_utils.h" #include "grackle.h" #include "grackle_macros.h" #include "grackle_types.h" @@ -314,7 +315,13 @@ int local_initialize_chemistry_data(chemistry_data *my_chemistry, initialize_rates(my_chemistry, my_rates, my_units, co_length_units, co_density_units); // prepare to read data from data files - const char* path = my_chemistry->grackle_data_file; + + struct generic_file_props file_props = + determine_data_file_(my_chemistry->grackle_data_file, + my_chemistry->grackle_data_file_options); + if (file_props.path == NULL) { + return GR_FAIL; + } /* Initialize Cloudy cooling. */ my_rates->cloudy_data_new = 1; @@ -322,7 +329,7 @@ int local_initialize_chemistry_data(chemistry_data *my_chemistry, /* Primordial tables. */ read_data = my_chemistry->primordial_chemistry == 0; - if (initialize_cloudy_data(path, my_chemistry, my_rates, + if (initialize_cloudy_data(file_props.path, my_chemistry, my_rates, &my_rates->cloudy_primordial, "Primordial", my_units, read_data) == GR_FAIL) { fprintf(stderr, "Error in initialize_cloudy_data.\n"); @@ -331,7 +338,7 @@ int local_initialize_chemistry_data(chemistry_data *my_chemistry, /* Metal tables. */ read_data = my_chemistry->metal_cooling == TRUE; - if (initialize_cloudy_data(path, my_chemistry, my_rates, + if (initialize_cloudy_data(file_props.path, my_chemistry, my_rates, &my_rates->cloudy_metal, "Metals", my_units, read_data) == GR_FAIL) { fprintf(stderr, "Error in initialize_cloudy_data.\n"); @@ -340,11 +347,16 @@ int local_initialize_chemistry_data(chemistry_data *my_chemistry, /* Initialize UV Background data. */ initialize_empty_UVBtable_struct(&(my_rates->UVbackground_table)); - if (initialize_UVbackground_data(path, my_chemistry, my_rates) == GR_FAIL) { + if (initialize_UVbackground_data(file_props.path, my_chemistry, my_rates) + == GR_FAIL) { fprintf(stderr, "Error in initialize_UVbackground_data.\n"); return GR_FAIL; } + // clean up from reading in data files + free_generic_file_props_(&file_props); + + /* store a copy of the initial units */ my_rates->initial_units = *my_units; From 0456441a9b749ea7880f29634ab71ea73fea099f Mon Sep 17 00:00:00 2001 From: Matthew Abruzzo Date: Fri, 30 Aug 2024 11:24:01 -0400 Subject: [PATCH 13/36] introduce os_utils.[ch] --- src/clib/CMakeLists.txt | 21 ++++ src/clib/os_utils.c | 273 ++++++++++++++++++++++++++++++++++++++++ src/clib/os_utils.h | 51 ++++++++ 3 files changed, 345 insertions(+) create mode 100644 src/clib/os_utils.c create mode 100644 src/clib/os_utils.h diff --git a/src/clib/CMakeLists.txt b/src/clib/CMakeLists.txt index 00e9958e..7b093282 100644 --- a/src/clib/CMakeLists.txt +++ b/src/clib/CMakeLists.txt @@ -94,6 +94,7 @@ add_library(Grackle_Grackle initialize_cloudy_data.c initialize_rates.c initialize_UVbackground_data.c + os_utils.c rate_functions.c set_default_chemistry_parameters.c solve_chemistry.c @@ -140,6 +141,7 @@ add_library(Grackle_Grackle grackle_chemistry_data_fields.def # <-- acts as a C header grackle_macros.h index_helper.h + os_utils.h phys_constants.h utils.h @@ -258,6 +260,25 @@ if ("${CMAKE_SYSTEM_NAME}" MATCHES "^(Linux)|(Darwin)$") target_compile_definitions(Grackle_Grackle PRIVATE "LINUX") endif() +# define a macro to specify the correct macro for use within os_utils.c +# -> this is used to define functionality that specifies the default +# location where data is stored (this location is prefered by OS) +# when the GRACKLE_DATA_DIR isn't specified +if ("${CMAKE_SYSTEM_NAME}" STREQUAL "Darwin") + target_compile_definitions(Grackle_Grackle PRIVATE PLATFORM_MACOS) +else() + # check if we can treat the platform as a generic unix-like OS that provides + # a few standard headers + + include(CheckIncludeFiles) + unset(HAS_POSIX_HDRS CACHE) + CHECK_INCLUDE_FILES("unistd.h;sys/types.h;pwd.h" HAS_POSIX_HDRS) + if(UNIX AND "${HAS_POSIX_HDRS}" STREQUAL "1") + target_compile_definitions(Grackle_Grackle PRIVATE PLATFORM_GENERIC_UNIX) + endif() +endif() + + # If we are building a shared library, construct a "configuration Makefile" # that is used to build the code examples # - we're mostly just doing this to let us run the code examples as part of the diff --git a/src/clib/os_utils.c b/src/clib/os_utils.c new file mode 100644 index 00000000..9fef4460 --- /dev/null +++ b/src/clib/os_utils.c @@ -0,0 +1,273 @@ +/*********************************************************************** +/ +/ Implement utility functions used internally by Grackle to related to +/ path manipulation and OS-specific functionality +/ +/ +/ Copyright (c) 2013, Enzo/Grackle Development Team. +/ +/ Distributed under the terms of the Enzo Public Licence. +/ +/ The full license is in the file LICENSE, distributed with this +/ software. +************************************************************************/ + +#include // ERANGE +#include // fprintf, stderr +#include // malloc, realloc, free, abort +#include // memcpy, strlen + +#include "os_utils.h" +#include "grackle.h" // grackle_verbose + +/// Just like getenv, except it returns NULL in place of strings of length 0. +static const char* getenv_nonempty_(const char* name) { + const char* out = getenv(name); + return ((out == NULL) || (out[0] == '\0')) ? NULL : out; +} + +char* my_strdup_(const char* src) { + size_t len_with_nul = strlen(src) + 1; + char* out = malloc(sizeof(char) * len_with_nul); + memcpy(out, src, len_with_nul); + return out; +} + +const char* post_prefix_ptr_(const char* s, const char* prefix) { + if ((s == NULL) || (prefix == NULL)) return NULL; + + // these lengths don't include the null terminator + size_t len_s = strlen(s); + size_t len_prefix = strlen(prefix); + if ((len_s < len_prefix) || (len_prefix == 0)) return NULL; + + if (memcmp(s, prefix, len_prefix) != 0) return NULL; + + return s + len_prefix; +} + +char* join_parts_(char sep, const char** parts, int nparts) { + if (nparts < 2) return NULL; + + // in principle, we could give sep == '\0' special significance + + size_t total_len = 0; + for (int i = 0; i < nparts; i++) { + if (parts[i] == NULL) return NULL; + total_len += strlen(parts[i]); // we don't include the nul-terminator + } + total_len += (nparts - 1); // account for the size of sep and + total_len++; // account for trailing nul-terminator + + char* out = malloc(total_len); + size_t cur_offset = 0; + for (int i = 0; i < nparts; i++) { + if (i > 0) { + out[cur_offset] = sep; + cur_offset++; + } + size_t cur_part_len = strlen(parts[i]); + memcpy(out + cur_offset, parts[i], cur_part_len); + cur_offset += cur_part_len; + } + out[cur_offset] = '\0'; + if ((cur_offset+1) != total_len) abort(); + return out; +} + + +// Platform-Specific Stuff +// ----------------------- + +enum platform_kind get_platform_(void) { +#if defined(PLATFORM_GENERIC_UNIX) && defined(PLATFORM_MACOS) + #error "more than 1 platform macro was defined" +#elif defined(PLATFORM_GENERIC_UNIX) + return platform_kind_generic_unix; +#elif defined(PLATFORM_MACOS) + return platform_kind_macos; +#else + return platform_kind_unknown; +#endif +} + +// define a function to get the home directory + +/// returns the user's home directory +/// +/// If it is defined with a non-empty value, the function honors the value in +/// the ``HOME`` environment variable. Otherwise, the function falls back to +/// fetching the value using platform specific apis. +/// +/// @return a string pointing to the current user's home directory. ``NULL`` is +/// returned if there was an error. The caller is always responsible for +/// deallocating this string. +static char* get_home_dir(void); + +#if defined(PLATFORM_GENERIC_UNIX) || defined(PLATFORM_MACOS) + +// assume a posix-platform, the following headers are all standard + + +#include // uid_t +#include // getuid, sysconf +#include // getpwuid, struct passwd + +static char* get_home_dir(void) +{ + // first, try to get the value set in the environment + const char* env_str = getenv_nonempty_("HOME"); + if (env_str != NULL) return my_strdup_(env_str); + + // fall back to checking the user database (standard on posix systems) + + // ask the system for an upper limit on the buffersize to hold the results + const long initial_bufsize_guess = sysconf(_SC_GETPW_R_SIZE_MAX); + + // If the system can't give a firm answer, we guess. + long bufsize = (initial_bufsize_guess == -1) ? 2048 : initial_bufsize_guess; + char* buffer = NULL; + + struct passwd pwd, *result; + int return_code; + + do { + if (buffer == NULL) { // our 1st attempt + buffer = malloc(sizeof(char)*bufsize); + } else { // our next attempt + bufsize *= 2; + char* tmp = realloc(buffer, sizeof(char)*bufsize); + if (tmp == NULL) break; + buffer = tmp; + } + return_code = getpwuid_r(getuid(), &pwd, buffer, bufsize, &result); + } while ((return_code == ERANGE) && (bufsize < 1000000000)); + + if (return_code != 0) { + free(buffer); + fprintf(stderr, "ERROR while determining the HOME directory\n"); + return NULL; + } + + char* out = my_strdup_(pwd.pw_dir); + free(buffer); + return out; +} +#else +static char* get_home_dir(void) { + fprintf(stderr, + "Don't know how to determine HOME directory on current platform\n"); + return NULL; +} +#endif + +/// Returns a string specifying the default data directory +/// +/// All of these choices are inspired by the API description of the +/// platformdirs python package +/// * we only looked at online documentation: +/// https://platformdirs.readthedocs.io/en/latest/ +/// * we have NOT read any source code +static char* default_data_dir_(enum platform_kind kind) { + const char* appname = "grackle"; + switch(kind) { + + case platform_kind_unknown: { + fprintf( + stderr, + ("ERROR: can't infer default data dir on unknown platform.\n" + " -> can only infer data directories on macOS and unix systems\n") + ); + return NULL; + } + + case platform_kind_macos: { + // https://developer.apple.com/library/archive/documentation/FileManagement/Conceptual/FileSystemProgrammingGuide/MacOSXDirectories/MacOSXDirectories.html + char* home_dir = get_home_dir(); + const char * parts[3] = { + home_dir, "Library/Application Support", appname + }; + char* out = join_parts_('/', parts, 3); + free(home_dir); + return out; + } + + case platform_kind_generic_unix: { + // https://specifications.freedesktop.org/basedir-spec/latest/ + const char* env_str = getenv_nonempty_("XDG_DATA_HOME"); + + // check if we need to fall back to the default + const char* dflt = "~/.local/share"; + if (env_str == NULL) { + env_str = dflt; + } else if ((env_str[0] != '~') && (env_str[0] != '/')) { + // this is what the specification tells us to do + fprintf(stderr, + "WARNING: ignoring XDG_DATA_HOME because it doesn't hold an " + "absolute path\n"); + env_str = dflt; + } + + // now actually infer the absolute path + if (env_str[0] == '~') { + if (post_prefix_ptr_(env_str, "~/") == NULL) { + fprintf(stderr, + "ERROR: can't expand env-variable, XDG_DATA_HOME when it " + "starts with `~user/` or just contains `~`\n"); + return NULL; + } + + char* home_dir = get_home_dir(); + const char* parts[3] = {home_dir, env_str + 1, appname}; + char* out = join_parts_('/', parts, 3); + free(home_dir); + return out; + + } else { + const char* parts[2] = {env_str, appname}; + char* out = join_parts_('/', parts, 2); + return out; + + } + } + + } + + fprintf(stderr, + "ERROR: This part of the function should be unreachable! Did you add " + "a new platform_kind and forget to update the function?\n"); + abort(); +} + +char* get_data_dir_(enum platform_kind kind) { + const char* env_str = getenv_nonempty_("GRACKLE_DATA_DIR"); + char* out; + const char* description; + if (env_str != NULL) { + out = my_strdup_(env_str); + description = "from the `GRACKLE_DATA_DIR` environment variable"; + } else { + if (grackle_verbose) { + fprintf(stdout, + ("INFO: looking up system-default for the data directory since " + "`GRACKLE_DATA_DIR` env variable is empty\n")); + fflush(stdout); // flush in case we run into an error in the next call + } + out = default_data_dir_(kind); + description = "inferred from the system defaults"; + } + + // confirm we are providing an absolute path + if (out[0] != '/') { + fprintf(stderr, + "ERROR: the data-directory %s, `%s` is not an absolute path\n", + description, out); + free(out); + return out; + } + if (grackle_verbose) { + fprintf(stdout, "INFO: the data-directory (%s) is: `%s`\n", + description, out); + } + return out; +} diff --git a/src/clib/os_utils.h b/src/clib/os_utils.h new file mode 100644 index 00000000..81c1cf25 --- /dev/null +++ b/src/clib/os_utils.h @@ -0,0 +1,51 @@ +/*********************************************************************** +/ +/ Declare utility functions used internally by Grackle to related to +/ path manipulation and OS-specific functionality +/ +/ +/ Copyright (c) 2013, Enzo/Grackle Development Team. +/ +/ Distributed under the terms of the Enzo Public Licence. +/ +/ The full license is in the file LICENSE, distributed with this +/ software. +************************************************************************/ + +#ifndef OS_UTILS_H +#define OS_UTILS_H + +/// a portable version of strdup, which is provided on posix and in C23 +char* my_strdup_(const char* src); + +/// For a string ``s`` that starts with prefix ``prefix``, this returns +/// the first character in ``s`` after the prefix. Otherwise, it returns NULL. +/// +/// If the returned non-NULL ptr points to a '\0' character, then both strings +/// are identical. +/// +/// @param s the full string that may begin with the prefix +/// @param prefix the prefix that the full string may begin with +/// +/// @return ``NULL`` either argument was ``NULL`` or if ``path`` does not +/// start with ``prefix``. Otherwise, this returns ``path + strlen(prefix)`` +const char* post_prefix_ptr_(const char* s, const char* prefix); + +/// join together fragments of a string into 1 newly allocated string +char* join_parts_(char sep, const char** parts, int nparts); + + +/// represents the known platform types (that produce different results) +enum platform_kind { + platform_kind_generic_unix, + platform_kind_macos, + platform_kind_unknown +}; + +/// function that returns the appropriate platform enum +enum platform_kind get_platform_(void); + +/// get the Grackle data directory +char* get_data_dir_(enum platform_kind kind); + +#endif /* OS_UTILS_H */ From f80226c6c9702e24dc8c63eddde1c4a7e78c9004 Mon Sep 17 00:00:00 2001 From: Matthew Abruzzo Date: Mon, 26 Aug 2024 09:06:51 -0400 Subject: [PATCH 14/36] add file-search logic to determine_data_file_ Among other things, we started using picohash and using the functions in os_utils.ch --- CMakeLists.txt | 27 ++ external/picohash.h | 754 +++++++++++++++++++++++++++++++++++++ src/clib/CMakeLists.txt | 1 + src/clib/data_file_utils.c | 319 +++++++++++++++- 4 files changed, 1100 insertions(+), 1 deletion(-) create mode 100644 external/picohash.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 6925b127..070c6588 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -171,6 +171,33 @@ if (UNIX AND NOT CMAKE_SYSTEM_NAME STREQUAL "Darwin") set_target_properties(toolchain::m PROPERTIES IMPORTED_LIBNAME "m") endif() + +# picohash is a vendored, self-contained, header-only library +# -> it's a internal dependency of Grackle::Grackle (it shouldn't be exposed) +# -> If it were an `INTERFACE` library, CMake would abort with an error when we +# declare rules for installing the "export information" for a static-lib +# build of Grackle::Grackle (there aren't problems for a shared-lib build) +# -> The "export information" includes autogenerated linking logic that gets +# evaluated by external CMake projects that consume Grackle::Grackle via +# `find_package`. This logic infers a list of any other libraries that +# are shipped by this project that need to be linked to use libgrackle.a +# -> When invoked, the logic goes through ALL of Grackle::Grackle's (public +# & private) depenedencies that COULD specify such linking requirements. +# -> CMake will complain if any INTERFACE library used by Grackle::Grackle +# isn't publicaly exported since it COULD specify this information. +# -> the BUILD_LOCAL_INTERFACE generator expression, (in CMake 3.26+) can +# work around this +# -> this isn't an issue for INTERFACE IMPORTED libraries since IMPORTED +# libs should only specify linker requirements of prebuilt external libs, +# if there are any. (i.e. if there are any reqs, CMake expects the developer +# to manually add logic to the installed "export info") +add_library(picohash INTERFACE IMPORTED) + +# we use the SYSTEM option to suppress any warnings +target_include_directories(picohash SYSTEM INTERFACE + ${CMAKE_CURRENT_SOURCE_DIR}/external +) + # Main build targets # ------------------ add_subdirectory(src/clib) diff --git a/external/picohash.h b/external/picohash.h new file mode 100644 index 00000000..28b37422 --- /dev/null +++ b/external/picohash.h @@ -0,0 +1,754 @@ +/* + * The code is placed under public domain by Kazuho Oku . + * + * The MD5 implementation is based on a public domain implementation written by + * Solar Designer in 2001, which is used by Dovecot. + * + * The SHA1 implementation is based on a public domain implementation written + * by Wei Dai and other contributors for libcrypt, used also in liboauth. + * + * The SHA224/SHA256 implementation is based on a public domain implementation + * by Sam Hocevar for LibTomCrypt. + */ +#ifndef _picohash_h_ +#define _picohash_h_ + +#include +#include +#include + +#ifdef __BIG_ENDIAN__ +#define _PICOHASH_BIG_ENDIAN +#elif defined __LITTLE_ENDIAN__ +/* override */ +#elif defined __BYTE_ORDER +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ +#define _PICOHASH_BIG_ENDIAN +#endif +#elif !defined(_WIN32) +#include // machine/endian.h +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ +#define _PICOHASH_BIG_ENDIAN +#endif +#endif + +#define PICOHASH_MD5_BLOCK_LENGTH 64 +#define PICOHASH_MD5_DIGEST_LENGTH 16 + +typedef struct _picohash_md5_ctx_t { + uint_fast32_t lo, hi; + uint_fast32_t a, b, c, d; + unsigned char buffer[64]; + uint_fast32_t block[PICOHASH_MD5_DIGEST_LENGTH]; + const void *(*_body)(struct _picohash_md5_ctx_t *ctx, const void *data, size_t size); +} _picohash_md5_ctx_t; + +static void _picohash_md5_init(_picohash_md5_ctx_t *ctx); +static void _picohash_md5_update(_picohash_md5_ctx_t *ctx, const void *data, size_t size); +static void _picohash_md5_final(_picohash_md5_ctx_t *ctx, void *digest); + +#define PICOHASH_SHA1_BLOCK_LENGTH 64 +#define PICOHASH_SHA1_DIGEST_LENGTH 20 + +typedef struct { + uint32_t buffer[PICOHASH_SHA1_BLOCK_LENGTH / 4]; + uint32_t state[PICOHASH_SHA1_DIGEST_LENGTH / 4]; + uint64_t byteCount; + uint8_t bufferOffset; +} _picohash_sha1_ctx_t; + +static void _picohash_sha1_init(_picohash_sha1_ctx_t *ctx); +static void _picohash_sha1_update(_picohash_sha1_ctx_t *ctx, const void *input, size_t len); +static void _picohash_sha1_final(_picohash_sha1_ctx_t *ctx, void *digest); + +#define PICOHASH_SHA256_BLOCK_LENGTH 64 +#define PICOHASH_SHA256_DIGEST_LENGTH 32 +#define PICOHASH_SHA224_BLOCK_LENGTH PICOHASH_SHA256_BLOCK_LENGTH +#define PICOHASH_SHA224_DIGEST_LENGTH 28 + +typedef struct { + uint64_t length; + uint32_t state[PICOHASH_SHA256_DIGEST_LENGTH / 4]; + uint32_t curlen; + unsigned char buf[PICOHASH_SHA256_BLOCK_LENGTH]; +} _picohash_sha256_ctx_t; + +static void _picohash_sha256_init(_picohash_sha256_ctx_t *ctx); +static void _picohash_sha256_update(_picohash_sha256_ctx_t *ctx, const void *data, size_t len); +static void _picohash_sha256_final(_picohash_sha256_ctx_t *ctx, void *digest); +static void _picohash_sha224_init(_picohash_sha256_ctx_t *ctx); +static void _picohash_sha224_final(_picohash_sha256_ctx_t *ctx, void *digest); + +#define PICOHASH_MAX_BLOCK_LENGTH 64 +#define PICOHASH_MAX_DIGEST_LENGTH 32 + +typedef struct { + union { + _picohash_md5_ctx_t _md5; + _picohash_sha1_ctx_t _sha1; + _picohash_sha256_ctx_t _sha256; + }; + size_t block_length; + size_t digest_length; + void (*_reset)(void *ctx); + void (*_update)(void *ctx, const void *input, size_t len); + void (*_final)(void *ctx, void *digest); + struct { + unsigned char key[PICOHASH_MAX_BLOCK_LENGTH]; + void (*hash_reset)(void *ctx); + void (*hash_final)(void *ctx, void *digest); + } _hmac; +} picohash_ctx_t; + +static void picohash_init_md5(picohash_ctx_t *ctx); +static void picohash_init_sha1(picohash_ctx_t *ctx); +static void picohash_init_sha224(picohash_ctx_t *ctx); +static void picohash_init_sha256(picohash_ctx_t *ctx); +static void picohash_update(picohash_ctx_t *ctx, const void *input, size_t len); +static void picohash_final(picohash_ctx_t *ctx, void *digest); +static void picohash_reset(picohash_ctx_t *ctx); + +static void picohash_init_hmac(picohash_ctx_t *ctx, void (*initf)(picohash_ctx_t *), const void *key, size_t key_len); + +/* following are private definitions */ + +/* + * The basic MD5 functions. + * + * F is optimized compared to its RFC 1321 definition just like in Colin + * Plumb's implementation. + */ +#define _PICOHASH_MD5_F(x, y, z) ((z) ^ ((x) & ((y) ^ (z)))) +#define _PICOHASH_MD5_G(x, y, z) ((y) ^ ((z) & ((x) ^ (y)))) +#define _PICOHASH_MD5_H(x, y, z) ((x) ^ (y) ^ (z)) +#define _PICOHASH_MD5_I(x, y, z) ((y) ^ ((x) | ~(z))) + +/* + * The MD5 transformation for all four rounds. + */ +#define _PICOHASH_MD5_STEP(f, a, b, c, d, x, t, s) \ + (a) += f((b), (c), (d)) + (x) + (t); \ + (a) = (((a) << (s)) | (((a)&0xffffffff) >> (32 - (s)))); \ + (a) += (b); + +/* + * SET reads 4 input bytes in little-endian byte order and stores them + * in a properly aligned word in host byte order. + * + * The check for little-endian architectures which tolerate unaligned + * memory accesses is just an optimization. Nothing will break if it + * doesn't work. + */ +#if defined(__i386__) || defined(__x86_64__) || defined(__vax__) +#define _PICOHASH_MD5_SET(n) (*(const uint32_t *)&ptr[(n)*4]) +#define _PICOHASH_MD5_GET(n) _PICOHASH_MD5_SET(n) +#else +#define _PICOHASH_MD5_SET(n) \ + (ctx->block[(n)] = (uint_fast32_t)ptr[(n)*4] | ((uint_fast32_t)ptr[(n)*4 + 1] << 8) | ((uint_fast32_t)ptr[(n)*4 + 2] << 16) | \ + ((uint_fast32_t)ptr[(n)*4 + 3] << 24)) +#define _PICOHASH_MD5_GET(n) (ctx->block[(n)]) +#endif + +/* + * This processes one or more 64-byte data blocks, but does NOT update + * the bit counters. There're no alignment requirements. + */ +static const void *_picohash_md5_body(_picohash_md5_ctx_t *ctx, const void *data, size_t size) +{ + const unsigned char *ptr; + uint_fast32_t a, b, c, d; + uint_fast32_t saved_a, saved_b, saved_c, saved_d; + + ptr = data; + + a = ctx->a; + b = ctx->b; + c = ctx->c; + d = ctx->d; + + do { + saved_a = a; + saved_b = b; + saved_c = c; + saved_d = d; + + /* Round 1 */ + _PICOHASH_MD5_STEP(_PICOHASH_MD5_F, a, b, c, d, _PICOHASH_MD5_SET(0), 0xd76aa478, 7) + _PICOHASH_MD5_STEP(_PICOHASH_MD5_F, d, a, b, c, _PICOHASH_MD5_SET(1), 0xe8c7b756, 12) + _PICOHASH_MD5_STEP(_PICOHASH_MD5_F, c, d, a, b, _PICOHASH_MD5_SET(2), 0x242070db, 17) + _PICOHASH_MD5_STEP(_PICOHASH_MD5_F, b, c, d, a, _PICOHASH_MD5_SET(3), 0xc1bdceee, 22) + _PICOHASH_MD5_STEP(_PICOHASH_MD5_F, a, b, c, d, _PICOHASH_MD5_SET(4), 0xf57c0faf, 7) + _PICOHASH_MD5_STEP(_PICOHASH_MD5_F, d, a, b, c, _PICOHASH_MD5_SET(5), 0x4787c62a, 12) + _PICOHASH_MD5_STEP(_PICOHASH_MD5_F, c, d, a, b, _PICOHASH_MD5_SET(6), 0xa8304613, 17) + _PICOHASH_MD5_STEP(_PICOHASH_MD5_F, b, c, d, a, _PICOHASH_MD5_SET(7), 0xfd469501, 22) + _PICOHASH_MD5_STEP(_PICOHASH_MD5_F, a, b, c, d, _PICOHASH_MD5_SET(8), 0x698098d8, 7) + _PICOHASH_MD5_STEP(_PICOHASH_MD5_F, d, a, b, c, _PICOHASH_MD5_SET(9), 0x8b44f7af, 12) + _PICOHASH_MD5_STEP(_PICOHASH_MD5_F, c, d, a, b, _PICOHASH_MD5_SET(10), 0xffff5bb1, 17) + _PICOHASH_MD5_STEP(_PICOHASH_MD5_F, b, c, d, a, _PICOHASH_MD5_SET(11), 0x895cd7be, 22) + _PICOHASH_MD5_STEP(_PICOHASH_MD5_F, a, b, c, d, _PICOHASH_MD5_SET(12), 0x6b901122, 7) + _PICOHASH_MD5_STEP(_PICOHASH_MD5_F, d, a, b, c, _PICOHASH_MD5_SET(13), 0xfd987193, 12) + _PICOHASH_MD5_STEP(_PICOHASH_MD5_F, c, d, a, b, _PICOHASH_MD5_SET(14), 0xa679438e, 17) + _PICOHASH_MD5_STEP(_PICOHASH_MD5_F, b, c, d, a, _PICOHASH_MD5_SET(15), 0x49b40821, 22) + + /* Round 2 */ + _PICOHASH_MD5_STEP(_PICOHASH_MD5_G, a, b, c, d, _PICOHASH_MD5_GET(1), 0xf61e2562, 5) + _PICOHASH_MD5_STEP(_PICOHASH_MD5_G, d, a, b, c, _PICOHASH_MD5_GET(6), 0xc040b340, 9) + _PICOHASH_MD5_STEP(_PICOHASH_MD5_G, c, d, a, b, _PICOHASH_MD5_GET(11), 0x265e5a51, 14) + _PICOHASH_MD5_STEP(_PICOHASH_MD5_G, b, c, d, a, _PICOHASH_MD5_GET(0), 0xe9b6c7aa, 20) + _PICOHASH_MD5_STEP(_PICOHASH_MD5_G, a, b, c, d, _PICOHASH_MD5_GET(5), 0xd62f105d, 5) + _PICOHASH_MD5_STEP(_PICOHASH_MD5_G, d, a, b, c, _PICOHASH_MD5_GET(10), 0x02441453, 9) + _PICOHASH_MD5_STEP(_PICOHASH_MD5_G, c, d, a, b, _PICOHASH_MD5_GET(15), 0xd8a1e681, 14) + _PICOHASH_MD5_STEP(_PICOHASH_MD5_G, b, c, d, a, _PICOHASH_MD5_GET(4), 0xe7d3fbc8, 20) + _PICOHASH_MD5_STEP(_PICOHASH_MD5_G, a, b, c, d, _PICOHASH_MD5_GET(9), 0x21e1cde6, 5) + _PICOHASH_MD5_STEP(_PICOHASH_MD5_G, d, a, b, c, _PICOHASH_MD5_GET(14), 0xc33707d6, 9) + _PICOHASH_MD5_STEP(_PICOHASH_MD5_G, c, d, a, b, _PICOHASH_MD5_GET(3), 0xf4d50d87, 14) + _PICOHASH_MD5_STEP(_PICOHASH_MD5_G, b, c, d, a, _PICOHASH_MD5_GET(8), 0x455a14ed, 20) + _PICOHASH_MD5_STEP(_PICOHASH_MD5_G, a, b, c, d, _PICOHASH_MD5_GET(13), 0xa9e3e905, 5) + _PICOHASH_MD5_STEP(_PICOHASH_MD5_G, d, a, b, c, _PICOHASH_MD5_GET(2), 0xfcefa3f8, 9) + _PICOHASH_MD5_STEP(_PICOHASH_MD5_G, c, d, a, b, _PICOHASH_MD5_GET(7), 0x676f02d9, 14) + _PICOHASH_MD5_STEP(_PICOHASH_MD5_G, b, c, d, a, _PICOHASH_MD5_GET(12), 0x8d2a4c8a, 20) + + /* Round 3 */ + _PICOHASH_MD5_STEP(_PICOHASH_MD5_H, a, b, c, d, _PICOHASH_MD5_GET(5), 0xfffa3942, 4) + _PICOHASH_MD5_STEP(_PICOHASH_MD5_H, d, a, b, c, _PICOHASH_MD5_GET(8), 0x8771f681, 11) + _PICOHASH_MD5_STEP(_PICOHASH_MD5_H, c, d, a, b, _PICOHASH_MD5_GET(11), 0x6d9d6122, 16) + _PICOHASH_MD5_STEP(_PICOHASH_MD5_H, b, c, d, a, _PICOHASH_MD5_GET(14), 0xfde5380c, 23) + _PICOHASH_MD5_STEP(_PICOHASH_MD5_H, a, b, c, d, _PICOHASH_MD5_GET(1), 0xa4beea44, 4) + _PICOHASH_MD5_STEP(_PICOHASH_MD5_H, d, a, b, c, _PICOHASH_MD5_GET(4), 0x4bdecfa9, 11) + _PICOHASH_MD5_STEP(_PICOHASH_MD5_H, c, d, a, b, _PICOHASH_MD5_GET(7), 0xf6bb4b60, 16) + _PICOHASH_MD5_STEP(_PICOHASH_MD5_H, b, c, d, a, _PICOHASH_MD5_GET(10), 0xbebfbc70, 23) + _PICOHASH_MD5_STEP(_PICOHASH_MD5_H, a, b, c, d, _PICOHASH_MD5_GET(13), 0x289b7ec6, 4) + _PICOHASH_MD5_STEP(_PICOHASH_MD5_H, d, a, b, c, _PICOHASH_MD5_GET(0), 0xeaa127fa, 11) + _PICOHASH_MD5_STEP(_PICOHASH_MD5_H, c, d, a, b, _PICOHASH_MD5_GET(3), 0xd4ef3085, 16) + _PICOHASH_MD5_STEP(_PICOHASH_MD5_H, b, c, d, a, _PICOHASH_MD5_GET(6), 0x04881d05, 23) + _PICOHASH_MD5_STEP(_PICOHASH_MD5_H, a, b, c, d, _PICOHASH_MD5_GET(9), 0xd9d4d039, 4) + _PICOHASH_MD5_STEP(_PICOHASH_MD5_H, d, a, b, c, _PICOHASH_MD5_GET(12), 0xe6db99e5, 11) + _PICOHASH_MD5_STEP(_PICOHASH_MD5_H, c, d, a, b, _PICOHASH_MD5_GET(15), 0x1fa27cf8, 16) + _PICOHASH_MD5_STEP(_PICOHASH_MD5_H, b, c, d, a, _PICOHASH_MD5_GET(2), 0xc4ac5665, 23) + + /* Round 4 */ + _PICOHASH_MD5_STEP(_PICOHASH_MD5_I, a, b, c, d, _PICOHASH_MD5_GET(0), 0xf4292244, 6) + _PICOHASH_MD5_STEP(_PICOHASH_MD5_I, d, a, b, c, _PICOHASH_MD5_GET(7), 0x432aff97, 10) + _PICOHASH_MD5_STEP(_PICOHASH_MD5_I, c, d, a, b, _PICOHASH_MD5_GET(14), 0xab9423a7, 15) + _PICOHASH_MD5_STEP(_PICOHASH_MD5_I, b, c, d, a, _PICOHASH_MD5_GET(5), 0xfc93a039, 21) + _PICOHASH_MD5_STEP(_PICOHASH_MD5_I, a, b, c, d, _PICOHASH_MD5_GET(12), 0x655b59c3, 6) + _PICOHASH_MD5_STEP(_PICOHASH_MD5_I, d, a, b, c, _PICOHASH_MD5_GET(3), 0x8f0ccc92, 10) + _PICOHASH_MD5_STEP(_PICOHASH_MD5_I, c, d, a, b, _PICOHASH_MD5_GET(10), 0xffeff47d, 15) + _PICOHASH_MD5_STEP(_PICOHASH_MD5_I, b, c, d, a, _PICOHASH_MD5_GET(1), 0x85845dd1, 21) + _PICOHASH_MD5_STEP(_PICOHASH_MD5_I, a, b, c, d, _PICOHASH_MD5_GET(8), 0x6fa87e4f, 6) + _PICOHASH_MD5_STEP(_PICOHASH_MD5_I, d, a, b, c, _PICOHASH_MD5_GET(15), 0xfe2ce6e0, 10) + _PICOHASH_MD5_STEP(_PICOHASH_MD5_I, c, d, a, b, _PICOHASH_MD5_GET(6), 0xa3014314, 15) + _PICOHASH_MD5_STEP(_PICOHASH_MD5_I, b, c, d, a, _PICOHASH_MD5_GET(13), 0x4e0811a1, 21) + _PICOHASH_MD5_STEP(_PICOHASH_MD5_I, a, b, c, d, _PICOHASH_MD5_GET(4), 0xf7537e82, 6) + _PICOHASH_MD5_STEP(_PICOHASH_MD5_I, d, a, b, c, _PICOHASH_MD5_GET(11), 0xbd3af235, 10) + _PICOHASH_MD5_STEP(_PICOHASH_MD5_I, c, d, a, b, _PICOHASH_MD5_GET(2), 0x2ad7d2bb, 15) + _PICOHASH_MD5_STEP(_PICOHASH_MD5_I, b, c, d, a, _PICOHASH_MD5_GET(9), 0xeb86d391, 21) + + a += saved_a; + b += saved_b; + c += saved_c; + d += saved_d; + + ptr += 64; + } while (size -= 64); + + ctx->a = a; + ctx->b = b; + ctx->c = c; + ctx->d = d; + + return ptr; +} + +inline void _picohash_md5_init(_picohash_md5_ctx_t *ctx) +{ + ctx->a = 0x67452301; + ctx->b = 0xefcdab89; + ctx->c = 0x98badcfe; + ctx->d = 0x10325476; + + ctx->lo = 0; + ctx->hi = 0; + + ctx->_body = _picohash_md5_body; +} + +inline void _picohash_md5_update(_picohash_md5_ctx_t *ctx, const void *data, size_t size) +{ + uint_fast32_t saved_lo; + unsigned long used, free; + + saved_lo = ctx->lo; + if ((ctx->lo = (saved_lo + size) & 0x1fffffff) < saved_lo) + ctx->hi++; + ctx->hi += size >> 29; + + used = saved_lo & 0x3f; + + if (used) { + free = 64 - used; + + if (size < free) { + memcpy(&ctx->buffer[used], data, size); + return; + } + + memcpy(&ctx->buffer[used], data, free); + data = (const unsigned char *)data + free; + size -= free; + ctx->_body(ctx, ctx->buffer, 64); + } + + if (size >= 64) { + data = ctx->_body(ctx, data, size & ~(unsigned long)0x3f); + size &= 0x3f; + } + + memcpy(ctx->buffer, data, size); +} + +inline void _picohash_md5_final(_picohash_md5_ctx_t *ctx, void *_digest) +{ + unsigned char *digest = _digest; + unsigned long used, free; + + used = ctx->lo & 0x3f; + + ctx->buffer[used++] = 0x80; + + free = 64 - used; + + if (free < 8) { + memset(&ctx->buffer[used], 0, free); + ctx->_body(ctx, ctx->buffer, 64); + used = 0; + free = 64; + } + + memset(&ctx->buffer[used], 0, free - 8); + + ctx->lo <<= 3; + ctx->buffer[56] = ctx->lo; + ctx->buffer[57] = ctx->lo >> 8; + ctx->buffer[58] = ctx->lo >> 16; + ctx->buffer[59] = ctx->lo >> 24; + ctx->buffer[60] = ctx->hi; + ctx->buffer[61] = ctx->hi >> 8; + ctx->buffer[62] = ctx->hi >> 16; + ctx->buffer[63] = ctx->hi >> 24; + + ctx->_body(ctx, ctx->buffer, 64); + + digest[0] = ctx->a; + digest[1] = ctx->a >> 8; + digest[2] = ctx->a >> 16; + digest[3] = ctx->a >> 24; + digest[4] = ctx->b; + digest[5] = ctx->b >> 8; + digest[6] = ctx->b >> 16; + digest[7] = ctx->b >> 24; + digest[8] = ctx->c; + digest[9] = ctx->c >> 8; + digest[10] = ctx->c >> 16; + digest[11] = ctx->c >> 24; + digest[12] = ctx->d; + digest[13] = ctx->d >> 8; + digest[14] = ctx->d >> 16; + digest[15] = ctx->d >> 24; + + memset(ctx, 0, sizeof(*ctx)); +} + +#define _PICOHASH_SHA1_K0 0x5a827999 +#define _PICOHASH_SHA1_K20 0x6ed9eba1 +#define _PICOHASH_SHA1_K40 0x8f1bbcdc +#define _PICOHASH_SHA1_K60 0xca62c1d6 + +static inline uint32_t _picohash_sha1_rol32(uint32_t number, uint8_t bits) +{ + return ((number << bits) | (number >> (32 - bits))); +} + +static inline void _picohash_sha1_hash_block(_picohash_sha1_ctx_t *s) +{ + uint8_t i; + uint32_t a, b, c, d, e, t; + + a = s->state[0]; + b = s->state[1]; + c = s->state[2]; + d = s->state[3]; + e = s->state[4]; + for (i = 0; i < 80; i++) { + if (i >= 16) { + t = s->buffer[(i + 13) & 15] ^ s->buffer[(i + 8) & 15] ^ s->buffer[(i + 2) & 15] ^ s->buffer[i & 15]; + s->buffer[i & 15] = _picohash_sha1_rol32(t, 1); + } + if (i < 20) { + t = (d ^ (b & (c ^ d))) + _PICOHASH_SHA1_K0; + } else if (i < 40) { + t = (b ^ c ^ d) + _PICOHASH_SHA1_K20; + } else if (i < 60) { + t = ((b & c) | (d & (b | c))) + _PICOHASH_SHA1_K40; + } else { + t = (b ^ c ^ d) + _PICOHASH_SHA1_K60; + } + t += _picohash_sha1_rol32(a, 5) + e + s->buffer[i & 15]; + e = d; + d = c; + c = _picohash_sha1_rol32(b, 30); + b = a; + a = t; + } + s->state[0] += a; + s->state[1] += b; + s->state[2] += c; + s->state[3] += d; + s->state[4] += e; +} + +static inline void _picohash_sha1_add_uncounted(_picohash_sha1_ctx_t *s, uint8_t data) +{ + uint8_t *const b = (uint8_t *)s->buffer; +#ifdef _PICOHASH_BIG_ENDIAN + b[s->bufferOffset] = data; +#else + b[s->bufferOffset ^ 3] = data; +#endif + s->bufferOffset++; + if (s->bufferOffset == PICOHASH_SHA1_BLOCK_LENGTH) { + _picohash_sha1_hash_block(s); + s->bufferOffset = 0; + } +} + +inline void _picohash_sha1_init(_picohash_sha1_ctx_t *s) +{ + s->state[0] = 0x67452301; + s->state[1] = 0xefcdab89; + s->state[2] = 0x98badcfe; + s->state[3] = 0x10325476; + s->state[4] = 0xc3d2e1f0; + s->byteCount = 0; + s->bufferOffset = 0; +} + +inline void _picohash_sha1_update(_picohash_sha1_ctx_t *s, const void *_data, size_t len) +{ + const uint8_t *data = _data; + for (; len != 0; --len) { + ++s->byteCount; + _picohash_sha1_add_uncounted(s, *data++); + } +} + +inline void _picohash_sha1_final(_picohash_sha1_ctx_t *s, void *digest) +{ + // Pad with 0x80 followed by 0x00 until the end of the block + _picohash_sha1_add_uncounted(s, 0x80); + while (s->bufferOffset != 56) + _picohash_sha1_add_uncounted(s, 0x00); + + // Append length in the last 8 bytes + _picohash_sha1_add_uncounted(s, s->byteCount >> 53); // Shifting to multiply by 8 + _picohash_sha1_add_uncounted(s, s->byteCount >> 45); // as SHA-1 supports bitstreams as well as + _picohash_sha1_add_uncounted(s, s->byteCount >> 37); // byte. + _picohash_sha1_add_uncounted(s, s->byteCount >> 29); + _picohash_sha1_add_uncounted(s, s->byteCount >> 21); + _picohash_sha1_add_uncounted(s, s->byteCount >> 13); + _picohash_sha1_add_uncounted(s, s->byteCount >> 5); + _picohash_sha1_add_uncounted(s, s->byteCount << 3); + +#ifndef SHA_BIG_ENDIAN + { // Swap byte order back + int i; + for (i = 0; i < 5; i++) { + s->state[i] = (((s->state[i]) << 24) & 0xff000000) | (((s->state[i]) << 8) & 0x00ff0000) | + (((s->state[i]) >> 8) & 0x0000ff00) | (((s->state[i]) >> 24) & 0x000000ff); + } + } +#endif + + memcpy(digest, s->state, sizeof(s->state)); +} + +#define _picohash_sha256_ch(x, y, z) (z ^ (x & (y ^ z))) +#define _picohash_sha256_maj(x, y, z) (((x | y) & z) | (x & y)) +#define _picohash_sha256_s(x, y) \ + (((((uint32_t)(x)&0xFFFFFFFFUL) >> (uint32_t)((y)&31)) | ((uint32_t)(x) << (uint32_t)(32 - ((y)&31)))) & 0xFFFFFFFFUL) +#define _picohash_sha256_r(x, n) (((x)&0xFFFFFFFFUL) >> (n)) +#define _picohash_sha256_sigma0(x) (_picohash_sha256_s(x, 2) ^ _picohash_sha256_s(x, 13) ^ _picohash_sha256_s(x, 22)) +#define _picohash_sha256_sigma1(x) (_picohash_sha256_s(x, 6) ^ _picohash_sha256_s(x, 11) ^ _picohash_sha256_s(x, 25)) +#define _picohash_sha256_gamma0(x) (_picohash_sha256_s(x, 7) ^ _picohash_sha256_s(x, 18) ^ _picohash_sha256_r(x, 3)) +#define _picohash_sha256_gamma1(x) (_picohash_sha256_s(x, 17) ^ _picohash_sha256_s(x, 19) ^ _picohash_sha256_r(x, 10)) +#define _picohash_sha256_rnd(a, b, c, d, e, f, g, h, i) \ + t0 = h + _picohash_sha256_sigma1(e) + _picohash_sha256_ch(e, f, g) + K[i] + W[i]; \ + t1 = _picohash_sha256_sigma0(a) + _picohash_sha256_maj(a, b, c); \ + d += t0; \ + h = t0 + t1; + +static inline void _picohash_sha256_compress(_picohash_sha256_ctx_t *ctx, unsigned char *buf) +{ + static const uint32_t K[64] = { + 0x428a2f98UL, 0x71374491UL, 0xb5c0fbcfUL, 0xe9b5dba5UL, 0x3956c25bUL, 0x59f111f1UL, 0x923f82a4UL, 0xab1c5ed5UL, + 0xd807aa98UL, 0x12835b01UL, 0x243185beUL, 0x550c7dc3UL, 0x72be5d74UL, 0x80deb1feUL, 0x9bdc06a7UL, 0xc19bf174UL, + 0xe49b69c1UL, 0xefbe4786UL, 0x0fc19dc6UL, 0x240ca1ccUL, 0x2de92c6fUL, 0x4a7484aaUL, 0x5cb0a9dcUL, 0x76f988daUL, + 0x983e5152UL, 0xa831c66dUL, 0xb00327c8UL, 0xbf597fc7UL, 0xc6e00bf3UL, 0xd5a79147UL, 0x06ca6351UL, 0x14292967UL, + 0x27b70a85UL, 0x2e1b2138UL, 0x4d2c6dfcUL, 0x53380d13UL, 0x650a7354UL, 0x766a0abbUL, 0x81c2c92eUL, 0x92722c85UL, + 0xa2bfe8a1UL, 0xa81a664bUL, 0xc24b8b70UL, 0xc76c51a3UL, 0xd192e819UL, 0xd6990624UL, 0xf40e3585UL, 0x106aa070UL, + 0x19a4c116UL, 0x1e376c08UL, 0x2748774cUL, 0x34b0bcb5UL, 0x391c0cb3UL, 0x4ed8aa4aUL, 0x5b9cca4fUL, 0x682e6ff3UL, + 0x748f82eeUL, 0x78a5636fUL, 0x84c87814UL, 0x8cc70208UL, 0x90befffaUL, 0xa4506cebUL, 0xbef9a3f7UL, 0xc67178f2UL}; + uint32_t S[8], W[64], t, t0, t1; + int i; + + /* copy state into S */ + for (i = 0; i < 8; i++) + S[i] = ctx->state[i]; + + /* copy the state into 512-bits into W[0..15] */ + for (i = 0; i < 16; i++) + W[i] = + (uint32_t)buf[4 * i] << 24 | (uint32_t)buf[4 * i + 1] << 16 | (uint32_t)buf[4 * i + 2] << 8 | (uint32_t)buf[4 * i + 3]; + + /* fill W[16..63] */ + for (i = 16; i < 64; i++) + W[i] = _picohash_sha256_gamma1(W[i - 2]) + W[i - 7] + _picohash_sha256_gamma0(W[i - 15]) + W[i - 16]; + + /* Compress */ + for (i = 0; i < 64; ++i) { + _picohash_sha256_rnd(S[0], S[1], S[2], S[3], S[4], S[5], S[6], S[7], i); + t = S[7]; + S[7] = S[6]; + S[6] = S[5]; + S[5] = S[4]; + S[4] = S[3]; + S[3] = S[2]; + S[2] = S[1]; + S[1] = S[0]; + S[0] = t; + } + + /* feedback */ + for (i = 0; i < 8; i++) + ctx->state[i] = ctx->state[i] + S[i]; +} + +static inline void _picohash_sha256_do_final(_picohash_sha256_ctx_t *ctx, void *digest, size_t len) +{ + unsigned char *out = digest; + size_t i; + + /* increase the length of the message */ + ctx->length += ctx->curlen * 8; + + /* append the '1' bit */ + ctx->buf[ctx->curlen++] = (unsigned char)0x80; + + /* if the length is currently above 56 bytes we append zeros + * then compress. Then we can fall back to padding zeros and length + * encoding like normal. + */ + if (ctx->curlen > 56) { + while (ctx->curlen < 64) { + ctx->buf[ctx->curlen++] = (unsigned char)0; + } + _picohash_sha256_compress(ctx, ctx->buf); + ctx->curlen = 0; + } + + /* pad upto 56 bytes of zeroes */ + while (ctx->curlen < 56) { + ctx->buf[ctx->curlen++] = (unsigned char)0; + } + + /* store length */ + for (i = 0; i != 8; ++i) + ctx->buf[56 + i] = ctx->length >> (56 - 8 * i); + _picohash_sha256_compress(ctx, ctx->buf); + + /* copy output */ + for (i = 0; i != len / 4; ++i) { + out[i * 4] = ctx->state[i] >> 24; + out[i * 4 + 1] = ctx->state[i] >> 16; + out[i * 4 + 2] = ctx->state[i] >> 8; + out[i * 4 + 3] = ctx->state[i]; + } +} + +inline void _picohash_sha256_init(_picohash_sha256_ctx_t *ctx) +{ + ctx->curlen = 0; + ctx->length = 0; + ctx->state[0] = 0x6A09E667UL; + ctx->state[1] = 0xBB67AE85UL; + ctx->state[2] = 0x3C6EF372UL; + ctx->state[3] = 0xA54FF53AUL; + ctx->state[4] = 0x510E527FUL; + ctx->state[5] = 0x9B05688CUL; + ctx->state[6] = 0x1F83D9ABUL; + ctx->state[7] = 0x5BE0CD19UL; +} + +inline void _picohash_sha256_update(_picohash_sha256_ctx_t *ctx, const void *data, size_t len) +{ + const unsigned char *in = data; + size_t n; + + while (len > 0) { + if (ctx->curlen == 0 && len >= PICOHASH_SHA256_BLOCK_LENGTH) { + _picohash_sha256_compress(ctx, (unsigned char *)in); + ctx->length += PICOHASH_SHA256_BLOCK_LENGTH * 8; + in += PICOHASH_SHA256_BLOCK_LENGTH; + len -= PICOHASH_SHA256_BLOCK_LENGTH; + } else { + n = PICOHASH_SHA256_BLOCK_LENGTH - ctx->curlen; + if (n > len) + n = len; + memcpy(ctx->buf + ctx->curlen, in, (size_t)n); + ctx->curlen += n; + in += n; + len -= n; + if (ctx->curlen == 64) { + _picohash_sha256_compress(ctx, ctx->buf); + ctx->length += 8 * PICOHASH_SHA256_BLOCK_LENGTH; + ctx->curlen = 0; + } + } + } +} + +inline void _picohash_sha256_final(_picohash_sha256_ctx_t *ctx, void *digest) +{ + _picohash_sha256_do_final(ctx, digest, PICOHASH_SHA256_DIGEST_LENGTH); +} + +inline void _picohash_sha224_init(_picohash_sha256_ctx_t *ctx) +{ + ctx->curlen = 0; + ctx->length = 0; + ctx->state[0] = 0xc1059ed8UL; + ctx->state[1] = 0x367cd507UL; + ctx->state[2] = 0x3070dd17UL; + ctx->state[3] = 0xf70e5939UL; + ctx->state[4] = 0xffc00b31UL; + ctx->state[5] = 0x68581511UL; + ctx->state[6] = 0x64f98fa7UL; + ctx->state[7] = 0xbefa4fa4UL; +} + +inline void _picohash_sha224_final(_picohash_sha256_ctx_t *ctx, void *digest) +{ + _picohash_sha256_do_final(ctx, digest, PICOHASH_SHA224_DIGEST_LENGTH); +} + +inline void picohash_init_md5(picohash_ctx_t *ctx) +{ + ctx->block_length = PICOHASH_MD5_BLOCK_LENGTH; + ctx->digest_length = PICOHASH_MD5_DIGEST_LENGTH; + ctx->_reset = (void *)_picohash_md5_init; + ctx->_update = (void *)_picohash_md5_update; + ctx->_final = (void *)_picohash_md5_final; + + _picohash_md5_init(&ctx->_md5); +} + +inline void picohash_init_sha1(picohash_ctx_t *ctx) +{ + ctx->block_length = PICOHASH_SHA1_BLOCK_LENGTH; + ctx->digest_length = PICOHASH_SHA1_DIGEST_LENGTH; + ctx->_reset = (void *)_picohash_sha1_init; + ctx->_update = (void *)_picohash_sha1_update; + ctx->_final = (void *)_picohash_sha1_final; + _picohash_sha1_init(&ctx->_sha1); +} + +inline void picohash_init_sha224(picohash_ctx_t *ctx) +{ + ctx->block_length = PICOHASH_SHA224_BLOCK_LENGTH; + ctx->digest_length = PICOHASH_SHA224_DIGEST_LENGTH; + ctx->_reset = (void *)_picohash_sha224_init; + ctx->_update = (void *)_picohash_sha256_update; + ctx->_final = (void *)_picohash_sha224_final; + _picohash_sha224_init(&ctx->_sha256); +} + +inline void picohash_init_sha256(picohash_ctx_t *ctx) +{ + ctx->block_length = PICOHASH_SHA256_BLOCK_LENGTH; + ctx->digest_length = PICOHASH_SHA256_DIGEST_LENGTH; + ctx->_reset = (void *)_picohash_sha256_init; + ctx->_update = (void *)_picohash_sha256_update; + ctx->_final = (void *)_picohash_sha256_final; + _picohash_sha256_init(&ctx->_sha256); +} + +inline void picohash_update(picohash_ctx_t *ctx, const void *input, size_t len) +{ + ctx->_update(ctx, input, len); +} + +inline void picohash_final(picohash_ctx_t *ctx, void *digest) +{ + ctx->_final(ctx, digest); +} + +inline void picohash_reset(picohash_ctx_t *ctx) +{ + ctx->_reset(ctx); +} + +static inline void _picohash_hmac_apply_key(picohash_ctx_t *ctx, unsigned char delta) +{ + size_t i; + for (i = 0; i != ctx->block_length; ++i) + ctx->_hmac.key[i] ^= delta; + picohash_update(ctx, ctx->_hmac.key, ctx->block_length); + for (i = 0; i != ctx->block_length; ++i) + ctx->_hmac.key[i] ^= delta; +} + +static void _picohash_hmac_final(picohash_ctx_t *ctx, void *digest) +{ + unsigned char inner_digest[PICOHASH_MAX_DIGEST_LENGTH]; + + ctx->_hmac.hash_final(ctx, inner_digest); + + ctx->_hmac.hash_reset(ctx); + _picohash_hmac_apply_key(ctx, 0x5c); + picohash_update(ctx, inner_digest, ctx->digest_length); + memset(inner_digest, 0, ctx->digest_length); + + ctx->_hmac.hash_final(ctx, digest); +} + +static inline void _picohash_hmac_reset(picohash_ctx_t *ctx) +{ + ctx->_hmac.hash_reset(ctx); + _picohash_hmac_apply_key(ctx, 0x36); +} + +inline void picohash_init_hmac(picohash_ctx_t *ctx, void (*initf)(picohash_ctx_t *), const void *key, size_t key_len) +{ + initf(ctx); + + memset(ctx->_hmac.key, 0, ctx->block_length); + if (key_len > ctx->block_length) { + /* hash the key if it is too long */ + picohash_update(ctx, key, key_len); + picohash_final(ctx, ctx->_hmac.key); + ctx->_hmac.hash_reset(ctx); + } else { + memcpy(ctx->_hmac.key, key, key_len); + } + + /* replace reset and final function */ + ctx->_hmac.hash_reset = ctx->_reset; + ctx->_hmac.hash_final = ctx->_final; + ctx->_reset = (void *)_picohash_hmac_reset; + ctx->_final = (void *)_picohash_hmac_final; + + /* start calculating the inner hash */ + _picohash_hmac_apply_key(ctx, 0x36); +} + +#endif diff --git a/src/clib/CMakeLists.txt b/src/clib/CMakeLists.txt index 7b093282..3f794845 100644 --- a/src/clib/CMakeLists.txt +++ b/src/clib/CMakeLists.txt @@ -228,6 +228,7 @@ target_include_directories(Grackle_Grackle target_link_libraries(Grackle_Grackle PRIVATE toolchain::m GRACKLE_HDF5_C + picohash $<$:OpenMP::OpenMP_Fortran> $<$:OpenMP::OpenMP_C> ) diff --git a/src/clib/data_file_utils.c b/src/clib/data_file_utils.c index fff85aba..90767352 100644 --- a/src/clib/data_file_utils.c +++ b/src/clib/data_file_utils.c @@ -11,10 +11,326 @@ / software. ************************************************************************/ +#include // tolower +#include // CHAR_BIT #include #include +#include + +#include "picohash.h" #include "data_file_utils.h" +#include "os_utils.h" + +#include "grackle.h" // get_grackle_version + + +#define CKSUM_ALGORITHM "sha1" +#define CKSUM_STR_PREFIX CKSUM_ALGORITHM ":" +#define CKSUM_DIGEST_N_BYTES PICOHASH_SHA1_DIGEST_LENGTH +//#define CKSUM_DIGEST_N_BYTES 20 +#define CKSUM_DIGEST_N_HEXDIGITS (2*CKSUM_DIGEST_N_BYTES) + +// confirm a byte is 8 bits +// -> the C standard technically allows it to be larger. But on any modern +// POSIX system (or even Windows) it must be 8 bytes +// -> this scenario only comes up on highly specialize DSP hardware +#if CHAR_BIT != 8 + #error "our assumption that a byte is 8 bits is violated" +#endif + +/// returns whether 2 null-terminated checksum strings are equal +/// +/// A checksum string consists of 2 parts: +/// - a prefix that includes the name of a hash algorthim used to compute the +/// checksum followed by a colon (e.g. `md5:`, `sha1:`, `sha256:`) +/// - the suffix that specifies the actual values of the checksum as a string +/// of hexadecimal digits. +/// +/// @note +/// We could make this faster by encoding the checksum as an array of bytes +/// (rather than a string of hexadecimal digits). +/// - This would involve half the memory and we wouldn't need to worry about +/// case-insensitivity. +/// - But it's not worth the effort to do this to perform just a single +/// checksum comparison. (we need to compute the string-representation +/// anyway in order to effectively communicate issues with library users) +static int cksum_str_eq_(const char* lhs, const char*rhs) +{ + // locales could theoretically be an issue here... (but we should be fine) + // as long as the strings only contain latin letters (without modifiers) + // and arabic numerals + if ((lhs == NULL) || (rhs == NULL)) return 0; + + size_t len = strlen(lhs); // excludes trailing '\0' + if ((len == 0) || (len != strlen(rhs))) return 0; + + int neq = 0; + for (size_t i = 0; i < len; i++){ + neq += (tolower(lhs[i]) == tolower(rhs[i])); + } + return (len == (size_t)neq); +} + +/// abort the program with an error message if the checksum string +/// isn't valid +/// +/// we abort, rather than return NULL because there is a programming error +/// (and people can simply avoid this error by running their program without +/// any cksum calculations) +/// +/// behavior is undefined when cksum_str is NULL +void assert_valid_cksum_str_(const char* cksum_str, + const char* cksum_origin_descr, + const char* extra_fmt_arg) { + char* err = NULL; + + const char* hexstr_start = post_prefix_ptr_(cksum_str, CKSUM_STR_PREFIX); + const char* colon_pos = strchr(cksum_str, ':'); + + // ignore '\0' in length calculation + size_t hexstr_len = (hexstr_start == NULL) ? 0 : strlen(hexstr_start); + + if ((hexstr_start == NULL) && (colon_pos == NULL)){ + err = my_strdup_( + "no prefix specifying an algorithm name (e.g. \"" CKSUM_STR_PREFIX "\")" + ); + } else if (hexstr_start == NULL) { + err = my_strdup_( + "the algorithm name (i.e. characters before the colon), doesn't match" + " \"" CKSUM_ALGORITHM "\"" + ); + } else if (hexstr_len != CKSUM_DIGEST_N_HEXDIGITS) { + const char fmt[] = "it should have %d characters after the prefix, not %d"; + int sz = snprintf(err, 0, fmt, CKSUM_DIGEST_N_HEXDIGITS, (int)hexstr_len); + err = malloc(sz+1); + snprintf(err, sz, fmt, CKSUM_DIGEST_N_HEXDIGITS, (int)hexstr_len); + + } else { + const char hexdigits[] = "0123456789abcdefABCDEF"; + int bad_digits = 0; + for (int i = 0; i < CKSUM_DIGEST_N_HEXDIGITS; i++) { + bad_digits += (strchr(hexdigits, hexstr_start[i]) == NULL); + } + if (bad_digits) { + err = strdup( + "the characters after the prefix include non-hexadecimal digit(s)" + ); + } + } + + // let's perform some sanity checks on the contents of this string! + if (err != NULL) { + const char* extra_fmt = (extra_fmt_arg == NULL) ? "" : extra_fmt_arg; + fprintf( + stderr, + ("INTERNAL ERROR: There is a problem with a checksum string\n" + " string value: \"%s\"\n" + " origin: %s %s\n" + " issue: %s\n"), + cksum_str, cksum_origin_descr, extra_fmt_arg, err); + free(err); + abort(); + } +} + +// =========================================== +// LOGIC TO BE RELOCATED BEGIN +// =========================================== +// the logic in the section will be relocated to a separate header file + +typedef struct { const char* fname; const char* cksum; } registry_entry; + +static registry_entry file_registry[] = { + // for now, this includes a subset (will be autogenerated in the future) + {"CloudyData_UVB=FG2011.h5", "sha1:5b3423fb5cb96d6f8fae65655e204f1f82a276fa"}, + {"CloudyData_UVB=HM2012.h5", "sha1:3ae95f71926aa9543964fbd41c5e53a42345c19c"}, +}; + +/// return the full checksum string of the file if it is in the registry (or +/// NULL if there isn'a match) +static inline const char* expected_file_cksum_(const char* fname) { + if (fname == NULL) return NULL; + + const size_t n_entries = sizeof(file_registry) / sizeof(registry_entry); + const char* cksum_str = NULL; + for (size_t i = 0; i < n_entries; i++) { + if (strcmp(fname, file_registry[i].fname) == 0) { + return file_registry[i].cksum; + } + } + return NULL; +} +// =========================================== +// LOGIC TO BE RELOCATED END +// =========================================== + +/// Converts a checksum digest into a hexadecimal string +/// +/// @param[in] digest is an array of bytes where each byte has an +/// arbitrary value from 0 to 255 +/// @param[in] digest_len is the length of digest +/// @param[out] str is an emtpy array of length `2*digest_len + 1` +/// entries. At the conclusion of this operation, +/// `str[i*2:i*2+2]` specifies the value of `digest[i]` in +/// hexadecimal notation. `str[digest_len*2]` will be assigned +/// the null terminator. +static void convert_to_hex_(char* digest, int digest_len, char* str) { + + // some important context: the standard does not specify whether `char` is + // signed or unsigned and the call to snprintf will only only work if we + // pass the values of each byte as an unsigned char. + // + // Thus: we need to explicitly reinterpret the value of each element digest + // as an unsigned char. + // - there are rules in the C & C++ standard for this topic. Consider + // an object of T1. We want to access it through a value of type T2. + // For arbitrary types, this "type punning" is undefined behavior + // (i.e. standards compliant compilers are free to do whatever they + // want when they encounter undefined behavior without any consistency) + // - C++ is generally stricter about this topic (e.g. they forbid using + // unions to reinterpret values) + // - there are exceptions when it comes to `unsigned char` & `char` + // - discussions of these rules for C & C++ are found at + // https://en.cppreference.com/w/c/language/object#Strict_aliasing + // https://en.cppreference.com/w/cpp/language/reinterpret_cast#Type_aliasing + + for (int i = 0; i < digest_len; i++){ + // while it may seem like there are faster ways to do this, please don't + // change this without including a reference or argument explaining why + // your approach won't invoke undefined behavior. + + char elem = digest[i]; +#ifdef __cplusplus + unsigned char *uchar_ptr = reinterpret_cast(&elem); +#else + unsigned char *uchar_ptr = (unsigned char*)(&elem); +#endif + snprintf(str + 2*i, 3, "%02hhx\n", *uchar_ptr); + } +} + +/// calculate the checksum for the specified file +static char* calc_checksum_str_(const char* fname) { + + FILE* fp = fopen(fname, "rb"); + if (!fp) { + fprintf(stderr, + ("ERROR: unable to open `%s` to calculate checksum. Does the file " + "actually exist?"), + fname); + return NULL; + } + + picohash_ctx_t ctx; + picohash_init_sha1(&ctx); + + const size_t CHUNKSIZE = 4096; + char* buffer = malloc(CHUNKSIZE); + + int any_data_read = 0; + int cur_len; + do { + cur_len = fread(buffer, 1, CHUNKSIZE, fp); + if (cur_len != 0) { + picohash_update(&ctx, buffer, cur_len); + any_data_read = 1; + } + } while(cur_len == CHUNKSIZE); + free(buffer); + fclose(fp); + + if (!any_data_read) { + fprintf(stderr, "ERROR: `%s` either specifies a path to an empty file\n", + fname); + } + + char digest[PICOHASH_SHA1_DIGEST_LENGTH]; + picohash_final(&ctx, digest); + + // now we just need to convert all of the bytes to a string of hexadecimal + // digits for the sake of comparison + const char prefix[] = CKSUM_STR_PREFIX; + size_t prefix_len = strlen(prefix); // excludes nul character + size_t out_length = prefix_len + (2 * sizeof(digest)) + 1; // add 1 for nul + + char* out = malloc(out_length); + memcpy(out, prefix, prefix_len); + convert_to_hex_(digest, sizeof(digest), out+prefix_len); + return out; +} + + +static struct generic_file_props file_from_data_dir_( + const char* grackle_data_file, int grackle_data_file_options +) +{ + // initialize output struct in a format that will denote an error (if is + // never modified) + struct generic_file_props out = {NULL, 0, NULL, 0}; + + // first, let's check if the specified file name is known to Grackle + const char* expected_cksum_str = expected_file_cksum_(grackle_data_file); + if (expected_cksum_str == NULL) { + // in the future, depending on the value of grackle_data_file_options, + // we may want to provide special handling for the case where + // grackle_data_file starts with `"user-data/..." + + fprintf(stderr, + "ERROR: can't load %s from data directory, no such file is in " + "the file registry\n", + grackle_data_file); + return out; + } + + // sanity check that checksum from the file registry was properly formatted + assert_valid_cksum_str_(expected_cksum_str, + "from the file-registry for the file named", + grackle_data_file); + + // now it's time to construct the full path to the file (if it exists) + grackle_version version_info = get_grackle_version(); + + // get the data_directory + char* data_dir_path = get_data_dir_(get_platform_()); + if (data_dir_path == NULL) return out; + + const char* path_parts[4] = { + data_dir_path, "data-store-v1", version_info.version, grackle_data_file + }; + char* full_path = join_parts_('/', path_parts, 4); + free(data_dir_path); + + char* measured_cksum_str = calc_checksum_str_(full_path); + + if (measured_cksum_str == NULL) { + return out; + } else if (cksum_str_eq_(measured_cksum_str, expected_cksum_str) == 0) { + fprintf(stderr, + "ERROR: the measured checksums doesn't match expectations\n" + " -> measured: \"%s\"\n" + " -> expected: \"%s\"\n" + " -> path: `%s`\n" + " This error is indicative of 1 of 3 scenarios:\n" + " 1. There is a bug in the core Grackle library for locating\n" + " the file or computing the checksum\n" + " 2. There is a bug in the Grackle's data-file management\n" + " tool.\n" + " 3. It isn't Grackle's fault. Either the datafile was\n" + " corrupted or its the fault of the user/some other tool\n" + " that tried to modify the file.\n", + measured_cksum_str, expected_cksum_str, full_path); + free(measured_cksum_str); + free(full_path); + } else { + out.path = full_path; + out.path_requires_dealloc = 1; + out.checksum = measured_cksum_str; + out.checksum_requires_dealloc = 1; + } + return out; +} + struct generic_file_props determine_data_file_(const char* grackle_data_file, int grackle_data_file_options) @@ -30,7 +346,8 @@ struct generic_file_props determine_data_file_(const char* grackle_data_file, } else if (grackle_data_file_options == -1) { // the legacy case! out.path = grackle_data_file; return out; - + } else if (grackle_data_file_options == 1) { + return file_from_data_dir_(grackle_data_file, grackle_data_file_options); } else { fprintf(stderr, "grackle_data_file_options has an unexpected value: %d\n", grackle_data_file_options); From 5abaeea63fa3f935efafe4be43cdc1d9df960dcf Mon Sep 17 00:00:00 2001 From: Matthew Abruzzo Date: Sun, 1 Sep 2024 16:59:53 -0600 Subject: [PATCH 15/36] add build logic to classic build-system for the autofile management. --- src/clib/Make.config.assemble | 19 +++++++++++++++++-- src/clib/Make.config.objects | 4 +++- 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/src/clib/Make.config.assemble b/src/clib/Make.config.assemble index 623cc5ea..12f24e4b 100644 --- a/src/clib/Make.config.assemble +++ b/src/clib/Make.config.assemble @@ -161,6 +161,18 @@ $(error Illegal value '$(CONFIG_OMP)' for $$(CONFIG_OMP)) endif +#======================================================================= +# DETERMIN PLATFORM-SPECIFIC DEFINES +#======================================================================= + + # this is only used within os_utils.c + PLATFORM := $(shell uname) + ifeq ($(PLATFORM),Darwin) + PLATFORM_DEFINES = -DPLATFORM_MACOS + else + PLATFORM_DEFINES = -DPLATFORM_GENERIC_UNIX + endif + #======================================================================= # ASSIGN ALL OUTPUT VARIABLES #======================================================================= @@ -193,12 +205,15 @@ LDOUTPUT_FLAGS = $(ASSEMBLE_LDOUTPUT_FLAGS) DEFINES = $(MACH_DEFINES) \ - $(ASSEMBLE_IO_DEFINES) + $(ASSEMBLE_IO_DEFINES) \ + $(PLATFORM_DEFINES) PUBLIC_HEADER_SRCDIR = $(GRACKLE_DIR)/../include AUTOGEN_DIR = $(GRACKLE_DIR)/autogen - BUILD_INCLUDES = -I$(PUBLIC_HEADER_SRCDIR) -I$(AUTOGEN_DIR) + BUILD_INCLUDES = -I$(PUBLIC_HEADER_SRCDIR) \ + -I$(AUTOGEN_DIR) \ + -I$(GRACKLE_DIR)/../../external INCLUDES = $(MACH_INCLUDES) \ $(MAKEFILE_INCLUDES) \ diff --git a/src/clib/Make.config.objects b/src/clib/Make.config.objects index 7940c11b..806d9fdb 100644 --- a/src/clib/Make.config.objects +++ b/src/clib/Make.config.objects @@ -39,4 +39,6 @@ OBJS_CONFIG_LIB = \ update_UVbackground_rates.lo \ rate_functions.lo \ initialize_rates.lo \ - utils.lo \ No newline at end of file + utils.lo \ + data_file_utils.lo \ + os_utils.lo From e26cf08a81b52e4b59661f0f7e0198c8269f52a3 Mon Sep 17 00:00:00 2001 From: Matthew Abruzzo Date: Mon, 2 Sep 2024 10:16:41 -0600 Subject: [PATCH 16/36] automatically inject file registry into C library The file registry is encoded in the autogenerated file_registry.h file that is produced from file_registry.h.in. To get this to work properly for the Makefile build-system, I needed to add a new feature to ``configure_file.py``. In detail: * ``configure_file.py`` already provided the option to replace a variable in a template file with multiple lines of content read from an external file. We assumed that this option would only be used for formatting multiline strings in printf statements. Consequently, the machinery would replace any new-line characters encountered in the external file with the "\n" escape-sequence used in C strings to represent a new-line. * I added simply added the option to ``configure_file.py`` to do the same thing WITHOUT escaping new-line characters. --- config/configure_file.py | 39 +++++++++++++++++++++++---------- src/clib/CMakeLists.txt | 26 ++++++++++++++++------ src/clib/Makefile | 14 +++++++++++- src/clib/data_file_utils.c | 38 ++++---------------------------- src/clib/file_registry.h.in | 43 +++++++++++++++++++++++++++++++++++++ 5 files changed, 107 insertions(+), 53 deletions(-) create mode 100644 src/clib/file_registry.h.in diff --git a/config/configure_file.py b/config/configure_file.py index 808a48e1..1ac2d63f 100755 --- a/config/configure_file.py +++ b/config/configure_file.py @@ -60,7 +60,7 @@ def replace(matchobj): if err_msg is not None: out_f.close() os.remove(out_fname) - raise RuntimeError(rslt) + raise RuntimeError(err_msg) unused_variables = used_variable_set.symmetric_difference(variable_map) @@ -70,7 +70,9 @@ def replace(matchobj): "were unused: {!r}".format(unused_variables)) def _parse_variables(dict_to_update, var_val_assignment_str_l, - val_is_file_path = False): + val_kind = 'literal'): + assert val_kind in ['literal', 'file-path-escaped-contents', + 'file-path-literal-contents'] for var_val_assignment_str in var_val_assignment_str_l: stripped_str = var_val_assignment_str.strip() # for safety @@ -104,7 +106,7 @@ def _parse_variables(dict_to_update, var_val_assignment_str_l, raise RuntimeError( "the {!r} variable is defined more than once".format(var_name)) - if val_is_file_path: + if val_kind != 'literal': # val_kind is some kind of file path path = value if not os.path.isfile(path): raise RuntimeError( @@ -112,11 +114,15 @@ def _parse_variables(dict_to_update, var_val_assignment_str_l, "at {!r} with the {!r} variable: no such file exists" ).format(path, var_name)) with open(value, "r") as f: - # we generally treat the characters in the file as literals - # -> we do need to make a point of properly escaping the - # newline characters - assert os.linesep == '\n' # implicit assumption - value = f.read().replace(os.linesep, r'\n') + if val_kind == 'file-path-escaped-contents': + # we generally treat the characters in the file as literals + # -> we do need to make a point of properly escaping the + # newline characters + assert os.linesep == '\n' # implicit assumption + value = f.read().replace(os.linesep, r'\n') + else: # val_kind == 'file-path-literal-contents' + value = f.read() + dict_to_update[var_name] = value def main(args): @@ -130,9 +136,11 @@ def main(args): # fill variable_map with the specified variables and values variable_map = {} _parse_variables(variable_map, args.variables, - val_is_file_path = False) + val_kind = 'literal') _parse_variables(variable_map, args.variable_use_file_contents, - val_is_file_path = True) + val_kind = 'file-path-escaped-contents') + _parse_variables(variable_map, args.variable_use_literal_file_contents, + val_kind = 'file-path-literal-contents') # use variable_map to actually create the output file with open(args.input, 'r') as f_input: @@ -148,7 +156,16 @@ def main(args): '--variable-use-file-contents', action = 'append', default = [], metavar = 'VAR=path/to/file', help = ("associates the (possibly multi-line) contents contained by the " - "specified file with VAR") + "specified file with VAR. This replaces each newline character " + "with the pair of characters \"\\n\". This is useful if the " + "contents represent a string to be printed") +) + +parser.add_argument( + '--variable-use-literal-file-contents', action = 'append', default = [], + metavar = 'VAR=path/to/file', + help = ("associates the (possibly multi-line) contents contained by the " + "specified file with VAR. This does NOT escape newline characters.") ) parser.add_argument( "variables", nargs = '*', action = 'store', default = [], diff --git a/src/clib/CMakeLists.txt b/src/clib/CMakeLists.txt index 3f794845..9a193c63 100644 --- a/src/clib/CMakeLists.txt +++ b/src/clib/CMakeLists.txt @@ -33,6 +33,15 @@ endif() configure_file(../include/grackle_float.h.in ${GRACKLE_GENRATED_PUBLIC_HEADERS}/grackle_float.h @ONLY) +# now, declare recipe for generating file_registry.h: +set(GRACKLE_GENERATED_PRIVATE_HEADERS "${CMAKE_CURRENT_BINARY_DIR}") +file(READ + "${CMAKE_CURRENT_SOURCE_DIR}/../python/pygrackle/file_registry/file_registry.txt" + FILE_REGISTRY_CONTENTS) +configure_file(file_registry.h.in + ${GRACKLE_GENERATED_PRIVATE_HEADERS}/file_registry.h @ONLY) + + # next, declare recipe for generating auto_general.c: # fetch necessary version information via query-version.py script @@ -212,15 +221,18 @@ set_target_typed_info_properties(Grackle_Grackle BOOL_PROPERTIES ) target_include_directories(Grackle_Grackle - # specify where to search for generated and ordinary headers when building + # specify where to search for generated/ordinary private headers (only used + # when building grackle) + # -> while it may seem unnecessary to specify the ordinary private headers' + # directory, it's necessary to compile auto_general.c + PRIVATE ${GRACKLE_GENERATED_PRIVATE_HEADERS} ${CMAKE_CURRENT_SOURCE_DIR} + + # specify where to search for generated/ordinary public headers when building # grackle AND when linking against grackle under inclusion approach #1 - # -> while it may seem unnecessary to specify the ordinary headers' directory - # while building grackle, it's necessary to compile auto_general.c - PUBLIC $ # generated hdrs - $ # public hdrs - $ # private hdrs + PUBLIC $ + $ - # specify where to search for the other headers when linking against grackle + # specify where to search for public headers when linking against grackle # (for inclusion approach #2) INTERFACE $ ) diff --git a/src/clib/Makefile b/src/clib/Makefile index 737b9dbf..096eae6d 100644 --- a/src/clib/Makefile +++ b/src/clib/Makefile @@ -212,7 +212,7 @@ verbose: VERBOSE = 1 # This variable is defined with Make.config.assemble. .PHONY: autogen -autogen: config_type $(AUTOGEN_DIR)/auto_general.c +autogen: config_type $(AUTOGEN_DIR)/file_registry.h $(AUTOGEN_DIR)/auto_general.c # in following recipe, GRACKLE_FLOAT_MACRO is set to either GRACKLE_FLOAT_4 or # GRACKLE_FLOAT_8 @@ -224,6 +224,15 @@ config_type: $(PUBLIC_HEADER_SRCDIR)/grackle_float.h.in --output $(AUTOGEN_DIR)/grackle_float.h \ GRACKLE_FLOAT_MACRO=GRACKLE_FLOAT_$(ASSEMBLE_PRECISION_NUMBER)); +# Force update of file_registry.h (an internally used header file) +.PHONY: $(AUTOGEN_DIR)/file_registry.h +$(AUTOGEN_DIR)/file_registry.h: file_registry.h.in + -@(mkdir -p $(AUTOGEN_DIR)) + @$(CONFIG_DIR)/configure_file.py --clobber \ + --input $< \ + --output $(AUTOGEN_DIR)/file_registry.h \ + --variable-use-literal-file-contents FILE_REGISTRY_CONTENTS=../python/pygrackle/file_registry/file_registry.txt + # Force update of auto_general.c .PHONY: $(AUTOGEN_DIR)/auto_general.c $(AUTOGEN_DIR)/auto_general.c: auto_general.c.in @@ -240,8 +249,11 @@ $(AUTOGEN_DIR)/auto_general.c: auto_general.c.in GIT_REVISION=`$(QUERY_VERSION) git-revision` # keep this recipe updated so that we always clean up the autogenerated files +# (the second line cleans up autogenerated files that might be left over from +# before we moved the autogenerated files into a subdirectory) .PHONY: clean_autogen clean_autogen: + -@rm -rf $(AUTOGEN_DIR) -@rm -f auto_*.c $(PUBLIC_HEADER_SRCDIR)/grackle_float.h #----------------------------------------------------------------------- diff --git a/src/clib/data_file_utils.c b/src/clib/data_file_utils.c index 90767352..f4de1674 100644 --- a/src/clib/data_file_utils.c +++ b/src/clib/data_file_utils.c @@ -20,6 +20,7 @@ #include "picohash.h" #include "data_file_utils.h" +#include "file_registry.h" #include "os_utils.h" #include "grackle.h" // get_grackle_version @@ -80,9 +81,9 @@ static int cksum_str_eq_(const char* lhs, const char*rhs) /// any cksum calculations) /// /// behavior is undefined when cksum_str is NULL -void assert_valid_cksum_str_(const char* cksum_str, - const char* cksum_origin_descr, - const char* extra_fmt_arg) { +static void assert_valid_cksum_str_(const char* cksum_str, + const char* cksum_origin_descr, + const char* extra_fmt_arg) { char* err = NULL; const char* hexstr_start = post_prefix_ptr_(cksum_str, CKSUM_STR_PREFIX); @@ -134,37 +135,6 @@ void assert_valid_cksum_str_(const char* cksum_str, } } -// =========================================== -// LOGIC TO BE RELOCATED BEGIN -// =========================================== -// the logic in the section will be relocated to a separate header file - -typedef struct { const char* fname; const char* cksum; } registry_entry; - -static registry_entry file_registry[] = { - // for now, this includes a subset (will be autogenerated in the future) - {"CloudyData_UVB=FG2011.h5", "sha1:5b3423fb5cb96d6f8fae65655e204f1f82a276fa"}, - {"CloudyData_UVB=HM2012.h5", "sha1:3ae95f71926aa9543964fbd41c5e53a42345c19c"}, -}; - -/// return the full checksum string of the file if it is in the registry (or -/// NULL if there isn'a match) -static inline const char* expected_file_cksum_(const char* fname) { - if (fname == NULL) return NULL; - - const size_t n_entries = sizeof(file_registry) / sizeof(registry_entry); - const char* cksum_str = NULL; - for (size_t i = 0; i < n_entries; i++) { - if (strcmp(fname, file_registry[i].fname) == 0) { - return file_registry[i].cksum; - } - } - return NULL; -} -// =========================================== -// LOGIC TO BE RELOCATED END -// =========================================== - /// Converts a checksum digest into a hexadecimal string /// /// @param[in] digest is an array of bytes where each byte has an diff --git a/src/clib/file_registry.h.in b/src/clib/file_registry.h.in new file mode 100644 index 00000000..506c0a67 --- /dev/null +++ b/src/clib/file_registry.h.in @@ -0,0 +1,43 @@ +/*********************************************************************** +/ +/ Template header-file that is used to internally specify the file +/ registry. This is only intended to be included once +/ +/ +/ Copyright (c) 2013, Enzo/Grackle Development Team. +/ +/ Distributed under the terms of the Enzo Public Licence. +/ +/ The full license is in the file LICENSE, distributed with this +/ software. +************************************************************************/ + +#ifndef FILE_REGISTRY_H +#define FILE_REGISTRY_H + +#include // strcmp + +typedef struct { const char* fname; const char* cksum; } registry_entry; + +static registry_entry file_registry[] = { +@FILE_REGISTRY_CONTENTS@ +}; + +/// return the full checksum string of the file if it is in the registry +/// +/// returns NULL if there is no match! +static inline const char* expected_file_cksum_(const char* fname) { + if (fname == NULL) return NULL; + + const size_t n_entries = sizeof(file_registry) / sizeof(registry_entry); + const char* cksum_str = NULL; + for (size_t i = 0; i < n_entries; i++) { + if (strcmp(fname, file_registry[i].fname) == 0) { + return file_registry[i].cksum; + } + } + return NULL; +} + + +#endif /* FILE_REGISTRY_H */ From 2551dac8bb3be8dd51c0c0fb1170a34896d1d295 Mon Sep 17 00:00:00 2001 From: Matthew Abruzzo Date: Mon, 2 Sep 2024 15:42:31 -0600 Subject: [PATCH 17/36] finalize grackle_data_file_options. --- src/clib/data_file_utils.c | 42 +++++++++++++++++++++++++++----------- src/include/grackle.h | 8 ++++++++ 2 files changed, 38 insertions(+), 12 deletions(-) diff --git a/src/clib/data_file_utils.c b/src/clib/data_file_utils.c index f4de1674..d427e0de 100644 --- a/src/clib/data_file_utils.c +++ b/src/clib/data_file_utils.c @@ -232,7 +232,8 @@ static char* calc_checksum_str_(const char* fname) { static struct generic_file_props file_from_data_dir_( - const char* grackle_data_file, int grackle_data_file_options + const char* grackle_data_file, int grackle_data_file_options, + int calculate_checksum ) { // initialize output struct in a format that will denote an error (if is @@ -271,6 +272,12 @@ static struct generic_file_props file_from_data_dir_( char* full_path = join_parts_('/', path_parts, 4); free(data_dir_path); + if (calculate_checksum == 0) { // skip the checksum calculation + out.path = full_path; + out.path_requires_dealloc = 1; + return out; + } + char* measured_cksum_str = calc_checksum_str_(full_path); if (measured_cksum_str == NULL) { @@ -312,20 +319,31 @@ struct generic_file_props determine_data_file_(const char* grackle_data_file, if (grackle_data_file == NULL) { fprintf(stderr, "grackle_data_file must not be NULL\n"); return out; - - } else if (grackle_data_file_options == -1) { // the legacy case! - out.path = grackle_data_file; - return out; - } else if (grackle_data_file_options == 1) { - return file_from_data_dir_(grackle_data_file, grackle_data_file_options); - } else { - fprintf(stderr, "grackle_data_file_options has an unexpected value: %d\n", - grackle_data_file_options); - return out; - } + if (grackle_data_file_options == -1) { + grackle_data_file_options = GR_DFOPT_FULLPATH_NO_CKSUM; // the legacy case + } + switch (grackle_data_file_options) { + case GR_DFOPT_FULLPATH_NO_CKSUM: { + out.path = grackle_data_file; + return out; + } + case GR_DFOPT_MANAGED: { + return file_from_data_dir_(grackle_data_file, grackle_data_file_options, + 1); + } + case GR_DFOPT_MANAGED_NO_CKSUM: { + return file_from_data_dir_(grackle_data_file, grackle_data_file_options, + 0); + } + default: { + fprintf(stderr, "grackle_data_file_options has an unexpected value: %d\n", + grackle_data_file_options); + return out; + } + } } void free_generic_file_props_(struct generic_file_props* ptr) { diff --git a/src/include/grackle.h b/src/include/grackle.h index 3796e787..6893b4b2 100644 --- a/src/include/grackle.h +++ b/src/include/grackle.h @@ -27,6 +27,14 @@ extern "C" { #define GR_SPECIFY_INITIAL_A_VALUE -1 +// here, we define the precise values passed to the grackle_data_file_options +// to specify how data files are handled. The precise values are experimental +// (passing -1 will always map to the legacy behavior) and may change. But, +// we will do our best to avoid changing anything. +#define GR_DFOPT_FULLPATH_NO_CKSUM 1 +#define GR_DFOPT_MANAGED 2 +#define GR_DFOPT_MANAGED_NO_CKSUM 3 + extern int grackle_verbose; extern chemistry_data *grackle_data; From 4cf476692fccfa3ea584f2ae7d321e6be521d420 Mon Sep 17 00:00:00 2001 From: Matthew Abruzzo Date: Sun, 1 Sep 2024 16:58:52 -0600 Subject: [PATCH 18/36] introduce tests. --- src/python/pygrackle/__init__.py | 4 +- src/python/pygrackle/grackle_defs.pxd | 5 + src/python/pygrackle/grackle_wrapper.pyx | 13 + src/python/pygrackle/utilities/testing.py | 50 +++ src/python/tests/test_auto_files.py | 381 ++++++++++++++++++++++ src/python/tests/test_query_units.py | 71 ++-- 6 files changed, 504 insertions(+), 20 deletions(-) create mode 100644 src/python/tests/test_auto_files.py diff --git a/src/python/pygrackle/__init__.py b/src/python/pygrackle/__init__.py index e4b96935..e57c8778 100644 --- a/src/python/pygrackle/__init__.py +++ b/src/python/pygrackle/__init__.py @@ -18,7 +18,9 @@ FluidContainer from .grackle_wrapper import \ - chemistry_data + chemistry_data, \ + constants + from .utilities.convenience import \ setup_fluid_container diff --git a/src/python/pygrackle/grackle_defs.pxd b/src/python/pygrackle/grackle_defs.pxd index 63bdcb69..c359d5a1 100644 --- a/src/python/pygrackle/grackle_defs.pxd +++ b/src/python/pygrackle/grackle_defs.pxd @@ -175,6 +175,11 @@ cdef extern from "grackle.h": cdef int GRACKLE_FAIL_VALUE "GR_FAIL" cdef int GR_SPECIFY_INITIAL_A_VALUE + # options for grackle_data_field_option + cdef int GR_DFOPT_FULLPATH_NO_CKSUM + cdef int GR_DFOPT_MANAGED + cdef int GR_DFOPT_MANAGED_NO_CKSUM + int local_initialize_chemistry_parameters(c_chemistry_data *my_chemistry) void set_velocity_units(c_code_units *my_units) diff --git a/src/python/pygrackle/grackle_wrapper.pyx b/src/python/pygrackle/grackle_wrapper.pyx index 243d37f2..71b52e42 100644 --- a/src/python/pygrackle/grackle_wrapper.pyx +++ b/src/python/pygrackle/grackle_wrapper.pyx @@ -12,6 +12,7 @@ ######################################################################## import copy +from types import SimpleNamespace from pygrackle.utilities.physical_constants import \ boltzmann_constant_cgs, \ mass_hydrogen_cgs @@ -20,6 +21,18 @@ from libc.limits cimport INT_MAX from .grackle_defs cimport * import numpy as np +# declare a variable that acts as a namespace for all of Grackle's named +# constants. The name of this variable is all lowercase in case we ever want to +# make a module called `constants.pyx` +_constants_contents = { + 'GR_FAIL': GRACKLE_FAIL_VALUE, + 'GR_DFOPT_FULLPATH_NO_CKSUM' : GR_DFOPT_FULLPATH_NO_CKSUM, + 'GR_DFOPT_MANAGED' : GR_DFOPT_MANAGED, + 'GR_DFOPT_MANAGED_NO_CKSUM' : GR_DFOPT_MANAGED_NO_CKSUM +} +constants = SimpleNamespace(**_constants_contents) +del _constants_contents + cdef class chemistry_data: cdef _wrapped_c_chemistry_data data cdef c_chemistry_data_storage rates diff --git a/src/python/pygrackle/utilities/testing.py b/src/python/pygrackle/utilities/testing.py index 71dae01e..8dbaf963 100644 --- a/src/python/pygrackle/utilities/testing.py +++ b/src/python/pygrackle/utilities/testing.py @@ -99,3 +99,53 @@ def ensure_dir(path): else: raise return path + +def _fetch_keys(actual, reference, err_msg = ""): + # check consistency in dictionary keys + refkeys = reference.keys() + refkey_set = set(refkeys) + mismatch_keys = refkey_set.symmetric_difference(actual.keys()) + + if len(mismatch_keys): + shared_keys = list(refkey_set.intersection(actual.keys())) + extra_ref, extra_actual = [], [] + for k in mismatch_keys: + if k in refkeys: + extra_ref.append(k) + else: + extra_actual.append(k) + + raise AssertionError( + "The results are not equal to specified tolerance.\n" + f"{err_msg}\n" + "There is a keys mismatch. Both results have the keys:\n" + f" {shared_keys!r}\n" + "Extra Keys:\n" + f" actual: {extra_actual}\n" + f" reference: {extra_ref}" + ) + return list(refkeys) + +def assert_allequal_arraydict(actual, reference, err_msg=''): + """ + Raises an AssertionError if any contents of the 2 compared mappings of + arrays are not EXACTLY equal + Parameters + ---------- + actual : mapping + A mapping of arrays obtained in a calculation + reference : mapping + A mapping of reference arrays + err_msg : str + Custom error message to be printed in case of failure. + Note + ---- + A separate function is proposed as part of PR #195 to do approximate + equality checks (like np.testing.assert_allclose). + """ + __tracebackhide__ = True # control pytest traceback depth + + keys = _fetch_keys(actual, reference, err_msg = err_msg) + for key in keys: + assert_array_equal(actual[key], reference[key], err_msg = err_msg, + strict = True) diff --git a/src/python/tests/test_auto_files.py b/src/python/tests/test_auto_files.py new file mode 100644 index 00000000..ac0edf1b --- /dev/null +++ b/src/python/tests/test_auto_files.py @@ -0,0 +1,381 @@ +######################################################################## +# +# Test the API for dynamically accessing fields of chemistry_data +# +# +# Copyright (c) 2013, Enzo/Grackle Development Team. +# +# Distributed under the terms of the Enzo Public Licence. +# +# The full license is in the file LICENSE, distributed with this +# software. +######################################################################## + +import contextlib +import io +import os +import shutil +import sys + +import numpy as np +import pytest + + +from pygrackle import setup_fluid_container, constants +from pygrackle.utilities.data_path import ( + _make_config_pair, + _fnames_in_registry, +) +from pygrackle.utilities.grdata import main +from pygrackle.utilities.physical_constants import sec_per_Myr +from pygrackle.utilities.testing import assert_allequal_arraydict, ensure_dir + +from test_query_units import _setup_generic_chemistry_data + +# we probably don't have to skip everything +if not hasattr(os, "putenv"): + pytest.skip( + "several tests need os.putenv to work properly", allow_module_level=True + ) + +# _ENV_VAR holds the list of environment variables that could affect the +# location of the data directory +if sys.platform.startswith("darwin"): + _ENV_VARS = ("HOME", "GRACKLE_DATA_DIR") +else: + _ENV_VARS = ("HOME", "GRACKLE_DATA_DIR", "XDG_DATA_HOME") + + +def _ensure_removed(d, key): + try: + del d[key] + except KeyError: + pass + + +@contextlib.contextmanager +def modified_env(new_env_vals, extra_cleared_variables=None): + """ + Temporarily overwrite the environment variables. This is necessary to test C + extensions that rely upon the environment variables + """ + if extra_cleared_variables is None: + extra_cleared_variables = None + + # record the original values for any variable we will overwrite + original_vals = {} + try: + for var in filter(lambda e: e not in new_env_vals, extra_cleared_variables): + original_vals[var] = os.environ.get(var, None) + _ensure_removed(os.environ, var) + + for var, new_val in new_env_vals.items(): + original_vals[var] = os.environ.get(var, None) + if new_val is None: + _ensure_removed(os.environ, var) + else: + os.environ[var] = new_val + + yield + + finally: + # restore to the initial values + for var, val in original_vals.items(): + if val is None: + _ensure_removed(os.environ, var) + else: + os.environ[var] = val + + +class DataFileManagementHarness: + """ + This is a wrapper around the cli interface provided by pygrackle. + + This mainly exists to make it easier for us to wrap a standalone script + in the future that isn't part of pygrackle + """ + + def __init__(self, config_pair=None): + self.config_pair = config_pair + self.fnames_in_registry = _fnames_in_registry() + + def __call__(self, args): + """pass in cli args. The exit code and the captured stdout is returned""" + if (args is None) or isinstance(args, str) or not isinstance(args[0], str): + raise RuntimeError("invalid args sanity check failed!") + config_pair = self.config_pair + if config_pair is None: + config_pair = _make_config_pair() + tmp = io.StringIO() + with contextlib.redirect_stdout(tmp): + exitcode = main(*config_pair, prog_name="python -m pygrackle", args=args) + return exitcode, tmp.getvalue().rstrip() + + def version_dir_path(self): + rc, current_version_data_path = self(["getpath", "--vdata"]) + if rc != 0: + raise RuntimeError("something went horribly wrong") + return current_version_data_path + + def data_dir_path(self): + rc, current_version_data_path = self(["getpath", "--data-dir"]) + if rc != 0: + raise RuntimeError("something went horribly wrong") + return current_version_data_path + + +# this one is locked environment variables as they are set right now +_static_GRDATA = DataFileManagementHarness(_make_config_pair()) +# this one will be affected by changes in the environment variable +_flexible_GRDATA = DataFileManagementHarness() + + +@contextlib.contextmanager +def tmpversiondir_with_file(input_path, env, fname=None, *, cleanup_on_close=False): + """ + A context manager that sets up a temporary on disk that appears (to + the Grackle library), as if the grdata tool set up a data directory + (the location is governed by the environment variables specified by + env), that contains a single file called ``fname``, which is a copy + of the has the file at `input_path`. + + In practice, the data-directory structure may not actually be + managaed by the grdata tool. Consequently, some implementation + details (e.g. related to deduplication) may not be defined. But, + that's ok since the logic in the Grackle library should only care + about whether a file (or link) shows up in the version directory. + + Parameters + ---------- + input_path : str + the path to the file we will copy + env : dict of strs + Dictionary holding the new values that we will use + fname : Optional, str + This is the name of the file as it appears inside of the + versiondir. When not specified, this is inferred from + input_path + """ + for var, val in env.items(): + if var not in _ENV_VARS: + raise ValueError(f"{var} isn't a known overridable env variable.") + if not os.path.isfile(input_path): + raise ValueError("input_path must specify a real file") + + with modified_env(env, extra_cleared_variables=_ENV_VARS): + try: + data_dir = _flexible_GRDATA.data_dir_path() + if os.path.isdir(data_dir) and (len(os.listdir(data_dir)) > 0): + raise ValueError( + "sanity check: this context manager requires that you specify " + "environment variables that lead to a data directory that doesn't " + "exist yet (or at least is empty)" + ) + + version_dir = _flexible_GRDATA.version_dir_path() + if fname is None: + # in the future, we may want to actually invoke GRDATA to make the copy + # in this case + fname = os.path.basename(input_path) + + ensure_dir(version_dir) + full_path = os.path.join(version_dir, fname) + shutil.copy(input_path, full_path) + yield full_path + + finally: + if cleanup_on_close: + shutil.rmtree(_flexible_GRDATA.data_dir_path()) + + +def _check_valid_datafile_fname(fname): + if fname not in _static_GRDATA.fnames_in_registry: + pytest.skip( + f"test is broken since {fname} is not a datafile distributed " + "with the current version Grackle" + ) + + +@pytest.fixture(scope="function") +def managed_datafile(request, tmp_path): + """ + A pytest fixture that ensures that a data-directory (and associated + environment variables specifying its location) is correctly configured + so that Grackle's internal logic can automatically lookup the + location of a standard datafile for the duration of a test. + + The standard datafile is "CloudyData_UVB=HM2012.h5". + + For the sake of convenience, this fixture passes provides the full + path of the datafile (in that data directory) to the test. + + This operates in 2 modes: + 1. when `hasattr(request, "param", None)` is `None`, we use the + existing data directory (essentially we ignore the tmp_path + fixture). + 2. otherwise, we use the `tmpversiondir_with_file` context manager + to temporarily (for the duration of the test) delete any/all + environment variables that could control the location of the + data-directory and replace it with the environment variable + specified by `param.request`. + - That environment variable hints at the location of a + temporary data directory. + - The location of that directory is controlled by the path + provided by the pytest's `tmp_path` fixture. + - We also copy the standard datafile into the appropriate + location within the data directory so that the test can + actually read in the data file. + + Note + ---- + If we want to parameterize the actual name of the file, then maybe we should return + some kind of factory? + """ + + fname = "CloudyData_UVB=HM2012.h5" + _check_valid_datafile_fname(fname) + + existing_fname_path = os.path.join(_static_GRDATA.version_dir_path(), fname) + + if getattr(request, "param", None) is None: + full_path = existing_fname_path + yield full_path + else: + env_var = request.param + with tmpversiondir_with_file( + input_path=existing_fname_path, env={env_var: str(tmp_path)} + ) as full_path: + yield full_path + +def setup_generic_problem(parameter_overrides={}): + """set up a really simplistic problem""" + chem = _setup_generic_chemistry_data( + initial_redshift=2.7, parameter_overrides=parameter_overrides + ) + # the precise details don't really matter here... + dt = sec_per_Myr / chem.time_units + fc = setup_fluid_container( + chem, + density=1.67e-24, + temperature=np.geomspace(1e3, 1e7, num=11), + metal_mass_fraction=0.01, # kinda arbitrary + state="ionized", + converge=False, + ) + return fc, dt + + +@pytest.mark.parametrize( + "managed_datafile", + ([pytest.param(None, id = "default-datadir")] + + [pytest.param(var, id=f"arbitrary-{var}") for var in _ENV_VARS]), + indirect=True +) +def test_autofile_equivalence(managed_datafile): + """ + A parameterized test that confirms that grackle produces the same + exact result (for a generic test problem) when you: + - you pass grackle_data_file a full path to the data file + - automatic lookup is used to infer the full path (to the same file) + + This test uses a parameterized fixture that may + - use the existing data directory variable, + - or use a custom environment variable that specifies the location + of the data file (in this case, the variable points to a location in + a temporary directory, where the datafile has been copied to) + + Essentially, the use of parametrized fixtures let us confirm that + Grackle's internal logic searches for the data files in the right + locations. + """ + + full_path = managed_datafile + fname = os.path.basename(full_path) + + assert os.path.isfile(full_path) # sanity check + + # generate a simple test problem + fc_ref, dt = setup_generic_problem( + parameter_overrides={"grackle_data_file": full_path} + ) + fc_ref.solve_chemistry(dt) + + # rerun the same problem, but now don't use the full path + fc_other, _ = setup_generic_problem( + parameter_overrides={ + "grackle_data_file": fname, + "grackle_data_file_options": constants.GR_DFOPT_MANAGED + } + ) + fc_other.solve_chemistry(dt) + assert_allequal_arraydict(fc_ref, fc_other) + + +def test_autofile_fail_unknown_file(): + # verify that the autofile machinery properly tells Grackle to abort initialization + # when we specify an invalid filename + chem = _setup_generic_chemistry_data( + initial_redshift=0.0, + skip_initialize=True, + parameter_overrides={ + "grackle_data_file": "not-a-file.png", + "grackle_data_file_options": constants.GR_DFOPT_MANAGED + }, + ) + assert chem.initialize() == constants.GR_FAIL + + +def test_autofile_fail_known_missing_file(tmp_path): + # verify that the autofile machinery properly tells Grackle to abort initialization + # when we specify a filename known to Grackle but that is missing + + fname_to_copy = "CloudyData_UVB=HM2012.h5" + alt_fname = "CloudyData_UVB=FG2011.h5" + _check_valid_datafile_fname(fname_to_copy) + _check_valid_datafile_fname(alt_fname) + + file_to_copy = os.path.join(_static_GRDATA.version_dir_path(), fname_to_copy) + + with tmpversiondir_with_file( + input_path=file_to_copy, + env={"GRACKLE_DATA_DIR": str(tmp_path)}, + ): + chem = _setup_generic_chemistry_data( + initial_redshift=0.0, + skip_initialize=True, + parameter_overrides={ + "grackle_data_file": alt_fname, + "grackle_data_file_options": constants.GR_DFOPT_MANAGED + }, + ) + assert chem.initialize() == constants.GR_FAIL + + +def test_autofile_fail_bad_checksum(tmp_path): + # verify that the autofile machinery properly tells Grackle to abort initialization + # when we specify a filename known to Grackle, that exists, but has the wrong + # checksum value + + fname_to_copy = "CloudyData_UVB=HM2012.h5" + alt_fname = "CloudyData_UVB=FG2011.h5" + _check_valid_datafile_fname(fname_to_copy) + _check_valid_datafile_fname(alt_fname) + + file_to_copy = os.path.join(_static_GRDATA.version_dir_path(), fname_to_copy) + + # for this test, we intentionally copy a file and give it the wrong name + with tmpversiondir_with_file( + input_path=file_to_copy, + env={"GRACKLE_DATA_DIR": str(tmp_path)}, + fname=alt_fname, + ): + chem = _setup_generic_chemistry_data( + initial_redshift=0.0, + skip_initialize=True, + parameter_overrides={ + "grackle_data_file": alt_fname, + "grackle_data_file_options": constants.GR_DFOPT_MANAGED + }, + ) + assert chem.initialize() == constants.GR_FAIL + diff --git a/src/python/tests/test_query_units.py b/src/python/tests/test_query_units.py index df5a8a7a..f68ddfb2 100644 --- a/src/python/tests/test_query_units.py +++ b/src/python/tests/test_query_units.py @@ -11,6 +11,7 @@ # software. ######################################################################## +from collections import ChainMap import os import numpy as np import pytest @@ -26,20 +27,43 @@ from pygrackle.grackle_wrapper import _query_units -_local_dir = os.path.dirname(os.path.abspath(__file__)) -def _setup_generic_chemistry_data(initial_redshift, current_redshift = None): - # construct a generic chemistry_data instance - # -> it is ONLY set up for comoving coordinates when current_redshift is - # not None - data_file_path = os.sep.join([_local_dir, "..", "..", "..", "input", - "CloudyData_UVB=HM2012.h5"]) +from testing_common import grackle_data_dir + +_UNITS_NAMES = ('density_units', 'time_units', 'length_units', 'a_value', + 'a_units', 'velocity_units', 'temperature_units') + +def _setup_generic_chemistry_data(initial_redshift, current_redshift = None, *, + skip_initialize = False, parameter_overrides = None): + """ + construct a generic chemistry_data instance + + It is ONLY set up for comoving coordinates when current_redshift is + not None + """ + + defaults = { + "use_grackle" : 1, + "with_radiative_cooling" : 0, + "primordial_chemistry" : 0, + "metal_cooling" : 1, + "UVbackground" : 1, + "grackle_data_file" : os.path.join(grackle_data_dir, "CloudyData_UVB=HM2012.h5") + } + + params = ChainMap( + {} if parameter_overrides is None else parameter_overrides, + defaults + ) + chem = chemistry_data() - chem.use_grackle = 1 - chem.with_radiative_cooling = 0 - chem.primordial_chemistry = 0 - chem.metal_cooling = 1 - chem.UVbackground = 1 - chem.grackle_data_file = data_file_path + for param_name, value in params.items(): + if (param_name in _UNITS_NAMES) or (param_name == "comoving_coordinates"): + raise ValueError( + f"{param_name!r} isn't allowed to be passed an override parameter " + "because this function has special handling for initializing " + "unit-related parameters") + setattr(chem, param_name, value) + if current_redshift is not None: set_cosmology_units(chem, current_redshift=current_redshift, @@ -53,8 +77,11 @@ def _setup_generic_chemistry_data(initial_redshift, current_redshift = None): chem.density_units = mass_hydrogen_cgs # rho = 1.0 is 1.67e-24 g chem.length_units = cm_per_mpc # 1 Mpc in cm chem.time_units = sec_per_Myr # 1 Myr in s - chem.initialize() - return chem + if skip_initialize: + return chem + else: + chem.initialize() + return chem _UNITS_NAMES = ('density_units', 'time_units', 'length_units', 'a_value', @@ -82,8 +109,11 @@ def test_query_units(comoving_coordinates, initial_redshift): current_redshift = initial_redshift else: current_redshift = None - chem = _setup_generic_chemistry_data(initial_redshift = initial_redshift, - current_redshift = current_redshift) + chem = _setup_generic_chemistry_data( + initial_redshift = initial_redshift, + current_redshift = current_redshift, + parameter_overrides = {"with_radiative_cooling" : 0} + ) # retrieve the initial units-related quantities units_at_init = _prefetch_units_vals(chem) @@ -120,8 +150,11 @@ def test_query_units(comoving_coordinates, initial_redshift): # for the comoving-case, the returned value should match the physical # units at the desired cosmological scale_factor expected = _prefetch_units_vals( - _setup_generic_chemistry_data(initial_redshift = initial_redshift, - current_redshift = later_redshift) + _setup_generic_chemistry_data( + initial_redshift = initial_redshift, + current_redshift = later_redshift, + parameter_overrides = {"with_radiative_cooling" : 0} + ) ) for name in _UNITS_NAMES: if name in ('time_units', 'a_units'): From 808a64863fd4292e5205a04d65968c9d4dad3fd6 Mon Sep 17 00:00:00 2001 From: Matthew Abruzzo Date: Wed, 4 Sep 2024 06:47:09 -0600 Subject: [PATCH 19/36] use automated file lookup in pygrackle examples --- src/python/examples/cooling_cell.py | 8 +++++--- src/python/examples/cooling_rate.py | 6 +++--- src/python/examples/freefall.py | 6 +++--- src/python/examples/yt_grackle.py | 7 ++++--- 4 files changed, 15 insertions(+), 12 deletions(-) diff --git a/src/python/examples/cooling_cell.py b/src/python/examples/cooling_cell.py index 0809d086..cd6355f6 100644 --- a/src/python/examples/cooling_cell.py +++ b/src/python/examples/cooling_cell.py @@ -22,13 +22,13 @@ from pygrackle import \ chemistry_data, \ + constants, \ evolve_constant_density, \ setup_fluid_container from pygrackle.utilities.physical_constants import \ mass_hydrogen_cgs, \ sec_per_Myr, \ cm_per_mpc -from pygrackle.utilities.data_path import grackle_data_dir from pygrackle.utilities.model_tests import \ get_model_set, \ model_test_format_version @@ -62,8 +62,10 @@ my_chemistry.primordial_chemistry = 0 my_chemistry.metal_cooling = 1 my_chemistry.UVbackground = 1 - my_chemistry.grackle_data_file = \ - os.path.join(grackle_data_dir, "CloudyData_UVB=HM2012.h5") + my_chemistry.grackle_data_file = "CloudyData_UVB=HM2012.h5" + my_chemistry.grackle_data_file_options = constants.GR_DFOPT_MANAGED + + density = 0.1 * mass_hydrogen_cgs # g /cm^3 temperature = 1e6 # K diff --git a/src/python/examples/cooling_rate.py b/src/python/examples/cooling_rate.py index f3720d9d..346b9383 100644 --- a/src/python/examples/cooling_rate.py +++ b/src/python/examples/cooling_rate.py @@ -19,8 +19,8 @@ from pygrackle import \ chemistry_data, \ + constants, \ setup_fluid_container -from pygrackle.utilities.data_path import grackle_data_dir from pygrackle.utilities.physical_constants import \ mass_hydrogen_cgs, \ sec_per_Myr, \ @@ -63,8 +63,8 @@ my_chemistry.UVbackground = 1 my_chemistry.self_shielding_method = 0 my_chemistry.H2_self_shielding = 0 - my_chemistry.grackle_data_file = \ - os.path.join(grackle_data_dir, "CloudyData_UVB=HM2012.h5") + my_chemistry.grackle_data_file = "CloudyData_UVB=HM2012.h5" + my_chemistry.grackle_data_file_options = constants.GR_DFOPT_MANAGED my_chemistry.use_specific_heating_rate = 1 my_chemistry.use_volumetric_heating_rate = 1 diff --git a/src/python/examples/freefall.py b/src/python/examples/freefall.py index 81035b61..8b2d3806 100644 --- a/src/python/examples/freefall.py +++ b/src/python/examples/freefall.py @@ -18,6 +18,7 @@ from pygrackle import \ chemistry_data, \ + constants, \ evolve_constant_density, \ evolve_freefall, \ setup_fluid_container @@ -25,7 +26,6 @@ mass_hydrogen_cgs, \ sec_per_Myr, \ cm_per_mpc -from pygrackle.utilities.data_path import grackle_data_dir from pygrackle.utilities.model_tests import \ get_model_set, \ model_test_format_version @@ -65,8 +65,8 @@ my_chemistry.CaseBRecombination = 1 my_chemistry.cie_cooling = 1 my_chemistry.h2_optical_depth_approximation = 1 - my_chemistry.grackle_data_file = os.path.join( - grackle_data_dir, "cloudy_metals_2008_3D.h5") + my_chemistry.grackle_data_file = "cloudy_metals_2008_3D.h5" + my_chemistry.grackle_data_file_options = constants.GR_DFOPT_MANAGED redshift = 0. diff --git a/src/python/examples/yt_grackle.py b/src/python/examples/yt_grackle.py index fabe95e3..4f0a8a58 100644 --- a/src/python/examples/yt_grackle.py +++ b/src/python/examples/yt_grackle.py @@ -15,8 +15,7 @@ import sys import yt -from pygrackle import add_grackle_fields -from pygrackle.utilities.data_path import grackle_data_dir +from pygrackle import add_grackle_fields, constants from pygrackle.utilities.model_tests import model_test_format_version output_name = os.path.basename(__file__[:-3]) # strip off ".py" @@ -45,9 +44,11 @@ ds = yt.load(ds_path) - grackle_data_file = os.path.join(grackle_data_dir, "CloudyData_UVB=HM2012.h5") + grackle_data_file = "CloudyData_UVB=HM2012.h5" + grackle_data_file_options = constants.GR_DFOPT_MANAGED grackle_pars = {'grackle_data_file': grackle_data_file, + 'grackle_data_file_options': grackle_data_file_options, 'UVbackground': 1, 'h2_on_dust': 1} From c8f96a29119f99c297392fcf42fd4a96e361200b Mon Sep 17 00:00:00 2001 From: Matthew Abruzzo Date: Wed, 4 Sep 2024 06:59:44 -0600 Subject: [PATCH 20/36] first stab at describing new parameter for using automatic file search --- doc/source/Parameters.rst | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/doc/source/Parameters.rst b/doc/source/Parameters.rst index 7905442a..e25a6d2a 100644 --- a/doc/source/Parameters.rst +++ b/doc/source/Parameters.rst @@ -147,6 +147,41 @@ For all on/off integer flags, 0 is off and 1 is on. Path to the data file containing the metal cooling and UV background tables. Default: "". +.. c:var:: int grackle_data_file_options + + This controls how the string passed to the :c:data:`grackle_data_file` parameter is interpretted. + Allowable values are represented by global constants specified in the header file. + The primary choices include: + + * :c:macro:`!GR_DFOPT_FULLPATH_NO_CKSUM` indicates that the user wants to use the data file at an arbitrary path specified by :c:data:`grackle_data_file`. + This is the legacy behavior. + If no value is specified, we fall back to this choice. + + * :c:macro:`!GR_DFOPT_MANAGED` indicates that the caller wants to use one of the standard datafiles shipped with the current version of grackle and that is managed by the :ref:`data management tool `. + + * In this case, :c:data:`grackle_data_file` holds a string that **EXACTLY** matches the name of a standard data file. + For example, ``"CloudyData_UVB=HM2012.h5"`` is valid but "path/to/CloudyData_UVB=HM2012.h5" is **NOT** valid. + + * Grackle uses the same algorithm as :ref:`the data management tool ` to infer the path to data file that is explicitly associated with the current version of Grackle (if a different version of Grackle ever ships a different version of the same data file, this will never use that version). + + * For safety reasons, Grackle will always validate the contents of the file; it will compute the checksum and compare it with its internal expectations. + If the checksums don't match Grackle will report an error. + The overhead of the checksum calculation is minimal and it only affects initialization of Grackle (i.e. you just pay the cost once) + + * At the moment, this option is most useful when used with pygrackle (since that is currently the only way to invoke :ref:`the data management tool `). + In the near future, we expect this to become easier to use. + + .. note:: + + The primary reason we validate the checksum is to protect users from the unlikely scenarios where logical bugs get introduced into the core grackle library or the :ref:`data-management-tool `. + The concern is that a hypothetical bug could cause the logic to silently load the wrong data file (or worse, a partially corrupted datafile) and continue operating with any indication of a problem. + + With that said, we recognize that some parallel filesystems can be very fragile. + Thus we introduce :c:macro:`!GR_DFOPT_MANAGED_NO_CKSUM`, which is exactly the same as :c:macro:`!GR_DFOPT_MANAGED`, except that the the checksum is not computed and compared against expectations. + This should **ONLY** be used in a parallel operation where at least 1 of the processed is using the :c:macro:`!GR_DFOPT_MANAGED` choice. + (If you choose to use :c:macro:`!GR_DFOPT_MANAGED_NO_CKSUM`, be aware that you are giving up all safety checks) + + .. c:var:: float Gamma The ratio of specific heats for an ideal gas. A direct calculation From f17f09857f8db3ca439a7aaebe6d7d7d1a555fd7 Mon Sep 17 00:00:00 2001 From: Matthew Abruzzo Date: Sun, 22 Sep 2024 18:04:22 -0400 Subject: [PATCH 21/36] start importing some things from test_grdata.py --- src/python/tests/test_auto_files.py | 74 +++++++---------------------- 1 file changed, 17 insertions(+), 57 deletions(-) diff --git a/src/python/tests/test_auto_files.py b/src/python/tests/test_auto_files.py index ac0edf1b..011167f0 100644 --- a/src/python/tests/test_auto_files.py +++ b/src/python/tests/test_auto_files.py @@ -15,7 +15,6 @@ import io import os import shutil -import sys import numpy as np import pytest @@ -30,63 +29,22 @@ from pygrackle.utilities.physical_constants import sec_per_Myr from pygrackle.utilities.testing import assert_allequal_arraydict, ensure_dir +from test_grdata import ( + _ENV_VARS, # holds list of environment variables that affect data dir location + modified_env, +) from test_query_units import _setup_generic_chemistry_data + # we probably don't have to skip everything if not hasattr(os, "putenv"): pytest.skip( "several tests need os.putenv to work properly", allow_module_level=True ) -# _ENV_VAR holds the list of environment variables that could affect the -# location of the data directory -if sys.platform.startswith("darwin"): - _ENV_VARS = ("HOME", "GRACKLE_DATA_DIR") -else: - _ENV_VARS = ("HOME", "GRACKLE_DATA_DIR", "XDG_DATA_HOME") - - -def _ensure_removed(d, key): - try: - del d[key] - except KeyError: - pass - - -@contextlib.contextmanager -def modified_env(new_env_vals, extra_cleared_variables=None): - """ - Temporarily overwrite the environment variables. This is necessary to test C - extensions that rely upon the environment variables - """ - if extra_cleared_variables is None: - extra_cleared_variables = None - - # record the original values for any variable we will overwrite - original_vals = {} - try: - for var in filter(lambda e: e not in new_env_vals, extra_cleared_variables): - original_vals[var] = os.environ.get(var, None) - _ensure_removed(os.environ, var) - - for var, new_val in new_env_vals.items(): - original_vals[var] = os.environ.get(var, None) - if new_val is None: - _ensure_removed(os.environ, var) - else: - os.environ[var] = new_val - - yield - - finally: - # restore to the initial values - for var, val in original_vals.items(): - if val is None: - _ensure_removed(os.environ, var) - else: - os.environ[var] = val - +# it would be nice to replace the following with test_grdata.CLIApp, but that would +# definitely take some work class DataFileManagementHarness: """ This is a wrapper around the cli interface provided by pygrackle. @@ -247,6 +205,7 @@ def managed_datafile(request, tmp_path): ) as full_path: yield full_path + def setup_generic_problem(parameter_overrides={}): """set up a really simplistic problem""" chem = _setup_generic_chemistry_data( @@ -267,9 +226,11 @@ def setup_generic_problem(parameter_overrides={}): @pytest.mark.parametrize( "managed_datafile", - ([pytest.param(None, id = "default-datadir")] + - [pytest.param(var, id=f"arbitrary-{var}") for var in _ENV_VARS]), - indirect=True + ( + [pytest.param(None, id="default-datadir")] + + [pytest.param(var, id=f"arbitrary-{var}") for var in _ENV_VARS] + ), + indirect=True, ) def test_autofile_equivalence(managed_datafile): """ @@ -304,7 +265,7 @@ def test_autofile_equivalence(managed_datafile): fc_other, _ = setup_generic_problem( parameter_overrides={ "grackle_data_file": fname, - "grackle_data_file_options": constants.GR_DFOPT_MANAGED + "grackle_data_file_options": constants.GR_DFOPT_MANAGED, } ) fc_other.solve_chemistry(dt) @@ -319,7 +280,7 @@ def test_autofile_fail_unknown_file(): skip_initialize=True, parameter_overrides={ "grackle_data_file": "not-a-file.png", - "grackle_data_file_options": constants.GR_DFOPT_MANAGED + "grackle_data_file_options": constants.GR_DFOPT_MANAGED, }, ) assert chem.initialize() == constants.GR_FAIL @@ -345,7 +306,7 @@ def test_autofile_fail_known_missing_file(tmp_path): skip_initialize=True, parameter_overrides={ "grackle_data_file": alt_fname, - "grackle_data_file_options": constants.GR_DFOPT_MANAGED + "grackle_data_file_options": constants.GR_DFOPT_MANAGED, }, ) assert chem.initialize() == constants.GR_FAIL @@ -374,8 +335,7 @@ def test_autofile_fail_bad_checksum(tmp_path): skip_initialize=True, parameter_overrides={ "grackle_data_file": alt_fname, - "grackle_data_file_options": constants.GR_DFOPT_MANAGED + "grackle_data_file_options": constants.GR_DFOPT_MANAGED, }, ) assert chem.initialize() == constants.GR_FAIL - From 73f6f7d1966ad088dace9f3eee93d6fb979acef4 Mon Sep 17 00:00:00 2001 From: Matthew Abruzzo Date: Thu, 10 Oct 2024 10:22:58 -0400 Subject: [PATCH 22/36] shift the order of autogenerated files. --- src/clib/Makefile | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/src/clib/Makefile b/src/clib/Makefile index 096eae6d..8a47eb0c 100644 --- a/src/clib/Makefile +++ b/src/clib/Makefile @@ -224,15 +224,6 @@ config_type: $(PUBLIC_HEADER_SRCDIR)/grackle_float.h.in --output $(AUTOGEN_DIR)/grackle_float.h \ GRACKLE_FLOAT_MACRO=GRACKLE_FLOAT_$(ASSEMBLE_PRECISION_NUMBER)); -# Force update of file_registry.h (an internally used header file) -.PHONY: $(AUTOGEN_DIR)/file_registry.h -$(AUTOGEN_DIR)/file_registry.h: file_registry.h.in - -@(mkdir -p $(AUTOGEN_DIR)) - @$(CONFIG_DIR)/configure_file.py --clobber \ - --input $< \ - --output $(AUTOGEN_DIR)/file_registry.h \ - --variable-use-literal-file-contents FILE_REGISTRY_CONTENTS=../python/pygrackle/file_registry/file_registry.txt - # Force update of auto_general.c .PHONY: $(AUTOGEN_DIR)/auto_general.c $(AUTOGEN_DIR)/auto_general.c: auto_general.c.in @@ -248,6 +239,16 @@ $(AUTOGEN_DIR)/auto_general.c: auto_general.c.in GIT_BRANCH=`$(QUERY_VERSION) git-branch` \ GIT_REVISION=`$(QUERY_VERSION) git-revision` +# Force update of file_registry.h (an internally used header file) +.PHONY: $(AUTOGEN_DIR)/file_registry.h +$(AUTOGEN_DIR)/file_registry.h: file_registry.h.in + -@(echo "Generating $@") + -@(mkdir -p $(AUTOGEN_DIR)) + @$(CONFIG_DIR)/configure_file.py --clobber \ + --input $< \ + --output $@ \ + --variable-use-literal-file-contents FILE_REGISTRY_CONTENTS=$(GRACKLE_DIR)/../python/pygrackle/file_registry/file_registry.txt + # keep this recipe updated so that we always clean up the autogenerated files # (the second line cleans up autogenerated files that might be left over from # before we moved the autogenerated files into a subdirectory) From 872f4fc73cb5f912ab06a2e2706e0da704303821 Mon Sep 17 00:00:00 2001 From: Matthew Abruzzo Date: Thu, 10 Oct 2024 11:06:27 -0400 Subject: [PATCH 23/36] modifications to file_registry.txt trigger rebuilds for the CMake build-system --- src/clib/CMakeLists.txt | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/src/clib/CMakeLists.txt b/src/clib/CMakeLists.txt index 9a193c63..b02af75e 100644 --- a/src/clib/CMakeLists.txt +++ b/src/clib/CMakeLists.txt @@ -35,12 +35,17 @@ configure_file(../include/grackle_float.h.in # now, declare recipe for generating file_registry.h: set(GRACKLE_GENERATED_PRIVATE_HEADERS "${CMAKE_CURRENT_BINARY_DIR}") -file(READ +set(_FILE_REGISTRY_PATH "${CMAKE_CURRENT_SOURCE_DIR}/../python/pygrackle/file_registry/file_registry.txt" - FILE_REGISTRY_CONTENTS) +) +file(READ ${_FILE_REGISTRY_PATH} FILE_REGISTRY_CONTENTS) configure_file(file_registry.h.in ${GRACKLE_GENERATED_PRIVATE_HEADERS}/file_registry.h @ONLY) - +# instruct CMake to rerun config-stage (and re-generate files) if any changes +# occur to the file containing the file registry +set_property( + DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS ${_FILE_REGISTRY_PATH} +) # next, declare recipe for generating auto_general.c: From 3c8f6855bcfefee18aec5673321fc96847829729 Mon Sep 17 00:00:00 2001 From: Matthew Abruzzo Date: Thu, 10 Oct 2024 11:14:54 -0400 Subject: [PATCH 24/36] prep grdata.py to be used as a template for a standalone executable --- src/python/pygrackle/utilities/grdata.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/python/pygrackle/utilities/grdata.py b/src/python/pygrackle/utilities/grdata.py index 5b139cc8..5c2db76f 100644 --- a/src/python/pygrackle/utilities/grdata.py +++ b/src/python/pygrackle/utilities/grdata.py @@ -1797,9 +1797,9 @@ def make_config_objects(grackle_version, file_registry_file): # - install it into the bin directory alongside the grackle libraries if __name__ == "__main__": - _GRACKLE_VERSION = "@GRACKLE_VERSION@" + _GRACKLE_VERSION = "@_GRDATA_GRACKLE_VERSION@" _FILE_REGISTRY_CONTENTS = """\ -@FILE_REGISTRY_CONTENTS@ +@_GRDATA_FILE_REGISTRY_CONTENTS@ """ def _check_substitution_problems(var_name, var_value): From 97fa2b065343460b04eef250161cd1f589d5364c Mon Sep 17 00:00:00 2001 From: Matthew Abruzzo Date: Thu, 10 Oct 2024 11:15:20 -0400 Subject: [PATCH 25/36] cmake support for installing grdata This commit makes the following changes to the CMake build system: - it now creates the grdata executable and installs it - it exports the program's location as a variable in grackle.pc - it exposes the program's location through a custom CMake command (that custom CMake command will be revisited in future commits) --- CMakeLists.txt | 16 +- cmake/CreateProgram-grdata.cmake | 306 +++++++++++++++++++ cmake/GrackleConfig.cmake.in | 7 + cmake/GrackleReusedBuildConfigCommands.cmake | 110 +++++++ cmake/grackle.pc.in | 5 +- cmake/installation_rules.cmake | 44 ++- src/clib/CMakeLists.txt | 11 +- 7 files changed, 481 insertions(+), 18 deletions(-) create mode 100644 cmake/CreateProgram-grdata.cmake create mode 100644 cmake/GrackleReusedBuildConfigCommands.cmake diff --git a/CMakeLists.txt b/CMakeLists.txt index 070c6588..f1759e1d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -130,9 +130,7 @@ the structure of this directory # these intentionally are not CACHE variables set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${stageDir}/${CMAKE_INSTALL_LIBDIR}) set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${stageDir}/${CMAKE_INSTALL_LIBDIR}) - - # don't currently need the following since grackle doesn't ship an executable - #set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${stageDir}/${CMAKE_INSTALL_BINDIR}) + set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${stageDir}/${CMAKE_INSTALL_BINDIR}) # the location where the export files go to export from build-tree set(GRACKLE_BUILD_EXPORT_PREFIX_PATH ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}) @@ -202,6 +200,18 @@ target_include_directories(picohash SYSTEM INTERFACE # ------------------ add_subdirectory(src/clib) +# configure the grdata cli program +# -> we are essentially performing some template substitutions on a python +# file so that the file can be executed as a standalone cli program +include(CreateProgram-grdata) +create_grdata_program( + GRACKLE_VERSION "${_GRACKLE_FULL_VERSION}" + PATH_OUTVAR "_GRACKLE_GRDATA_TOOL_PATH" +) +# make the location of grdata accessible via the grackle_get_info command +include(GrackleReusedBuildConfigCommands) +_grackleprivate_get_info_setup("${_GRACKLE_GRDATA_TOOL_PATH}") + # declare build-recipies for examples if (GRACKLE_EXAMPLES) add_subdirectory(src/example) diff --git a/cmake/CreateProgram-grdata.cmake b/cmake/CreateProgram-grdata.cmake new file mode 100644 index 00000000..fb6dbdaa --- /dev/null +++ b/cmake/CreateProgram-grdata.cmake @@ -0,0 +1,306 @@ +# This is a cmake module that defines the logic for creating the grdata +# "program" +# +# In a vacuum, it would be more idiomatic to take the logic in this file and +# directly embed it in a CMakeLists.txt file located in close proximity to the +# template file. +# +# However the situation is complicated by the following 2 factors: +# 1. grdata.py is used both as a template file and as part of the pygrackle +# package. The grdata.py file is written in such away that it can be used +# without as part of pygrackle without any sort of variable substitution +# +# 2. Moreover, the pygrackle package is built with the scikit-build-core +# backend, which requires a set of CMake files. Since there is no reason to +# ever directly execute this file's logic as part of building the python +# package it makes even more sense to try to keep the logic separate + + +# load the file_registry information for the current version of grackle into a +# string held by the variable specified by the outvar argument +function(load_file_registry_string UPDATE_CONFIGURE_DEPENDS outvar) + set(path + "${PROJECT_SOURCE_DIR}/src/python/pygrackle/file_registry/file_registry.txt" + ) + if ("${UPDATE_CONFIGURE_DEPENDS}") + set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS "${path}") + endif() + file(READ ${path} contents) + set("${outvar}" "${contents}" PARENT_SCOPE) +endfunction() + +set(_GRDATA_CACHE_PREFIX "_GRACKLEGRDATAPRIVATE_") + +# create the grdata program +# +# the program's resulting location is controlled by the global +# CMAKE_RUNTIME_OUTPUT_DIRECTORY variable, if it is set. Otherwise, we put it +# in the CURRENT_BINARY_DIR. +# +# Arguments +# --------- +# +# GRACKLE_VERSION: the full grackle version number of the grackle version that +# the program is associated with (not just the truncated number understood +# by cmake) +# PATH_OUTVAR: specifies the name of the variable where the path to the +# resulting program is stored +function(create_grdata_program) + + set(options) + set(oneValueArgs DESTINATION GRACKLE_VERSION PATH_OUTVAR) + set(multiValueArgs) + + cmake_parse_arguments(PARSE_ARGV 0 + CREATE_GRDATA "${options}" "${oneValueArgs}" "${multiValueArgs}") + + # some basic error-handling + set(_funcname "create_grdata_program") + if (DEFINED CREATE_GRDATA_UNPARSED_ARGUMENTS) + message(FATAL_ERROR + "${_funcname} recieved invalid arguments: " + "\"${CREATE_GRDATA_UNPARSED_ARGUMENTS}\"") + elseif (DEFINED CREATE_GRDATA_KEYWORDS_MISSING_VALUES) + message(FATAL_ERROR + "${_funcname} received the ${CREATE_GRDATA_KEYWORDS_MISSING_VALUES} " + "keyword(s) without any associated arguments.") + endif() + + if (DEFINED CMAKE_RUNTIME_OUTPUT_DIRECTORY) + set(output_path "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/grdata") + else() + set(output_path "${CMAKE_CURRENT_BINARY_DIR}/grdata") + endif() + + load_file_registry_string(TRUE "_GRDATA_FILE_REGISTRY_CONTENTS") + set(_GRDATA_GRACKLE_VERSION "${CREATE_GRDATA_GRACKLE_VERSION}") + configure_file( + "${PROJECT_SOURCE_DIR}/src/python/pygrackle/utilities/grdata.py" + "${output_path}" + @ONLY + ) + + if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.19") + file(CHMOD ${output_path} FILE_PERMISSIONS + OWNER_READ OWNER_WRITE OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ + WORLD_EXECUTE + ) + else() + execute_process(COMMAND chmod a+rx ${output_path}) + endif() + + set("${CREATE_GRDATA_PATH_OUTVAR}" "${output_path}" PARENT_SCOPE) + + set(_GRACKLEGRDATAPRIVATE_TOOL_PATH "${output_path}" CACHE INTERNAL + "cached location of the grdata tool") +endfunction() + + +# note that cmake normalizes paths on all platforms to use forward-slashes +function(_num_path_segments path outCount) + set(arg_descr "the path argument, of the _num_path_segments command,") + if (path MATCHES "^/.*") + message(FATAL_ERROR "${arg_descr} can't start with `/`") + elseif (path STREQUAL ".") + message(FATAL_ERROR "${arg_descr} can't currently be `.`") + elseif ((path MATCHES "^\./.*$") OR (path MATCHES "^.*/\./.*$")) + message(FATAL_ERROR "${arg_descr} can't currently hold a `./` segment") + elseif (path STREQUAL "..") + message(FATAL_ERROR "${arg_descr} can't currently be `..`") + elseif ((path MATCHES "^\.\./.*$") OR (path MATCHES "^.*/\.\./.*$")) + message(FATAL_ERROR "${arg_descr} can't currently hold a `../` segment") + elseif (path STREQUAL "") + set(${outCount} "0" PARENT_SCOPE) + return() + endif() + + set(count 1) + set(remainder "${path}") + + while(NOT (remainder MATCHES "^[^/]+/*$")) # exit loop if no slashes or all + # slashes are trailing + math(EXPR count "${count} + 1") + + # remove trailing slash(es) + if (remainder MATCHES "^(.*/[^/])/+$") + set(remainder "${CMAKE_MATCH_1}") + endif() + get_filename_component(remainder "${remainder}" DIRECTORY) + endwhile() + + set(${outCount} "${count}" PARENT_SCOPE) +endfunction() + +# helper function to create an export-file (that will be installed) and is used +# to provide some basic properties about the grdata tool +# EXPORT_FILE_DESTINATION_DIR specifies directory where the export-file will +# be installed relative to the root-install path +# TOOL_RELATIVE_INSTALL_PATH where the export-file will be +# installed relative to the root-install path +function(_grdata_get_export_file_contents EXPORT_FILE_DESTINATION_DIR TOOL_RELATIVE_INSTALL_PATH outVar) + # a sanity check! + if ((EXPORT_FILE_DESTINATION_DIR MATCHES "^/.*") OR + (TOOL_RELATIVE_INSTALL_PATH MATCHES "^/.*")) + message( + FATAL_ERROR + "_grdata_get_export_file_contents can't handle an argument that starts " + "with a forward slash") + endif() + + _num_path_segments("${EXPORT_FILE_DESTINATION_DIR}" num_segments) + + if (num_segments EQUAL 0) + set(REL_PATH_TO_PREFIX "") + else() + string(REPEAT "../" "${num_segments}" REL_PATH_TO_PREFIX) + endif() + + # now, we will sanitize EXPORT_FILE_DESTINATION_DIR (and remove any trailing + # slashes) + + string(CONFIGURE [=[ +# Autogenerated file that stores the location of the grdata tool +# -> this is directly analogous to a file that would be defined with cmake's +# install(EXPORT ...) command. +# -> since the grdata tool is a weird sort of pseudo target (i.e. it isn't +# compiled), we store the path to the file +# -> like `install(EXPORT ...)` (and in contrast to `export(EXPORT ...)`), +# we use a relative path to the grdata tool +set(_IMPORT_PREFIX "${CMAKE_CURRENT_LIST_FILE}/@REL_PATH_TO_PREFIX@") +set(_GRACKLE_GRDATA_TOOL_PATH "${_IMPORT_PREFIX}/@TOOL_RELATIVE_INSTALL_PATH@") +unset(_IMPORT_PREFIX) +]=] contents @ONLY +) + set("${outVar}" "${contents}" PARENT_SCOPE) + +endfunction() + + +# due to the "weird" nature of the grdata tool, (we treat it like its an +# executable even though it isn't compiled), this is a command to help with +# standard export/installation options +# +# This command operates in 3 modes: +# +# 1. In ``INSTALL_TOOL`` mode, the following command +# ``` +# grdata_install_export_helper_(INSTALL_TOOL DESTINATION +# COMPONENT ) +# ``` +# acts as a substitute for the `install(TARGETS grdata-target ...)` command, +# if the grdata tool were a typicial cmake target (i.e. that got compiled). +# In practice, this does some bookkeeping and wraps the following command: +# ``` +# install(PROGRAMS path/to/grdata DESTINATION COMPONENT ) +# ``` +# +# 2. In ``INSTALL_EXPORT`` mode, the following command +# ``` +# grdata_install_export_helper_(INSTALL_EXPORT DESTINATION +# FILE .cmake TMPDIR ) +# ``` +# acts as an analog to the ``install(EXPORT ...)`` command, but with 1 minor +# difference: +# - the ``install(EXPORT ...)`` command doesn't obviously do anything at +# configuration time (in practice it does create the export files and +# stores them within /CMakeFiles/Export) +# - we explicitly generate the export files at configuration time & store +# them within directory specified by TMPDIR +# +# 3. In ``EXPORT_EXPORT`` mode, the following command +# ``` +# grdata_install_export_helper_(EXPORT_EXPORT FILE .cmake) +# ``` +# acts as an analog to the ``export(EXPORT ...)`` command +# +macro(grdata_install_export_helper_ mode) + # we only take 1-value arguments (after the mode argument) + + set(_GRDATA_IEH_name "grdata_install_export_helper_") + + # get the kwargs for the current mode + if("INSTALL_TOOL" STREQUAL "${mode}") + set(_GRDATA_IEH_oneValueArgs DESTINATION COMPONENT) + elseif("INSTALL_EXPORT" STREQUAL "${mode}") + set(_GRDATA_IEH_oneValueArgs DESTINATION FILE TMPDIR) + elseif("EXPORT_EXPORT" STREQUAL "${mode}") + set(_GRDATA_IEH_oneValueArgs FILE) + else() + message(FATAL_ERROR + "${_GRDATA_IEH_name} command invoked with unexpected mode: \"${mode}\"" + ) + endif() + + # parse the arguments + cmake_parse_arguments(_GRDATA_IEH "" "${_GRDATA_IEH_oneValueArgs}" "" + ${ARGN}) + + if (DEFINED _GRDATA_IEH_UNPARSED_ARGUMENTS) + message(FATAL_ERROR + "${_GRDATA_IEH_name}(${mode}) recieved invalid arguments: " + "\"${_GRDATA_IEH_UNPARSED_ARGUMENTS}\"") + elseif (DEFINED _GRDATA_IEH_KEYWORDS_MISSING_VALUES) + message(FATAL_ERROR + "${_GRDATA_IEH_name}(${mode}) received the " + "${_GRDATA_IEH_KEYWORDS_MISSING_VALUES} keyword(s) without any " + "associated arguments.") + endif() + + if (NOT DEFINED _GRACKLEGRDATAPRIVATE_TOOL_PATH) + message(FATAL_ERROR + "${_GRDATA_IEH_name}(${mode}) can only be called AFTER a call to the " + "create_grdata_program command.") + endif() + + # now, actually complete the command + if ("INSTALL_TOOL" STREQUAL "${mode}") + install(PROGRAMS ${_GRACKLEGRDATAPRIVATE_TOOL_PATH} + COMPONENT ${_GRDATA_IEH_COMPONENT} + DESTINATION ${_GRDATA_IEH_DESTINATION} + ) + + set(_GRACKLEGRDATAPRIVATE_RELATIVE_INSTALL_PATH + "${_GRDATA_IEH_DESTINATION}/grdata" CACHE INTERNAL + "install location of the grdata tool relative to base install path") + + elseif("INSTALL_EXPORT" STREQUAL "${mode}") + if (NOT DEFINED _GRACKLEGRDATAPRIVATE_RELATIVE_INSTALL_PATH) + message(FATAL_ERROR + "${_GRDATA_IEH_name}(${mode}) can only be called AFTER a call to the " + "${_GRDATA_IEH_name}(INSTALL_TOOL) command.") + endif() + + _grdata_get_export_file_contents( + ${_GRDATA_IEH_DESTINATION} + ${_GRACKLEGRDATAPRIVATE_RELATIVE_INSTALL_PATH} + "_GRDATA_IEH_EXPORT_CONTENTS" + ) + + file(WRITE "${_GRDATA_IEH_TMPDIR}/${_GRDATA_IEH_FILE}" + "${_GRDATA_IEH_EXPORT_CONTENTS}") + install(FILES + "${_GRDATA_IEH_TMPDIR}/${_GRDATA_IEH_FILE}" + DESTINATION "${_GRDATA_IEH_DESTINATION}" + ) + + elseif ("EXPORT_EXPORT" STREQUAL "${mode}") + string(CONFIGURE [=[ +# Autogenerated file that stores the location of the grdata tool +# -> this is directly analogous to a file that would be defined with cmake's +# export(EXPORT ...) command. +# -> since the grdata tool is a weird sort of pseudo target (i.e. it isn't +# compiled), we store the path to the file +# -> like `export(EXPORT ...)` (and in contrast to `install(EXPORT ...)`), +# we use an absolute path + +set(_GRACKLE_GRDATA_TOOL_PATH "@_GRACKLEGRDATAPRIVATE_TOOL_PATH@") +"]=] _GRDATA_IEH_EXPORT_CONTENTS @ONLY) + + file(WRITE "${_GRDATA_IEH_FILE}" "${_GRDATA_IEH_EXPORT_CONTENTS}") + + else() + message(FATAL_ERROR + "something went horribly wrong within ${_GRDATA_IEH_name}(${mode})") + endif() + +endmacro() diff --git a/cmake/GrackleConfig.cmake.in b/cmake/GrackleConfig.cmake.in index 0241836a..27fb16c2 100644 --- a/cmake/GrackleConfig.cmake.in +++ b/cmake/GrackleConfig.cmake.in @@ -355,6 +355,13 @@ set_target_properties(Grackle::Grackle PROPERTIES @_GRACKLE_INFO_PROPERTIES@ ) +# define the grackle_get_info command +include(${CMAKE_CURRENT_LIST_DIR}/Grackle_grdata_pseudotarget.cmake) +include(${CMAKE_CURRENT_LIST_DIR}/GrackleReusedBuildConfigCommands.cmake) +_grackleprivate_get_info_setup("${_GRACKLE_GRDATA_TOOL_PATH}") +unset(_GRACKLE_GRDATA_TOOL_PATH) + + # Finally, let's do some cleanup so people don't rely upon these variables # (specifically, let's cleanup the record of the build-flags) unset(_GRACKLEBUILD_USE_DOUBLE) diff --git a/cmake/GrackleReusedBuildConfigCommands.cmake b/cmake/GrackleReusedBuildConfigCommands.cmake new file mode 100644 index 00000000..b0626d60 --- /dev/null +++ b/cmake/GrackleReusedBuildConfigCommands.cmake @@ -0,0 +1,110 @@ +# ======== +# Overview +# ======== +# +# This is a cmake module that defines commands that are used in 2 cases: +# 1. as part of the build logic +# 2. as part of the configuration logic +# +# Since commands have global scope, these commands (functions or macros) +# defined in this file will always be visible to CMake projects that consume +# Grackle (regardless of how Grackle is consumed). Only a subset of commands +# are intended to be used directly by consumers. These commands are explicitly +# documented in the documentations +# +# IMPORTANT +# --------- +# All code in this file must be executable by the minimum supported cmake +# version described in `GrackleConfig.cmake.in` (at this time of writing, that +# is version 3.3) + + +# Commands Motivated by the grdata tool +# ------------------------------------- +# Here, we turn our attention to commands originally created with the intention +# of supporting the grdata cli tool (the commands are named generically enough +# that they can be used to support other things in the future, too). +# +# Some background: +# - the grdata program can be very useful for downstream codes for testing +# purposes (and for the hypothetical scenario where we support downloading +# precompiled copies of Grackle). But it's a little weird in a CMake context +# - we want people to think of it essentially as a command line program (and +# be indifferent to the fact that it is implemented as an executable python +# executable python script behind the scenes) +# - since we want it to be available to downstream CMake applications whether +# people embed grackle as part of their build or use find_package, we +# probably shouldn't declare it as an IMPORTED executable. If we just cared +# about supporting the latter case, then this would be absolutely fine. +# But, IMPORTED executables aren't really intended for the former case +# (especially since we are producing the file with configure_file). While +# it could work out in the former case, I worry about unforseen side +# effects. +# - Prior to CMake 3.0, we might have made this information available in a +# package variable (like GRACKLE_GRDATA_TOOL_PATH), but the Professional +# CMake book, written by Craig Scott (one of the primary CMake developer), +# makes it clear we should prefer to avoid package-variables in order to make +# the project as consumable as possible. +# - This advice is offered in multiple versions of the book, but you can +# specifically find it in section 40.4 of the 18th edition +# - essentially, using package variables produces complication when people +# want to use cmake 3.24 or newer (specifically related to the integration +# between find_package and FetchContent). +# - Following the book's suggestion, we instead make this information +# available through the grackle_get_info command + +function(_grackleprivate_get_propertyprefix outVar) # private helper function + set("${outVar}" "_PrivateGrackleGlobalPropDoNotUse_" PARENT_SCOPE) +endfunction() + +function(_grackleprivate_get_info_setup grdata_tool_path) + # this is a private command to setup for the grackle_get_info command + # ARGS: + # grdata_tool_path: specifies the path to the grdata tool + + # - the whole point here is to make the grackle_get_info without reading any + # information related to grdata_tool_path from a global/package variable. + # (It might be possible to define grackle_get_info so that it does read + # this info from global/package variables and is portable across all + # contexts, but that very much isn't clear). + # - This leaves us with a few alternatives (th + # 1. Write a file (maybe to ${PROJECT_BINARY_DIR}/__grackle_get_info.cmake) + # that defines the grackle_get_info function (while substituting the + # grdata_tool_path information) and then include that file in order to + # declare the function (we would need to make this into a macro) + # 2. Store the grdata_tool_path information as an attribute on a target + # a) we could do this on an existing target, like Grackle::Grackle (in + # this case, we'd name the property so its clear that it's not part + # of the public interface) + # b) we could create a new interface target (like Grackle::_PRIVATE) for + # this explicit purpose + # - It really doesn't which choice we pick since this is an implementation + # detail we could change at any time. For simplicity, we go with 2a. + + set(_target_name "Grackle::Grackle") + get_target_property(_aliased "${_target_name}" ALIASED_TARGET) + if(_aliased) + set(_target_name "${_aliased}") + endif() + + _grackleprivate_get_propertyprefix( "_prefix" ) + + set_property(TARGET "${_target_name}" + PROPERTY "${_prefix}GRACKLE_GRDATA_TOOL_PATH" "${grdata_tool_path}" + ) +endfunction() + + +# a public function for accessing extra grackle information +# +# An empty string is returned if the property is not known +# +# ARGUMENTS +# --------- +# key: the name of the grackle information to be accessed +# outVar: the name of the variable where the accessed info is stored +function(grackle_get_info key outVar) + _grackleprivate_get_propertyprefix( "_prefix" ) + get_target_property(val Grackle::Grackle "${_prefix}${key}") + set("${outVar}" "${val}" PARENT_SCOPE) +endfunction() diff --git a/cmake/grackle.pc.in b/cmake/grackle.pc.in index baba0c43..18c3e44b 100644 --- a/cmake/grackle.pc.in +++ b/cmake/grackle.pc.in @@ -30,12 +30,15 @@ # commonly used to make the build relocatable (there's not a ton of docs on it) prefix=${pcfiledir}/../.. -libdir=${prefix}/lib +libdir=${prefix}/@CMAKE_INSTALL_LIBDIR@ includedir=${prefix}/include # define Grackle-specific variables conveying extra information @_PC_INFO_PROPERTIES@ +# define other variables conveying extra Grackle-related information +GRACKLE_GRDATA_TOOL_PATH=${prefix}/@CMAKE_INSTALL_BINDIR@/grdata + Name: grackle Description: chemistry and radiative cooling library for astrophysical simulations and models Version: @Grackle_VERSION@ diff --git a/cmake/installation_rules.cmake b/cmake/installation_rules.cmake index f91b5a8b..03c42c4c 100644 --- a/cmake/installation_rules.cmake +++ b/cmake/installation_rules.cmake @@ -141,6 +141,18 @@ install(TARGETS Grackle_Grackle COMPONENT Grackle_Development ) +include(CreateProgram-grdata) +# define installation rules for installation of grdata cli tool. This is +# analogous to install(TARGETS ...), but is needed since grdata isn't a +# compiled executable +# -> in the future, maybe we should make this part of the Grackle_Development +# "installation component?" (from my perspective it's probably better err +# on the side of being a little too atomic here) +grdata_install_export_helper_(INSTALL_TOOL + DESTINATION ${CMAKE_INSTALL_BINDIR} + COMPONENT Grackle_Tools +) + if (BUILD_SHARED_LIBS) # (As noted above) Because we renamed the shared library so its called # `libgrackle-{VERSION_NUM}.so` (rather than `libgrackle.so`), we need an @@ -329,6 +341,13 @@ endif() # Define the cmake Package Config File #------------------------------------- +# create variable that holds the path to the current directory +set(LOCAL_CMAKE_MODULE_DIR "${CMAKE_CURRENT_LIST_DIR}") + +# create variable storing where copies of the cmake files are stored so that +# they can be used without a full installation +set(BUILDTREE_CMAKE_DIR ${GRACKLE_BUILD_EXPORT_PREFIX_PATH}/cmake/Grackle) + include(CMakePackageConfigHelpers) # The following function implements standardized logic for determining @@ -363,7 +382,7 @@ write_basic_package_version_file( get_info_properties_export_str(Grackle_Grackle CMAKE_CONFIG _GRACKLE_INFO_PROPERTIES) configure_file( - ${PROJECT_SOURCE_DIR}/cmake/GrackleConfig.cmake.in + ${LOCAL_CMAKE_MODULE_DIR}/GrackleConfig.cmake.in ${CMAKE_CURRENT_BINARY_DIR}/install-metadata/GrackleConfig.cmake @ONLY ) @@ -371,6 +390,7 @@ configure_file( install(FILES ${CMAKE_CURRENT_BINARY_DIR}/install-metadata/GrackleConfig.cmake ${CMAKE_CURRENT_BINARY_DIR}/install-metadata/GrackleConfigVersion.cmake + ${LOCAL_CMAKE_MODULE_DIR}/GrackleReusedBuildConfigCommands.cmake DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/Grackle ) @@ -390,15 +410,24 @@ install(EXPORT GrackleTargets FILE Grackle_${GRACKLE_CONFIG_FILL_VAL}_targets.cmake ) -# generate and configure some cmake files in the build-tree so that external -# cmake projects can use find_package to directly import Grackle::Grackle from -# the build-tree (without requiring a full installation) +# call the analog of install(EXPORT ...) for the grdata tool +grdata_install_export_helper_(INSTALL_EXPORT + DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/Grackle + FILE Grackle_grdata_pseudotarget.cmake + TMPDIR ${CMAKE_CURRENT_BINARY_DIR}/install-metadata +) + + +# copy some files to the build tree to locations specified by the +# BUILDTREE_CMAKE_DIR variable so so that external cmake projects can use +# find_package to directly import Grackle::Grackle from the build-tree (without +# requiring a full installation) -set(BUILDTREE_CMAKE_DIR ${GRACKLE_BUILD_EXPORT_PREFIX_PATH}/cmake/Grackle) file(COPY ${CMAKE_CURRENT_BINARY_DIR}/install-metadata/GrackleConfig.cmake ${CMAKE_CURRENT_BINARY_DIR}/install-metadata/GrackleConfigVersion.cmake + ${LOCAL_CMAKE_MODULE_DIR}/GrackleReusedBuildConfigCommands.cmake DESTINATION ${BUILDTREE_CMAKE_DIR} ) @@ -406,3 +435,8 @@ export(EXPORT GrackleTargets FILE ${BUILDTREE_CMAKE_DIR}/Grackle_${GRACKLE_CONFIG_FILL_VAL}_targets.cmake NAMESPACE Grackle:: ) + +# analog to export(EXPORT ...) for the grdata tool +grdata_install_export_helper_(EXPORT_EXPORT + FILE ${BUILDTREE_CMAKE_DIR}/Grackle_grdata_pseudotarget.cmake +) diff --git a/src/clib/CMakeLists.txt b/src/clib/CMakeLists.txt index b02af75e..87c5d324 100644 --- a/src/clib/CMakeLists.txt +++ b/src/clib/CMakeLists.txt @@ -34,18 +34,11 @@ configure_file(../include/grackle_float.h.in ${GRACKLE_GENRATED_PUBLIC_HEADERS}/grackle_float.h @ONLY) # now, declare recipe for generating file_registry.h: +include(CreateProgram-grdata) set(GRACKLE_GENERATED_PRIVATE_HEADERS "${CMAKE_CURRENT_BINARY_DIR}") -set(_FILE_REGISTRY_PATH - "${CMAKE_CURRENT_SOURCE_DIR}/../python/pygrackle/file_registry/file_registry.txt" -) -file(READ ${_FILE_REGISTRY_PATH} FILE_REGISTRY_CONTENTS) +load_file_registry_string(TRUE FILE_REGISTRY_CONTENTS) configure_file(file_registry.h.in ${GRACKLE_GENERATED_PRIVATE_HEADERS}/file_registry.h @ONLY) -# instruct CMake to rerun config-stage (and re-generate files) if any changes -# occur to the file containing the file registry -set_property( - DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS ${_FILE_REGISTRY_PATH} -) # next, declare recipe for generating auto_general.c: From 203118e47d1cb8d4106be4156905c93d6b27a238 Mon Sep 17 00:00:00 2001 From: Matthew Abruzzo Date: Wed, 9 Oct 2024 12:35:00 -0400 Subject: [PATCH 26/36] alter installation of grdata tool and add docs --- CMakeLists.txt | 5 +- cmake/CreateProgram-grdata.cmake | 200 ++++++++++++++----- cmake/GrackleConfig.cmake.in | 8 +- cmake/GrackleReusedBuildConfigCommands.cmake | 110 ---------- cmake/installation_rules.cmake | 2 - doc/source/Integration.rst | 11 +- doc/source/Tools.rst | 102 ++++++++-- 7 files changed, 248 insertions(+), 190 deletions(-) delete mode 100644 cmake/GrackleReusedBuildConfigCommands.cmake diff --git a/CMakeLists.txt b/CMakeLists.txt index f1759e1d..3d1b54b5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -206,11 +206,8 @@ add_subdirectory(src/clib) include(CreateProgram-grdata) create_grdata_program( GRACKLE_VERSION "${_GRACKLE_FULL_VERSION}" - PATH_OUTVAR "_GRACKLE_GRDATA_TOOL_PATH" + TARGET_NAME Grackle::grdata ) -# make the location of grdata accessible via the grackle_get_info command -include(GrackleReusedBuildConfigCommands) -_grackleprivate_get_info_setup("${_GRACKLE_GRDATA_TOOL_PATH}") # declare build-recipies for examples if (GRACKLE_EXAMPLES) diff --git a/cmake/CreateProgram-grdata.cmake b/cmake/CreateProgram-grdata.cmake index fb6dbdaa..84eed508 100644 --- a/cmake/CreateProgram-grdata.cmake +++ b/cmake/CreateProgram-grdata.cmake @@ -29,9 +29,10 @@ function(load_file_registry_string UPDATE_CONFIGURE_DEPENDS outvar) set("${outvar}" "${contents}" PARENT_SCOPE) endfunction() -set(_GRDATA_CACHE_PREFIX "_GRACKLEGRDATAPRIVATE_") - -# create the grdata program +# create the grdata program (and represent it as an executable target) +# +# To install the program, and properly expose export information, see the +# grdata_install_export_helper_ command # # the program's resulting location is controlled by the global # CMAKE_RUNTIME_OUTPUT_DIRECTORY variable, if it is set. Otherwise, we put it @@ -39,16 +40,68 @@ set(_GRDATA_CACHE_PREFIX "_GRACKLEGRDATAPRIVATE_") # # Arguments # --------- -# # GRACKLE_VERSION: the full grackle version number of the grackle version that -# the program is associated with (not just the truncated number understood -# by cmake) -# PATH_OUTVAR: specifies the name of the variable where the path to the -# resulting program is stored +# the program is associated with (not just the truncated +# number understood by cmake) +# TARGET_NAME: specifies the name of the target that is created to represent +# the program. +# +# Notes +# ----- +# The grdata program can be very useful for downstream codes for testing +# purposes (and for the hypothetical scenario where we support downloading +# precompiled copies of Grackle). Due to the nature of the grdata program, +# providing the program details (i.e. namely providing the path to it) to +# downstream projects is not straight-forward. +# - In more detail, the program is a little weird in a CMake context since we +# want people to essentially think of it as a generic command line program +# even though it isn't technically a compiled program (people shouldn't care +# that it is actually a portable, executable python script). +# - I was originally hesitant to declare it as an IMPORTED executable. I was +# primarily concerned about the scenario where other CMake-built projects +# might consume Grackle by embedding it as a part of the build. Since the +# documentation explains that IMPORTED executable machinery is intended to +# represent machinery from outside the CMake, I was concerned that we could +# encounter some unforseen side-effects. +# - Prior to CMake 3.0, we might have instead provided the tool's path in a +# package variable (like GRACKLE_GRDATA_TOOL_PATH). While the details of how +# we achieve this in modern CMake differ, we can still provide the path in a +# manner similar to variable-access +# - it is still possible to provide package variables, but the *Professional +# CMake* book, by Craig Scott (a primary CMake developer), makes is clear +# we should to avoid package-variables to make Grackle easily consumable. +# In the book's 18th edition this advice is in section 40.4. +# - essentially, we would need to take extra steps for every package variable +# that we introduce to support downstream projects employ newer dependency +# management machinery introduced in 2022 (cmake 3.24). +# - the advice is to make this information accessible through a function +# (that returns values for known keys) or as properties of a target. We +# experimented with the function approach -- see commits just before this +# documentation was written. It requires a somewhat involved solution +# to completely avoid global/package variables. +# +# After giving this some more thought, it became clear that the IMPORTED +# executable approach is superior for 2 reasons: +# 1. Even if there is some unforseen side-effect of the IMPORTED executable, we +# wiil be no worse-off than the variable-like approach. +# - The worst imaginable side-effect is that somebody consuming Grackle +# in an embedded manner, might find that some of CMake's source-file +# dependency magic doesn't work right after updating the files that +# compose the grdata tool. +# - This scenario seems extremely pathological (and should probably never +# arise), since it could only occur if people alter their source files +# based on the output of the grdata tool. +# - Regardless of the practicality, the variable-like approach definitely +# wouldn't offer any benefits here (the same issues would still occur) +# 2. The use of targets is generally more idiomatic than variables. While we +# still require some custom logic to get everything to work right, we need +# less of it than the alternative. Moreover, the custom-code will be much +# more directly analogous to standard cmake logic (making it easier to +# understand) function(create_grdata_program) set(options) - set(oneValueArgs DESTINATION GRACKLE_VERSION PATH_OUTVAR) + set(oneValueArgs DESTINATION GRACKLE_VERSION TARGET_NAME) set(multiValueArgs) cmake_parse_arguments(PARSE_ARGV 0 @@ -89,12 +142,18 @@ function(create_grdata_program) execute_process(COMMAND chmod a+rx ${output_path}) endif() - set("${CREATE_GRDATA_PATH_OUTVAR}" "${output_path}" PARENT_SCOPE) + set(_GRACKLEGRDATAPRIVATE_TARGETNAME "${CREATE_GRDATA_TARGET_NAME}" + CACHE INTERNAL "name of the target representing the grdata tool") + + add_executable(${_GRACKLEGRDATAPRIVATE_TARGETNAME} IMPORTED GLOBAL) + set_target_properties(${_GRACKLEGRDATAPRIVATE_TARGETNAME} PROPERTIES + IMPORTED_LOCATION "${output_path}" + ) - set(_GRACKLEGRDATAPRIVATE_TOOL_PATH "${output_path}" CACHE INTERNAL - "cached location of the grdata tool") endfunction() +# Helper Functions used to define export files +# -------------------------------------------- # note that cmake normalizes paths on all platforms to use forward-slashes function(_num_path_segments path outCount) @@ -131,13 +190,19 @@ function(_num_path_segments path outCount) set(${outCount} "${count}" PARENT_SCOPE) endfunction() -# helper function to create an export-file (that will be installed) and is used -# to provide some basic properties about the grdata tool -# EXPORT_FILE_DESTINATION_DIR specifies directory where the export-file will -# be installed relative to the root-install path -# TOOL_RELATIVE_INSTALL_PATH where the export-file will be -# installed relative to the root-install path -function(_grdata_get_export_file_contents EXPORT_FILE_DESTINATION_DIR TOOL_RELATIVE_INSTALL_PATH outVar) +# create a relocatable export-file (to be placed in the installation directory) +# that declares a target representing the grdata tool +# +# Arguments +# --------- +# EXPORT_FILE_DESTINATION_DIR specifies directory where the export-file will +# be installed, relative to the root-install path +# TOOL_RELATIVE_INSTALL_PATH specifies path where the grdata tool will be +# installed, relative to the root-install path +# TMP_FILE_LOCATION Where to put the file (right after we create it) +function(_grdata_write_installdir_export_file + EXPORT_FILE_DESTINATION_DIR TOOL_RELATIVE_INSTALL_PATH TMP_FILE_LOCATION +) # a sanity check! if ((EXPORT_FILE_DESTINATION_DIR MATCHES "^/.*") OR (TOOL_RELATIVE_INSTALL_PATH MATCHES "^/.*")) @@ -155,10 +220,7 @@ function(_grdata_get_export_file_contents EXPORT_FILE_DESTINATION_DIR TOOL_RELAT string(REPEAT "../" "${num_segments}" REL_PATH_TO_PREFIX) endif() - # now, we will sanitize EXPORT_FILE_DESTINATION_DIR (and remove any trailing - # slashes) - - string(CONFIGURE [=[ + set(template [======================[ # Autogenerated file that stores the location of the grdata tool # -> this is directly analogous to a file that would be defined with cmake's # install(EXPORT ...) command. @@ -166,15 +228,46 @@ function(_grdata_get_export_file_contents EXPORT_FILE_DESTINATION_DIR TOOL_RELAT # compiled), we store the path to the file # -> like `install(EXPORT ...)` (and in contrast to `export(EXPORT ...)`), # we use a relative path to the grdata tool + set(_IMPORT_PREFIX "${CMAKE_CURRENT_LIST_FILE}/@REL_PATH_TO_PREFIX@") -set(_GRACKLE_GRDATA_TOOL_PATH "${_IMPORT_PREFIX}/@TOOL_RELATIVE_INSTALL_PATH@") + +add_executable(@_GRACKLEGRDATAPRIVATE_TARGETNAME@ IMPORTED) +set_target_properties(@_GRACKLEGRDATAPRIVATE_TARGETNAME@ PROPERTIES + IMPORTED_LOCATION "${_IMPORT_PREFIX}/@TOOL_RELATIVE_INSTALL_PATH@" +) + unset(_IMPORT_PREFIX) -]=] contents @ONLY +]======================] ) - set("${outVar}" "${contents}" PARENT_SCOPE) + + string(CONFIGURE "${template}" contents @ONLY) + file(WRITE ${TMP_FILE_LOCATION} "${contents}") endfunction() +# helper function to create a export-file to support find_package using the +# build-directory (analogous to the of `export(EXPORT)` command) +function(_grdata_write_builddir_export_file IMMEDIATE_EXPORT_FILE_PATH) + + set(template [======================[ +# Autogenerated file that stores the location of the grdata tool +# -> this is directly analogous to a file that would be defined with cmake's +# export(EXPORT ...) command. +# -> since the grdata tool is a weird sort of pseudo target (i.e. it isn't +# compiled), we store the path to the file +# -> like `export(EXPORT ...)` (and in contrast to `install(EXPORT ...)`), +# we use an absolute path + +set(_GRACKLE_GRDATA_TOOL_PATH "@absolute_tool_path@") +]======================] +) + get_target_property(absolute_tool_path + ${_GRACKLEGRDATAPRIVATE_TARGETNAME} IMPORTED_LOCATION) + + string(CONFIGURE "${template}" contents @ONLY) + file(WRITE ${IMMEDIATE_EXPORT_FILE_PATH} "${contents}") + +endfunction() # due to the "weird" nature of the grdata tool, (we treat it like its an # executable even though it isn't compiled), this is a command to help with @@ -206,6 +299,7 @@ endfunction() # stores them within /CMakeFiles/Export) # - we explicitly generate the export files at configuration time & store # them within directory specified by TMPDIR +# NOTE: the namespace used when we originally defined the target is reused # # 3. In ``EXPORT_EXPORT`` mode, the following command # ``` @@ -218,13 +312,13 @@ macro(grdata_install_export_helper_ mode) set(_GRDATA_IEH_name "grdata_install_export_helper_") - # get the kwargs for the current mode + # get the kwargs for the current mode (all kwargs expect a single arg) if("INSTALL_TOOL" STREQUAL "${mode}") - set(_GRDATA_IEH_oneValueArgs DESTINATION COMPONENT) + set(_GRDATA_IEH_Args DESTINATION COMPONENT) elseif("INSTALL_EXPORT" STREQUAL "${mode}") - set(_GRDATA_IEH_oneValueArgs DESTINATION FILE TMPDIR) + set(_GRDATA_IEH_Args DESTINATION FILE TMPDIR) elseif("EXPORT_EXPORT" STREQUAL "${mode}") - set(_GRDATA_IEH_oneValueArgs FILE) + set(_GRDATA_IEH_Args FILE) else() message(FATAL_ERROR "${_GRDATA_IEH_name} command invoked with unexpected mode: \"${mode}\"" @@ -232,9 +326,9 @@ macro(grdata_install_export_helper_ mode) endif() # parse the arguments - cmake_parse_arguments(_GRDATA_IEH "" "${_GRDATA_IEH_oneValueArgs}" "" - ${ARGN}) + cmake_parse_arguments(_GRDATA_IEH "" "${_GRDATA_IEH_Args}" "" ${ARGN}) + # check argument validity if (DEFINED _GRDATA_IEH_UNPARSED_ARGUMENTS) message(FATAL_ERROR "${_GRDATA_IEH_name}(${mode}) recieved invalid arguments: " @@ -245,8 +339,15 @@ macro(grdata_install_export_helper_ mode) "${_GRDATA_IEH_KEYWORDS_MISSING_VALUES} keyword(s) without any " "associated arguments.") endif() + foreach(_GRDATA_IEH_ARG IN ITEMS ${_GRDATA_IEH_Args}) + if (NOT DEFINED "_GRDATA_IEH_${_GRDATA_IEH_ARG}") + message(FATAL_ERROR + "${_GRDATA_IEH_name}(${mode}) requires the `${_GRDATA_IEH_ARG}` kwarg") + endif() + endforeach() - if (NOT DEFINED _GRACKLEGRDATAPRIVATE_TOOL_PATH) + # check a precondition + if (NOT DEFINED _GRACKLEGRDATAPRIVATE_TARGETNAME) message(FATAL_ERROR "${_GRDATA_IEH_name}(${mode}) can only be called AFTER a call to the " "create_grdata_program command.") @@ -254,7 +355,7 @@ macro(grdata_install_export_helper_ mode) # now, actually complete the command if ("INSTALL_TOOL" STREQUAL "${mode}") - install(PROGRAMS ${_GRACKLEGRDATAPRIVATE_TOOL_PATH} + install(PROGRAMS $ COMPONENT ${_GRDATA_IEH_COMPONENT} DESTINATION ${_GRDATA_IEH_DESTINATION} ) @@ -270,37 +371,34 @@ macro(grdata_install_export_helper_ mode) "${_GRDATA_IEH_name}(INSTALL_TOOL) command.") endif() - _grdata_get_export_file_contents( + # create the export file + _grdata_write_installdir_export_file( + # where we'll put export file during install (relative to install-prefix) ${_GRDATA_IEH_DESTINATION} + # where we'll put grdata tool during install (relative to install-prefix) ${_GRACKLEGRDATAPRIVATE_RELATIVE_INSTALL_PATH} - "_GRDATA_IEH_EXPORT_CONTENTS" + # where in the build-directly we'll put the export file immediately after + # we create it (we will copy it from here when we install) + "${_GRDATA_IEH_TMPDIR}/${_GRDATA_IEH_FILE}" ) - file(WRITE "${_GRDATA_IEH_TMPDIR}/${_GRDATA_IEH_FILE}" - "${_GRDATA_IEH_EXPORT_CONTENTS}") + # define the rule to copy the export-file during installation install(FILES "${_GRDATA_IEH_TMPDIR}/${_GRDATA_IEH_FILE}" DESTINATION "${_GRDATA_IEH_DESTINATION}" ) elseif ("EXPORT_EXPORT" STREQUAL "${mode}") - string(CONFIGURE [=[ -# Autogenerated file that stores the location of the grdata tool -# -> this is directly analogous to a file that would be defined with cmake's -# export(EXPORT ...) command. -# -> since the grdata tool is a weird sort of pseudo target (i.e. it isn't -# compiled), we store the path to the file -# -> like `export(EXPORT ...)` (and in contrast to `install(EXPORT ...)`), -# we use an absolute path - -set(_GRACKLE_GRDATA_TOOL_PATH "@_GRACKLEGRDATAPRIVATE_TOOL_PATH@") -"]=] _GRDATA_IEH_EXPORT_CONTENTS @ONLY) - - file(WRITE "${_GRDATA_IEH_FILE}" "${_GRDATA_IEH_EXPORT_CONTENTS}") + _grdata_write_builddir_export_file("${_GRDATA_IEH_FILE}") else() message(FATAL_ERROR "something went horribly wrong within ${_GRDATA_IEH_name}(${mode})") endif() + # need to do some cleanup (since we are in a macro): + foreach(_GRDATA_IEH_ARG IN LISTS _GRDATA_IEH_Args) + unset("_GRDATA_IEH_${_GRDATA_IEH_ARG}") + endforeach() + endmacro() diff --git a/cmake/GrackleConfig.cmake.in b/cmake/GrackleConfig.cmake.in index 27fb16c2..7f5b65b9 100644 --- a/cmake/GrackleConfig.cmake.in +++ b/cmake/GrackleConfig.cmake.in @@ -23,7 +23,7 @@ # -> when the logic in this file is executed, it is intended to define # variables necessary for the external project to make use of grackle. # Historically, this involved defining new variables. But, we adopt the -# modern convention of ONLY defining library-targets +# modern convention of ONLY defining targets # -> if we include `return()`, anywhere in the top-level scope of this file # (i.e. not in a function definition), the evaluation of this file # abruptly ends. We can also define some specific variables to inform @@ -355,12 +355,8 @@ set_target_properties(Grackle::Grackle PROPERTIES @_GRACKLE_INFO_PROPERTIES@ ) -# define the grackle_get_info command +# define the Grackle::grdata executable include(${CMAKE_CURRENT_LIST_DIR}/Grackle_grdata_pseudotarget.cmake) -include(${CMAKE_CURRENT_LIST_DIR}/GrackleReusedBuildConfigCommands.cmake) -_grackleprivate_get_info_setup("${_GRACKLE_GRDATA_TOOL_PATH}") -unset(_GRACKLE_GRDATA_TOOL_PATH) - # Finally, let's do some cleanup so people don't rely upon these variables # (specifically, let's cleanup the record of the build-flags) diff --git a/cmake/GrackleReusedBuildConfigCommands.cmake b/cmake/GrackleReusedBuildConfigCommands.cmake deleted file mode 100644 index b0626d60..00000000 --- a/cmake/GrackleReusedBuildConfigCommands.cmake +++ /dev/null @@ -1,110 +0,0 @@ -# ======== -# Overview -# ======== -# -# This is a cmake module that defines commands that are used in 2 cases: -# 1. as part of the build logic -# 2. as part of the configuration logic -# -# Since commands have global scope, these commands (functions or macros) -# defined in this file will always be visible to CMake projects that consume -# Grackle (regardless of how Grackle is consumed). Only a subset of commands -# are intended to be used directly by consumers. These commands are explicitly -# documented in the documentations -# -# IMPORTANT -# --------- -# All code in this file must be executable by the minimum supported cmake -# version described in `GrackleConfig.cmake.in` (at this time of writing, that -# is version 3.3) - - -# Commands Motivated by the grdata tool -# ------------------------------------- -# Here, we turn our attention to commands originally created with the intention -# of supporting the grdata cli tool (the commands are named generically enough -# that they can be used to support other things in the future, too). -# -# Some background: -# - the grdata program can be very useful for downstream codes for testing -# purposes (and for the hypothetical scenario where we support downloading -# precompiled copies of Grackle). But it's a little weird in a CMake context -# - we want people to think of it essentially as a command line program (and -# be indifferent to the fact that it is implemented as an executable python -# executable python script behind the scenes) -# - since we want it to be available to downstream CMake applications whether -# people embed grackle as part of their build or use find_package, we -# probably shouldn't declare it as an IMPORTED executable. If we just cared -# about supporting the latter case, then this would be absolutely fine. -# But, IMPORTED executables aren't really intended for the former case -# (especially since we are producing the file with configure_file). While -# it could work out in the former case, I worry about unforseen side -# effects. -# - Prior to CMake 3.0, we might have made this information available in a -# package variable (like GRACKLE_GRDATA_TOOL_PATH), but the Professional -# CMake book, written by Craig Scott (one of the primary CMake developer), -# makes it clear we should prefer to avoid package-variables in order to make -# the project as consumable as possible. -# - This advice is offered in multiple versions of the book, but you can -# specifically find it in section 40.4 of the 18th edition -# - essentially, using package variables produces complication when people -# want to use cmake 3.24 or newer (specifically related to the integration -# between find_package and FetchContent). -# - Following the book's suggestion, we instead make this information -# available through the grackle_get_info command - -function(_grackleprivate_get_propertyprefix outVar) # private helper function - set("${outVar}" "_PrivateGrackleGlobalPropDoNotUse_" PARENT_SCOPE) -endfunction() - -function(_grackleprivate_get_info_setup grdata_tool_path) - # this is a private command to setup for the grackle_get_info command - # ARGS: - # grdata_tool_path: specifies the path to the grdata tool - - # - the whole point here is to make the grackle_get_info without reading any - # information related to grdata_tool_path from a global/package variable. - # (It might be possible to define grackle_get_info so that it does read - # this info from global/package variables and is portable across all - # contexts, but that very much isn't clear). - # - This leaves us with a few alternatives (th - # 1. Write a file (maybe to ${PROJECT_BINARY_DIR}/__grackle_get_info.cmake) - # that defines the grackle_get_info function (while substituting the - # grdata_tool_path information) and then include that file in order to - # declare the function (we would need to make this into a macro) - # 2. Store the grdata_tool_path information as an attribute on a target - # a) we could do this on an existing target, like Grackle::Grackle (in - # this case, we'd name the property so its clear that it's not part - # of the public interface) - # b) we could create a new interface target (like Grackle::_PRIVATE) for - # this explicit purpose - # - It really doesn't which choice we pick since this is an implementation - # detail we could change at any time. For simplicity, we go with 2a. - - set(_target_name "Grackle::Grackle") - get_target_property(_aliased "${_target_name}" ALIASED_TARGET) - if(_aliased) - set(_target_name "${_aliased}") - endif() - - _grackleprivate_get_propertyprefix( "_prefix" ) - - set_property(TARGET "${_target_name}" - PROPERTY "${_prefix}GRACKLE_GRDATA_TOOL_PATH" "${grdata_tool_path}" - ) -endfunction() - - -# a public function for accessing extra grackle information -# -# An empty string is returned if the property is not known -# -# ARGUMENTS -# --------- -# key: the name of the grackle information to be accessed -# outVar: the name of the variable where the accessed info is stored -function(grackle_get_info key outVar) - _grackleprivate_get_propertyprefix( "_prefix" ) - get_target_property(val Grackle::Grackle "${_prefix}${key}") - set("${outVar}" "${val}" PARENT_SCOPE) -endfunction() diff --git a/cmake/installation_rules.cmake b/cmake/installation_rules.cmake index 03c42c4c..9bd3f647 100644 --- a/cmake/installation_rules.cmake +++ b/cmake/installation_rules.cmake @@ -390,7 +390,6 @@ configure_file( install(FILES ${CMAKE_CURRENT_BINARY_DIR}/install-metadata/GrackleConfig.cmake ${CMAKE_CURRENT_BINARY_DIR}/install-metadata/GrackleConfigVersion.cmake - ${LOCAL_CMAKE_MODULE_DIR}/GrackleReusedBuildConfigCommands.cmake DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/Grackle ) @@ -427,7 +426,6 @@ grdata_install_export_helper_(INSTALL_EXPORT file(COPY ${CMAKE_CURRENT_BINARY_DIR}/install-metadata/GrackleConfig.cmake ${CMAKE_CURRENT_BINARY_DIR}/install-metadata/GrackleConfigVersion.cmake - ${LOCAL_CMAKE_MODULE_DIR}/GrackleReusedBuildConfigCommands.cmake DESTINATION ${BUILDTREE_CMAKE_DIR} ) diff --git a/doc/source/Integration.rst b/doc/source/Integration.rst index a84f9c15..66839884 100644 --- a/doc/source/Integration.rst +++ b/doc/source/Integration.rst @@ -172,13 +172,16 @@ The following snippet shows a sample Makefile for compiling a sample application pkg-config also provides additional functionality, like querying version numbers, enforcing version requirements, etc. Most of that functionality is described in `this guide `__. -You can also query Grackle-specific details, such as: +You can also query Grackle-related details, such as: * the full version string (to determine if it's a dev-version or not) via ``pkg-config --variable=GRACKLE_VERSION_STR grackle`` * whether Grackle was compiled with double precision, via ``pkg-config --variable=GRACKLE_USE_DOUBLE grackle`` -* whether grackle was compiled with openmp, via ``pkg-config --variable=GRACKLE_USE_OPENMP grackle`` +* whether Grackle was compiled with openmp, via ``pkg-config --variable=GRACKLE_USE_OPENMP grackle`` + +* the path to the :ref:`grdata cli tool ` associated with this version of Grackle, via ``pkg-config --variable=GRACKLE_GRDATA_TOOL_PATH grackle`` (this might be useful for testing purposes) + .. warning:: @@ -240,6 +243,8 @@ These properties include: * ``GRACKLE_USE_DOUBLE`` -- stores whether Grackle was compiled with single or double precision * ``GRACKLE_USE_OPENMP`` -- stores whether Grackle was compiled with OpenMP +Information about the :ref:`grdata cli tool ` tool that is created and built alongside this version of Grackle is exposed via the ``Grackle::grcli`` executable target. +This can be useful for testing purposes. .. _Embed_Grackle_in_Sim_Build: @@ -308,6 +313,8 @@ Care has been taken while designing the CMake build-system to ensure that the `` In both cases, the target provides the same custom properties to describe information about the build. See the :ref:`section ` about ``find_package`` for more details. +Additionally, information about the :ref:`grdata cli tool ` tool that is created and built alongside this version of Grackle is exposed via the ``Grackle::grcli`` executable target. + .. rubric:: Footnotes .. [#f1] This is required by CMake. diff --git a/doc/source/Tools.rst b/doc/source/Tools.rst index 1a3d4aac..fb5b2136 100644 --- a/doc/source/Tools.rst +++ b/doc/source/Tools.rst @@ -5,32 +5,97 @@ Datafile Management =================== We provide a command line tool to optionally manage Grackle's datafiles. +We call this the ``grdata`` tool. At a Quick Glance ----------------- -Currently, this command line tool is only accessible when :ref:`pygrackle is installed `. -To execute the tool execute +We provide 2 ways to access this tool: -.. code-block:: shell-session +1. As a standalone command line application installed alongside of grackle. +2. As a command line tool shipped :ref:`as a part of pygrackle `. - $ python -m pygrackle ... +To execute the tool: -Where ``...`` is replaced with one or more command-line arguments. +.. tabs:: + + .. group-tab:: As a Standalone CLI + + When you build grackle as a standalone application with the CMake build-system, the program can generally be found inside the build directory at */grackle/bin/grdata*. [#df1]_ + Thus, you can invoke the tool with: + + .. code-block:: shell-session + + $ .//grackle/bin/grdata ... + + When you install grackle (with either the traditional build-system OR the CMake build-system), the program will be installed alongside the core Grackle library. + If the core Grackle library is at */lib/libgrackle...*, then the tool is probably installed to */bin/grdata*. [#df2]_ + In this case, you can invoke the tool with: + + + .. code-block:: shell-session + + $ /bin/grdata ... + + + .. note:: + + In detail, the tool is implemented as an executable python script (it only uses the python standard library) and it relies upon a `shebang `__ to launch the program. + + In the event that, that your machine finds an invalid python version (at this time of writing, the minimum required version is 3.6.1 is r) or can't find any python interpretter, you have 2 options: + + 1. you can modify the shebang path OR + + 2. you can directly invoke the tool with python `path/to/python path/to/grdata ...`. + + + .. group-tab:: As a part of Pygrackle + + When Pygrackle is installed, you can invoke + + .. code-block:: shell-session + + $ python -m pygrackle ... + + .. note:: + + If you choose to install pygrackle without manually downloading the grackle repository, this tool is the most efficient way to download the files. + + + +In the sample snippets ``...`` is replaced with one or more command-line arguments. For example, ``fetch`` will invoke a subcommand that downloads all associated files (if they aren't already downloaded). You can use the ``--help`` option to get a list of all subcommands. You can also pass the ``--help`` option after the name of a subcommand (e.g. you can use ``fetch --help``) to get more details about subcommand-specific options. -.. note:: - At the moment, this functionality is most useful for pygrackle. - In the near future [#df1]_\ , it will be possible install pygrackle without manually downloading the grackle repository. - At that time, this will be the most efficient way to retrieve the files. - The pygrackle examples and some of the pygrackle tests rely upon this functionality. - However, you are free to completely ignore this functionality for your own purposes. +The pygrackle examples and the pygrackle tests all rely upon this functionality. +The Grackle C library has support for access the datafiles managed by this tool. +Some of the examples may soon rely upon the functionality. + +.. important:: + + Instances of the grdata tool are associated with a single version of Grackle (if you are using Pygrackle, the version of the core Grackle c-library is the relevant version number). - There is ongoing work to implement functionality for the Grackle C library to directly access the datafiles managed by this tool. - When these efforts are finished, we plan to additionally provide this command-line-tool as a standalone program that is always installed alongside Grackle (so that you can access this functionality without installing pygrackle) + Internal (Py)Grackle logic that automatically locate and use the files managed by this tool, will **only** work if the files have been "fetched" by versions of the grdata tool that **exactly** match. + In detail, matching version strings have identical major, minor, and micro version numbers. + Both version strings must either have an identical suffix (namely "-dev") or they must both lack a suffix. + For development versions of Grackle, we do **NOT** require the commit-hashes to exactly match. + + Consider the following hypothetical scenario: + + * You install version 3.4.0 of Grackle and use the associated version of ``grdata`` to install datafiles. + + * The version of Pygrackle that wraps 3.4.0 will also be able to automatically locate these datafiles + + * If you install a different version of Grackle (say version 3.3.1-dev or 3.4.0-dev or 3.4.1 or 3.4.1-dev) or a copy of Pygrackle that wraps a different version, you will need to use the copy of ``grdata`` associated with that copy to fetch the datafiles. + + * You install the version of Pygrackle that wraps version 3.4.0 of Grackle and use it to install the associated datafiles + + * Any build/installation of Grackle version 3.4.0 (whether or not it is wrapped by Pygrackle) will also be able to locate and use these datafiles + + As we will discuss, the grdata tool takes a steps to deduplicate the storage used by the datafiles for different Grackle versions. + For example, if you use ``grdata`` instances to associate 5 distinct grackle versions only one copy of the **CloudyData_UVB=HM2012_high_density.h5** file's data will be stored on disk (i.e. that data will only take ~6.74 MB of disk space rather than ~33.7 MB) Description ----------- @@ -71,5 +136,12 @@ Down below, we sketch out what the directory-structure might look like: .. rubric:: Footnotes -.. [#df1] Once `GH-#208 `__ is merged, you will be able to instruct pip to install pygrackle by just specifying the URL of the GitHub repository. - We also have plans to upload pygrackle to pip. +.. [#df1] The precise location is sensitive to CMake's standard `CMAKE_INSTALL_BINDIR` variable. + On just about all platforms this defaults to *bin*, but it is plausible that it could default to a different value. + You are also free to override the value of the `CMAKE_INSTALL_BINDIR` variable. + +.. [#df2] Again, the precise location of ``grdata`` installed by the CMake build-system is sensitive to CMake's standard `CMAKE_INSTALL_BINDIR` variable. + Additionally the precise location of the core Grackle library is sensitive to the CMake's standard `CMAKE_INSTALL_LIBDIR` variable, which defaults to platform's preferred default values (again, you can overwrite it). + CMake's `CMAKE_INSTALL_LIBDIR` variable is known to vary considerably more between platforms. + Common defaults include *lib* (most common), *lib64*, *lib32*, or *libx32*. + From d514cabd81c79fb57803acdb854f2bb4e5f4e608 Mon Sep 17 00:00:00 2001 From: Matthew Abruzzo Date: Thu, 10 Oct 2024 09:48:32 -0400 Subject: [PATCH 27/36] tweak configure_file.py so that it can be applied to the grdata.py script --- config/configure_file.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/config/configure_file.py b/config/configure_file.py index 1ac2d63f..82b5213f 100755 --- a/config/configure_file.py +++ b/config/configure_file.py @@ -19,6 +19,9 @@ "alphanumeric character is an uppercase or lowercase letter (A-Z or a-z), " "a digit (0-9) or an underscore (_)") +# simple pattern to detect simple occurences python decorator syntax +_PY_DECORATOR_PATTERN = re.compile(r"^[ \t]*@[^\s@]+[^@]*$") + def is_valid_varname(s, start = None, stop = None): return re.fullmatch(_VALID_VARNAME_STR, s[slice(start, stop)]) is not None @@ -55,7 +58,13 @@ def replace(matchobj): line = line[:-1] match_count = 0 - out_f.write(_PATTERN.sub(replace,line)) + if _PY_DECORATOR_PATTERN.match(line) is not None: + # this is a crude workaround to support python decorators. + # - if we didn't have this, then our eager error-handling would + # classify this line as an error + out_f.write(line) + else: + out_f.write(_PATTERN.sub(replace,line)) out_f.write('\n') if err_msg is not None: out_f.close() From 491b4e9fa0531ce96587ef49e92168b3e9b5f4fd Mon Sep 17 00:00:00 2001 From: Matthew Abruzzo Date: Thu, 10 Oct 2024 09:58:17 -0400 Subject: [PATCH 28/36] laid the ground work for installing the grdata tool with classic build system. --- src/clib/Make.config.assemble | 6 ++++++ src/clib/Make.mach.darwin | 3 ++- src/clib/Make.mach.linux-gnu | 1 + src/clib/Make.mach.nasa-aitken-rome | 1 + src/clib/Make.mach.nasa-pleiades | 1 + src/clib/Make.mach.ncsa-bluewaters-cray | 1 + src/clib/Make.mach.ncsa-bluewaters-gnu | 1 + src/clib/Make.mach.summit | 3 ++- src/clib/Make.mach.tacc-stampede-gnu | 1 + src/clib/Make.mach.tacc-stampede-intel | 1 + src/clib/Make.mach.tigercpu | 1 + src/clib/Make.mach.uiuc-campus-gnu | 1 + src/clib/Make.mach.uiuc-campus-intel | 1 + src/clib/Make.mach.uiuc-campus-pgi | 1 + src/clib/Make.mach.unknown | 1 + src/clib/Make.mach.wheeler-intel | 1 + 16 files changed, 23 insertions(+), 2 deletions(-) diff --git a/src/clib/Make.config.assemble b/src/clib/Make.config.assemble index 12f24e4b..519b1fe1 100644 --- a/src/clib/Make.config.assemble +++ b/src/clib/Make.config.assemble @@ -229,10 +229,12 @@ INSTALL_LIB_DIR = $(DEFAULT_INSTALL_PREFIX)/lib INSTALL_INCLUDE_DIR = $(DEFAULT_INSTALL_PREFIX)/include + INSTALL_BIN_DIR = $(DEFAULT_INSTALL_PREFIX)/bin ifdef MACH_INSTALL_PREFIX INSTALL_LIB_DIR = $(MACH_INSTALL_PREFIX)/lib INSTALL_INCLUDE_DIR = $(MACH_INSTALL_PREFIX)/include + INSTALL_BIN_DIR = $(MACH_INSTALL_PREFIX)/bin endif ifdef MACH_INSTALL_LIB_DIR @@ -242,3 +244,7 @@ ifdef MACH_INSTALL_INCLUDE_DIR INSTALL_INCLUDE_DIR = $(MACH_INSTALL_INCLUDE_DIR) endif + + ifdef MACH_INSTALL_BIN_DIR + INSTALL_BIN_DIR = $(MACH_INSTALL_BIN_DIR) + endif diff --git a/src/clib/Make.mach.darwin b/src/clib/Make.mach.darwin index 26268727..640cf36b 100644 --- a/src/clib/Make.mach.darwin +++ b/src/clib/Make.mach.darwin @@ -109,4 +109,5 @@ MACH_LIBS = $(LOCAL_LIBS_HDF5) $(LOCAL_LIBS_MACH) MACH_INSTALL_PREFIX = $(HOME)/grackle_install MACH_INSTALL_LIB_DIR = -MACH_INSTALL_INCLUDE_DIR = \ No newline at end of file +MACH_INSTALL_INCLUDE_DIR = +MACH_INSTALL_BIN_DIR = diff --git a/src/clib/Make.mach.linux-gnu b/src/clib/Make.mach.linux-gnu index d21e452b..abf90c0c 100644 --- a/src/clib/Make.mach.linux-gnu +++ b/src/clib/Make.mach.linux-gnu @@ -83,3 +83,4 @@ MACH_LIBS = $(LOCAL_LIBS_HDF5) $(LOCAL_LIBS_MACH) MACH_INSTALL_PREFIX = $(HOME)/local MACH_INSTALL_LIB_DIR = MACH_INSTALL_INCLUDE_DIR = +MACH_INSTALL_BIN_DIR = diff --git a/src/clib/Make.mach.nasa-aitken-rome b/src/clib/Make.mach.nasa-aitken-rome index eab28523..15ca034a 100644 --- a/src/clib/Make.mach.nasa-aitken-rome +++ b/src/clib/Make.mach.nasa-aitken-rome @@ -99,3 +99,4 @@ MACH_LIBS = $(LOCAL_LIBS_HDF5) $(LOCAL_LIBS_MACH) #$(LOCAL_LIBS_PYTHON) MACH_INSTALL_PREFIX = $(HOME)/local MACH_INSTALL_LIB_DIR = MACH_INSTALL_INCLUDE_DIR = +MACH_INSTALL_BIN_DIR = diff --git a/src/clib/Make.mach.nasa-pleiades b/src/clib/Make.mach.nasa-pleiades index 3dcbe2fb..19194bdc 100644 --- a/src/clib/Make.mach.nasa-pleiades +++ b/src/clib/Make.mach.nasa-pleiades @@ -105,3 +105,4 @@ MACH_LIBS = $(LOCAL_LIBS_HDF5) $(LOCAL_LIBS_MACH) #$(LOCAL_LIBS_PYTHON) MACH_INSTALL_PREFIX = $(HOME)/local MACH_INSTALL_LIB_DIR = MACH_INSTALL_INCLUDE_DIR = +MACH_INSTALL_BIN_DIR = diff --git a/src/clib/Make.mach.ncsa-bluewaters-cray b/src/clib/Make.mach.ncsa-bluewaters-cray index f5770934..7528eebf 100644 --- a/src/clib/Make.mach.ncsa-bluewaters-cray +++ b/src/clib/Make.mach.ncsa-bluewaters-cray @@ -95,3 +95,4 @@ MACH_LIBS = $(LOCAL_LIBS_HDF5) $(LOCAL_LIBS_MACH) MACH_INSTALL_PREFIX = $(HOME)/local/cray MACH_INSTALL_LIB_DIR = MACH_INSTALL_INCLUDE_DIR = +MACH_INSTALL_BIN_DIR = diff --git a/src/clib/Make.mach.ncsa-bluewaters-gnu b/src/clib/Make.mach.ncsa-bluewaters-gnu index 68f7c87c..7efe6315 100644 --- a/src/clib/Make.mach.ncsa-bluewaters-gnu +++ b/src/clib/Make.mach.ncsa-bluewaters-gnu @@ -97,3 +97,4 @@ MACH_LIBS = $(LOCAL_LIBS_HDF5) $(LOCAL_LIBS_MACH) MACH_INSTALL_PREFIX = $(HOME)/local MACH_INSTALL_LIB_DIR = MACH_INSTALL_INCLUDE_DIR = +MACH_INSTALL_BIN_DIR = diff --git a/src/clib/Make.mach.summit b/src/clib/Make.mach.summit index 8aaf1252..16afb920 100644 --- a/src/clib/Make.mach.summit +++ b/src/clib/Make.mach.summit @@ -81,4 +81,5 @@ MACH_LIBS = $(LOCAL_LIBS_HDF5) $(LOCAL_LIBS_MACH) MACH_INSTALL_PREFIX = $(HOME)/local MACH_INSTALL_LIB_DIR = -MACH_INSTALL_INCLUDE_DIR = \ No newline at end of file +MACH_INSTALL_INCLUDE_DIR = +MACH_INSTALL_BIN_DIR = diff --git a/src/clib/Make.mach.tacc-stampede-gnu b/src/clib/Make.mach.tacc-stampede-gnu index cddf1f5c..7d54e288 100644 --- a/src/clib/Make.mach.tacc-stampede-gnu +++ b/src/clib/Make.mach.tacc-stampede-gnu @@ -86,3 +86,4 @@ MACH_LIBS = $(LOCAL_LIBS_HDF5) $(LOCAL_LIBS_MACH) MACH_INSTALL_PREFIX = $(HOME)/local MACH_INSTALL_LIB_DIR = MACH_INSTALL_INCLUDE_DIR = +MACH_INSTALL_BIN_DIR = diff --git a/src/clib/Make.mach.tacc-stampede-intel b/src/clib/Make.mach.tacc-stampede-intel index 5adb9a16..6c0f5b11 100644 --- a/src/clib/Make.mach.tacc-stampede-intel +++ b/src/clib/Make.mach.tacc-stampede-intel @@ -96,3 +96,4 @@ MACH_LIBS = $(LOCAL_LIBS_HDF5) $(LOCAL_LIBS_MACH) MACH_INSTALL_PREFIX = $(HOME)/local MACH_INSTALL_LIB_DIR = MACH_INSTALL_INCLUDE_DIR = +MACH_INSTALL_BIN_DIR = diff --git a/src/clib/Make.mach.tigercpu b/src/clib/Make.mach.tigercpu index 2979d2ee..fd4e256b 100644 --- a/src/clib/Make.mach.tigercpu +++ b/src/clib/Make.mach.tigercpu @@ -85,3 +85,4 @@ MACH_LIBS = $(LOCAL_LIBS_HDF5) $(LOCAL_LIBS_MACH) MACH_INSTALL_PREFIX = $(HOME)/grackle-build MACH_INSTALL_LIB_DIR = MACH_INSTALL_INCLUDE_DIR = +MACH_INSTALL_BIN_DIR = diff --git a/src/clib/Make.mach.uiuc-campus-gnu b/src/clib/Make.mach.uiuc-campus-gnu index ee17f2d9..8c7e66a4 100644 --- a/src/clib/Make.mach.uiuc-campus-gnu +++ b/src/clib/Make.mach.uiuc-campus-gnu @@ -92,3 +92,4 @@ MACH_LIBS = $(LOCAL_LIBS_HDF5) $(LOCAL_LIBS_MACH) MACH_INSTALL_PREFIX = $(HOME)/local MACH_INSTALL_LIB_DIR = MACH_INSTALL_INCLUDE_DIR = +MACH_INSTALL_BIN_DIR = diff --git a/src/clib/Make.mach.uiuc-campus-intel b/src/clib/Make.mach.uiuc-campus-intel index e18aee9e..cf1e8a8f 100644 --- a/src/clib/Make.mach.uiuc-campus-intel +++ b/src/clib/Make.mach.uiuc-campus-intel @@ -91,3 +91,4 @@ MACH_LIBS = $(LOCAL_LIBS_HDF5) $(LOCAL_LIBS_MACH) MACH_INSTALL_PREFIX = $(HOME)/local MACH_INSTALL_LIB_DIR = MACH_INSTALL_INCLUDE_DIR = +MACH_INSTALL_BIN_DIR = diff --git a/src/clib/Make.mach.uiuc-campus-pgi b/src/clib/Make.mach.uiuc-campus-pgi index a3013c45..fd0da040 100644 --- a/src/clib/Make.mach.uiuc-campus-pgi +++ b/src/clib/Make.mach.uiuc-campus-pgi @@ -92,3 +92,4 @@ MACH_LIBS = $(LOCAL_LIBS_HDF5) $(LOCAL_LIBS_MACH) MACH_INSTALL_PREFIX = $(HOME)/local MACH_INSTALL_LIB_DIR = MACH_INSTALL_INCLUDE_DIR = +MACH_INSTALL_BIN_DIR = diff --git a/src/clib/Make.mach.unknown b/src/clib/Make.mach.unknown index d78dce0d..acbdfa43 100644 --- a/src/clib/Make.mach.unknown +++ b/src/clib/Make.mach.unknown @@ -82,3 +82,4 @@ MACH_LIBS = $(LOCAL_LIBS_HDF5) $(LOCAL_LIBS_MACH) MACH_INSTALL_PREFIX = $(HOME)/local MACH_INSTALL_LIB_DIR = MACH_INSTALL_INCLUDE_DIR = +MACH_INSTALL_BIN_DIR = diff --git a/src/clib/Make.mach.wheeler-intel b/src/clib/Make.mach.wheeler-intel index c301faef..27e23e82 100644 --- a/src/clib/Make.mach.wheeler-intel +++ b/src/clib/Make.mach.wheeler-intel @@ -91,3 +91,4 @@ MACH_LIBS = $(LOCAL_LIBS_HDF5) $(LOCAL_LIBS_MACH) MACH_INSTALL_PREFIX = $(HOME)/local MACH_INSTALL_LIB_DIR = MACH_INSTALL_INCLUDE_DIR = +MACH_INSTALL_BIN_DIR = From 1723f430cdf1179a9ce87940e4c6fbf780cbfb7e Mon Sep 17 00:00:00 2001 From: Matthew Abruzzo Date: Thu, 10 Oct 2024 10:37:00 -0400 Subject: [PATCH 29/36] classic build system support for grdata --- src/clib/Makefile | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/src/clib/Makefile b/src/clib/Makefile index 8a47eb0c..dde0406c 100644 --- a/src/clib/Makefile +++ b/src/clib/Makefile @@ -212,7 +212,7 @@ verbose: VERBOSE = 1 # This variable is defined with Make.config.assemble. .PHONY: autogen -autogen: config_type $(AUTOGEN_DIR)/file_registry.h $(AUTOGEN_DIR)/auto_general.c +autogen: config_type $(AUTOGEN_DIR)/auto_general.c $(AUTOGEN_DIR)/file_registry.h $(AUTOGEN_DIR)/grdata # in following recipe, GRACKLE_FLOAT_MACRO is set to either GRACKLE_FLOAT_4 or # GRACKLE_FLOAT_8 @@ -239,6 +239,8 @@ $(AUTOGEN_DIR)/auto_general.c: auto_general.c.in GIT_BRANCH=`$(QUERY_VERSION) git-branch` \ GIT_REVISION=`$(QUERY_VERSION) git-revision` +FILE_REGISTRY_PATH = $(GRACKLE_DIR)/../python/pygrackle/file_registry/file_registry.txt + # Force update of file_registry.h (an internally used header file) .PHONY: $(AUTOGEN_DIR)/file_registry.h $(AUTOGEN_DIR)/file_registry.h: file_registry.h.in @@ -247,7 +249,19 @@ $(AUTOGEN_DIR)/file_registry.h: file_registry.h.in @$(CONFIG_DIR)/configure_file.py --clobber \ --input $< \ --output $@ \ - --variable-use-literal-file-contents FILE_REGISTRY_CONTENTS=$(GRACKLE_DIR)/../python/pygrackle/file_registry/file_registry.txt + --variable-use-literal-file-contents FILE_REGISTRY_CONTENTS=$(FILE_REGISTRY_PATH) + +# Force update of the grdata cli tool +.PHONY: $(AUTOGEN_DIR)/grdata +$(AUTOGEN_DIR)/grdata: $(GRACKLE_DIR)/../python/pygrackle/utilities/grdata.py + -@(echo "Generating $@") + -@(mkdir -p $(AUTOGEN_DIR)) + @$(CONFIG_DIR)/configure_file.py --clobber \ + --input $< \ + --output $@ \ + _GRDATA_GRACKLE_VERSION=$(LIB_RELEASE_VERSION) \ + --variable-use-literal-file-contents _GRDATA_FILE_REGISTRY_CONTENTS=$(FILE_REGISTRY_PATH) + -@(chmod a+rx $@) # keep this recipe updated so that we always clean up the autogenerated files # (the second line cleans up autogenerated files that might be left over from @@ -315,6 +329,13 @@ install: fi) @echo "Installing grackle library files to $(INSTALL_LIB_DIR)." $(LIBTOOL) --mode=install install -c libgrackle.la $(INSTALL_LIB_DIR)/libgrackle.la + @echo "Installing grackle tools to $(INSTALL_BIN_DIR)." + @(if [ ! -d $(INSTALL_BIN_DIR) ]; then \ + mkdir $(INSTALL_BIN_DIR); \ + fi) + @echo "-- Copying $(AUTOGEN_DIR)/grdata to $(INSTALL_BIN_DIR)/grdata" + @cp $(AUTOGEN_DIR)/grdata $(INSTALL_BIN_DIR)/grdata + #----------------------------------------------------------------------- From 2c101720dbad87c23debb604fb9c2ba8fddfde44 Mon Sep 17 00:00:00 2001 From: Matthew Abruzzo Date: Thu, 10 Oct 2024 13:06:21 -0400 Subject: [PATCH 30/36] fix some typos --- doc/source/Tools.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/Tools.rst b/doc/source/Tools.rst index fb5b2136..570977ed 100644 --- a/doc/source/Tools.rst +++ b/doc/source/Tools.rst @@ -42,11 +42,11 @@ To execute the tool: In detail, the tool is implemented as an executable python script (it only uses the python standard library) and it relies upon a `shebang `__ to launch the program. - In the event that, that your machine finds an invalid python version (at this time of writing, the minimum required version is 3.6.1 is r) or can't find any python interpretter, you have 2 options: + In the event that, that your machine finds an invalid python version (at this time of writing, the minimum required version is 3.6.1) or can't find any python interpretter, you have 2 options: 1. you can modify the shebang path OR - 2. you can directly invoke the tool with python `path/to/python path/to/grdata ...`. + 2. you can directly invoke the tool with python: `` ...``. .. group-tab:: As a part of Pygrackle From 1816ff5cdb3d95528ab2183a30191c4bb5655d4e Mon Sep 17 00:00:00 2001 From: Matthew Abruzzo Date: Thu, 10 Oct 2024 13:07:02 -0400 Subject: [PATCH 31/36] more-typo fixes --- src/python/pygrackle/utilities/grdata.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/python/pygrackle/utilities/grdata.py b/src/python/pygrackle/utilities/grdata.py index 5c2db76f..42036e37 100644 --- a/src/python/pygrackle/utilities/grdata.py +++ b/src/python/pygrackle/utilities/grdata.py @@ -42,7 +42,7 @@ available data, and delete the data. The system stores the data files at a single global location. (Grackle, -itself, will soon be able to access files from this location). +itself, can access files from this location). The key feature of this system is its support for versioning: From db32df693c9daf24f290d000936f1f941f40809d Mon Sep 17 00:00:00 2001 From: Matthew Abruzzo Date: Thu, 10 Oct 2024 20:29:09 -0400 Subject: [PATCH 32/36] moved the examples section. --- doc/source/Examples.rst | 141 ++++++++++++++++++++++++++++++++++++ doc/source/Installation.rst | 118 ++++++++++-------------------- doc/source/Interaction.rst | 49 +------------ doc/source/index.rst | 1 + 4 files changed, 183 insertions(+), 126 deletions(-) create mode 100644 doc/source/Examples.rst diff --git a/doc/source/Examples.rst b/doc/source/Examples.rst new file mode 100644 index 00000000..8e096b8e --- /dev/null +++ b/doc/source/Examples.rst @@ -0,0 +1,141 @@ + +.. _examples: + +Example Executables +=================== + +The Grackle repository provides example C, C++, and Fortran code for interacting with Grackle. + +Example Descriptions +-------------------- + +The examples are located in the **src/example** directory. +The examples include: + + * **c_example.c** - C example + + * **c_local_example.c** - C example using only :ref:`local_functions` + + * **cxx_example.C** - C++ example + + * **cxx_omp_example.C** - C++ example using OpenMP + + * **fortran_example.F** - Fortran example + +.. _how-to-run-example: + +Preparing and Executing the Examples +------------------------------------ + +In this section, we explain how to prepare and execute the example executables. +Running an example is a useful way to quickly check whether Grackle is functioning correctly +(to more rigorously check that Grackle is fully functional, you can try :ref:`running the +test suite `). + +The instructions for building and executing the examples vary based on the build-system. +In both cases, the examples require that you haven't cleanup up from your build. + +1. Compile the example: + + .. tabs:: + + .. group-tab:: Classic Build System + + Once you have already installed the grackle library, you can build the examples by typing ``make`` and the name of the file without extension. + Assuming that you were just in the **src/clib** subdirectory, you would execute the following to build the C++ example: + + .. code-block:: shell-session + + ~/grackle/src/clib $ cd ../example + ~/grackle/src/example $ make clean + ~/grackle/src/example $ make + + Compiling cxx_example.C + Linking + Success! + + .. group-tab:: CMake Build System + + By default, the examples are automatically built with the rest of Grackle. + The compiled example binaries can be found within **/example**, where **** is the arbitrary build-directory that you need to specify when compiling Grackle. + + .. warning:: + + It's important that **** is a top-level directory in the grackle repository (e.g. something like **~/grackle/my-build** is fine, but choices like **~/grackle/../my-grackle-build** and **~/grackle/my_builds/my-first-build** are problematic). + If this isn't the case, then the examples won't be able to locate the input data files. + + .. important:: + + If you're using the Classic build system, make sure to add the path to the directory containing the installed **libgrackle.so** to your LD_LIBRARY_PATH (or DYLD_LIBRARY_PATH on Mac). + This is **NOT** necessary for the CMake build system. + More information is provided below.\ [#f1]_ + +2. Now we execute the example + + .. note:: + + The examples make certain assumptions about the location of the data files. + To ensure that the data files can be found, you should execute each example-binary from the same directory where the example binary is produced. + + To execute the example, invoke: + + .. tabs:: + + .. group-tab:: Classic Build System + + .. code-block:: shell-session + + ~/grackle/src/example $ ./cxx_example + + .. group-tab:: CMake Build System + + .. code-block:: shell-session + + ~/grackle $ cd /examples + ~/grackle//examples $ ./cxx_example + + The output will look like the following: + + .. code-block:: shell-session + + The Grackle Version 2.2 + Mercurial Branch default + Mercurial Revision b4650914153d + + Initializing grackle data. + with_radiative_cooling: 1. + primordial_chemistry: 3. + metal_cooling: 1. + UVbackground: 1. + Initializing Cloudy cooling: Metals. + cloudy_table_file: ../../input/CloudyData_UVB=HM2012.h5. + Cloudy cooling grid rank: 3. + Cloudy cooling grid dimensions: 29 26 161. + Parameter1: -10 to 4 (29 steps). + Parameter2: 0 to 14.849 (26 steps). + Temperature: 1 to 9 (161 steps). + Reading Cloudy Cooling dataset. + Reading Cloudy Heating dataset. + Initializing UV background. + Reading UV background data from ../../input/CloudyData_UVB=HM2012.h5. + UV background information: + Haardt & Madau (2012, ApJ, 746, 125) [Galaxies & Quasars] + z_min = 0.000 + z_max = 15.130 + Setting UVbackground_redshift_on to 15.130000. + Setting UVbackground_redshift_off to 0.000000. + Cooling time = -1.434987e+13 s. + Temperature = 4.637034e+02 K. + Pressure = 3.345738e+34. + gamma = 1.666645e+00. + + +.. rubric:: Footnotes + +.. [#f1] In more detail, both build-systems use copies of the grackle-library within the build directory while compiling the example. + + * the Classic build-system **always** links Grackle against the shared-library version of Grackle and requires that Grackle is fully installed in a location known by the system (either a standard system location OR a location specified by ``LD_LIBRARY_PATH``/``DYLD_LIBRARY_PATH``). + * In contrast, cmake automatically takes special-steps to try to ensure that each example-binary will link to the copy of the Grackle library (whether it is shared or static) that is in the ````; in fact, Grackle doesn't even need to be installed to run the Grackle library. + * With that said, if you compile Grackle as a shared library in a cmake build, an example-binary **might** try to use a copy of a shared grackle library found in a directory specified by ``LD_LIBRARY_PATH``/``DYLD_LIBRARY_PATH`` if one exists. + The exact behavior may be platform dependent and also depends on whether CMake instructs the linker to use RPATH or RUNPATH (this is not specified by the cmake docs). + diff --git a/doc/source/Installation.rst b/doc/source/Installation.rst index 2931a7bb..ce109eb0 100644 --- a/doc/source/Installation.rst +++ b/doc/source/Installation.rst @@ -261,54 +261,13 @@ Then, to install: 5. Test your Installation Once installed, you can test your installation with the provided example to -assure it is functioning correctly. If something goes wrong in this process, +assure it is functioning correctly. +More details are provided :ref:`here `. +If something goes wrong in this process, check the ``out.compile`` file to see what went wrong during compilation, or use ``ldd`` (``otool -L`` on Mac) on your executable to determine what went wrong during linking. -:: - - ~/grackle/src/clib $ cd ../example - ~/grackle/src/example $ make clean - ~/grackle/src/example $ make - - Compiling cxx_example.C - Linking - Success! - - ~/grackle/src/example $ ./cxx_example - - The Grackle Version 2.2 - Mercurial Branch default - Mercurial Revision b4650914153d - - Initializing grackle data. - with_radiative_cooling: 1. - primordial_chemistry: 3. - metal_cooling: 1. - UVbackground: 1. - Initializing Cloudy cooling: Metals. - cloudy_table_file: ../../input/CloudyData_UVB=HM2012.h5. - Cloudy cooling grid rank: 3. - Cloudy cooling grid dimensions: 29 26 161. - Parameter1: -10 to 4 (29 steps). - Parameter2: 0 to 14.849 (26 steps). - Temperature: 1 to 9 (161 steps). - Reading Cloudy Cooling dataset. - Reading Cloudy Heating dataset. - Initializing UV background. - Reading UV background data from ../../input/CloudyData_UVB=HM2012.h5. - UV background information: - Haardt & Madau (2012, ApJ, 746, 125) [Galaxies & Quasars] - z_min = 0.000 - z_max = 15.130 - Setting UVbackground_redshift_on to 15.130000. - Setting UVbackground_redshift_off to 0.000000. - Cooling time = -1.434987e+13 s. - Temperature = 4.637034e+02 K. - Pressure = 3.345738e+34. - gamma = 1.666645e+00. - In order to verify that Grackle is fully functional, try :ref:`running the test suite `. @@ -326,11 +285,37 @@ Steps have also been taken simplify integration of Grackle into simulation codes More details about integration are provided :doc:`on this page `. This current section focuses on installation. -For the uninitiated, the CMake build-system performs an out-of-source build. -An out-of-source build places all build artifacts (auto-generated source/header files, object files, etc.) into a "build-directory." -The build-directory is at a user-specified location that is organized into a hierarchy that resembles the source directory hierarchy. -Cleaning up from a CMake-build is as simple as deleting this build-directory. -In contrast, the "classic build system" performs an in-source build (because that type of build distributes build artifacts throughout the source directory hierarchy, clean up requires more complex logic encapsulated by the ``make clean`` command). +Basic Definitions ++++++++++++++++++ + +For the uninitiated, the CMake build-system performs an *out-of-source* build. +To introduce what this means we define the terms **source directory** and **build directory** and touch on the idea of an **install destination**. +For concreteness, we continue to assume that the root of the cloned Grackle repository is located at **~/grackle**. + +.. COMMENT: The following is a RST "Definition List" structure (with a label so we can reference it later) + +.. _dir-defs: + +source directory + The root directory holding all of Grackle's source files. + We generally consider this to be **~/grackle/src** (a case could be made that it's actually **~/grackle**). + +build directory + The root directory where we put all build artifacts (auto-generated source/header files, object files, libraries, executables, etc.) + + * for an *in-source-build* (e.g. a build performed by the classic build system) the build and source directories are comingled (i.e. build artifacts are distributed throughout the source directory hierarchy). + * for an *out-of-source* build, this is a location chosen so that no build artifacts are placed within the source directory + * for the CMake build system, this is an arbitrary, user-specified location. + It is conventionally placed within the root of the grackle repository and called something like **~/grackle/build**. + We commonly denote this location as **** + +install destination + Specifies where the primary products of the build process are copied during a build system's installation phase. + Properties of the copied products (e.g. file owners, file permissions, executable/shared library properties) may be altered. + More information will be provided later. + +While cleaning up from an *in-source-build* requires special logic (commonly encoded in a ``make clean`` command), cleaning up from an *out-of-source-build* is much more straight-forward. +To clean up from an *out-of-source-build*, you can simply delete the build directory. .. warning:: @@ -357,9 +342,8 @@ The remainder of this subsection is primarily intended for readers who are relat For now, we make 2 basic decisions: - #. Decide on the directory, ````, where you want to build Grackle. [#f1]_ - This is referred to as the build-directory and is generally placed at the root level of the grackle repository. - A common choice is ``build`` (but this is fairly arbitrary). + #. Decide on the :ref:`build-directory `, ````, where you want to build Grackle.\ [#f1]_ + This is generally placed at the root level of the grackle repository and commonly named ``build`` (but this is fairly arbitrary). #. Decide on the installation directory prefix, ````, where Grackle will be installed. This is be specified via the ``CMAKE_INSTALL_PREFIX`` cmake configuration variable. @@ -404,33 +388,9 @@ The remainder of this subsection is primarily intended for readers who are relat 4. Test your Build. - - Once you have compiled Grackle, you can run one of the provided example to test if it functions correctly. - These examples are automatically compiled with Grackle. - - .. code-block:: shell-session - - ~/grackle $ cd /examples - ~/grackle//examples $ ./cxx_example - - .. warning:: - - The examples make certain assumptions about the location of the input files. - The examples are only guaranteed to work if both: - - 1. you execute the example-binary from the same-directory where the example-binary is found - - 2. ```` is a top-level directory in the grackle repository (e.g. something like ``my-build`` is fine, but choices like ``../my-grackle-build`` and ``my_builds/my-first-build`` are problematic). - - .. note:: - - For reference, the Classic build-system always links Grackle against the shared-library version of Grackle and requires that Grackle is fully installed in a location known by the system (either a standard system location OR a location specified by ``LD_LIBRARY_PATH``/``DYLD_LIBRARY_PATH``). - In contrast, cmake automatically takes special-steps to try to ensure that each example-binary will link to the copy of the Grackle library (whether it is shared or static) that is in the ````; in fact, Grackle doesn't even need to be installed to run the Grackle library. - - With that said, if you compile Grackle as a shared library in a cmake build, an example-binary **might** try to use a copy of a shared grackle library found in a directory specified by ``LD_LIBRARY_PATH``/``DYLD_LIBRARY_PATH`` if one exists. - The exact behavior may be platform dependent and also depends on whether CMake instructs the linker to use RPATH or RUNPATH (this is not specified by the cmake docs). - -In order to verify that Grackle is fully functional, you can try :ref:`running the test suite `. + Once you have compiled Grackle, you can run one of the provided examples to test if it functions correctly. + More details are provided :ref:`here `. + In order to verify that Grackle is fully functional, you can try :ref:`running the test suite `. .. _how_to_configure: diff --git a/doc/source/Interaction.rst b/doc/source/Interaction.rst index 3f93b2ea..9ad961a6 100644 --- a/doc/source/Interaction.rst +++ b/doc/source/Interaction.rst @@ -15,57 +15,12 @@ The :ref:`Primary API `, manages some of Grackle's data struc In contrast, the :ref:`Local API `, requires that the downstream application explicitly manage pointers to these same data-structures and requires that the pointers are provided as arguments to each function. The latter API is explicitly thread-safe as it involves no global data. -.. _examples: - Example Executables ------------------- The grackle source code contains examples for C, C++, and Fortran codes. -They are located in the **src/example** directory and provide examples -of calling all of grackle's functions. - - * **c_example.c** - C example - - * **c_local_example.c** - C example using only :ref:`local_functions` - - * **cxx_example.C** - C++ example - - * **cxx_omp_example.C** - C++ example using OpenMP - - * **fortran_example.F** - Fortran example - -The instructions for building and executing the examples vary based on the build-system: - -.. tabs:: - - .. tab:: Classic Build System - - Once you have already installed the grackle library, you can build the examples by typing *make* and the name of the file without extension. - For example, to build the C++ example, type: - - .. code-block:: shell-session - - $ make cxx_example - - To run the example, make sure to add the path to the directory containing - the installed **libgrackle.so** to your LD_LIBRARY_PATH (or - DYLD_LIBRARY_PATH on Mac). - - - .. tab:: CMake Build System - - By default, the examples are automatically built with the rest of Grackle. - The compiled example binaries can be found within */example*, where ** is the arbitrary build-directory that you need to specify when compiling Grackle. - - It's important that ** is a top-level directory in the grackle repository (e.g. something like *my-build* is fine, but choices like *../my-grackle-build* and *my_builds/my-first-build* are problematic). - If this isn't the case, then the examples won't be able to locate the input data files. - - You don't need to worry about using LD_LIBRARY_PATH (or DYLD_LIBRARY_PATH on Mac) to run these examples with this build-system. - -.. important:: - - The examples make certain assumptions about the location of the input files. - To ensure that the input files can be found, you should execute each example-binary from the same directory where the example binary is produced. +These files illustrate how to call all of grackle's functions. +More details are provided :ref:`here `. Header Files ------------ diff --git a/doc/source/index.rst b/doc/source/index.rst index a859235b..51f314aa 100644 --- a/doc/source/index.rst +++ b/doc/source/index.rst @@ -39,6 +39,7 @@ Contents: :maxdepth: 2 Installation.rst + Examples.rst Testing.rst Integration.rst Interaction.rst From 0e58fafd12d061d16b92f113e014a8e2cd4c1071 Mon Sep 17 00:00:00 2001 From: Matthew Abruzzo Date: Fri, 11 Oct 2024 14:03:27 -0400 Subject: [PATCH 33/36] a bunch of progress. --- doc/source/Installation.rst | 87 ++++++++++++++++++++++++++++++++++--- doc/source/Integration.rst | 3 ++ doc/source/Interaction.rst | 2 + 3 files changed, 86 insertions(+), 6 deletions(-) diff --git a/doc/source/Installation.rst b/doc/source/Installation.rst index ce109eb0..bc36f0f0 100644 --- a/doc/source/Installation.rst +++ b/doc/source/Installation.rst @@ -17,7 +17,7 @@ There are 3 steps to setting up Grackle on your system Given a smooth roll-out of the :ref:`CMake build system `, it is our intention to deprecate and remove the :ref:`classic build system `. If you encounter any problems with the CMake system or anticipate any issues with this plan, :doc:`please let us know `. -We include a :ref:`note on compiler toolchain compatability ` at the end of this page. +We include a :ref:`description of the installed products ` and a :ref:`note on compiler toolchain compatability ` at the end of this page. .. _install_grackle_dependencies: @@ -116,7 +116,17 @@ Linux systems, and an unformatted ``Make.mach.unknown``. If you have a make file prepared for an Enzo install, it cannot be used straight away, but is a very good place to start. -Once you have chosen the make file to be used, a few variables should be set: +.. COMMENT BLOCK + + To support cross-referencing of the following block of text with sphinx's + `:ref:` construct (while suppressing warnings about referencing plain text), + we enclose the text in RST's container directive (it won't impact rendering) + https://docutils.sourceforge.io/docs/ref/rst/directives.html#toc-entry-19 + +.. container:: + :name: classic-makefile-variable-list + + Once you have chosen the make file to be used, a few variables should be set: * ``MACH_LIBTOOL`` - path to ``libtool`` executable. Note, on a Mac, this should point to ``glibtool``, which can be installed with macports @@ -136,6 +146,10 @@ Once you have chosen the make file to be used, a few variables should be set: * ``MACH_INSTALL_INCLUDE_DIR`` - path where grackle header files will be installed (only set if different from MACH_INSTALL_PREFIX/include). + * ``MACH_INSTALL_BIN_DIR`` - path where grackle-related utility executables + will be installed (only set if different from MACH_INSTALL_PREFIX/include). + + Once the proper variables are set, they are loaded into the build system by doing the following: @@ -283,7 +297,7 @@ An overview of our design philosophy is provided :ref:`here `. -This current section focuses on installation. +The current section focuses on building and installation. Basic Definitions +++++++++++++++++ @@ -312,7 +326,7 @@ build directory install destination Specifies where the primary products of the build process are copied during a build system's installation phase. Properties of the copied products (e.g. file owners, file permissions, executable/shared library properties) may be altered. - More information will be provided later. + More information will be provided :ref:`below `. While cleaning up from an *in-source-build* requires special logic (commonly encoded in a ``make clean`` command), cleaning up from an *out-of-source-build* is much more straight-forward. To clean up from an *out-of-source-build*, you can simply delete the build directory. @@ -467,6 +481,7 @@ This second table highlights a subset of standardized CMake options that may als .. list-table:: Standard CMake Options :widths: 12 30 5 :header-rows: 1 + :name: standard-cmake-options * - Name - Description @@ -477,7 +492,7 @@ This second table highlights a subset of standardized CMake options that may als - ```` * - ``CMAKE_BUILD_TYPE`` - - Specifies the desired build configuration (for single-configuration generators [#f3]_). + - Specifies the desired build configuration (for single-configuration generators\ [#f3]_). Grackle currently supports the standard choices ``Debug``, ``Release``, ``RelWithDebInfo`` and ``MinSizeRel``. - ```` @@ -499,6 +514,22 @@ This second table highlights a subset of standardized CMake options that may als This is commonly set by host-files. - ```` + +.. COMMENT BLOCK + + To support cross-referencing the following block of text with sphinx's `ref` + construct (while suppressing warnings about referencing plain text), we + enclose the text in RST's container directive (it won't impact rendering) + +.. container:: + :name: cmake-granular-install-vars + + In the (unlikely) event you need more control over :ref:`installation locations `, the build-sytem honors values specified for standard variables like ``CMAKE_INSTALL_BINDIR``, ``CMAKE_INSTALL_LIBDIR``, ``CMAKE_INSTALL_INCLUDEDIR``. + More information is provided `here `__ about these cmake variables. + + + + There are also additional standard options for BOTH configuring other aspects of the build and for finding the correct/preferred HDF5 library and configuring the correct openmp library. Addtionally, CMake will also respect the values of certain environment variables. @@ -592,6 +623,48 @@ While embedded builds currently respect ``GRACKLE_OPTIMIZATION_FLIST_INIT``, tha * after we update the minimum required CMake version for compiling Grackle to at least 3.19, we may transition to using these features. +.. _install-products: + +Installation Products +--------------------- + +We now give an overview of the products of an installation (e.g. the result of commands like ``make install`` or ``cmake --install ``). + +We describe these products in terms of the :ref:`installation destination `. +Organization of the installed products has a similar description on all major platforms.\ [#installproducts1]_ +Essentially, products are distributed among a standard set of directories contained within a single root directory. +This root directory is often called the "installation prefix" (or simply the "prefix"). +The **include** subdirectory typically holds headers, the **bin** subdirectory typically holds executables, and the **lib** subdirectory (some platform use similar names like **lib64**) holds libraries. + +.. tabs:: + + .. group-tab:: Classic Build System + + Unless overriden, the **lib** subdirectory is *always* called **lib**. + + Overrides are specified with :ref:`Makefile variables ` + ``MACH_INSTALL_PREFIX`` controls the prefix while ``MACH_INSTALL_LIB_DIR``, ``MACH_INSTALL_INCLUDE_DIR``, and ``MACH_INSTALL_BIN_DIR`` gives finer grain control over the other variables. + + .. group-tab:: CMake Build System + + The default value for the **lib** subdirectory is `platform dependent `__ (currently either **lib** or **lib64**). + + The standard ``CMAKE_INSTALL_PREFIX`` option :ref:`controls the prefix ` while + ``CMAKE_INSTALL_LIBDIR``, ``CMAKE_INSTALL_INCLUDEDIR``, and ``CMAKE_INSTALL_BINDIR`` options :ref:`provide finer control `. + +A vanilla, standalone Grackle installation provides: + +- The Grackle library (in the **lib** subdirectory). + Depending on how build system (and your choices), installation provides it as a shared library, a static library, or both. + +- Header files (in the **include** subdirectory). + More details about the header files (e.g. public headers vs. implementation details) are provided :ref:`here `. + +- Utility Executables (in the **bin** subdirectory). + At the moment, this just includes the ``grdata`` command line tool for :ref:`managing grackle data files`. + +- If you used the CMake build system, some metadata files are also included to make it :ref:`easy for other projects to consume Grackle ` + .. _compiler_toolchain_compatability: @@ -641,5 +714,7 @@ For example, adding GPU-support with the likes of CUDA or HIP would involve link .. [#f4] Aside: performing these 2 separate CMake builds compiles the source files the same number of times as the Classic build system. Behind the scenes, the classic build system always compile each source file twice (once with position independent code and once without). - +.. [#installproducts1] The primary exception is for MacOS software distributed through official Apple channels. + For our purposes, we (like most open-source science software) get away with treating MacOS as a generic Unix-like system. + Ironically, while Windows (which we definitely don't support) may prefer some alternative organization, it is much less of an exception than MacOS. diff --git a/doc/source/Integration.rst b/doc/source/Integration.rst index 66839884..93230707 100644 --- a/doc/source/Integration.rst +++ b/doc/source/Integration.rst @@ -1,3 +1,6 @@ + +.. _integration-consuming-grackle: + Integrating Grackle into your Application ========================================= diff --git a/doc/source/Interaction.rst b/doc/source/Interaction.rst index 9ad961a6..3aabf128 100644 --- a/doc/source/Interaction.rst +++ b/doc/source/Interaction.rst @@ -22,6 +22,8 @@ The grackle source code contains examples for C, C++, and Fortran codes. These files illustrate how to call all of grackle's functions. More details are provided :ref:`here `. +.. _public-header-files: + Header Files ------------ From ff9d4acb2c0600f4b0c3d56ccb529e058ed2917f Mon Sep 17 00:00:00 2001 From: Matthew Abruzzo Date: Fri, 11 Oct 2024 20:35:49 -0400 Subject: [PATCH 34/36] updating information about where to find the grdata tool. --- CMakeLists.txt | 9 +++++++++ doc/source/Installation.rst | 40 ++++++++++++++++++++++++++++++++++++- doc/source/Tools.rst | 29 ++++----------------------- 3 files changed, 52 insertions(+), 26 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 3d1b54b5..c6211c75 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -209,6 +209,15 @@ create_grdata_program( TARGET_NAME Grackle::grdata ) +file(GENERATE + OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/grackle-buildpaths-$.txt + CONTENT [[ +# This file lists paths where useful build products can be found in the build +# directory (if/when the products are actually built) +$ +]]) + + # declare build-recipies for examples if (GRACKLE_EXAMPLES) add_subdirectory(src/example) diff --git a/doc/source/Installation.rst b/doc/source/Installation.rst index bc36f0f0..de743773 100644 --- a/doc/source/Installation.rst +++ b/doc/source/Installation.rst @@ -566,6 +566,28 @@ The following code snippet illustrates how you might do this (for concreteness, ~ grackle $ cmake --build build-shared ~ grackle $ cmake --install build-shared +.. _build-dir-product-locations: + +Build Directory Product Locations ++++++++++++++++++++++++++++++++++ + +Up until now, we have been pretty vague about how the build-products are organized within the build directory. +**This is intentional!** +The paths to files within the build directory are an implementation detail that can and will change at any time in the future (some files could be removed entirely from the build-directory). +We generally expect consumers to interact with most of Grackle's build products :ref:`once they are installed `. + +With that said, we recognize that it can be useful to make use of certain build-products from a standalone Grackle build without requiring a full installation. +We provide 2 mechanisms for doing this: + +1. The root of the build directory contains a file called **grackle-buildpaths-.txt** contains paths specifying where some useful grackle utilities can be found in the build-directory (if/when they are built) + + - if you have been following the above compilation instructions, you can only have 1 file in you build-directory called **grackle-buildpaths-*.txt** at a time.\ [#buildproducts1]_ + + - at the moment, this just contains the ``grdata`` command line tool for :ref:`managing grackle data files`. + +2. We provide an experimental :ref:`approach for integrating grackle ` in a downstream project using a the build directory of a Grackle directory and we may add another approach in the future.\ [#buildproducts2]_ + (Technically, the :ref:`embedded install apprach ` also lets you avoid fully installing Grackle, but this is a special case) + .. _cmake_host-files: More About Host-Files @@ -652,7 +674,7 @@ The **include** subdirectory typically holds headers, the **bin** subdirectory t The standard ``CMAKE_INSTALL_PREFIX`` option :ref:`controls the prefix ` while ``CMAKE_INSTALL_LIBDIR``, ``CMAKE_INSTALL_INCLUDEDIR``, and ``CMAKE_INSTALL_BINDIR`` options :ref:`provide finer control `. -A vanilla, standalone Grackle installation provides: +A vanilla, standalone (i.e. :ref:`not an embedded build `) Grackle installation provides: - The Grackle library (in the **lib** subdirectory). Depending on how build system (and your choices), installation provides it as a shared library, a static library, or both. @@ -663,6 +685,15 @@ A vanilla, standalone Grackle installation provides: - Utility Executables (in the **bin** subdirectory). At the moment, this just includes the ``grdata`` command line tool for :ref:`managing grackle data files`. + .. note:: + + .. COMMENT: (Maybe we should just put a redirect at the root of the build-directory?) + + When you use the CMake build-system, you can reliably find the ``grdata`` command line program within the build directory at **/grackle/bin/grdata** (this assumes that you performed a stand-alone build, with default configuration settings) + + **REMINDER: Unless explicitly noted, the locations of all other installation products (and any other contents) within the build directory are considered implementation details -- they can/will change at ANY time.** + + - If you used the CMake build system, some metadata files are also included to make it :ref:`easy for other projects to consume Grackle ` @@ -714,6 +745,13 @@ For example, adding GPU-support with the likes of CUDA or HIP would involve link .. [#f4] Aside: performing these 2 separate CMake builds compiles the source files the same number of times as the Classic build system. Behind the scenes, the classic build system always compile each source file twice (once with position independent code and once without). +.. [#buildproducts1] In principle, you can get multiple files if you are using a multi-configuration generation. + If you don't know what this means, you really don't need to worry about it. + +.. [#buildproducts2] The common property to a supported integration approach that lets you use grackle from the build-directory is they don't require hardcoding assumptions about Grackle's build-directory into a downstream project's build-system. + Instead, they introduce a standardized way for us, the Grackle developers, to communicate Grackle's usage requirements (and any assumptions abouts paths) to the downstream build system. + + .. [#installproducts1] The primary exception is for MacOS software distributed through official Apple channels. For our purposes, we (like most open-source science software) get away with treating MacOS as a generic Unix-like system. Ironically, while Windows (which we definitely don't support) may prefer some alternative organization, it is much less of an exception than MacOS. diff --git a/doc/source/Tools.rst b/doc/source/Tools.rst index 570977ed..4aec567e 100644 --- a/doc/source/Tools.rst +++ b/doc/source/Tools.rst @@ -21,22 +21,14 @@ To execute the tool: .. group-tab:: As a Standalone CLI - When you build grackle as a standalone application with the CMake build-system, the program can generally be found inside the build directory at */grackle/bin/grdata*. [#df1]_ - Thus, you can invoke the tool with: - - .. code-block:: shell-session - - $ .//grackle/bin/grdata ... - - When you install grackle (with either the traditional build-system OR the CMake build-system), the program will be installed alongside the core Grackle library. - If the core Grackle library is at */lib/libgrackle...*, then the tool is probably installed to */bin/grdata*. [#df2]_ - In this case, you can invoke the tool with: + In a full, standalone Grackle installation (regardless of build-system), the ``grdata`` tool will be :ref:`one of the installed components `. + If you build a standalone copy of Grackle with the CMake build-system, the build-system provides details about where to find a copy of the ``grdata`` tool :ref:`within the build-directory `. + Once you locate the tool, you can invoke in with: .. code-block:: shell-session - $ /bin/grdata ... - + $ ./ ... .. note:: @@ -132,16 +124,3 @@ Down below, we sketch out what the directory-structure might look like: :language: none :start-after: [[[BEGIN:DIRECTORY-CARTOON]]] :end-before: [[[END:DIRECTORY-CARTOON]]] - - -.. rubric:: Footnotes - -.. [#df1] The precise location is sensitive to CMake's standard `CMAKE_INSTALL_BINDIR` variable. - On just about all platforms this defaults to *bin*, but it is plausible that it could default to a different value. - You are also free to override the value of the `CMAKE_INSTALL_BINDIR` variable. - -.. [#df2] Again, the precise location of ``grdata`` installed by the CMake build-system is sensitive to CMake's standard `CMAKE_INSTALL_BINDIR` variable. - Additionally the precise location of the core Grackle library is sensitive to the CMake's standard `CMAKE_INSTALL_LIBDIR` variable, which defaults to platform's preferred default values (again, you can overwrite it). - CMake's `CMAKE_INSTALL_LIBDIR` variable is known to vary considerably more between platforms. - Common defaults include *lib* (most common), *lib64*, *lib32*, or *libx32*. - From a190a086a88bf158d4a0756c830aef25aad54d0c Mon Sep 17 00:00:00 2001 From: Matthew Abruzzo Date: Fri, 11 Oct 2024 20:54:55 -0400 Subject: [PATCH 35/36] prompt users to use grdata when running examples. --- doc/source/Examples.rst | 47 +++++++++++++++++++++++++++++++---------- doc/source/Tools.rst | 2 ++ 2 files changed, 38 insertions(+), 11 deletions(-) diff --git a/doc/source/Examples.rst b/doc/source/Examples.rst index 8e096b8e..10e42e46 100644 --- a/doc/source/Examples.rst +++ b/doc/source/Examples.rst @@ -34,8 +34,28 @@ test suite `). The instructions for building and executing the examples vary based on the build-system. In both cases, the examples require that you haven't cleanup up from your build. +If you used the classic build-system, the examples require that Grackle has been fully installed. -1. Compile the example: +1. Fetch datafiles with the ``grdata`` tool. + + * some of the examples require that the :ref:`datafiles have been fetched and managed ` by the ``grdata`` tool. + + * In a full, standalone Grackle installation (regardless of build-system), the ``grdata`` tool will be :ref:`one of the installed components `. + If you build a standalone copy of Grackle with the CMake build-system, the build-system provides details about where to find a copy of the ``grdata`` tool :ref:`within the build-directory `. + + Once you locate ``grdata`` you should invoke: + + .. code-block:: shell-session + + $ ./ fetch + + .. tip:: + + Even if you don't think it will be necessary, it is always worth fetching the data files with the copy of ``grdata`` that was created alongside Grackle. + Grackle will only access managed files if you have invoked this command with a copy of ``grdata`` that exactly matches the Grackle version :ref:`(steps are taken to deduplicate files on disk) `. + At worst, the command will confirm that nothing needs to be done. + +2. Compile the example (if necessary): .. tabs:: @@ -57,20 +77,20 @@ In both cases, the examples require that you haven't cleanup up from your build. .. group-tab:: CMake Build System By default, the examples are automatically built with the rest of Grackle. - The compiled example binaries can be found within **/example**, where **** is the arbitrary build-directory that you need to specify when compiling Grackle. + The compiled example binaries can be found within **/example**, where **** is the arbitrary :ref:`build-directory ` that you previously specified while compiling Grackle. .. warning:: It's important that **** is a top-level directory in the grackle repository (e.g. something like **~/grackle/my-build** is fine, but choices like **~/grackle/../my-grackle-build** and **~/grackle/my_builds/my-first-build** are problematic). - If this isn't the case, then the examples won't be able to locate the input data files. + If this isn't the case, then the examples (that don't use automatically managed input data files) won't be able to locate the data files. .. important:: If you're using the Classic build system, make sure to add the path to the directory containing the installed **libgrackle.so** to your LD_LIBRARY_PATH (or DYLD_LIBRARY_PATH on Mac). This is **NOT** necessary for the CMake build system. - More information is provided below.\ [#f1]_ + :ref:`More information is provided below.` -2. Now we execute the example +3. Now we execute the example .. note:: @@ -130,12 +150,17 @@ In both cases, the examples require that you haven't cleanup up from your build. gamma = 1.666645e+00. -.. rubric:: Footnotes +.. _how-examples-are-built: + +More details about how examples are built +----------------------------------------- + +In more detail, both build-systems use copies of the grackle-library within the build directory while compiling the example. + +* the Classic build-system **always** links Grackle against the shared-library version of Grackle and requires that Grackle is fully installed in a location known by the system (either a standard system location OR a location specified by ``LD_LIBRARY_PATH``/``DYLD_LIBRARY_PATH``). -.. [#f1] In more detail, both build-systems use copies of the grackle-library within the build directory while compiling the example. +* In contrast, CMake automatically takes special-steps to try to ensure that each example-binary will link to the copy of the Grackle library (whether it is shared or static) that is in the ````; in fact, Grackle doesn't even need to be installed to run the Grackle library. - * the Classic build-system **always** links Grackle against the shared-library version of Grackle and requires that Grackle is fully installed in a location known by the system (either a standard system location OR a location specified by ``LD_LIBRARY_PATH``/``DYLD_LIBRARY_PATH``). - * In contrast, cmake automatically takes special-steps to try to ensure that each example-binary will link to the copy of the Grackle library (whether it is shared or static) that is in the ````; in fact, Grackle doesn't even need to be installed to run the Grackle library. - * With that said, if you compile Grackle as a shared library in a cmake build, an example-binary **might** try to use a copy of a shared grackle library found in a directory specified by ``LD_LIBRARY_PATH``/``DYLD_LIBRARY_PATH`` if one exists. - The exact behavior may be platform dependent and also depends on whether CMake instructs the linker to use RPATH or RUNPATH (this is not specified by the cmake docs). +* With that said, if you compile Grackle as a shared library in a cmake build, an example-binary **might** try to use a copy of a shared grackle library found in a directory specified by ``LD_LIBRARY_PATH``/``DYLD_LIBRARY_PATH`` if one exists. + The exact behavior may be platform dependent and also depends on whether CMake instructs the linker to use RPATH or RUNPATH (this is not specified by the cmake docs). diff --git a/doc/source/Tools.rst b/doc/source/Tools.rst index 4aec567e..65b66468 100644 --- a/doc/source/Tools.rst +++ b/doc/source/Tools.rst @@ -65,6 +65,8 @@ The pygrackle examples and the pygrackle tests all rely upon this functionality. The Grackle C library has support for access the datafiles managed by this tool. Some of the examples may soon rely upon the functionality. +.. _grdata-versioning-and-deduplication: + .. important:: Instances of the grdata tool are associated with a single version of Grackle (if you are using Pygrackle, the version of the core Grackle c-library is the relevant version number). From c9b5cd129601cf234e185c546db4bcdcc604bc33 Mon Sep 17 00:00:00 2001 From: Matthew Abruzzo Date: Fri, 11 Oct 2024 20:59:41 -0400 Subject: [PATCH 36/36] Modified cxx_example.C to make use of managed data files. --- src/example/cxx_example.C | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/example/cxx_example.C b/src/example/cxx_example.C index f064f669..baa33961 100644 --- a/src/example/cxx_example.C +++ b/src/example/cxx_example.C @@ -65,7 +65,10 @@ int main(int argc, char *argv[]) grackle_data->dust_chemistry = 1; grackle_data->metal_cooling = 1; // metal cooling on grackle_data->UVbackground = 1; // UV background on - grackle_data->grackle_data_file = "../../input/CloudyData_UVB=HM2012.h5"; // data file + + // assume that the grdata tool was previously used to fetch data files + grackle_data->grackle_data_file_options = GR_DFOPT_MANAGED; + grackle_data->grackle_data_file = "CloudyData_UVB=HM2012.h5"; // data file // Finally, initialize the chemistry object. if (initialize_chemistry_data(&my_units) == 0) {