From 07559a11b9c6cbd24f1f90947071f0258dd7b72c Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Mon, 22 Jul 2024 17:42:03 -0700 Subject: [PATCH] Ignore cache if Nextclade or dataset version is different Currently checks Nextclade and dataset versions of the first row of the nextclade.tsv file and formats them as the propose JSON. Once the version JSON file is in place, it should be easy to swap out the check for the new file. --- bin/fetch-cache-version | 16 ++++++++++++ bin/use-nextclade-cache | 35 ++++++++++++++++++++++++-- workflow/snakemake_rules/nextclade.smk | 17 +++++++++++++ 3 files changed, 66 insertions(+), 2 deletions(-) create mode 100755 bin/fetch-cache-version diff --git a/bin/fetch-cache-version b/bin/fetch-cache-version new file mode 100755 index 00000000..572956f1 --- /dev/null +++ b/bin/fetch-cache-version @@ -0,0 +1,16 @@ +#!/bin/bash + +s3_url="${1:?An S3 URL is required as the first argument}" + + +trap '' SIGPIPE + +(aws s3 cp "${s3_url}" - \ + | zstd -T0 -dcq \ + | head -n 2 \ + | tsv-select -H -f 'nextclade_version,dataset_version' \ + | tail -n 1 \ + | jq --raw-input -c ' + split("\t") + | { "nextclade_version": .[0], "nextclade_dataset_version": .[1] }') \ + 2> /dev/null diff --git a/bin/use-nextclade-cache b/bin/use-nextclade-cache index 0a783dd1..f353f053 100755 --- a/bin/use-nextclade-cache +++ b/bin/use-nextclade-cache @@ -1,19 +1,40 @@ #!/bin/bash set -euo pipefail -vendored="$(dirname "$0")"/../vendored +bin="$(dirname "$0")" +vendored="$bin"/../vendored main() { s3_dst="${1:?A destination s3:// URL where the renew file is hosted is required as the first argument.}" s3_src="${2:?A source s3:// URL where the fallback renew file is hosted is required as the second argument.}" + nextclade="${3:?A path to the Nextclade executable is required as the third argument}" + nextclade_dataset="${4:?A path to a Nextclade dataset ZIP archive is required as the fourth argument}" # Nextclade dataset reference wildcard - reference="${3:-}" + reference="${5:-}" + if renew-flag-exists; then echo "[INFO] Found renew flag" >&2 echo "false" exit 0 fi + cache_versions="$(get-cache-version-info)" + cache_nextclade_version="$(echo "$cache_versions" | jq -r .nextclade_version)" + current_nextclade_version="$("$nextclade" --version)" + if [[ "$cache_nextclade_version" != "$current_nextclade_version" ]]; then + echo "[INFO] Current Nextclade version (${current_nextclade_version}) is different from cache version (${cache_nextclade_version})" >&2 + echo "false" + exit 0 + fi + + cache_dataset_version="$(echo "$cache_versions" | jq -r .nextclade_dataset_version)" + current_dataset_version="$(unzip -p ${nextclade_dataset} pathogen.json | jq -r '.version.tag')" + if [[ "$cache_dataset_version" != "$current_dataset_version" ]]; then + echo "[INFO] Current Nextclade dataset version (${current_dataset_version}) is different from cache version (${cache_dataset_version})" >&2 + echo "false" + exit 0 + fi + echo "true" } @@ -25,4 +46,14 @@ renew-flag-exists() { "$vendored"/s3-object-exists "${dst_renew_file}" || "$vendored"/s3-object-exists "${src_renew_file}" } +get-cache-version-info() { + # TODO: Update check a separate file for version info + # Currently just checks the first row of the nextclade.tsv file + local version_file="nextclade${reference}.tsv.zst" + local dst_version_file="${s3_dst}/${version_file}" + local src_version_file="${s3_src}/${version_file}" + + "$bin"/fetch-cache-version "$dst_version_file" || "$bin"/cache-version "$src_version_file" +} + main "$@" diff --git a/workflow/snakemake_rules/nextclade.smk b/workflow/snakemake_rules/nextclade.smk index db56ecbf..9a2ff6cc 100644 --- a/workflow/snakemake_rules/nextclade.smk +++ b/workflow/snakemake_rules/nextclade.smk @@ -62,7 +62,22 @@ if config.get("s3_dst") and config.get("s3_src"): ruleorder: download_nextclade_tsv_from_s3 > create_empty_nextclade_info ruleorder: download_previous_alignment_from_s3 > create_empty_nextclade_aligned + def _convert_dataset_name(wildcards): + if wildcards.reference == '': + dataset_name="sars-cov-2" + elif wildcards.reference == '_21L': + dataset_name="sars-cov-2-21L" + else: + # We shouldn't run into this since we have wildcard_constraints, + # but doesn't hurt to include it in case that changes + raise ValueError(f"Cannot convert unsupported reference {wildcards.reference!r} to Nextclade dataset name") + + return f"data/nextclade_data/{dataset_name}.zip", + rule use_nextclade_cache: + input: + nextclade="./nextclade", + nextclade_dataset=_convert_dataset_name, params: dst_source=config["s3_dst"], src_source=config["s3_src"], @@ -73,6 +88,8 @@ if config.get("s3_dst") and config.get("s3_src"): ./bin/use-nextclade-cache \ {params.dst_source:q} \ {params.src_source:q} \ + {input.nextclade:q} \ + {input.nextclade_dataset:q} \ {wildcards.reference:q} \ > {output.use_nextclade_cache} """