diff --git a/bin/fetch-cache-version b/bin/fetch-cache-version new file mode 100755 index 00000000..572956f1 --- /dev/null +++ b/bin/fetch-cache-version @@ -0,0 +1,16 @@ +#!/bin/bash + +s3_url="${1:?An S3 URL is required as the first argument}" + + +trap '' SIGPIPE + +(aws s3 cp "${s3_url}" - \ + | zstd -T0 -dcq \ + | head -n 2 \ + | tsv-select -H -f 'nextclade_version,dataset_version' \ + | tail -n 1 \ + | jq --raw-input -c ' + split("\t") + | { "nextclade_version": .[0], "nextclade_dataset_version": .[1] }') \ + 2> /dev/null diff --git a/bin/use-nextclade-cache b/bin/use-nextclade-cache index 0a783dd1..f353f053 100755 --- a/bin/use-nextclade-cache +++ b/bin/use-nextclade-cache @@ -1,19 +1,40 @@ #!/bin/bash set -euo pipefail -vendored="$(dirname "$0")"/../vendored +bin="$(dirname "$0")" +vendored="$bin"/../vendored main() { s3_dst="${1:?A destination s3:// URL where the renew file is hosted is required as the first argument.}" s3_src="${2:?A source s3:// URL where the fallback renew file is hosted is required as the second argument.}" + nextclade="${3:?A path to the Nextclade executable is required as the third argument}" + nextclade_dataset="${4:?A path to a Nextclade dataset ZIP archive is required as the fourth argument}" # Nextclade dataset reference wildcard - reference="${3:-}" + reference="${5:-}" + if renew-flag-exists; then echo "[INFO] Found renew flag" >&2 echo "false" exit 0 fi + cache_versions="$(get-cache-version-info)" + cache_nextclade_version="$(echo "$cache_versions" | jq -r .nextclade_version)" + current_nextclade_version="$("$nextclade" --version)" + if [[ "$cache_nextclade_version" != "$current_nextclade_version" ]]; then + echo "[INFO] Current Nextclade version (${current_nextclade_version}) is different from cache version (${cache_nextclade_version})" >&2 + echo "false" + exit 0 + fi + + cache_dataset_version="$(echo "$cache_versions" | jq -r .nextclade_dataset_version)" + current_dataset_version="$(unzip -p ${nextclade_dataset} pathogen.json | jq -r '.version.tag')" + if [[ "$cache_dataset_version" != "$current_dataset_version" ]]; then + echo "[INFO] Current Nextclade dataset version (${current_dataset_version}) is different from cache version (${cache_dataset_version})" >&2 + echo "false" + exit 0 + fi + echo "true" } @@ -25,4 +46,14 @@ renew-flag-exists() { "$vendored"/s3-object-exists "${dst_renew_file}" || "$vendored"/s3-object-exists "${src_renew_file}" } +get-cache-version-info() { + # TODO: Update check a separate file for version info + # Currently just checks the first row of the nextclade.tsv file + local version_file="nextclade${reference}.tsv.zst" + local dst_version_file="${s3_dst}/${version_file}" + local src_version_file="${s3_src}/${version_file}" + + "$bin"/fetch-cache-version "$dst_version_file" || "$bin"/cache-version "$src_version_file" +} + main "$@" diff --git a/workflow/snakemake_rules/nextclade.smk b/workflow/snakemake_rules/nextclade.smk index db56ecbf..9a2ff6cc 100644 --- a/workflow/snakemake_rules/nextclade.smk +++ b/workflow/snakemake_rules/nextclade.smk @@ -62,7 +62,22 @@ if config.get("s3_dst") and config.get("s3_src"): ruleorder: download_nextclade_tsv_from_s3 > create_empty_nextclade_info ruleorder: download_previous_alignment_from_s3 > create_empty_nextclade_aligned + def _convert_dataset_name(wildcards): + if wildcards.reference == '': + dataset_name="sars-cov-2" + elif wildcards.reference == '_21L': + dataset_name="sars-cov-2-21L" + else: + # We shouldn't run into this since we have wildcard_constraints, + # but doesn't hurt to include it in case that changes + raise ValueError(f"Cannot convert unsupported reference {wildcards.reference!r} to Nextclade dataset name") + + return f"data/nextclade_data/{dataset_name}.zip", + rule use_nextclade_cache: + input: + nextclade="./nextclade", + nextclade_dataset=_convert_dataset_name, params: dst_source=config["s3_dst"], src_source=config["s3_src"], @@ -73,6 +88,8 @@ if config.get("s3_dst") and config.get("s3_src"): ./bin/use-nextclade-cache \ {params.dst_source:q} \ {params.src_source:q} \ + {input.nextclade:q} \ + {input.nextclade_dataset:q} \ {wildcards.reference:q} \ > {output.use_nextclade_cache} """