Skip to content

Commit

Permalink
Ignore cache if Nextclade or dataset version is different
Browse files Browse the repository at this point in the history
Currently checks Nextclade and dataset versions of the first row of the
nextclade.tsv file and formats them as the propose JSON. Once the
version JSON file is in place, it should be easy to swap out the check
for the new file.
  • Loading branch information
joverlee521 committed Jul 24, 2024
1 parent 055c5b7 commit 136c5c8
Show file tree
Hide file tree
Showing 3 changed files with 70 additions and 6 deletions.
16 changes: 16 additions & 0 deletions bin/fetch-cache-version
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#!/bin/bash

s3_url="${1:?An S3 URL is required as the first argument}"


trap '' SIGPIPE

(aws s3 cp "$s3_url" - \
| zstd -T0 -dcq \
| head -n 2 \
| tsv-select -H -f 'nextclade_version,dataset_version' \
| tail -n 1 \
| jq --raw-input -c '
split("\t")
| { "nextclade_version": .[0], "nextclade_dataset_version": .[1] }') \
2> /dev/null
43 changes: 37 additions & 6 deletions bin/use-nextclade-cache
Original file line number Diff line number Diff line change
@@ -1,28 +1,59 @@
#!/bin/bash
set -euo pipefail

vendored="$(dirname "$0")"/../vendored
bin="$(dirname "$0")"
vendored="$bin"/../vendored

main() {
s3_dst="${1:?A destination s3:// URL where the renew file is hosted is required as the first argument.}"
s3_src="${2:?A source s3:// URL where the fallback renew file is hosted is required as the second argument.}"
nextclade="${3:?A path to the Nextclade executable is required as the third argument}"
nextclade_dataset="${4:?A path to a Nextclade dataset ZIP archive is required as the fourth argument}"
# Nextclade dataset reference wildcard
reference="${3:-}"
reference="${5:-}"

if renew-flag-exists; then
echo "[INFO] Found renew flag" >&2
echo "false"
exit 0
fi

cache_versions="$(get-cache-version-info)"
cache_nextclade_version="$(echo "$cache_versions" | jq -r .nextclade_version)"
current_nextclade_version="$("$nextclade" --version)"
if [[ "$cache_nextclade_version" != "$current_nextclade_version" ]]; then
echo "[INFO] Current Nextclade version ($current_nextclade_version) is different from cache version ($cache_nextclade_version)" >&2
echo "false"
exit 0
fi

cache_dataset_version="$(echo "$cache_versions" | jq -r .nextclade_dataset_version)"
current_dataset_version="$(unzip -p "$nextclade_dataset" pathogen.json | jq -r '.version.tag')"
if [[ "$cache_dataset_version" != "$current_dataset_version" ]]; then
echo "[INFO] Current Nextclade dataset version ($current_dataset_version) is different from cache version ($cache_dataset_version)" >&2
echo "false"
exit 0
fi

echo "true"
}

renew-flag-exists() {
local renew_file="nextclade${reference}.tsv.zst.renew"
local dst_renew_file="${s3_dst}/${renew_file}"
local src_renew_file="${s3_src}/${renew_file}"
local renew_file="nextclade$reference.tsv.zst.renew"
local dst_renew_file="$s3_dst/$renew_file"
local src_renew_file="$s3_src/$renew_file"

"$vendored"/s3-object-exists "$dst_renew_file" || "$vendored"/s3-object-exists "$src_renew_file"
}

get-cache-version-info() {
# TODO: Update check a separate file for version info
# Currently just checks the first row of the nextclade.tsv file
local version_file="nextclade$reference.tsv.zst"
local dst_version_file="$s3_dst/$version_file"
local src_version_file="$s3_src/$version_file"

"$vendored"/s3-object-exists "${dst_renew_file}" || "$vendored"/s3-object-exists "${src_renew_file}"
"$bin"/fetch-cache-version "$dst_version_file" || "$bin"/cache-version "$src_version_file"
}

main "$@"
17 changes: 17 additions & 0 deletions workflow/snakemake_rules/nextclade.smk
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,22 @@ if config.get("s3_dst") and config.get("s3_src"):
ruleorder: download_nextclade_tsv_from_s3 > create_empty_nextclade_info
ruleorder: download_previous_alignment_from_s3 > create_empty_nextclade_aligned

def _convert_dataset_name(wildcards):
if wildcards.reference == '':
dataset_name="sars-cov-2"
elif wildcards.reference == '_21L':
dataset_name="sars-cov-2-21L"
else:
# We shouldn't run into this since we have wildcard_constraints,
# but doesn't hurt to include it in case that changes
raise ValueError(f"Cannot convert unsupported reference {wildcards.reference!r} to Nextclade dataset name")

return f"data/nextclade_data/{dataset_name}.zip",

rule use_nextclade_cache:
input:
nextclade="./nextclade",
nextclade_dataset=_convert_dataset_name,
params:
dst_source=config["s3_dst"],
src_source=config["s3_src"],
Expand All @@ -73,6 +88,8 @@ if config.get("s3_dst") and config.get("s3_src"):
./bin/use-nextclade-cache \
{params.dst_source:q} \
{params.src_source:q} \
{input.nextclade:q} \
{input.nextclade_dataset:q} \
{wildcards.reference:q} \
> {output.use_nextclade_cache}
"""
Expand Down

0 comments on commit 136c5c8

Please sign in to comment.