diff --git a/bin/fetch-cache-version b/bin/fetch-cache-version index debd4662..cb8a03e2 100755 --- a/bin/fetch-cache-version +++ b/bin/fetch-cache-version @@ -1,20 +1,21 @@ #!/bin/bash +set -euo pipefail -# this script intentionally doesn't `set -euo pipefail` -# because otherwise the `head -n 2` step triggers SIGPIPE -# causing the script to exit before it is done. +nextclade_version="${1:?An S3 URL for the Nextclade version JSON is required as the first argument}" +nextclade_tsv="${2:?An S3 URL for the Nextclade TSV is requried as the second argument}" -s3_url="${1:?An S3 URL is required as the first argument}" +no_hash=0000000000000000000000000000000000000000000000000000000000000000 +s3path="${nextclade_tsv#s3://}" +bucket="${s3path%%/*}" +key="${s3path#*/}" +nextclade_tsv_sha256sum="$(aws s3api head-object --bucket "$bucket" --key "$key" --query Metadata.sha256sum --output text 2>/dev/null || echo "$no_hash")" +nextclade_version_json="$(aws s3 cp "$nextclade_version" -)" +version_json_sha256sum="$(echo nextclade_version_json | jq '.nextclade_tsv_sha256sum')" -trap '' SIGPIPE - -(aws s3 cp "$s3_url" - \ - | zstd -T0 -dcq \ - | head -n 2 \ - | tsv-select -H -f 'nextclade_version,dataset_version' \ - | tail -n 1 \ - | jq --raw-input -c ' - split("\t") - | { "nextclade_version": .[0], "nextclade_dataset_version": .[1] }') \ - 2> /dev/null +if [[ "$nextclade_tsv_sha256sum" != "$version_json_sha256sum" ]]; then + echo "[INFO] Ignoring version JSON because the Nextclade TSV sha256sum values do not match" >&2 + echo "{}" +else + echo "$nextclade_version_json" +fi diff --git a/bin/use-nextclade-cache b/bin/use-nextclade-cache index 068b4c8b..543aaafb 100755 --- a/bin/use-nextclade-cache +++ b/bin/use-nextclade-cache @@ -47,13 +47,15 @@ renew-flag-exists() { } get-cache-version-info() { - # TODO: Update check a separate file for version info - # Currently just checks the first row of the nextclade.tsv file - local version_file="nextclade$reference.tsv.zst" + local version_file="nextclade${reference}_version.json" + local nextclade_tsv="nextclade$reference.tsv.zst" local dst_version_file="$s3_dst/$version_file" + local dst_nextclade_tsv="$s3_dst/$nextclade_tsv" local src_version_file="$s3_src/$version_file" + local src_nextclade_tsv="$s3_src/$nextclade_tsv" - "$bin"/fetch-cache-version "$dst_version_file" || "$bin"/cache-version "$src_version_file" + "$bin"/fetch-cache-version "$dst_version_file" "$dst_nextclade_tsv" \ + || "$bin"/cache-version "$src_version_file" "$src_nextclade_tsv" } main "$@" diff --git a/workflow/snakemake_rules/nextclade.smk b/workflow/snakemake_rules/nextclade.smk index 1fe637d8..800a89bc 100644 --- a/workflow/snakemake_rules/nextclade.smk +++ b/workflow/snakemake_rules/nextclade.smk @@ -218,7 +218,7 @@ rule run_wuhan_nextclade: f"--output-translations=data/{database}/nextclade.translation_{{cds}}.upd.fasta" ), output: - info=f"data/{database}/nextclade_new_raw.tsv", + info=f"data/{database}/nextclade_new.tsv", alignment=temp(f"data/{database}/nextclade.aligned.upd.fasta"), translations=[ temp(f"data/{database}/nextclade.translation_{gene}.upd.fasta") @@ -249,7 +249,7 @@ rule run_21L_nextclade: dataset=lambda w: f"data/nextclade_data/sars-cov-2-21L.zip", sequences=f"data/{database}/nextclade_21L.sequences.fasta", output: - info=f"data/{database}/nextclade_21L_new_raw.tsv", + info=f"data/{database}/nextclade_21L_new.tsv", threads: workflow.cores * 0.5 benchmark: @@ -264,47 +264,13 @@ rule run_21L_nextclade: """ -rule nextclade_tsv_concat_versions: - input: - nextclade="data/nextclade", - tsv=f"data/{database}/nextclade{{reference}}_new_raw.tsv", - dataset=lambda w: f"data/nextclade_data/sars-cov-2{w.reference.replace('_','-')}.zip", - output: - tsv=f"data/{database}/nextclade{{reference}}_new.tsv", - benchmark: - f"benchmarks/nextclade_tsv_concat_versions_{database}{{reference}}.txt" - shell: - """ - if [ -s {input.tsv} ]; then - # Get version numbers - nextclade_version="$({input.nextclade:q} --version)" - dataset_version="$(unzip -p {input.dataset} pathogen.json | jq -r '.version.tag')" - timestamp="$(date -u +"%Y-%m-%dT%H:%M:%SZ")" - - # Combine input file with version numbers and write to output - printf "%s\tnextclade_version\tdataset_version\trun_timestamp\n" \ - "$(head -n 1 {input.tsv})" \ - > {output.tsv} - - tail -n +2 {input.tsv} | \ - awk -v v1="$nextclade_version" \ - -v v2="$dataset_version" \ - -v v3="$timestamp" \ - -v OFS='\t' '{{print $0, v1, v2, v3}}' \ - >> {output.tsv} - else - cp {input.tsv} {output.tsv} - fi - """ - - rule nextclade_info: """ Generates nextclade info TSV for all sequences (new + old) """ input: old_info=f"data/{database}/nextclade{{reference}}_old.tsv", - new_info=rules.nextclade_tsv_concat_versions.output.tsv, + new_info=f"data/{database}/nextclade{{reference}}_new.tsv", output: nextclade_info=f"data/{database}/nextclade{{reference}}.tsv", benchmark: