From 029f79ab63ee4327d8bd04097013a6c1a03151d0 Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Fri, 26 Jul 2024 16:58:02 -0700 Subject: [PATCH 1/2] fetch-cache-version: Use the Nextclade version JSON Now that we are creating a Nextclade version JSON for the Nextclade TSV, we can use it to fetch the cache version. If the Nextclade TSV sha256sum doesn't match the value in the version JSON, then we ignore the version JSON which results in the workflow ignoring the Nextclade cache. --- bin/fetch-cache-version | 31 ++++++++++++++++--------------- bin/use-nextclade-cache | 10 ++++++---- 2 files changed, 22 insertions(+), 19 deletions(-) diff --git a/bin/fetch-cache-version b/bin/fetch-cache-version index debd4662..cb8a03e2 100755 --- a/bin/fetch-cache-version +++ b/bin/fetch-cache-version @@ -1,20 +1,21 @@ #!/bin/bash +set -euo pipefail -# this script intentionally doesn't `set -euo pipefail` -# because otherwise the `head -n 2` step triggers SIGPIPE -# causing the script to exit before it is done. +nextclade_version="${1:?An S3 URL for the Nextclade version JSON is required as the first argument}" +nextclade_tsv="${2:?An S3 URL for the Nextclade TSV is requried as the second argument}" -s3_url="${1:?An S3 URL is required as the first argument}" +no_hash=0000000000000000000000000000000000000000000000000000000000000000 +s3path="${nextclade_tsv#s3://}" +bucket="${s3path%%/*}" +key="${s3path#*/}" +nextclade_tsv_sha256sum="$(aws s3api head-object --bucket "$bucket" --key "$key" --query Metadata.sha256sum --output text 2>/dev/null || echo "$no_hash")" +nextclade_version_json="$(aws s3 cp "$nextclade_version" -)" +version_json_sha256sum="$(echo nextclade_version_json | jq '.nextclade_tsv_sha256sum')" -trap '' SIGPIPE - -(aws s3 cp "$s3_url" - \ - | zstd -T0 -dcq \ - | head -n 2 \ - | tsv-select -H -f 'nextclade_version,dataset_version' \ - | tail -n 1 \ - | jq --raw-input -c ' - split("\t") - | { "nextclade_version": .[0], "nextclade_dataset_version": .[1] }') \ - 2> /dev/null +if [[ "$nextclade_tsv_sha256sum" != "$version_json_sha256sum" ]]; then + echo "[INFO] Ignoring version JSON because the Nextclade TSV sha256sum values do not match" >&2 + echo "{}" +else + echo "$nextclade_version_json" +fi diff --git a/bin/use-nextclade-cache b/bin/use-nextclade-cache index 068b4c8b..543aaafb 100755 --- a/bin/use-nextclade-cache +++ b/bin/use-nextclade-cache @@ -47,13 +47,15 @@ renew-flag-exists() { } get-cache-version-info() { - # TODO: Update check a separate file for version info - # Currently just checks the first row of the nextclade.tsv file - local version_file="nextclade$reference.tsv.zst" + local version_file="nextclade${reference}_version.json" + local nextclade_tsv="nextclade$reference.tsv.zst" local dst_version_file="$s3_dst/$version_file" + local dst_nextclade_tsv="$s3_dst/$nextclade_tsv" local src_version_file="$s3_src/$version_file" + local src_nextclade_tsv="$s3_src/$nextclade_tsv" - "$bin"/fetch-cache-version "$dst_version_file" || "$bin"/cache-version "$src_version_file" + "$bin"/fetch-cache-version "$dst_version_file" "$dst_nextclade_tsv" \ + || "$bin"/cache-version "$src_version_file" "$src_nextclade_tsv" } main "$@" From 3e86a9fc1cc142605c8681f1444ab46488ec679f Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Fri, 26 Jul 2024 15:06:44 -0700 Subject: [PATCH 2/2] Remove version columns from Nextclade TSV We now check the Nextclade versions using the separate version JSON, so we no longer need to track version per row. This is a breaking change for the cache, so will need to be run with the renew flag to manually force a full-rerun. --- workflow/snakemake_rules/nextclade.smk | 40 ++------------------------ 1 file changed, 3 insertions(+), 37 deletions(-) diff --git a/workflow/snakemake_rules/nextclade.smk b/workflow/snakemake_rules/nextclade.smk index 1fe637d8..800a89bc 100644 --- a/workflow/snakemake_rules/nextclade.smk +++ b/workflow/snakemake_rules/nextclade.smk @@ -218,7 +218,7 @@ rule run_wuhan_nextclade: f"--output-translations=data/{database}/nextclade.translation_{{cds}}.upd.fasta" ), output: - info=f"data/{database}/nextclade_new_raw.tsv", + info=f"data/{database}/nextclade_new.tsv", alignment=temp(f"data/{database}/nextclade.aligned.upd.fasta"), translations=[ temp(f"data/{database}/nextclade.translation_{gene}.upd.fasta") @@ -249,7 +249,7 @@ rule run_21L_nextclade: dataset=lambda w: f"data/nextclade_data/sars-cov-2-21L.zip", sequences=f"data/{database}/nextclade_21L.sequences.fasta", output: - info=f"data/{database}/nextclade_21L_new_raw.tsv", + info=f"data/{database}/nextclade_21L_new.tsv", threads: workflow.cores * 0.5 benchmark: @@ -264,47 +264,13 @@ rule run_21L_nextclade: """ -rule nextclade_tsv_concat_versions: - input: - nextclade="data/nextclade", - tsv=f"data/{database}/nextclade{{reference}}_new_raw.tsv", - dataset=lambda w: f"data/nextclade_data/sars-cov-2{w.reference.replace('_','-')}.zip", - output: - tsv=f"data/{database}/nextclade{{reference}}_new.tsv", - benchmark: - f"benchmarks/nextclade_tsv_concat_versions_{database}{{reference}}.txt" - shell: - """ - if [ -s {input.tsv} ]; then - # Get version numbers - nextclade_version="$({input.nextclade:q} --version)" - dataset_version="$(unzip -p {input.dataset} pathogen.json | jq -r '.version.tag')" - timestamp="$(date -u +"%Y-%m-%dT%H:%M:%SZ")" - - # Combine input file with version numbers and write to output - printf "%s\tnextclade_version\tdataset_version\trun_timestamp\n" \ - "$(head -n 1 {input.tsv})" \ - > {output.tsv} - - tail -n +2 {input.tsv} | \ - awk -v v1="$nextclade_version" \ - -v v2="$dataset_version" \ - -v v3="$timestamp" \ - -v OFS='\t' '{{print $0, v1, v2, v3}}' \ - >> {output.tsv} - else - cp {input.tsv} {output.tsv} - fi - """ - - rule nextclade_info: """ Generates nextclade info TSV for all sequences (new + old) """ input: old_info=f"data/{database}/nextclade{{reference}}_old.tsv", - new_info=rules.nextclade_tsv_concat_versions.output.tsv, + new_info=f"data/{database}/nextclade{{reference}}_new.tsv", output: nextclade_info=f"data/{database}/nextclade{{reference}}.tsv", benchmark: