From 055c5b71b95e9103e20d7de370e0e98ce37a89a1 Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Fri, 19 Jul 2024 17:49:12 -0700 Subject: [PATCH] nextclade: check use of cache in separate script/rule Doing this in preparation for adding version checks to the decision tree of whether we should use the Nextclade cache. Replaces download of the empty .renew file with just a check that the S3 object exists to limit shuffling of files. --- bin/use-nextclade-cache | 28 +++++++++++++ workflow/snakemake_rules/nextclade.smk | 55 +++++++++++++++++++------- 2 files changed, 68 insertions(+), 15 deletions(-) create mode 100755 bin/use-nextclade-cache diff --git a/bin/use-nextclade-cache b/bin/use-nextclade-cache new file mode 100755 index 00000000..0a783dd1 --- /dev/null +++ b/bin/use-nextclade-cache @@ -0,0 +1,28 @@ +#!/bin/bash +set -euo pipefail + +vendored="$(dirname "$0")"/../vendored + +main() { + s3_dst="${1:?A destination s3:// URL where the renew file is hosted is required as the first argument.}" + s3_src="${2:?A source s3:// URL where the fallback renew file is hosted is required as the second argument.}" + # Nextclade dataset reference wildcard + reference="${3:-}" + if renew-flag-exists; then + echo "[INFO] Found renew flag" >&2 + echo "false" + exit 0 + fi + + echo "true" +} + +renew-flag-exists() { + local renew_file="nextclade${reference}.tsv.zst.renew" + local dst_renew_file="${s3_dst}/${renew_file}" + local src_renew_file="${s3_src}/${renew_file}" + + "$vendored"/s3-object-exists "${dst_renew_file}" || "$vendored"/s3-object-exists "${src_renew_file}" +} + +main "$@" diff --git a/workflow/snakemake_rules/nextclade.smk b/workflow/snakemake_rules/nextclade.smk index 5e71e310..db56ecbf 100644 --- a/workflow/snakemake_rules/nextclade.smk +++ b/workflow/snakemake_rules/nextclade.smk @@ -62,15 +62,31 @@ if config.get("s3_dst") and config.get("s3_src"): ruleorder: download_nextclade_tsv_from_s3 > create_empty_nextclade_info ruleorder: download_previous_alignment_from_s3 > create_empty_nextclade_aligned + rule use_nextclade_cache: + params: + dst_source=config["s3_dst"], + src_source=config["s3_src"], + output: + use_nextclade_cache=f"data/{database}/use_nextclade_cache{{reference}}.txt", + shell: + """ + ./bin/use-nextclade-cache \ + {params.dst_source:q} \ + {params.src_source:q} \ + {wildcards.reference:q} \ + > {output.use_nextclade_cache} + """ + + rule download_nextclade_tsv_from_s3: """ If there's a .renew touchfile, do not use the cache """ + input: + use_nextclade_cache=f"data/{database}/use_nextclade_cache{{reference}}.txt", params: dst_source=config["s3_dst"] + "/nextclade{reference}.tsv.zst", src_source=config["s3_src"] + "/nextclade{reference}.tsv.zst", - dst_rerun_touchfile=config["s3_dst"] + "/nextclade{reference}.tsv.zst.renew", - src_rerun_touchfile=config["s3_dst"] + "/nextclade{reference}.tsv.zst.renew", lines=config.get("subsample", {}).get("nextclade", 0), output: nextclade=f"data/{database}/nextclade{{reference}}_old.tsv", @@ -78,22 +94,27 @@ if config.get("s3_dst") and config.get("s3_src"): f"benchmarks/download_nextclade_tsv_from_s3_{database}{{reference}}.txt" shell: """ - ./vendored/download-from-s3 {params.dst_rerun_touchfile} {output.nextclade} 0 || \ - ./vendored/download-from-s3 {params.src_rerun_touchfile} {output.nextclade} 0 || \ - ./vendored/download-from-s3 {params.dst_source} {output.nextclade} {params.lines} || \ - ./vendored/download-from-s3 {params.src_source} {output.nextclade} {params.lines} || \ - touch {output.nextclade} + use_nextclade_cache=$(cat {input.use_nextclade_cache}) + + if [[ "$use_nextclade_cache" == 'true' ]]; then + echo "[INFO] Downloading cached nextclade{wildcards.reference}.tsv.zst" + ./vendored/download-from-s3 {params.dst_source} {output.nextclade} {params.lines} || \ + ./vendored/download-from-s3 {params.src_source} {output.nextclade} {params.lines} + else + echo "[INFO] Ignoring cached nextclade{wildcards.reference}.tsv.zst" + touch {output.nextclade} + fi """ rule download_previous_alignment_from_s3: ## NOTE two potential bugs with this implementation: ## (1) race condition. This file may be updated on the remote after download_nextclade has run but before this rule ## (2) we may get `download_nextclade` and `download_previous_alignment` from different s3 buckets + input: + use_nextclade_cache=f"data/{database}/use_nextclade_cache.txt", params: dst_source=config["s3_dst"] + "/{seqtype}.fasta.zst", src_source=config["s3_src"] + "/{seqtype}.fasta.zst", - dst_rerun_touchfile=config["s3_dst"] + "/nextclade.tsv.zst.renew", - src_rerun_touchfile=config["s3_dst"] + "/nextclade.tsv.zst.renew", lines=config.get("subsample", {}).get("nextclade", 0), output: alignment=temp(f"data/{database}/nextclade.{{seqtype}}.old.fasta"), @@ -101,13 +122,17 @@ if config.get("s3_dst") and config.get("s3_src"): f"benchmarks/download_previous_alignment_from_s3_{database}{{seqtype}}.txt" shell: """ - ./vendored/download-from-s3 {params.dst_rerun_touchfile} {output.alignment} 0 || \ - ./vendored/download-from-s3 {params.src_rerun_touchfile} {output.alignment} 0 || \ - ./vendored/download-from-s3 {params.dst_source} {output.alignment} {params.lines} || \ - ./vendored/download-from-s3 {params.src_source} {output.alignment} {params.lines} || \ - touch {output.alignment} - """ + use_nextclade_cache=$(cat {input.use_nextclade_cache}) + if [[ "$use_nextclade_cache" == 'true' ]]; then + echo "[INFO] Downloading cached Nextclade {wildcards.seqtype}.fasta.zst" + ./vendored/download-from-s3 {params.dst_source} {output.alignment} {params.lines} || \ + ./vendored/download-from-s3 {params.src_source} {output.alignment} {params.lines} + else + echo "[INFO] Ignoring cached Nextclade {wildcards.seqtype}.fasta.zst" + touch {output.alignment} + fi + """ rule get_sequences_without_nextclade_annotations: """Find sequences in FASTA which don't have clades assigned yet"""