Skip to content

Commit

Permalink
nextclade: check use of cache in separate script/rule
Browse files Browse the repository at this point in the history
Doing this in preparation for adding version checks to the decision
tree of whether we should use the Nextclade cache.

Replaces download of the empty .renew file with just a check that the
S3 object exists to limit shuffling of files.
  • Loading branch information
joverlee521 committed Jul 24, 2024
1 parent fd9d7e1 commit 055c5b7
Show file tree
Hide file tree
Showing 2 changed files with 68 additions and 15 deletions.
28 changes: 28 additions & 0 deletions bin/use-nextclade-cache
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#!/bin/bash
set -euo pipefail

vendored="$(dirname "$0")"/../vendored

main() {
s3_dst="${1:?A destination s3:// URL where the renew file is hosted is required as the first argument.}"
s3_src="${2:?A source s3:// URL where the fallback renew file is hosted is required as the second argument.}"
# Nextclade dataset reference wildcard
reference="${3:-}"
if renew-flag-exists; then
echo "[INFO] Found renew flag" >&2
echo "false"
exit 0
fi

echo "true"
}

renew-flag-exists() {
local renew_file="nextclade${reference}.tsv.zst.renew"
local dst_renew_file="${s3_dst}/${renew_file}"
local src_renew_file="${s3_src}/${renew_file}"

"$vendored"/s3-object-exists "${dst_renew_file}" || "$vendored"/s3-object-exists "${src_renew_file}"
}

main "$@"
55 changes: 40 additions & 15 deletions workflow/snakemake_rules/nextclade.smk
Original file line number Diff line number Diff line change
Expand Up @@ -62,52 +62,77 @@ if config.get("s3_dst") and config.get("s3_src"):
ruleorder: download_nextclade_tsv_from_s3 > create_empty_nextclade_info
ruleorder: download_previous_alignment_from_s3 > create_empty_nextclade_aligned

rule use_nextclade_cache:
params:
dst_source=config["s3_dst"],
src_source=config["s3_src"],
output:
use_nextclade_cache=f"data/{database}/use_nextclade_cache{{reference}}.txt",
shell:
"""
./bin/use-nextclade-cache \
{params.dst_source:q} \
{params.src_source:q} \
{wildcards.reference:q} \
> {output.use_nextclade_cache}
"""


rule download_nextclade_tsv_from_s3:
"""
If there's a .renew touchfile, do not use the cache
"""
input:
use_nextclade_cache=f"data/{database}/use_nextclade_cache{{reference}}.txt",
params:
dst_source=config["s3_dst"] + "/nextclade{reference}.tsv.zst",
src_source=config["s3_src"] + "/nextclade{reference}.tsv.zst",
dst_rerun_touchfile=config["s3_dst"] + "/nextclade{reference}.tsv.zst.renew",
src_rerun_touchfile=config["s3_dst"] + "/nextclade{reference}.tsv.zst.renew",
lines=config.get("subsample", {}).get("nextclade", 0),
output:
nextclade=f"data/{database}/nextclade{{reference}}_old.tsv",
benchmark:
f"benchmarks/download_nextclade_tsv_from_s3_{database}{{reference}}.txt"
shell:
"""
./vendored/download-from-s3 {params.dst_rerun_touchfile} {output.nextclade} 0 || \
./vendored/download-from-s3 {params.src_rerun_touchfile} {output.nextclade} 0 || \
./vendored/download-from-s3 {params.dst_source} {output.nextclade} {params.lines} || \
./vendored/download-from-s3 {params.src_source} {output.nextclade} {params.lines} || \
touch {output.nextclade}
use_nextclade_cache=$(cat {input.use_nextclade_cache})
if [[ "$use_nextclade_cache" == 'true' ]]; then
echo "[INFO] Downloading cached nextclade{wildcards.reference}.tsv.zst"
./vendored/download-from-s3 {params.dst_source} {output.nextclade} {params.lines} || \
./vendored/download-from-s3 {params.src_source} {output.nextclade} {params.lines}
else
echo "[INFO] Ignoring cached nextclade{wildcards.reference}.tsv.zst"
touch {output.nextclade}
fi
"""

rule download_previous_alignment_from_s3:
## NOTE two potential bugs with this implementation:
## (1) race condition. This file may be updated on the remote after download_nextclade has run but before this rule
## (2) we may get `download_nextclade` and `download_previous_alignment` from different s3 buckets
input:
use_nextclade_cache=f"data/{database}/use_nextclade_cache.txt",
params:
dst_source=config["s3_dst"] + "/{seqtype}.fasta.zst",
src_source=config["s3_src"] + "/{seqtype}.fasta.zst",
dst_rerun_touchfile=config["s3_dst"] + "/nextclade.tsv.zst.renew",
src_rerun_touchfile=config["s3_dst"] + "/nextclade.tsv.zst.renew",
lines=config.get("subsample", {}).get("nextclade", 0),
output:
alignment=temp(f"data/{database}/nextclade.{{seqtype}}.old.fasta"),
benchmark:
f"benchmarks/download_previous_alignment_from_s3_{database}{{seqtype}}.txt"
shell:
"""
./vendored/download-from-s3 {params.dst_rerun_touchfile} {output.alignment} 0 || \
./vendored/download-from-s3 {params.src_rerun_touchfile} {output.alignment} 0 || \
./vendored/download-from-s3 {params.dst_source} {output.alignment} {params.lines} || \
./vendored/download-from-s3 {params.src_source} {output.alignment} {params.lines} || \
touch {output.alignment}
"""
use_nextclade_cache=$(cat {input.use_nextclade_cache})
if [[ "$use_nextclade_cache" == 'true' ]]; then
echo "[INFO] Downloading cached Nextclade {wildcards.seqtype}.fasta.zst"
./vendored/download-from-s3 {params.dst_source} {output.alignment} {params.lines} || \
./vendored/download-from-s3 {params.src_source} {output.alignment} {params.lines}
else
echo "[INFO] Ignoring cached Nextclade {wildcards.seqtype}.fasta.zst"
touch {output.alignment}
fi
"""

rule get_sequences_without_nextclade_annotations:
"""Find sequences in FASTA which don't have clades assigned yet"""
Expand Down

0 comments on commit 055c5b7

Please sign in to comment.