Skip to content

Commit

Permalink
Merge pull request #424 from nextstrain/fix-upload-diff-race
Browse files Browse the repository at this point in the history
Fix upload diff race
  • Loading branch information
joverlee521 authored Jan 29, 2024
2 parents f7b7184 + 5564a77 commit 433bb82
Show file tree
Hide file tree
Showing 3 changed files with 16 additions and 12 deletions.
5 changes: 1 addition & 4 deletions Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,7 @@ if config.get("s3_dst"):
# and the `s3_src` is provided in config since some notify scripts depend
# do diffs with files on S3 from previous runs
if send_notifications and config.get("s3_src"):
all_targets.extend([
f"data/{database}/notify-on-record-change.done",
f"data/{database}/notify.done"
])
all_targets.append(f"data/{database}/notify.done")

rule all:
input: all_targets
Expand Down
3 changes: 2 additions & 1 deletion workflow/snakemake_rules/slack_notifications.smk
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ rule notify_on_record_change:

rule notify_gisaid:
input:
notify_on_record_change = "data/gisaid/notify-on-record-change.done",
flagged_annotations = rules.transform_gisaid_data.output.flagged_annotations,
additional_info = "data/gisaid/additional_info.tsv",
flagged_metadata = "data/gisaid/flagged_metadata.txt"
Expand All @@ -52,6 +53,7 @@ rule notify_gisaid:

rule notify_genbank:
input:
notify_on_record_change = "data/genbank/notify-on-record-change.done",
flagged_annotations = rules.transform_genbank_data.output.flagged_annotations,
duplicate_biosample = "data/genbank/duplicate_biosample.txt"
params:
Expand All @@ -63,4 +65,3 @@ rule notify_genbank:
# TODO - which rule produces data/genbank/problem_data.tsv? (was not explicit in `ingest-genbank` bash script)
shell("./bin/notify-on-problem-data data/genbank/problem_data.tsv")
shell("./bin/notify-on-duplicate-biosample-change {input.duplicate_biosample} {params.s3_bucket}/duplicate_biosample.txt.gz")

20 changes: 13 additions & 7 deletions workflow/snakemake_rules/upload.smk
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def compute_files_to_upload():

}
files_to_upload = files_to_upload | {
f"translation_{gene}.fasta.zst" : f"data/{database}/translation_{gene}.fasta"
f"translation_{gene}.fasta.zst" : f"data/{database}/translation_{gene}.fasta"
for gene in GENE_LIST
}

Expand All @@ -52,7 +52,7 @@ def compute_files_to_upload():

files_to_upload["additional_info.tsv.zst"] = f"data/{database}/additional_info.tsv"
files_to_upload["flagged_metadata.txt.zst"] = f"data/{database}/flagged_metadata.txt"

# Include upload of raw NDJSON if we are fetching new sequences from database
if config.get("fetch_from_database", False):
files_to_upload.update({
Expand All @@ -76,9 +76,15 @@ def compute_files_to_upload():

files_to_upload = compute_files_to_upload()


rule upload_single:
input: lambda w: files_to_upload[w.remote_filename]
input:
file_to_upload = lambda w: files_to_upload[w.remote_filename],
# Include the notifications touch file as an input to ensure that
# uploads only run after the notifications rules have run.
# This prevents the race condition between diffs and uploads described
# in https://github.com/nextstrain/ncov-ingest/issues/423
# -Jover, 2024-01-26
notifications_flag = f"data/{database}/notify.done" if send_notifications else [],
output:
"data/{database}/{remote_filename}.upload",
params:
Expand All @@ -89,7 +95,7 @@ rule upload_single:
"""
./vendored/upload-to-s3 \
{params.quiet} \
{input:q} \
{input.file_to_upload:q} \
{params.s3_bucket:q}/{wildcards.remote_filename:q} \
{params.cloudfront_domain} 2>&1 | tee {output}
"""
Expand All @@ -98,7 +104,7 @@ rule remove_rerun_touchfile:
"""
Remove the rerun touchfile if such a file is present
"""
input:
input:
f"data/{database}/{{remote_filename}}.upload",
output:
f"data/{database}/{{remote_filename}}.renew.deleted",
Expand All @@ -115,7 +121,7 @@ rule upload:
Requests one touch file for each uploaded remote file
Dynamically determines that list of files
"""
input:
input:
uploads = [f"data/{database}/{remote_file}.upload" for remote_file in files_to_upload.keys()],
touchfile_removes=[
f"data/{database}/{remote_file}.renew.deleted" for remote_file in [
Expand Down

0 comments on commit 433bb82

Please sign in to comment.