diff --git a/Snakefile b/Snakefile index f7479877..a5f21b59 100644 --- a/Snakefile +++ b/Snakefile @@ -41,10 +41,7 @@ if config.get("s3_dst"): # and the `s3_src` is provided in config since some notify scripts depend # do diffs with files on S3 from previous runs if send_notifications and config.get("s3_src"): - all_targets.extend([ - f"data/{database}/notify-on-record-change.done", - f"data/{database}/notify.done" - ]) + all_targets.append(f"data/{database}/notify.done") rule all: input: all_targets diff --git a/workflow/snakemake_rules/slack_notifications.smk b/workflow/snakemake_rules/slack_notifications.smk index b3f50970..5fe52eb6 100644 --- a/workflow/snakemake_rules/slack_notifications.smk +++ b/workflow/snakemake_rules/slack_notifications.smk @@ -38,6 +38,7 @@ rule notify_on_record_change: rule notify_gisaid: input: + notify_on_record_change = "data/gisaid/notify-on-record-change.done", flagged_annotations = rules.transform_gisaid_data.output.flagged_annotations, additional_info = "data/gisaid/additional_info.tsv", flagged_metadata = "data/gisaid/flagged_metadata.txt" @@ -52,6 +53,7 @@ rule notify_gisaid: rule notify_genbank: input: + notify_on_record_change = "data/genbank/notify-on-record-change.done", flagged_annotations = rules.transform_genbank_data.output.flagged_annotations, duplicate_biosample = "data/genbank/duplicate_biosample.txt" params: @@ -63,4 +65,3 @@ rule notify_genbank: # TODO - which rule produces data/genbank/problem_data.tsv? (was not explicit in `ingest-genbank` bash script) shell("./bin/notify-on-problem-data data/genbank/problem_data.tsv") shell("./bin/notify-on-duplicate-biosample-change {input.duplicate_biosample} {params.s3_bucket}/duplicate_biosample.txt.gz") - diff --git a/workflow/snakemake_rules/upload.smk b/workflow/snakemake_rules/upload.smk index 7f139037..2002cbf5 100644 --- a/workflow/snakemake_rules/upload.smk +++ b/workflow/snakemake_rules/upload.smk @@ -35,7 +35,7 @@ def compute_files_to_upload(): } files_to_upload = files_to_upload | { - f"translation_{gene}.fasta.zst" : f"data/{database}/translation_{gene}.fasta" + f"translation_{gene}.fasta.zst" : f"data/{database}/translation_{gene}.fasta" for gene in GENE_LIST } @@ -52,7 +52,7 @@ def compute_files_to_upload(): files_to_upload["additional_info.tsv.zst"] = f"data/{database}/additional_info.tsv" files_to_upload["flagged_metadata.txt.zst"] = f"data/{database}/flagged_metadata.txt" - + # Include upload of raw NDJSON if we are fetching new sequences from database if config.get("fetch_from_database", False): files_to_upload.update({ @@ -76,9 +76,15 @@ def compute_files_to_upload(): files_to_upload = compute_files_to_upload() - rule upload_single: - input: lambda w: files_to_upload[w.remote_filename] + input: + file_to_upload = lambda w: files_to_upload[w.remote_filename], + # Include the notifications touch file as an input to ensure that + # uploads only run after the notifications rules have run. + # This prevents the race condition between diffs and uploads described + # in https://github.com/nextstrain/ncov-ingest/issues/423 + # -Jover, 2024-01-26 + notifications_flag = f"data/{database}/notify.done" if send_notifications else [], output: "data/{database}/{remote_filename}.upload", params: @@ -89,7 +95,7 @@ rule upload_single: """ ./vendored/upload-to-s3 \ {params.quiet} \ - {input:q} \ + {input.file_to_upload:q} \ {params.s3_bucket:q}/{wildcards.remote_filename:q} \ {params.cloudfront_domain} 2>&1 | tee {output} """ @@ -98,7 +104,7 @@ rule remove_rerun_touchfile: """ Remove the rerun touchfile if such a file is present """ - input: + input: f"data/{database}/{{remote_filename}}.upload", output: f"data/{database}/{{remote_filename}}.renew.deleted", @@ -115,7 +121,7 @@ rule upload: Requests one touch file for each uploaded remote file Dynamically determines that list of files """ - input: + input: uploads = [f"data/{database}/{remote_file}.upload" for remote_file in files_to_upload.keys()], touchfile_removes=[ f"data/{database}/{remote_file}.renew.deleted" for remote_file in [