From b628019f872ee5a4724b7af50d5ac972d8abe824 Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Fri, 26 Jan 2024 13:34:44 -0800 Subject: [PATCH 1/3] Remove excess whitespace --- workflow/snakemake_rules/upload.smk | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/workflow/snakemake_rules/upload.smk b/workflow/snakemake_rules/upload.smk index 7f139037..9ed00edb 100644 --- a/workflow/snakemake_rules/upload.smk +++ b/workflow/snakemake_rules/upload.smk @@ -35,7 +35,7 @@ def compute_files_to_upload(): } files_to_upload = files_to_upload | { - f"translation_{gene}.fasta.zst" : f"data/{database}/translation_{gene}.fasta" + f"translation_{gene}.fasta.zst" : f"data/{database}/translation_{gene}.fasta" for gene in GENE_LIST } @@ -52,7 +52,7 @@ def compute_files_to_upload(): files_to_upload["additional_info.tsv.zst"] = f"data/{database}/additional_info.tsv" files_to_upload["flagged_metadata.txt.zst"] = f"data/{database}/flagged_metadata.txt" - + # Include upload of raw NDJSON if we are fetching new sequences from database if config.get("fetch_from_database", False): files_to_upload.update({ @@ -98,7 +98,7 @@ rule remove_rerun_touchfile: """ Remove the rerun touchfile if such a file is present """ - input: + input: f"data/{database}/{{remote_filename}}.upload", output: f"data/{database}/{{remote_filename}}.renew.deleted", @@ -115,7 +115,7 @@ rule upload: Requests one touch file for each uploaded remote file Dynamically determines that list of files """ - input: + input: uploads = [f"data/{database}/{remote_file}.upload" for remote_file in files_to_upload.keys()], touchfile_removes=[ f"data/{database}/{remote_file}.renew.deleted" for remote_file in [ From 74323847d08a7c5590753c4665e327ee02b10c87 Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Fri, 26 Jan 2024 13:00:38 -0800 Subject: [PATCH 2/3] slack_notifications: Simplify DAG Add the `notify_on_record_change` output as an input to the `notify_gisaid` and `notify_genbank` rules to simplify the DAG. Ensures the "notify.done" flag file signals the completion of all notification rules. This will make it easier to track when the upload rules can run in the following commit. --- Snakefile | 5 +---- workflow/snakemake_rules/slack_notifications.smk | 3 ++- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/Snakefile b/Snakefile index f7479877..a5f21b59 100644 --- a/Snakefile +++ b/Snakefile @@ -41,10 +41,7 @@ if config.get("s3_dst"): # and the `s3_src` is provided in config since some notify scripts depend # do diffs with files on S3 from previous runs if send_notifications and config.get("s3_src"): - all_targets.extend([ - f"data/{database}/notify-on-record-change.done", - f"data/{database}/notify.done" - ]) + all_targets.append(f"data/{database}/notify.done") rule all: input: all_targets diff --git a/workflow/snakemake_rules/slack_notifications.smk b/workflow/snakemake_rules/slack_notifications.smk index b3f50970..5fe52eb6 100644 --- a/workflow/snakemake_rules/slack_notifications.smk +++ b/workflow/snakemake_rules/slack_notifications.smk @@ -38,6 +38,7 @@ rule notify_on_record_change: rule notify_gisaid: input: + notify_on_record_change = "data/gisaid/notify-on-record-change.done", flagged_annotations = rules.transform_gisaid_data.output.flagged_annotations, additional_info = "data/gisaid/additional_info.tsv", flagged_metadata = "data/gisaid/flagged_metadata.txt" @@ -52,6 +53,7 @@ rule notify_gisaid: rule notify_genbank: input: + notify_on_record_change = "data/genbank/notify-on-record-change.done", flagged_annotations = rules.transform_genbank_data.output.flagged_annotations, duplicate_biosample = "data/genbank/duplicate_biosample.txt" params: @@ -63,4 +65,3 @@ rule notify_genbank: # TODO - which rule produces data/genbank/problem_data.tsv? (was not explicit in `ingest-genbank` bash script) shell("./bin/notify-on-problem-data data/genbank/problem_data.tsv") shell("./bin/notify-on-duplicate-biosample-change {input.duplicate_biosample} {params.s3_bucket}/duplicate_biosample.txt.gz") - From 5564a77b247597024c40e3b91a96a28d84a93eb6 Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Fri, 26 Jan 2024 13:43:15 -0800 Subject: [PATCH 3/3] upload: Include notification touch file as input Ensure that uploads only run after the notifications rules have run. This prevents the race condition between diffs and uploads described in https://github.com/nextstrain/ncov-ingest/issues/423. This will slightly delay the uploads of files but is necessary to support the diffs of local files and files on S3. --- workflow/snakemake_rules/upload.smk | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/workflow/snakemake_rules/upload.smk b/workflow/snakemake_rules/upload.smk index 9ed00edb..2002cbf5 100644 --- a/workflow/snakemake_rules/upload.smk +++ b/workflow/snakemake_rules/upload.smk @@ -76,9 +76,15 @@ def compute_files_to_upload(): files_to_upload = compute_files_to_upload() - rule upload_single: - input: lambda w: files_to_upload[w.remote_filename] + input: + file_to_upload = lambda w: files_to_upload[w.remote_filename], + # Include the notifications touch file as an input to ensure that + # uploads only run after the notifications rules have run. + # This prevents the race condition between diffs and uploads described + # in https://github.com/nextstrain/ncov-ingest/issues/423 + # -Jover, 2024-01-26 + notifications_flag = f"data/{database}/notify.done" if send_notifications else [], output: "data/{database}/{remote_filename}.upload", params: @@ -89,7 +95,7 @@ rule upload_single: """ ./vendored/upload-to-s3 \ {params.quiet} \ - {input:q} \ + {input.file_to_upload:q} \ {params.s3_bucket:q}/{wildcards.remote_filename:q} \ {params.cloudfront_domain} 2>&1 | tee {output} """