diff --git a/README.md b/README.md index ecf7d8c7..5e877ea3 100644 --- a/README.md +++ b/README.md @@ -135,8 +135,10 @@ Whenever the underlying nextclade dataset (reference tree, QC rules) and/or next In order to tell ingest to not use the cached `nextclade.tsv`/`aligned.fasta` and instead perform a full rerun, you need to add an (empty) touchfile to the s3 bucket: ```bash -aws s3 cp - s3://nextstrain-ncov-private/nextclade.tsv.zst.renew < /dev/null -aws s3 cp - s3://nextstrain-data/files/ncov/open/nextclade.tsv.zst.renew < /dev/null +for file in [ "nextclade.tsv.zst.renew", "version_sars-cov-2.txt" ]; do + aws s3 cp - s3://nextstrain-ncov-private/$file < /dev/null + aws s3 cp - s3://nextstrain-data/files/ncov/open/$file < /dev/null +done ``` Ingest will automatically remove the touchfiles after it has completed the rerun. @@ -144,8 +146,10 @@ Ingest will automatically remove the touchfiles after it has completed the rerun To rerun Nextclade using the `sars-cov-2-21L` dataset - which is only necessary when the calculation of `immune_escape` and `ace2_binding` changes - you need to add an (empty) touchfile to the s3 bucket: ```bash -aws s3 cp - s3://nextstrain-ncov-private/nextclade_21L.tsv.zst.renew < /dev/null -aws s3 cp - s3://nextstrain-data/files/ncov/open/nextclade_21L.tsv.zst.renew < /dev/null +for file in [ "nextclade_21L.tsv.zst.renew", "version_sars-cov-2-21L.txt" ]; do + aws s3 cp - s3://nextstrain-ncov-private/$file < /dev/null + aws s3 cp - s3://nextstrain-data/files/ncov/open/$file < /dev/null +done ``` ## Required environment variables @@ -157,3 +161,7 @@ aws s3 cp - s3://nextstrain-data/files/ncov/open/nextclade_21L.tsv.zst.renew < / - `AWS_SECRET_ACCESS_KEY` - `SLACK_TOKEN` - `SLACK_CHANNELS` + +## Unstable files produced by workflow + +- `version_sars-cov-2.txt` and `version_sars-cov-2-21L.txt`: used to track the version of the nextclade dataset used to generate the `nextclade.tsv` and `nextclade_21L.tsv` files. Format: `timestamp dataset_version` (e.g. `2023-02-06T14:40:23Z 2023-02-01T12:00:00Z`) for each run since and including the last full run. diff --git a/workflow/snakemake_rules/nextclade.smk b/workflow/snakemake_rules/nextclade.smk index 3d183c17..5549ee81 100644 --- a/workflow/snakemake_rules/nextclade.smk +++ b/workflow/snakemake_rules/nextclade.smk @@ -138,13 +138,26 @@ rule download_nextclade_executable: """ rule download_nextclade_dataset: - """Download Nextclade dataset""" + """ + Download Nextclade dataset + Append the dataset version used for this run to the version file with timestamp of download time + """ input: "nextclade" output: - dataset = "data/nextclade_data/{dataset_name}.zip" + dataset = "data/nextclade_data/{dataset_name}.zip", + version = "data/nextclade_data/version_{dataset_name}.txt", + params: + dst_version_file=config["s3_dst"] + "/version_{dataset_name}.txt", + src_version_file=config["s3_src"] + "/version_{dataset_name}.txt", shell: """ + ./bin/download-from-s3 {params.dst_version_file} {output.version} 0 || \ + ./bin/download-from-s3 {params.src_version_file} {output.version} 0 || \ + touch {output.version} + ./nextclade dataset get --name="{wildcards.dataset_name}" --output-zip={output.dataset} --verbose + printf %s "$(date --utc +%FT%TZ) " >> {output.version} + nextclade dataset list --name="{wildcards.dataset_name}" --json | jq -r '.[0].attributes.tag.value' >>{output.version} """ GENES = "E,M,N,ORF1a,ORF1b,ORF3a,ORF6,ORF7a,ORF7b,ORF8,ORF9b,S" diff --git a/workflow/snakemake_rules/upload.smk b/workflow/snakemake_rules/upload.smk index 5be9ee9c..bf51fa96 100644 --- a/workflow/snakemake_rules/upload.smk +++ b/workflow/snakemake_rules/upload.smk @@ -105,6 +105,29 @@ rule remove_rerun_touchfile: touch {output} """ +rule upload_dataset_version: + """ + Upload the Nextclade dataset version file + """ + input: + metadata_upload = f"data/{database}/metadata.tsv.zst.upload", + version_file = "data/nextclade_data/version_{dataset_name}.txt" + output: + touch("data/nextclade_data/version_{dataset_name}.upload") + params: + quiet = "" if send_notifications else "--quiet", + s3_bucket = config.get("s3_dst",""), + cloudfront_domain = config.get("cloudfront_domain", ""), + remote_filename = "version_{dataset_name}.txt", + shell: + """ + ./bin/upload-to-s3 \ + {params.quiet} \ + {input.version_file:q} \ + {params.s3_bucket:q}/{params.remote_filename:q} \ + {params.cloudfront_domain} 2>&1 | tee {output} + """ + rule upload: """ Requests one touch file for each uploaded remote file @@ -117,6 +140,7 @@ rule upload: "nextclade.tsv.zst", "nextclade_21L.tsv.zst", ] - ] + ], + dataset_version = [f"data/nextclade_data/version_{dataset_name}.upload" for dataset_name in ["sars-cov-2", "sars-cov-2-21L"]], output: touch(f"data/{database}/upload.done")