From 3850af5acab433338ded565a170d737708b25a49 Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Fri, 18 Oct 2024 15:05:22 -0700 Subject: [PATCH 1/8] ingest: Update to nextclade3 Since the v3 Nextclade datasets for MPXV and hMPXV both use NC_063383.1 as the reference, we can just run Nextclade once using the MPXV dataset. --- ingest/rules/nextclade.smk | 56 +++++++++++++++----------------------- 1 file changed, 22 insertions(+), 34 deletions(-) diff --git a/ingest/rules/nextclade.smk b/ingest/rules/nextclade.smk index 9008398..354b2e4 100644 --- a/ingest/rules/nextclade.smk +++ b/ingest/rules/nextclade.smk @@ -1,54 +1,42 @@ -rule nextclade_dataset: +rule get_nextclade_dataset: output: temp("mpxv.zip"), + params: + dataset_name="MPXV", shell: - """ - nextclade2 dataset get --name MPXV --output-zip {output} - """ - - -rule nextclade_dataset_hMPXV: - output: - temp("hmpxv.zip"), - shell: - """ - nextclade2 dataset get --name hMPXV --output-zip {output} + r""" + nextclade3 dataset get \ + --name {params.dataset_name:q} \ + --output-zip {output:q} """ -rule align: +rule run_nextclade: input: sequences="results/sequences.fasta", - dataset="hmpxv.zip", + dataset="mpxv.zip", output: + nextclade="data/nextclade.tsv", alignment="data/alignment.fasta", - insertions="data/insertions.csv", translations="data/translations.zip", params: # The lambda is used to deactivate automatic wildcard expansion. # https://github.com/snakemake/snakemake/blob/384d0066c512b0429719085f2cf886fdb97fd80a/snakemake/rules.py#L997-L1000 - translations=lambda w: "data/translations/{gene}.fasta", + translations=lambda w: "data/translations/{cds}.fasta", threads: 4 shell: - """ - nextclade2 run -D {input.dataset} -j {threads} --retry-reverse-complement \ - --output-fasta {output.alignment} --output-translations {params.translations} \ - --output-insertions {output.insertions} {input.sequences} - zip -rj {output.translations} data/translations - """ - + r""" + nextclade3 run \ + {input.sequences:q} \ + --jobs {threads:q} \ + --retry-reverse-complement \ + --input-dataset {input.dataset:q} \ + --output-tsv {output.nextclade:q} \ + --output-fasta {output.alignment:q} \ + --output-translations {params.translations:q} -rule nextclade: - input: - sequences="results/sequences.fasta", - dataset="mpxv.zip", - output: - "data/nextclade.tsv", - threads: 4 - shell: - """ - nextclade2 run -D {input.dataset} -j {threads} --output-tsv {output} {input.sequences} --retry-reverse-complement + zip -rj {output.translations:q} data/translations """ @@ -63,7 +51,7 @@ rule join_metadata_clades: id_field=config["curate"]["id_field"], nextclade_id_field=config["nextclade"]["id_field"], shell: - """ + r""" export SUBSET_FIELDS=`awk 'NR>1 {{print $1}}' {input.nextclade_field_map} | tr '\n' ',' | sed 's/,$//g'` csvtk -tl cut -f $SUBSET_FIELDS \ From cbcc7d4d9ab42e34b34caa2539a48acffc83578d Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Fri, 18 Oct 2024 15:51:16 -0700 Subject: [PATCH 2/8] ingest: move Nextclade dataset to `data/` and outputs to `results/` Move the downloaded Nextclade dataset to the `data/` directory so that it's automatically ignored by git. Move Nextclade outputs to `results` as they are part of the final outputs of the ingest workflow. --- ingest/rules/nextclade.smk | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/ingest/rules/nextclade.smk b/ingest/rules/nextclade.smk index 354b2e4..9e50caf 100644 --- a/ingest/rules/nextclade.smk +++ b/ingest/rules/nextclade.smk @@ -1,7 +1,7 @@ rule get_nextclade_dataset: output: - temp("mpxv.zip"), + temp("data/mpxv.zip"), params: dataset_name="MPXV", shell: @@ -15,15 +15,15 @@ rule get_nextclade_dataset: rule run_nextclade: input: sequences="results/sequences.fasta", - dataset="mpxv.zip", + dataset="data/mpxv.zip", output: - nextclade="data/nextclade.tsv", - alignment="data/alignment.fasta", - translations="data/translations.zip", + nextclade="results/nextclade.tsv", + alignment="results/alignment.fasta", + translations="results/translations.zip", params: # The lambda is used to deactivate automatic wildcard expansion. # https://github.com/snakemake/snakemake/blob/384d0066c512b0429719085f2cf886fdb97fd80a/snakemake/rules.py#L997-L1000 - translations=lambda w: "data/translations/{cds}.fasta", + translations=lambda w: "results/translations/{cds}.fasta", threads: 4 shell: r""" @@ -36,13 +36,13 @@ rule run_nextclade: --output-fasta {output.alignment:q} \ --output-translations {params.translations:q} - zip -rj {output.translations:q} data/translations + zip -rj {output.translations:q} results/translations """ rule join_metadata_clades: input: - nextclade="data/nextclade.tsv", + nextclade="results/nextclade.tsv", metadata="data/subset_metadata.tsv", nextclade_field_map=config["nextclade"]["field_map"], output: From 123447943f67282248733f3ed92ea2d15b79ffef Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Fri, 18 Oct 2024 15:28:46 -0700 Subject: [PATCH 3/8] ingest: Rename Nextclade metadata fields with augur curate rename Modified from . I switched the order of `augur curate rename` and `tsv-select` because the Nextclade data includes fields that we don't use that hit the csv field size limit. This commit also includes automated reformatting by `snakefmt`. --- ingest/defaults/config.yaml | 22 ++++++++++++++-- ingest/defaults/nextclade-field-map.tsv | 16 ----------- ingest/rules/nextclade.smk | 35 +++++++++++++++++-------- 3 files changed, 44 insertions(+), 29 deletions(-) delete mode 100644 ingest/defaults/nextclade-field-map.tsv diff --git a/ingest/defaults/config.yaml b/ingest/defaults/config.yaml index f35485b..935c210 100644 --- a/ingest/defaults/config.yaml +++ b/ingest/defaults/config.yaml @@ -80,5 +80,23 @@ curate: nextclade: # Field to use as the sequence ID in the Nextclade file id_field: 'seqName' - # Fields from a Nextclade file to be renamed (if desired) and appended to a metadata file - field_map: 'defaults/nextclade-field-map.tsv' + # The first column should be the original column name of the Nextclade TSV + # The second column should be the new column name to use in the final metadata TSV + # Nextclade can have pathogen specific output columns so make sure to check which + # columns would be useful for your downstream phylogenetic analysis. + field_map: + seqName: "seqName" + clade: "clade" + outbreak: "outbreak" + lineage: "lineage" + coverage: "coverage" + totalMissing: "missing_data" + totalSubstitutions: "divergence" + totalNonACGTNs: "nonACGTN" + qc.missingData.status: "QC_missing_data" + qc.mixedSites.status: "QC_mixed_sites" + qc.privateMutations.status: "QC_rare_mutations" + qc.frameShifts.status: "QC_frame_shifts" + qc.stopCodons.status: "QC_stop_codons" + frameShifts: "frame_shifts" + isReverseComplement: "is_reverse_complement" diff --git a/ingest/defaults/nextclade-field-map.tsv b/ingest/defaults/nextclade-field-map.tsv deleted file mode 100644 index a495da3..0000000 --- a/ingest/defaults/nextclade-field-map.tsv +++ /dev/null @@ -1,16 +0,0 @@ -key value -seqName seqName -clade clade -outbreak outbreak -lineage lineage -coverage coverage -totalMissing missing_data -totalSubstitutions divergence -totalNonACGTNs nonACGTN -qc.missingData.status QC_missing_data -qc.mixedSites.status QC_mixed_sites -qc.privateMutations.status QC_rare_mutations -qc.frameShifts.status QC_frame_shifts -qc.stopCodons.status QC_stop_codons -frameShifts frame_shifts -isReverseComplement is_reverse_complement \ No newline at end of file diff --git a/ingest/rules/nextclade.smk b/ingest/rules/nextclade.smk index 9e50caf..6841671 100644 --- a/ingest/rules/nextclade.smk +++ b/ingest/rules/nextclade.smk @@ -1,3 +1,5 @@ +import sys + rule get_nextclade_dataset: output: @@ -40,28 +42,39 @@ rule run_nextclade: """ +if isinstance(config["nextclade"]["field_map"], str): + print( + f"Converting config['nextclade']['field_map'] from TSV file ({config['nextclade']['field_map']}) to dictionary; " + f"consider putting the field map directly in the config file.", + file=sys.stderr, + ) + with open(config["nextclade"]["field_map"], "r") as f: + config["nextclade"]["field_map"] = dict( + line.rstrip("\n").split("\t", 1) for line in f if not line.startswith("#") + ) + + rule join_metadata_clades: input: nextclade="results/nextclade.tsv", metadata="data/subset_metadata.tsv", - nextclade_field_map=config["nextclade"]["field_map"], output: metadata="results/metadata.tsv", params: id_field=config["curate"]["id_field"], nextclade_id_field=config["nextclade"]["id_field"], + nextclade_field_map=[ + f"{old}={new}" for old, new in config["nextclade"]["field_map"].items() + ], + nextclade_fields=",".join(config["nextclade"]["field_map"].keys()), shell: r""" - export SUBSET_FIELDS=`awk 'NR>1 {{print $1}}' {input.nextclade_field_map} | tr '\n' ',' | sed 's/,$//g'` - - csvtk -tl cut -f $SUBSET_FIELDS \ - {input.nextclade} \ - | csvtk -tl rename2 \ - -F \ - -f '*' \ - -p '(.+)' \ - -r '{{kv}}' \ - -k {input.nextclade_field_map} \ + tsv-select --header --fields {params.nextclade_fields:q} {input.nextclade} \ + | augur curate rename \ + --metadata - \ + --id-column {params.nextclade_id_field:q} \ + --field-map {params.nextclade_field_map:q} \ + --output-metadata - \ | tsv-join -H \ --filter-file - \ --key-fields {params.nextclade_id_field} \ From 54ca356e098baf969abfb496fbae700e86ab2698 Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Fri, 18 Oct 2024 16:26:16 -0700 Subject: [PATCH 4/8] ingest: Merge Nextclade metadata with `augur merge` Modified from --- ingest/rules/nextclade.smk | 40 +++++++++++++++++++++++++------------- 1 file changed, 26 insertions(+), 14 deletions(-) diff --git a/ingest/rules/nextclade.smk b/ingest/rules/nextclade.smk index 6841671..cd547f7 100644 --- a/ingest/rules/nextclade.smk +++ b/ingest/rules/nextclade.smk @@ -54,14 +54,12 @@ if isinstance(config["nextclade"]["field_map"], str): ) -rule join_metadata_clades: +rule nextclade_metadata: input: nextclade="results/nextclade.tsv", - metadata="data/subset_metadata.tsv", output: - metadata="results/metadata.tsv", + nextclade_metadata=temp("results/nextclade_metadata.tsv"), params: - id_field=config["curate"]["id_field"], nextclade_id_field=config["nextclade"]["id_field"], nextclade_field_map=[ f"{old}={new}" for old, new in config["nextclade"]["field_map"].items() @@ -74,14 +72,28 @@ rule join_metadata_clades: --metadata - \ --id-column {params.nextclade_id_field:q} \ --field-map {params.nextclade_field_map:q} \ - --output-metadata - \ - | tsv-join -H \ - --filter-file - \ - --key-fields {params.nextclade_id_field} \ - --data-fields {params.id_field} \ - --append-fields '*' \ - --write-all ? \ - {input.metadata} \ - | tsv-select -H --exclude {params.nextclade_id_field} \ - > {output.metadata} + --output-metadata {output.nextclade_metadata:q} + """ + + +rule join_metadata_and_nextclade: + input: + metadata="data/subset_metadata.tsv", + nextclade_metadata="results/nextclade_metadata.tsv", + output: + metadata="results/metadata.tsv", + params: + metadata_id_field=config["curate"]["id_field"], + nextclade_id_field=config["nextclade"]["id_field"], + shell: + r""" + augur merge \ + --metadata \ + metadata={input.metadata:q} \ + nextclade={input.nextclade_metadata:q} \ + --metadata-id-columns \ + metadata={params.metadata_id_field:q} \ + nextclade={params.nextclade_id_field:q} \ + --output-metadata {output.metadata:q} \ + --no-source-columns """ From 475b64f952590fb6ebf02847c9e9902abeb73cb1 Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Fri, 18 Oct 2024 16:31:45 -0700 Subject: [PATCH 5/8] ingest/nextstrain_automation: Update `files_to_upload` Update the `files_to_upload` to match the output files from Nextclade v3. --- ingest/build-configs/nextstrain-automation/config.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ingest/build-configs/nextstrain-automation/config.yaml b/ingest/build-configs/nextstrain-automation/config.yaml index 82bba07..5f26423 100644 --- a/ingest/build-configs/nextstrain-automation/config.yaml +++ b/ingest/build-configs/nextstrain-automation/config.yaml @@ -18,9 +18,9 @@ upload: all_sequences.ndjson.xz: data/sequences.ndjson metadata.tsv.gz: results/metadata.tsv sequences.fasta.xz: results/sequences.fasta - alignment.fasta.xz: data/alignment.fasta - insertions.csv.gz: data/insertions.csv - translations.zip: data/translations.zip + nextclade.tsv.zst: results/nextclade.tsv + alignment.fasta.xz: results/alignment.fasta + translations.zip: results/translations.zip cloudfront_domain: 'data.nextstrain.org' From 6b5153c4fd4ef09235ef52b73f9905a0f740faf3 Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Fri, 18 Oct 2024 16:36:39 -0700 Subject: [PATCH 6/8] phylogenetic: Update description.md for Nextclade v3 Remove reference to nextalign and update available files to match the output files from Nextclade v3. --- phylogenetic/defaults/description.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/phylogenetic/defaults/description.md b/phylogenetic/defaults/description.md index 558e663..29ae51f 100644 --- a/phylogenetic/defaults/description.md +++ b/phylogenetic/defaults/description.md @@ -12,7 +12,7 @@ The fourth is [`mpox/clade-I`](https://nextstrain.org/mpox/clade-I), which focus #### Analysis Our bioinformatic processing workflow can be found at [github.com/nextstrain/mpox](https://github.com/nextstrain/mpox) and includes: -- sequence alignment by [nextalign](https://docs.nextstrain.org/projects/nextclade/en/stable/user/nextalign-cli.html) +- sequence alignment by [Nextclade](https://docs.nextstrain.org/projects/nextclade/en/stable/user/nextclade-cli/index.html) - masking several regions of the genome, including the first 1350 and last 6422 base pairs and multiple repetitive regions of variable length - phylogenetic reconstruction using [IQTREE-2](http://www.iqtree.org/) - ancestral state reconstruction and temporal inference using [TreeTime](https://github.com/neherlab/treetime) @@ -26,9 +26,9 @@ Curated sequences and metadata are available as flat files at: - [data.nextstrain.org/files/workflows/mpox/sequences.fasta.xz](https://data.nextstrain.org/files/workflows/mpox/sequences.fasta.xz) - [data.nextstrain.org/files/workflows/mpox/metadata.tsv.gz](https://data.nextstrain.org/files/workflows/mpox/metadata.tsv.gz) -Pairwise alignments with [Nextclade](https://clades.nextstrain.org/) against the [reference sequence MPXV-M5312_HM12_Rivers](https://www.ncbi.nlm.nih.gov/nuccore/NC_063383), insertions relative to the reference, and translated ORFs are available at +Pairwise alignments with [Nextclade](https://clades.nextstrain.org/) against the [reference sequence MPXV-M5312_HM12_Rivers](https://www.ncbi.nlm.nih.gov/nuccore/NC_063383), Nextclade analysis results, and translated ORFs are available at - [data.nextstrain.org/files/workflows/mpox/alignment.fasta.xz](https://data.nextstrain.org/files/workflows/mpox/alignment.fasta.xz) -- [data.nextstrain.org/files/workflows/mpox/insertions.csv.gz](https://data.nextstrain.org/files/workflows/mpox/insertions.csv.gz) +- [data.nextstrain.org/files/workflows/mpox/nextclade.tsv.zst](https://data.nextstrain.org/files/workflows/mpox/nextclade.tsv.zst) - [data.nextstrain.org/files/workflows/mpox/translations.zip](https://data.nextstrain.org/files/workflows/mpox/translations.zip) --- From 42c10401366712a3894c067656b3b46b33c8a7fa Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Tue, 22 Oct 2024 11:19:55 -0700 Subject: [PATCH 7/8] ingest/nextclade: Add log and benchmark Adding log/benchmark to all rules in nextclade.smk according to the Nextstrain Snakemake style guide --- ingest/rules/nextclade.smk | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/ingest/rules/nextclade.smk b/ingest/rules/nextclade.smk index cd547f7..6af90bf 100644 --- a/ingest/rules/nextclade.smk +++ b/ingest/rules/nextclade.smk @@ -6,11 +6,15 @@ rule get_nextclade_dataset: temp("data/mpxv.zip"), params: dataset_name="MPXV", + log: + "logs/get_nextclade_dataset.txt", + benchmark: + "benchmarks/get_nextclade_dataset.txt" shell: r""" nextclade3 dataset get \ --name {params.dataset_name:q} \ - --output-zip {output:q} + --output-zip {output:q} 2>&1 | tee {log} """ @@ -27,6 +31,10 @@ rule run_nextclade: # https://github.com/snakemake/snakemake/blob/384d0066c512b0429719085f2cf886fdb97fd80a/snakemake/rules.py#L997-L1000 translations=lambda w: "results/translations/{cds}.fasta", threads: 4 + log: + "logs/run_nextclade.txt", + benchmark: + "benchmarks/run_nextclade.txt" shell: r""" nextclade3 run \ @@ -36,7 +44,7 @@ rule run_nextclade: --input-dataset {input.dataset:q} \ --output-tsv {output.nextclade:q} \ --output-fasta {output.alignment:q} \ - --output-translations {params.translations:q} + --output-translations {params.translations:q} 2>&1 | tee {log} zip -rj {output.translations:q} results/translations """ @@ -65,14 +73,18 @@ rule nextclade_metadata: f"{old}={new}" for old, new in config["nextclade"]["field_map"].items() ], nextclade_fields=",".join(config["nextclade"]["field_map"].keys()), + log: + "logs/nextclade_metadata.txt", + benchmark: + "benchmarks/nextclade_metadata.txt" shell: r""" - tsv-select --header --fields {params.nextclade_fields:q} {input.nextclade} \ + (tsv-select --header --fields {params.nextclade_fields:q} {input.nextclade} \ | augur curate rename \ --metadata - \ --id-column {params.nextclade_id_field:q} \ --field-map {params.nextclade_field_map:q} \ - --output-metadata {output.nextclade_metadata:q} + --output-metadata {output.nextclade_metadata:q} ) 2>&1 | tee {log} """ @@ -85,6 +97,10 @@ rule join_metadata_and_nextclade: params: metadata_id_field=config["curate"]["id_field"], nextclade_id_field=config["nextclade"]["id_field"], + log: + "logs/join_metadata_and_nextclade.txt", + benchmark: + "benchmarks/join_metadata_and_nextclade.txt" shell: r""" augur merge \ @@ -95,5 +111,5 @@ rule join_metadata_and_nextclade: metadata={params.metadata_id_field:q} \ nextclade={params.nextclade_id_field:q} \ --output-metadata {output.metadata:q} \ - --no-source-columns + --no-source-columns 2>&1 | tee {log} """ From dab86fff10bd92199103cb70d7217f3a050a5ab3 Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Tue, 22 Oct 2024 15:19:39 -0700 Subject: [PATCH 8/8] ingest/nextclade: Use all available workflow cores Chatter on Slack regarding workflow runtimes made me realize that since we only run a single Nextclade job now, we no longer need to put a hard limit on the threads. --- ingest/rules/nextclade.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ingest/rules/nextclade.smk b/ingest/rules/nextclade.smk index 6af90bf..c80a750 100644 --- a/ingest/rules/nextclade.smk +++ b/ingest/rules/nextclade.smk @@ -30,7 +30,7 @@ rule run_nextclade: # The lambda is used to deactivate automatic wildcard expansion. # https://github.com/snakemake/snakemake/blob/384d0066c512b0429719085f2cf886fdb97fd80a/snakemake/rules.py#L997-L1000 translations=lambda w: "results/translations/{cds}.fasta", - threads: 4 + threads: workflow.cores log: "logs/run_nextclade.txt", benchmark: