Skip to content

Commit

Permalink
ingest: Add optional Nextclade rules
Browse files Browse the repository at this point in the history
Add rules for running Nextclade as a part of the ingest workflow. These
rules are optional because not every pathogen will have a Nextclade
dataset to be able to run Nextclade as a part of ingest.
  • Loading branch information
joverlee521 committed Oct 6, 2023
1 parent 9a18402 commit dec4dd5
Show file tree
Hide file tree
Showing 3 changed files with 58 additions and 0 deletions.
3 changes: 3 additions & 0 deletions ingest/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,6 @@ configfile: "config/defaults.yaml"

include: "rules/fetch_from_ncbi.smk"
include: "rules/curate.smk"

if "nextclade" in config:
include: "rules/nextclade.smk"
9 changes: 9 additions & 0 deletions ingest/config/defaults.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -56,3 +56,12 @@ curate:
output_sequence_field: ""
# The list of metadata columns to keep in the final output of the curation pipeline.
metadata_columns: []


# Nextclade parameters to include if you are running Nextclade as a part of your ingest workflow
# Note that this requires a Nextclade dataset to already exist for your pathogen.
# Remove the following parameters if you do not plan to run Nextclade.
nextclade:
# The name of the Nextclade dataset to use for running nextclade.
# Run `nextclade dataset list` to get a full list of available Nextclade datasets
dataset_name: ""
46 changes: 46 additions & 0 deletions ingest/rules/nextclade.smk
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
"""
This part of the workflow handles running Nextclade on the curated metadata
and sequences.
See Nextclade docs for more details on usage, inputs, and outputs if you would
like to customize the rules:
https://docs.nextstrain.org/projects/nextclade/en/stable/user/nextclade-cli.html
"""
DATASET_NAME = config["nextclade"]["dataset_name"]


rule get_nextclade_dataset:
"""Download Nextclade dataset"""
output:
dataset=f"data/nextclade_data/{DATASET_NAME}.zip",
params:
dataset_name=DATASET_NAME
shell:
"""
nextclade dataset get \
--name={param.dataset_name:q} \
--output-zip={output.dataset} \
--verbose
"""


rule run_nextclade:
input:
dataset=f"data/nextclade_data/{DATASET_NAME}.zip",
sequences="results/sequences.fasta",
output:
nextclade="results/nextclade.tsv",
alignment="results/alignment.fasta",
# The lambda is used to deactivate automatic wildcard expansion.
# https://github.com/snakemake/snakemake/blob/384d0066c512b0429719085f2cf886fdb97fd80a/snakemake/rules.py#L997-L1000
translations=lambda w: "results/translations_{gene}.fasta"
shell:
"""
nextstrain run \
{input.sequences} \
--input-dataset {input.dataset} \
--output-tsv {output.nextclade} \
--output-fasta {output.alignment} \
--output-translations {output.translations}
"""

0 comments on commit dec4dd5

Please sign in to comment.