Skip to content

Commit

Permalink
Merge pull request #7 from nextstrain/add-nextclade-to-ingest
Browse files Browse the repository at this point in the history
Add Nextclade to ingest
  • Loading branch information
joverlee521 authored Oct 13, 2023
2 parents 3493a93 + 798c516 commit d49aeeb
Show file tree
Hide file tree
Showing 5 changed files with 153 additions and 0 deletions.
12 changes: 12 additions & 0 deletions ingest/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,18 @@ If you have another data source or private data that needs to be formatted for
the phylogenetic workflow, then you can use a similar workflow to curate your
own data.

## Run

From within the `ingest` directory, run the workflow with:

```
nextstrain build .
```

This produces a `results` directory with the following outputs:
- sequences.fasta
- metadata.tsv

## Config

The config directory contains all of the default configurations for the ingest workflow.
Expand Down
25 changes: 25 additions & 0 deletions ingest/Snakefile
Original file line number Diff line number Diff line change
@@ -1,6 +1,31 @@
# Use default configuration values. Override with Snakemake's --configfile/--config options.
configfile: "config/defaults.yaml"

rule all:
input:
"results/sequences.fasta",
"results/metadata.tsv",


include: "rules/fetch_from_ncbi.smk"
include: "rules/curate.smk"


# If included, the nextclade rules will create the final metadata TSV by
# joining the Nextclade output with the metadata.
# However, if not including nextclade, we have to rename the subset metadata TSV
# to the final metadata TSV.
if "nextclade" in config:
include: "rules/nextclade.smk"

else:

rule create_final_metadata:
input:
metadata="results/subset_metadata.tsv"
output:
metadata="results/metadata.tsv"
shell:
"""
mv {input.metadata} {output.metadata}
"""
15 changes: 15 additions & 0 deletions ingest/config/defaults.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -73,3 +73,18 @@ curate:
output_sequence_field: ""
# The list of metadata columns to keep in the final output of the curation pipeline.
metadata_columns: []


# Nextclade parameters to include if you are running Nextclade as a part of your ingest workflow
# Note that this requires a Nextclade dataset to already exist for your pathogen.
# Remove the following parameters if you do not plan to run Nextclade.
nextclade:
# The name of the Nextclade dataset to use for running nextclade.
# Run `nextclade dataset list` to get a full list of available Nextclade datasets
dataset_name: ""
# Path to the mapping for renaming Nextclade output columns
# The path should be relative to the ingest directory
field_map: "config/nextclade_field_map.tsv"
# This is the ID field you would use to match the Nextclade output with the record metadata.
# This should be the new name that you have defined in your field map.
id_field: "seqName"
18 changes: 18 additions & 0 deletions ingest/config/nextclade_field_map.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# TSV file that is a mapping of column names for Nextclade output TSV
# The first column should be the original column name of the Nextclade TSV
# The second column should be the new column name to use in the final metadata TSV
# Nextclade can have pathogen specific output columns so make sure to check which
# columns would be useful for your downstream phylogenetic analysis.
seqName seqName
clade clade
lineage lineage
coverage coverage
totalMissing missing_data
totalSubstitutions divergence
totalNonACGTNs nonACGTN
qc.missingData.status QC_missing_data
qc.mixedSites.status QC_mixed_sites
qc.privateMutations.status QC_rare_mutations
qc.frameShifts.status QC_frame_shifts
qc.stopCodons.status QC_stop_codons
frameShifts frame_shifts
83 changes: 83 additions & 0 deletions ingest/rules/nextclade.smk
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
"""
This part of the workflow handles running Nextclade on the curated metadata
and sequences.
See Nextclade docs for more details on usage, inputs, and outputs if you would
like to customize the rules:
https://docs.nextstrain.org/projects/nextclade/en/stable/user/nextclade-cli.html
"""
DATASET_NAME = config["nextclade"]["dataset_name"]


rule get_nextclade_dataset:
"""Download Nextclade dataset"""
output:
dataset=f"data/nextclade_data/{DATASET_NAME}.zip",
params:
dataset_name=DATASET_NAME
shell:
"""
nextclade dataset get \
--name={params.dataset_name:q} \
--output-zip={output.dataset} \
--verbose
"""


rule run_nextclade:
input:
dataset=f"data/nextclade_data/{DATASET_NAME}.zip",
sequences="results/sequences.fasta",
output:
nextclade="results/nextclade.tsv",
alignment="results/alignment.fasta",
translations="results/translations.zip",
params:
# The lambda is used to deactivate automatic wildcard expansion.
# https://github.com/snakemake/snakemake/blob/384d0066c512b0429719085f2cf886fdb97fd80a/snakemake/rules.py#L997-L1000
translations=lambda w: "results/translations/{gene}.fasta",
shell:
"""
nextclade run \
{input.sequences} \
--input-dataset {input.dataset} \
--output-tsv {output.nextclade} \
--output-fasta {output.alignment} \
--output-translations {params.translations}
zip -rj {output.translations} results/translations
"""


rule join_metadata_and_nextclade:
input:
nextclade="results/nextclade.tsv",
metadata="results/subset_metadata.tsv",
nextclade_field_map=config["nextclade"]["field_map"],
output:
metadata="results/metadata.tsv",
params:
metadata_id_field=config["curate"]["output_id_field"],
nextclade_id_field=config["nextclade"]["id_field"],
shell:
"""
export SUBSET_FIELDS=`awk 'NR>1 {{print $1}}' {input.nextclade_field_map} | grep -v '^#' | tr '\n' ',' | sed 's/,$//g'`
csvtk -tl cut -f $SUBSET_FIELDS \
{input.nextclade} \
| csvtk -tl rename2 \
-F \
-f '*' \
-p '(.+)' \
-r '{{kv}}' \
-k {input.nextclade_field_map} \
| tsv-join -H \
--filter-file - \
--key-fields {params.nextclade_id_field} \
--data-fields {params.metadata_id_field} \
--append-fields '*' \
--write-all ? \
{input.metadata} \
| tsv-select -H --exclude {params.nextclade_id_field} \
> {output.metadata}
"""

0 comments on commit d49aeeb

Please sign in to comment.