Merge pull request #7 from nextstrain/add-nextclade-to-ingest

Add Nextclade to ingest
nextstrain · Oct 13, 2023 · d49aeeb · d49aeeb
2 parents 3493a93 + 798c516
commit d49aeeb
Show file tree

Hide file tree

Showing 5 changed files with 153 additions and 0 deletions.
diff --git a/ingest/README.md b/ingest/README.md
@@ -7,6 +7,18 @@ If you have another data source or private data that needs to be formatted for
 the phylogenetic workflow, then you can use a similar workflow to curate your
 own data.
 
+## Run
+
+From within the `ingest` directory, run the workflow with:
+
+```
+nextstrain build .
+```
+
+This produces a `results` directory with the following outputs:
+- sequences.fasta
+- metadata.tsv
+
 ## Config
 
 The config directory contains all of the default configurations for the ingest workflow.

diff --git a/ingest/Snakefile b/ingest/Snakefile
@@ -1,6 +1,31 @@
 # Use default configuration values. Override with Snakemake's --configfile/--config options.
 configfile: "config/defaults.yaml"
 
+rule all:
+    input:
+        "results/sequences.fasta",
+        "results/metadata.tsv",
+
 
 include: "rules/fetch_from_ncbi.smk"
 include: "rules/curate.smk"
+
+
+# If included, the nextclade rules will create the final metadata TSV by
+# joining the Nextclade output with the metadata.
+# However, if not including nextclade, we have to rename the subset metadata TSV
+# to the final metadata TSV.
+if "nextclade" in config:
+    include: "rules/nextclade.smk"
+
+else:
+
+    rule create_final_metadata:
+        input:
+            metadata="results/subset_metadata.tsv"
+        output:
+            metadata="results/metadata.tsv"
+        shell:
+            """
+            mv {input.metadata} {output.metadata}
+            """
diff --git a/ingest/config/defaults.yaml b/ingest/config/defaults.yaml
@@ -73,3 +73,18 @@ curate:
   output_sequence_field: ""
   # The list of metadata columns to keep in the final output of the curation pipeline.
   metadata_columns: []
+
+
+# Nextclade parameters to include if you are running Nextclade as a part of your ingest workflow
+# Note that this requires a Nextclade dataset to already exist for your pathogen.
+# Remove the following parameters if you do not plan to run Nextclade.
+nextclade:
+  # The name of the Nextclade dataset to use for running nextclade.
+  # Run `nextclade dataset list` to get a full list of available Nextclade datasets
+  dataset_name: ""
+  # Path to the mapping for renaming Nextclade output columns
+  # The path should be relative to the ingest directory
+  field_map: "config/nextclade_field_map.tsv"
+  # This is the ID field you would use to match the Nextclade output with the record metadata.
+  # This should be the new name that you have defined in your field map.
+  id_field: "seqName"
diff --git a/ingest/config/nextclade_field_map.tsv b/ingest/config/nextclade_field_map.tsv
@@ -0,0 +1,18 @@
+# TSV file that is a mapping of column names for Nextclade output TSV
+# The first column should be the original column name of the Nextclade TSV
+# The second column should be the new column name to use in the final metadata TSV
+# Nextclade can have pathogen specific output columns so make sure to check which
+# columns would be useful for your downstream phylogenetic analysis.
+seqName	seqName
+clade	clade
+lineage	lineage
+coverage	coverage
+totalMissing	missing_data
+totalSubstitutions	divergence
+totalNonACGTNs	nonACGTN
+qc.missingData.status	QC_missing_data
+qc.mixedSites.status	QC_mixed_sites
+qc.privateMutations.status	QC_rare_mutations
+qc.frameShifts.status	QC_frame_shifts
+qc.stopCodons.status	QC_stop_codons
+frameShifts	frame_shifts
diff --git a/ingest/rules/nextclade.smk b/ingest/rules/nextclade.smk
@@ -0,0 +1,83 @@
+"""
+This part of the workflow handles running Nextclade on the curated metadata
+and sequences.
+
+See Nextclade docs for more details on usage, inputs, and outputs if you would
+like to customize the rules:
+https://docs.nextstrain.org/projects/nextclade/en/stable/user/nextclade-cli.html
+"""
+DATASET_NAME = config["nextclade"]["dataset_name"]
+
+
+rule get_nextclade_dataset:
+    """Download Nextclade dataset"""
+    output:
+        dataset=f"data/nextclade_data/{DATASET_NAME}.zip",
+    params:
+        dataset_name=DATASET_NAME
+    shell:
+        """
+        nextclade dataset get \
+            --name={params.dataset_name:q} \
+            --output-zip={output.dataset} \
+            --verbose
+        """
+
+
+rule run_nextclade:
+    input:
+        dataset=f"data/nextclade_data/{DATASET_NAME}.zip",
+        sequences="results/sequences.fasta",
+    output:
+        nextclade="results/nextclade.tsv",
+        alignment="results/alignment.fasta",
+        translations="results/translations.zip",
+    params:
+        # The lambda is used to deactivate automatic wildcard expansion.
+        # https://github.com/snakemake/snakemake/blob/384d0066c512b0429719085f2cf886fdb97fd80a/snakemake/rules.py#L997-L1000
+        translations=lambda w: "results/translations/{gene}.fasta",
+    shell:
+        """
+        nextclade run \
+            {input.sequences} \
+            --input-dataset {input.dataset} \
+            --output-tsv {output.nextclade} \
+            --output-fasta {output.alignment} \
+            --output-translations {params.translations}
+
+        zip -rj {output.translations} results/translations
+        """
+
+
+rule join_metadata_and_nextclade:
+    input:
+        nextclade="results/nextclade.tsv",
+        metadata="results/subset_metadata.tsv",
+        nextclade_field_map=config["nextclade"]["field_map"],
+    output:
+        metadata="results/metadata.tsv",
+    params:
+        metadata_id_field=config["curate"]["output_id_field"],
+        nextclade_id_field=config["nextclade"]["id_field"],
+    shell:
+        """
+        export SUBSET_FIELDS=`awk 'NR>1 {{print $1}}' {input.nextclade_field_map} | grep -v '^#' | tr '\n' ',' | sed 's/,$//g'`
+
+        csvtk -tl cut -f $SUBSET_FIELDS \
+            {input.nextclade} \
+        | csvtk -tl rename2 \
+            -F \
+            -f '*' \
+            -p '(.+)' \
+            -r '{{kv}}' \
+            -k {input.nextclade_field_map} \
+        | tsv-join -H \
+            --filter-file - \
+            --key-fields {params.nextclade_id_field} \
+            --data-fields {params.metadata_id_field} \
+            --append-fields '*' \
+            --write-all ? \
+            {input.metadata} \
+        | tsv-select -H --exclude {params.nextclade_id_field} \
+            > {output.metadata}
+        """