-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #7 from nextstrain/add-nextclade-to-ingest
Add Nextclade to ingest
- Loading branch information
Showing
5 changed files
with
153 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,31 @@ | ||
# Use default configuration values. Override with Snakemake's --configfile/--config options. | ||
configfile: "config/defaults.yaml" | ||
|
||
rule all: | ||
input: | ||
"results/sequences.fasta", | ||
"results/metadata.tsv", | ||
|
||
|
||
include: "rules/fetch_from_ncbi.smk" | ||
include: "rules/curate.smk" | ||
|
||
|
||
# If included, the nextclade rules will create the final metadata TSV by | ||
# joining the Nextclade output with the metadata. | ||
# However, if not including nextclade, we have to rename the subset metadata TSV | ||
# to the final metadata TSV. | ||
if "nextclade" in config: | ||
include: "rules/nextclade.smk" | ||
|
||
else: | ||
|
||
rule create_final_metadata: | ||
input: | ||
metadata="results/subset_metadata.tsv" | ||
output: | ||
metadata="results/metadata.tsv" | ||
shell: | ||
""" | ||
mv {input.metadata} {output.metadata} | ||
""" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
# TSV file that is a mapping of column names for Nextclade output TSV | ||
# The first column should be the original column name of the Nextclade TSV | ||
# The second column should be the new column name to use in the final metadata TSV | ||
# Nextclade can have pathogen specific output columns so make sure to check which | ||
# columns would be useful for your downstream phylogenetic analysis. | ||
seqName seqName | ||
clade clade | ||
lineage lineage | ||
coverage coverage | ||
totalMissing missing_data | ||
totalSubstitutions divergence | ||
totalNonACGTNs nonACGTN | ||
qc.missingData.status QC_missing_data | ||
qc.mixedSites.status QC_mixed_sites | ||
qc.privateMutations.status QC_rare_mutations | ||
qc.frameShifts.status QC_frame_shifts | ||
qc.stopCodons.status QC_stop_codons | ||
frameShifts frame_shifts |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,83 @@ | ||
""" | ||
This part of the workflow handles running Nextclade on the curated metadata | ||
and sequences. | ||
See Nextclade docs for more details on usage, inputs, and outputs if you would | ||
like to customize the rules: | ||
https://docs.nextstrain.org/projects/nextclade/en/stable/user/nextclade-cli.html | ||
""" | ||
DATASET_NAME = config["nextclade"]["dataset_name"] | ||
|
||
|
||
rule get_nextclade_dataset: | ||
"""Download Nextclade dataset""" | ||
output: | ||
dataset=f"data/nextclade_data/{DATASET_NAME}.zip", | ||
params: | ||
dataset_name=DATASET_NAME | ||
shell: | ||
""" | ||
nextclade dataset get \ | ||
--name={params.dataset_name:q} \ | ||
--output-zip={output.dataset} \ | ||
--verbose | ||
""" | ||
|
||
|
||
rule run_nextclade: | ||
input: | ||
dataset=f"data/nextclade_data/{DATASET_NAME}.zip", | ||
sequences="results/sequences.fasta", | ||
output: | ||
nextclade="results/nextclade.tsv", | ||
alignment="results/alignment.fasta", | ||
translations="results/translations.zip", | ||
params: | ||
# The lambda is used to deactivate automatic wildcard expansion. | ||
# https://github.com/snakemake/snakemake/blob/384d0066c512b0429719085f2cf886fdb97fd80a/snakemake/rules.py#L997-L1000 | ||
translations=lambda w: "results/translations/{gene}.fasta", | ||
shell: | ||
""" | ||
nextclade run \ | ||
{input.sequences} \ | ||
--input-dataset {input.dataset} \ | ||
--output-tsv {output.nextclade} \ | ||
--output-fasta {output.alignment} \ | ||
--output-translations {params.translations} | ||
zip -rj {output.translations} results/translations | ||
""" | ||
|
||
|
||
rule join_metadata_and_nextclade: | ||
input: | ||
nextclade="results/nextclade.tsv", | ||
metadata="results/subset_metadata.tsv", | ||
nextclade_field_map=config["nextclade"]["field_map"], | ||
output: | ||
metadata="results/metadata.tsv", | ||
params: | ||
metadata_id_field=config["curate"]["output_id_field"], | ||
nextclade_id_field=config["nextclade"]["id_field"], | ||
shell: | ||
""" | ||
export SUBSET_FIELDS=`awk 'NR>1 {{print $1}}' {input.nextclade_field_map} | grep -v '^#' | tr '\n' ',' | sed 's/,$//g'` | ||
csvtk -tl cut -f $SUBSET_FIELDS \ | ||
{input.nextclade} \ | ||
| csvtk -tl rename2 \ | ||
-F \ | ||
-f '*' \ | ||
-p '(.+)' \ | ||
-r '{{kv}}' \ | ||
-k {input.nextclade_field_map} \ | ||
| tsv-join -H \ | ||
--filter-file - \ | ||
--key-fields {params.nextclade_id_field} \ | ||
--data-fields {params.metadata_id_field} \ | ||
--append-fields '*' \ | ||
--write-all ? \ | ||
{input.metadata} \ | ||
| tsv-select -H --exclude {params.nextclade_id_field} \ | ||
> {output.metadata} | ||
""" |