diff --git a/workflow/Snakefile b/workflow/Snakefile index 8f6c59f..8f52150 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -1,6 +1,6 @@ # Python standard library from os.path import join -import os, sys +import os, sys, json # Local imports from scripts.common import ( @@ -66,14 +66,38 @@ rule all: join(workpath, "{name}", "fastqs", "{name}.fastq.gz"), name=samples ), - # Base-calling quality filtering, - # @imported from `rule nanofilt` in rules/trim.smk + # Adapter trimming step, + # @imported from `rule porechop` in rules/trim.smk expand( - join(workpath, "{name}", "fastqs", "{name}.filtered.fastq.gz"), + join(workpath, "{name}", "fastqs", "{name}.trimmed.fastq.gz"), name=samples ), + # Align reads against monkeypox reference, + # @imported from `rule minimap2` in rules/map.smk + expand( + join(workpath, "{name}", "bams", "{name}.sam"), + name=samples + ), + # Create a consensus sequence from alignments + # @imported from `rule consensus` in rules/map.smk + expand( + join(workpath, "{name}", "consensus", "{name}_consensus_seqid.fa"), + name=samples + ), + # Create input file for MSA, concatenates the ref and + # each samples consequence sequence. + # @imported from `rule concat` in rules/map.smk + join(workpath, "project", "consensus.fa"), + # Mutiple sequence alignment (MSA), + # @imported from `rule mafft` in rules/map.smk + join(workpath, "project", "msa.fa"), + # Build a phylogentic tree from MSA, + # @imported from `rule tree` in rules/tree.smk + join(workpath, "project", "mpox_phylogeny.raxml.bestTree"), # Import rules include: join("rules", "common.smk") -include: join("rules", "trim.smk") \ No newline at end of file +include: join("rules", "trim.smk") +include: join("rules", "map.smk") +include: join("rules", "tree.smk") \ No newline at end of file diff --git a/workflow/rules/map.smk b/workflow/rules/map.smk new file mode 100644 index 0000000..911ab8f --- /dev/null +++ b/workflow/rules/map.smk @@ -0,0 +1,125 @@ +# Data processing rules to map, collapse, and perform msa +rule minimap2: + """ + Data-processing step to align reads against NCBI MonkeyPox reference + sequence ("NC_003310.1"): https://www.ncbi.nlm.nih.gov/nuccore/NC_003310.1 + @Input: + Trimmed FastQ file (scatter) + @Output: + SAM file + """ + input: + fq = join(workpath, "{name}", "fastqs", "{name}.trimmed.fastq.gz"), + output: + sam = join(workpath, "{name}", "bams", "{name}.sam"), + params: + rname = 'minimap2', + ref_fa = config['references']['mpox_ref_genome'], + conda: depending(conda_yaml_or_named_env, use_conda) + container: depending(config['images']['mpox-seek'], use_singularity) + shell: + """ + # Align against NCBRI monkeypox genome: + # https://www.ncbi.nlm.nih.gov/nuccore/NC_003310.1 + minimap2 \\ + -ax map-ont \\ + --secondary=no \\ + --sam-hit-only \\ + {params.ref_fa} \\ + {input.fq} \\ + > {output.sam} + """ + + +rule consensus: + """ + Data-processing step to collapse aligned reads into a consensus + sequence using viral_consensus. + @Input: + SAM file (scatter) + @Output: + Consensus FASTA file, + Consensus FASTA file with samples names for sequence identifers + """ + input: + sam = join(workpath, "{name}", "bams", "{name}.sam"), + output: + fa = join(workpath, "{name}", "consensus", "{name}_consensus.fa"), + fixed = join(workpath, "{name}", "consensus", "{name}_consensus_seqid.fa"), + params: + rname = 'consensus', + ref_fa = config['references']['mpox_ref_genome'], + conda: depending(conda_yaml_or_named_env, use_conda) + container: depending(config['images']['mpox-seek'], use_singularity) + shell: + """ + # Create a consensus sequence of aligned reads + viral_consensus \\ + -i {input.sam} \\ + -r {params.ref_fa} \\ + -o {output.fa} + + # Rename the sequence identifers in the FASTA + # file to contain only the sample name, by + # default viral_consensus contains the info + # related to the command that was run. + awk '{{split($0,a," "); n=split(a[4],b,"/"); gsub(/\\.sam$/,"",b[n]); if(a[2]) print ">"b[n]; else print; }}' \\ + {output.fa} \\ + > {output.fixed} + """ + + +rule concat: + """ + Data-processing step to create an input FASTA file for mafft. This + fasta file should contain the reference genome and the consensus + sequence all samples. + @Input: + Consensus FASTA file with samples names for sequence identifers (gather) + @Output: + FASTA file containing the ref and the consensus sequences of all samples + """ + input: + fas = expand(join(workpath, "{name}", "consensus", "{name}_consensus_seqid.fa"), name=samples), + output: + fa = join(workpath, "project", "consensus.fa"), + params: + rname = 'premafft', + ref_fa = config['references']['mpox_ref_genome'], + conda: depending(conda_yaml_or_named_env, use_conda) + container: depending(config['images']['mpox-seek'], use_singularity) + shell: + """ + # Create FASTA file with the reference genome + # and the consensus sequence of each sample + cat {params.ref_fa} \\ + {input.fas} \\ + > {output.fa} + """ + + +rule mafft: + """ + Data-processing step to run multiple sequence alignment (MSA) of the + reference genome and the consensus sequence of each sample. + @Input: + FASTA file containing the ref and the consensus sequences of all samples (indirect-gather, singleton) + @Output: + Multiple sequence alignment (MSA) FASTA file from mafft. + """ + input: + fa = join(workpath, "project", "consensus.fa"), + output: + msa = join(workpath, "project", "msa.fa"), + params: + rname = 'msa', + ref_fa = config['references']['mpox_ref_genome'], + conda: depending(conda_yaml_or_named_env, use_conda) + container: depending(config['images']['mpox-seek'], use_singularity) + shell: + """ + # Run multiple sequence alignment (MSA) of the + # reference genome and each samples consensus + # sequence using mafft + mafft --auto --thread 2 {input.fa} > {output.msa} + """ \ No newline at end of file diff --git a/workflow/rules/tree.smk b/workflow/rules/tree.smk new file mode 100644 index 0000000..a7e1637 --- /dev/null +++ b/workflow/rules/tree.smk @@ -0,0 +1,35 @@ +# Data processing rules to build a phylogentic tree +rule tree: + """ + Data-processing step to build a phylogentic tree of the multiple sequence + alignment (MSA) results using raxml-ng. This phylogentic tree can be viewed + and explored interactively with tools like figtree, ete, taxonium, java + treeviewer, etc. The tools are open-source and free to use, and many of + them can be downloaded as desktop applications (run offline). + @Input: + Multiple sequence alignment (MSA) FASTA file from mafft. + @Output: + Phylogenetic tree (Newick format). + """ + input: + msa = join(workpath, "project", "msa.fa"), + output: + nw = join(workpath, "project", "mpox_phylogeny.raxml.bestTree"), + params: + rname = 'tree', + ref_fa = config['references']['mpox_ref_genome'], + prefix = join(workpath, "project", "mpox_phylogeny"), + conda: depending(conda_yaml_or_named_env, use_conda) + container: depending(config['images']['mpox-seek'], use_singularity) + shell: + """ + # Build a phylogenetic tree of containing + # the reference genome and all samples + raxml-ng \\ + --redo \\ + --threads 2 \\ + --msa {input.msa} \\ + --model GTR+G \\ + --msa-format FASTA \\ + --prefix {params.prefix} + """ \ No newline at end of file diff --git a/workflow/rules/trim.smk b/workflow/rules/trim.smk index ec16a91..2da9272 100644 --- a/workflow/rules/trim.smk +++ b/workflow/rules/trim.smk @@ -59,7 +59,7 @@ rule setup: rule nanofilt: """ - Data-processing step to perform base quality filtering with Nanofilt. + Depreciated data-processing step to perform base quality filtering with Nanofilt. @Input: Setup FastQ file (scatter) @Output: @@ -82,3 +82,31 @@ rule nanofilt: | gzip \\ > {output.flt} """ + + +rule porechop: + """ + Data-processing step to perform adapter trimming with porechop. + @Input: + Setup FastQ file (scatter) + @Output: + Trimmed FastQ file + """ + input: + fq = join(workpath, "{name}", "fastqs", "{name}.fastq.gz"), + output: + fq = join(workpath, "{name}", "fastqs", "{name}.trimmed.fastq.gz"), + params: + rname='porechop', + conda: depending(conda_yaml_or_named_env, use_conda) + container: depending(config['images']['mpox-seek'], use_singularity) + shell: + """ + # Trim adapter sequences with porechop + porechop \\ + -i {input.fq} \\ + -o {output.fq} \\ + --format fastq.gz \\ + --verbosity 1 \\ + --threads 1 + """ \ No newline at end of file