Skip to content

Commit

Permalink
Feat: adding first iteration of the pipeline.
Browse files Browse the repository at this point in the history
  • Loading branch information
skchronicles committed Apr 2, 2024
1 parent d0b8bdf commit f1d698f
Show file tree
Hide file tree
Showing 4 changed files with 218 additions and 6 deletions.
34 changes: 29 additions & 5 deletions workflow/Snakefile
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Python standard library
from os.path import join
import os, sys
import os, sys, json

# Local imports
from scripts.common import (
Expand Down Expand Up @@ -66,14 +66,38 @@ rule all:
join(workpath, "{name}", "fastqs", "{name}.fastq.gz"),
name=samples
),
# Base-calling quality filtering,
# @imported from `rule nanofilt` in rules/trim.smk
# Adapter trimming step,
# @imported from `rule porechop` in rules/trim.smk
expand(
join(workpath, "{name}", "fastqs", "{name}.filtered.fastq.gz"),
join(workpath, "{name}", "fastqs", "{name}.trimmed.fastq.gz"),
name=samples
),
# Align reads against monkeypox reference,
# @imported from `rule minimap2` in rules/map.smk
expand(
join(workpath, "{name}", "bams", "{name}.sam"),
name=samples
),
# Create a consensus sequence from alignments
# @imported from `rule consensus` in rules/map.smk
expand(
join(workpath, "{name}", "consensus", "{name}_consensus_seqid.fa"),
name=samples
),
# Create input file for MSA, concatenates the ref and
# each samples consequence sequence.
# @imported from `rule concat` in rules/map.smk
join(workpath, "project", "consensus.fa"),
# Mutiple sequence alignment (MSA),
# @imported from `rule mafft` in rules/map.smk
join(workpath, "project", "msa.fa"),
# Build a phylogentic tree from MSA,
# @imported from `rule tree` in rules/tree.smk
join(workpath, "project", "mpox_phylogeny.raxml.bestTree"),


# Import rules
include: join("rules", "common.smk")
include: join("rules", "trim.smk")
include: join("rules", "trim.smk")
include: join("rules", "map.smk")
include: join("rules", "tree.smk")
125 changes: 125 additions & 0 deletions workflow/rules/map.smk
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
# Data processing rules to map, collapse, and perform msa
rule minimap2:
"""
Data-processing step to align reads against NCBI MonkeyPox reference
sequence ("NC_003310.1"): https://www.ncbi.nlm.nih.gov/nuccore/NC_003310.1
@Input:
Trimmed FastQ file (scatter)
@Output:
SAM file
"""
input:
fq = join(workpath, "{name}", "fastqs", "{name}.trimmed.fastq.gz"),
output:
sam = join(workpath, "{name}", "bams", "{name}.sam"),
params:
rname = 'minimap2',
ref_fa = config['references']['mpox_ref_genome'],
conda: depending(conda_yaml_or_named_env, use_conda)
container: depending(config['images']['mpox-seek'], use_singularity)
shell:
"""
# Align against NCBRI monkeypox genome:
# https://www.ncbi.nlm.nih.gov/nuccore/NC_003310.1
minimap2 \\
-ax map-ont \\
--secondary=no \\
--sam-hit-only \\
{params.ref_fa} \\
{input.fq} \\
> {output.sam}
"""


rule consensus:
"""
Data-processing step to collapse aligned reads into a consensus
sequence using viral_consensus.
@Input:
SAM file (scatter)
@Output:
Consensus FASTA file,
Consensus FASTA file with samples names for sequence identifers
"""
input:
sam = join(workpath, "{name}", "bams", "{name}.sam"),
output:
fa = join(workpath, "{name}", "consensus", "{name}_consensus.fa"),
fixed = join(workpath, "{name}", "consensus", "{name}_consensus_seqid.fa"),
params:
rname = 'consensus',
ref_fa = config['references']['mpox_ref_genome'],
conda: depending(conda_yaml_or_named_env, use_conda)
container: depending(config['images']['mpox-seek'], use_singularity)
shell:
"""
# Create a consensus sequence of aligned reads
viral_consensus \\
-i {input.sam} \\
-r {params.ref_fa} \\
-o {output.fa}
# Rename the sequence identifers in the FASTA
# file to contain only the sample name, by
# default viral_consensus contains the info
# related to the command that was run.
awk '{{split($0,a," "); n=split(a[4],b,"/"); gsub(/\\.sam$/,"",b[n]); if(a[2]) print ">"b[n]; else print; }}' \\
{output.fa} \\
> {output.fixed}
"""


rule concat:
"""
Data-processing step to create an input FASTA file for mafft. This
fasta file should contain the reference genome and the consensus
sequence all samples.
@Input:
Consensus FASTA file with samples names for sequence identifers (gather)
@Output:
FASTA file containing the ref and the consensus sequences of all samples
"""
input:
fas = expand(join(workpath, "{name}", "consensus", "{name}_consensus_seqid.fa"), name=samples),
output:
fa = join(workpath, "project", "consensus.fa"),
params:
rname = 'premafft',
ref_fa = config['references']['mpox_ref_genome'],
conda: depending(conda_yaml_or_named_env, use_conda)
container: depending(config['images']['mpox-seek'], use_singularity)
shell:
"""
# Create FASTA file with the reference genome
# and the consensus sequence of each sample
cat {params.ref_fa} \\
{input.fas} \\
> {output.fa}
"""


rule mafft:
"""
Data-processing step to run multiple sequence alignment (MSA) of the
reference genome and the consensus sequence of each sample.
@Input:
FASTA file containing the ref and the consensus sequences of all samples (indirect-gather, singleton)
@Output:
Multiple sequence alignment (MSA) FASTA file from mafft.
"""
input:
fa = join(workpath, "project", "consensus.fa"),
output:
msa = join(workpath, "project", "msa.fa"),
params:
rname = 'msa',
ref_fa = config['references']['mpox_ref_genome'],
conda: depending(conda_yaml_or_named_env, use_conda)
container: depending(config['images']['mpox-seek'], use_singularity)
shell:
"""
# Run multiple sequence alignment (MSA) of the
# reference genome and each samples consensus
# sequence using mafft
mafft --auto --thread 2 {input.fa} > {output.msa}
"""
35 changes: 35 additions & 0 deletions workflow/rules/tree.smk
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# Data processing rules to build a phylogentic tree
rule tree:
"""
Data-processing step to build a phylogentic tree of the multiple sequence
alignment (MSA) results using raxml-ng. This phylogentic tree can be viewed
and explored interactively with tools like figtree, ete, taxonium, java
treeviewer, etc. The tools are open-source and free to use, and many of
them can be downloaded as desktop applications (run offline).
@Input:
Multiple sequence alignment (MSA) FASTA file from mafft.
@Output:
Phylogenetic tree (Newick format).
"""
input:
msa = join(workpath, "project", "msa.fa"),
output:
nw = join(workpath, "project", "mpox_phylogeny.raxml.bestTree"),
params:
rname = 'tree',
ref_fa = config['references']['mpox_ref_genome'],
prefix = join(workpath, "project", "mpox_phylogeny"),
conda: depending(conda_yaml_or_named_env, use_conda)
container: depending(config['images']['mpox-seek'], use_singularity)
shell:
"""
# Build a phylogenetic tree of containing
# the reference genome and all samples
raxml-ng \\
--redo \\
--threads 2 \\
--msa {input.msa} \\
--model GTR+G \\
--msa-format FASTA \\
--prefix {params.prefix}
"""
30 changes: 29 additions & 1 deletion workflow/rules/trim.smk
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ rule setup:

rule nanofilt:
"""
Data-processing step to perform base quality filtering with Nanofilt.
Depreciated data-processing step to perform base quality filtering with Nanofilt.
@Input:
Setup FastQ file (scatter)
@Output:
Expand All @@ -82,3 +82,31 @@ rule nanofilt:
| gzip \\
> {output.flt}
"""


rule porechop:
"""
Data-processing step to perform adapter trimming with porechop.
@Input:
Setup FastQ file (scatter)
@Output:
Trimmed FastQ file
"""
input:
fq = join(workpath, "{name}", "fastqs", "{name}.fastq.gz"),
output:
fq = join(workpath, "{name}", "fastqs", "{name}.trimmed.fastq.gz"),
params:
rname='porechop',
conda: depending(conda_yaml_or_named_env, use_conda)
container: depending(config['images']['mpox-seek'], use_singularity)
shell:
"""
# Trim adapter sequences with porechop
porechop \\
-i {input.fq} \\
-o {output.fq} \\
--format fastq.gz \\
--verbosity 1 \\
--threads 1
"""

0 comments on commit f1d698f

Please sign in to comment.