Feat: adding first iteration of the pipeline.

OpenOmics · Apr 2, 2024 · f1d698f · f1d698f
1 parent d0b8bdf
commit f1d698f
Show file tree

Hide file tree

Showing 4 changed files with 218 additions and 6 deletions.
diff --git a/workflow/Snakefile b/workflow/Snakefile
@@ -1,6 +1,6 @@
 # Python standard library
 from os.path import join
-import os, sys
+import os, sys, json
 
 # Local imports
 from scripts.common import (
@@ -66,14 +66,38 @@ rule all:
             join(workpath, "{name}", "fastqs", "{name}.fastq.gz"), 
             name=samples
         ),
-        # Base-calling quality filtering,
-        # @imported from `rule nanofilt` in rules/trim.smk 
+        # Adapter trimming step,
+        # @imported from `rule porechop` in rules/trim.smk 
         expand(
-            join(workpath, "{name}", "fastqs", "{name}.filtered.fastq.gz"),
+            join(workpath, "{name}", "fastqs", "{name}.trimmed.fastq.gz"),
             name=samples
         ),
+        # Align reads against monkeypox reference,
+        # @imported from `rule minimap2` in rules/map.smk
+        expand(
+            join(workpath, "{name}", "bams", "{name}.sam"),
+            name=samples
+        ),
+        # Create a consensus sequence from alignments
+        # @imported from `rule consensus` in rules/map.smk
+        expand(
+            join(workpath, "{name}", "consensus", "{name}_consensus_seqid.fa"),
+            name=samples
+        ),
+        # Create input file for MSA, concatenates the ref and 
+        # each samples consequence sequence.
+        # @imported from `rule concat` in rules/map.smk
+        join(workpath, "project", "consensus.fa"),
+        # Mutiple sequence alignment (MSA),
+        # @imported from `rule mafft` in rules/map.smk
+        join(workpath, "project", "msa.fa"),
+        # Build a phylogentic tree from MSA,
+        # @imported from `rule tree` in rules/tree.smk
+        join(workpath, "project", "mpox_phylogeny.raxml.bestTree"),
 
 
 # Import rules 
 include: join("rules", "common.smk")
-include: join("rules", "trim.smk")
+include: join("rules", "trim.smk")
+include: join("rules", "map.smk")
+include: join("rules", "tree.smk")
diff --git a/workflow/rules/map.smk b/workflow/rules/map.smk
@@ -0,0 +1,125 @@
+# Data processing rules to map, collapse, and perform msa
+rule minimap2:
+    """
+    Data-processing step to align reads against NCBI MonkeyPox reference 
+    sequence ("NC_003310.1"): https://www.ncbi.nlm.nih.gov/nuccore/NC_003310.1
+    @Input:
+        Trimmed FastQ file (scatter)
+    @Output:
+        SAM file
+    """
+    input:
+        fq   = join(workpath, "{name}", "fastqs", "{name}.trimmed.fastq.gz"),
+    output:
+        sam  = join(workpath, "{name}", "bams", "{name}.sam"),
+    params:
+        rname  = 'minimap2',
+        ref_fa  = config['references']['mpox_ref_genome'],
+    conda: depending(conda_yaml_or_named_env, use_conda)
+    container: depending(config['images']['mpox-seek'], use_singularity)
+    shell: 
+        """
+        # Align against NCBRI monkeypox genome:
+        # https://www.ncbi.nlm.nih.gov/nuccore/NC_003310.1
+        minimap2 \\
+            -ax map-ont \\
+            --secondary=no \\
+            --sam-hit-only \\
+            {params.ref_fa} \\
+            {input.fq} \\
+        > {output.sam}
+        """
+
+
+rule consensus:
+    """
+    Data-processing step to collapse aligned reads into a consensus 
+    sequence using viral_consensus. 
+    @Input:
+        SAM file (scatter)
+    @Output:
+        Consensus FASTA file,
+        Consensus FASTA file with samples names for sequence identifers
+    """
+    input:
+        sam = join(workpath, "{name}", "bams", "{name}.sam"),
+    output:
+        fa    = join(workpath, "{name}", "consensus", "{name}_consensus.fa"),
+        fixed = join(workpath, "{name}", "consensus", "{name}_consensus_seqid.fa"),
+    params:
+        rname  = 'consensus',
+        ref_fa  = config['references']['mpox_ref_genome'],
+    conda: depending(conda_yaml_or_named_env, use_conda)
+    container: depending(config['images']['mpox-seek'], use_singularity)
+    shell: 
+        """
+        # Create a consensus sequence of aligned reads
+        viral_consensus \\
+            -i {input.sam} \\
+            -r {params.ref_fa} \\
+            -o {output.fa}
+        
+        # Rename the sequence identifers in the FASTA 
+        # file to contain only the sample name, by
+        # default viral_consensus contains the info
+        # related to the command that was run.
+        awk '{{split($0,a," "); n=split(a[4],b,"/"); gsub(/\\.sam$/,"",b[n]); if(a[2]) print ">"b[n]; else print; }}' \\
+            {output.fa} \\
+            > {output.fixed}
+        """
+
+
+rule concat:
+    """
+    Data-processing step to create an input FASTA file for mafft. This
+    fasta file should contain the reference genome and the consensus
+    sequence all samples.
+    @Input:
+        Consensus FASTA file with samples names for sequence identifers (gather)
+    @Output:
+        FASTA file containing the ref and the consensus sequences of all samples
+    """
+    input:
+        fas = expand(join(workpath, "{name}", "consensus", "{name}_consensus_seqid.fa"), name=samples),
+    output:
+        fa  = join(workpath, "project", "consensus.fa"),
+    params:
+        rname  = 'premafft',
+        ref_fa  = config['references']['mpox_ref_genome'],
+    conda: depending(conda_yaml_or_named_env, use_conda)
+    container: depending(config['images']['mpox-seek'], use_singularity)
+    shell: 
+        """
+        # Create FASTA file with the reference genome
+        # and the consensus sequence of each sample
+        cat {params.ref_fa} \\
+            {input.fas} \\
+            > {output.fa}
+        """
+
+
+rule mafft:
+    """
+    Data-processing step to run multiple sequence alignment (MSA) of the
+    reference genome and the consensus sequence of each sample.
+    @Input:
+        FASTA file containing the ref and the consensus sequences of all samples (indirect-gather, singleton)
+    @Output:
+        Multiple sequence alignment (MSA) FASTA file from mafft.
+    """
+    input:
+        fa  = join(workpath, "project", "consensus.fa"),
+    output:
+        msa = join(workpath, "project", "msa.fa"),
+    params:
+        rname  = 'msa',
+        ref_fa  = config['references']['mpox_ref_genome'],
+    conda: depending(conda_yaml_or_named_env, use_conda)
+    container: depending(config['images']['mpox-seek'], use_singularity)
+    shell: 
+        """
+        # Run multiple sequence alignment (MSA) of the 
+        # reference genome and each samples consensus 
+        # sequence using mafft
+        mafft --auto --thread 2 {input.fa} > {output.msa}
+        """
diff --git a/workflow/rules/tree.smk b/workflow/rules/tree.smk
@@ -0,0 +1,35 @@
+# Data processing rules to build a phylogentic tree
+rule tree:
+    """
+    Data-processing step to build a phylogentic tree of the multiple sequence 
+    alignment (MSA) results using raxml-ng. This phylogentic tree can be viewed
+    and explored interactively with tools like figtree, ete, taxonium, java 
+    treeviewer, etc. The tools are open-source and free to use, and many of 
+    them can be downloaded as desktop applications (run offline).
+    @Input:
+        Multiple sequence alignment (MSA) FASTA file from mafft.
+    @Output:
+        Phylogenetic tree (Newick format).
+    """
+    input:
+        msa  = join(workpath, "project", "msa.fa"),
+    output:
+        nw  = join(workpath, "project", "mpox_phylogeny.raxml.bestTree"),
+    params:
+        rname  = 'tree',
+        ref_fa = config['references']['mpox_ref_genome'],
+        prefix = join(workpath, "project", "mpox_phylogeny"),
+    conda: depending(conda_yaml_or_named_env, use_conda)
+    container: depending(config['images']['mpox-seek'], use_singularity)
+    shell: 
+        """
+        # Build a phylogenetic tree of containing 
+        # the reference genome and all samples
+        raxml-ng \\
+            --redo \\
+            --threads 2 \\
+            --msa {input.msa} \\
+            --model GTR+G \\
+            --msa-format FASTA \\
+            --prefix {params.prefix}
+        """
diff --git a/workflow/rules/trim.smk b/workflow/rules/trim.smk
@@ -59,7 +59,7 @@ rule setup:
 
 rule nanofilt:
     """
-    Data-processing step to perform base quality filtering with Nanofilt. 
+    Depreciated data-processing step to perform base quality filtering with Nanofilt. 
     @Input:
         Setup FastQ file (scatter)
     @Output:
@@ -82,3 +82,31 @@ rule nanofilt:
             | gzip \\
         > {output.flt}
         """
+
+
+rule porechop:
+    """
+    Data-processing step to perform adapter trimming with porechop. 
+    @Input:
+        Setup FastQ file (scatter)
+    @Output:
+        Trimmed FastQ file
+    """
+    input:
+        fq = join(workpath, "{name}", "fastqs", "{name}.fastq.gz"),
+    output:
+        fq = join(workpath, "{name}", "fastqs", "{name}.trimmed.fastq.gz"),
+    params:
+        rname='porechop',
+    conda: depending(conda_yaml_or_named_env, use_conda)
+    container: depending(config['images']['mpox-seek'], use_singularity)
+    shell: 
+        """
+        # Trim adapter sequences with porechop
+        porechop \\
+            -i {input.fq} \\
+            -o {output.fq} \\
+            --format fastq.gz \\
+            --verbosity 1 \\
+            --threads 1
+        """