Snakefile_downstream_only

# Snakemake pipeline for kraken2 classification of metagenomic samples
# Downstream processing  steps
# Developed by Ben Siranosian 2018-2023
# Originally developed in the Bhatt Lab, Stanford Genetics
# MIT Licensed. https://github.com/bhattlab/kraken2_classification/

from os.path import join, exists
import sys
import snakemake
import time
localrules: downstream_processing_kraken, downstream_processing_bracken, copy_files_processing, create_taxonomy_array

# Include code from other files
# scripts/setup.smk interprets config file and sets pipeline options
include: "scripts/functions.smk"
include: "scripts/setup_downstream_only.smk"

rule all:
    input:
        join(outdir, 'processed_results_kraken/plots/classified_taxonomy_barplot_species.pdf'),
        run_extra_all_outputs,
        join(outdir, "kraken2_processing_completed.txt")

rule create_taxonomy_array:
    input:
        join(config['database'], 'taxo.k2d')
    output:
        join(config['database'], 'taxonomy_array.tsv')
    params: 
        db = config['database'],
        improve_taxonomy_script = join(workflow.basedir, 'scripts', 'improve_taxonomy.py')
    conda: "envs/anytree/anytree.yaml"
    container: "docker://bsiranosian/anytree:2.8"
    shell: """
        python {params.improve_taxonomy_script} {params.db}
    """

# Running the pipeline with singularity had a strange bug, where these files had 
# to be present in the output directory. This rule accomplishes that.
rule copy_files_processing:
    input: 
        tax_array = join(config['database'], 'taxonomy_array.tsv')
    output:
        join(outdir,  'taxonomy_array.tsv'),
        join(outdir,  'scripts/post_classification_workflow.R')
    params:
        scriptdir = join(workflow.basedir, 'scripts')
    shell: """
    cp -r {params.scriptdir} {outdir}
    cp {input.tax_array} {outdir}
    """

# Downstream processing with R
## Run for Kraken, and also Bracken if the tool was run
rule downstream_processing_kraken:
    input:
        downstream_processing_input_kraken,
        tax_array = join(outdir, 'taxonomy_array.tsv'),
        script_test = join(outdir, 'scripts/post_classification_workflow.R')
    params:
        sample_reads_file = config["sample_reads_file"],
        sample_reports_file = config["sample_reports_file"],
        sample_groups_file = config["sample_groups_file"],
        workflow_outdir = outdir,
        result_dir = join(outdir, 'processed_results_kraken'),
        use_bracken_report = False,
        remove_chordata = config['remove_chordata']
    singularity: "shub://bhattlab/kraken2_classification:kraken2_processing"
    output:
        join(outdir, 'processed_results_kraken/plots/classified_taxonomy_barplot_species.pdf')
    script:
        'scripts/post_classification_workflow.R'

rule downstream_processing_bracken:
    input:
        downstream_processing_input_bracken,
        tax_array = join(outdir, 'taxonomy_array.tsv'),
        script_test = join(outdir, 'scripts/post_classification_workflow.R')
    params:
        sample_reads_file = config["sample_reads_file"],
        sample_reports_file = config["sample_reports_file"],
        sample_groups_file = config["sample_groups_file"],
        workflow_outdir = outdir,
        result_dir = join(outdir, 'processed_results_bracken'),
        use_bracken_report = config['run_bracken'],
        remove_chordata = config['remove_chordata']
    singularity: "shub://bhattlab/kraken2_classification:kraken2_processing"
    output:
        join(outdir, 'processed_results_bracken/plots/classified_taxonomy_barplot_species.pdf')
    script:
        'scripts/post_classification_workflow.R'

# Remove file copied files during setup
rule remove_files_processing:
    input: 
        rules.downstream_processing_kraken.output,
        run_extra_all_outputs
    output:
        join(outdir, "kraken2_processing_completed.txt")
    params:
        workflow_outdir = outdir
    shell: """
    rm -rf {params.workflow_outdir}/scripts
    rm -f {params.workflow_outdir}/taxonomy_array.tsv
    touch {output}
    """