Snakefile_fastq_multilane.alt_splicing

include:
    #'configs/config_HuR.human.Penalva_L_01182017.py'
    #'configs/config_HuR.mouse.py',
    'configs/config_Feb_02_2017_Radiation_GBM.py'
    #'configs/config_Penalva_L_01182017.human.py'
    #'configs/config_Musashi1_Penalva_L_12212016.human.py'

workdir: OUT_DIR

from itertools import chain, combinations
from os.path import join
import glob
import re

"""
The idea behind this section is to get sample names and the corresponding lanes,
here I do it for filenames which look like this:
HepG2_Control_1_S7_L001_R1_001.fastq.gz
HepG2_Control_1_S7_L001_R2_001.fastq.gz
HepG2_Control_1_S7_L002_R1_001.fastq.gz
HepG2_Control_1_S7_L002_R2_001.fastq.gz
HepG2_Control_1_S7_L003_R1_001.fastq.gz
HepG2_Control_1_S7_L003_R2_001.fastq.gz
HepG2_Control_1_S7_L004_R1_001.fastq.gz
HepG2_Control_1_S7_L004_R2_001.fastq.gz

So the SAMPLE_NAME here is HepG2_Control_1_S7 which was sequenced(paired-end) in 4 lanes: L001-L004.

"""
SAMPLES = glob.glob('{}**/*.fastq.gz'.format(RAWDATA_DIR), recursive=True)
SAMPLE_LANE = []
for sample in SAMPLES:
    sample = sample.replace('{}/'.format(RAWDATA_DIR),'')
    sample_name = re.split(r'_L\d\d\d_', sample)[0]
    lane_name = re.search(r'L\d\d\d', sample).group()
    SAMPLE_LANE.append((sample_name, lane_name))

SAMPLE_LANE = set(SAMPLE_LANE)
SAMPLE_LANE = sorted(SAMPLE_LANE, key=lambda tup: tup[0])
SAMPLE_NAMES, LANE_NAMES = zip(*SAMPLE_LANE)

PLOT_PREFIXES = list(combinations(set(SAMPLE_NAMES), 2))
PLOT_PREFIXES_1 = [x[0] for x in PLOT_PREFIXES]
PLOT_PREFIXES_2 = [x[1] for x in PLOT_PREFIXES]

T24_SAMPLES = list(filter(lambda x: 'T24' in x, SAMPLE_NAMES))
T1_SAMPLES = list(filter(lambda x: 'T1' in x, SAMPLE_NAMES))
T0_SAMPLES = list(filter(lambda x: 'T0' in x, SAMPLE_NAMES))

rule all:
    input:
        STAR_INDEX,
        expand('qc/{sample_name}_{lane}_R1_001_fastqc.html', zip, sample_name=SAMPLE_NAMES, lane=LANE_NAMES),
        expand('qc/{sample_name}_{lane}_R2_001_fastqc.html', zip, sample_name=SAMPLE_NAMES, lane=LANE_NAMES),
        expand('preprocessed/{sample_name}/{sample_name}_{lane}_R1_001_val_1.fq.gz', zip, sample_name=SAMPLE_NAMES, lane=LANE_NAMES),
        expand('preprocessed/{sample_name}/{sample_name}_{lane}_R2_001_val_2.fq.gz', zip, sample_name=SAMPLE_NAMES, lane=LANE_NAMES),
        expand('mapped/bams/{sample_name}/{sample_name}_{lane}.bam', zip, sample_name=SAMPLE_NAMES, lane=LANE_NAMES),
        expand('mapped/merged_bams/{sample_name}.bam', sample_name=SAMPLE_NAMES),
        #expand('mapped/plots/tpm_scatter/{sample1}_VS_{sample2}.png', sample1=PLOT_PREFIXES_1, sample2=PLOT_PREFIXES_2),
        #expand('mapped/HTSeq-counts/{sample}.counts.tsv', sample=SAMPLE_NAMES),
        #expand('mapped/intergenic_reads/{sample}.intergenic.bam', sample=SAMPLE_NAMES),
        expand('inferred_experiment/{sample}.txt', sample=SAMPLE_NAMES),
        #expand('mapped/counts_DEXSeq/{sample}.DEXSeq.counts.tsv', sample=SAMPLE_NAMES),
        #expand('mapped/fpkm/{sample}.FPKM.xls', sample=SAMPLE_NAMES),
        #'mapped/plots/tpm_scatter/',
        #'mapped/tpm/HTSeq/masterTPM.tsv',
        #'mapped/tpm/featureCounts/masterTPM.tsv',
        #'mapped/DE_analysis/featureCounts/'+GENOME_BUILD+'.featureCounts.DESeq2.sig.tsv',
        #'mapped/DE_analysis/HTSeq/'+GENOME_BUILD+'.HTSeq.DESeq2.all.tsv',
        expand('mapped/HTSeq-counts/byCDS/{sample}.CDS.counts.tsv', sample=SAMPLE_NAMES),
        expand('mapped/HTSeq-counts/byExon/{sample}.exon.counts.tsv', sample=SAMPLE_NAMES),
        expand('mapped/dexseq/{sample}.dexseq.counts.tsv', sample=SAMPLE_NAMES),
        #expand('mapped/rsem_bams/{sample_name}.bam', sample_name=SAMPLE_NAMES),
        #expand('mapped/rsem-counts/{sample}.isoforms.results', sample=SAMPLE_NAMES),
        'mapped/featureCounts/byCDS/fcounts.CDS.tsv',
        'mapped/featureCounts/byExon/fcounts.exon.tsv',
        'multiqc_report/multiqc_report.html',
        'alternative_splicing_rMATS/All_T1_vs_T0/MATS_output/AS_Event.MATS.JunctionCountOnly.txt',
        'alternative_splicing_rMATS/All_T24_vs_T0/MATS_output/AS_Event.MATS.JunctionCountOnly.txt',
        'alternative_splicing_rMATS/All_T24_vs_T1/MATS_output/AS_Event.MATS.JunctionCountOnly.txt'


rule perform_qc:
    input:
        R1=RAWDATA_DIR+'/{sample_name}_{lane}_R1_001.fastq.gz',
        R2=RAWDATA_DIR+'/{sample_name}_{lane}_R2_001.fastq.gz'
    params:
        out_dir = 'qc'
    output:
       'qc/{sample_name}_{lane}_R1_001_fastqc.html',
       'qc/{sample_name}_{lane}_R1_001_fastqc.zip',
       'qc/{sample_name}_{lane}_R2_001_fastqc.html',
       'qc/{sample_name}_{lane}_R2_001_fastqc.zip',
    shell:
        r'''
            fastqc -o {params.out_dir} -f fastq {input.R1} {input.R2}
        '''


## rMATs requires the final read lengths to be same. So we take a simple strategy, rather than trimming 
## only reads with adapters, we trim all reads for upto 13bp towards the 3' end (-13 and not 13).
## The other thing that needs to be done is to disable soft-clipping at the STAR mapping step by doing alignEndToEnd.

rule perfom_trimming:
    input:
        R1=RAWDATA_DIR+'/{sample_name}_{lane}_R1_001.fastq.gz',
        R2=RAWDATA_DIR+'/{sample_name}_{lane}_R2_001.fastq.gz'
    params:
        out_dir='preprocessed/{sample_name}',
        phred_cutoff=5
    output:
        'preprocessed/{sample_name}/{sample_name}_{lane}_R1_001_val_1.fq.gz',
        'preprocessed/{sample_name}/{sample_name}_{lane}_R2_001_val_2.fq.gz',
    shell:
        r'''
        cutadapt -u -13 -U -13 -o {output[0]} -p {output[1]} {input.R1} {input.R2}
        '''

"""
shell:
    r'''
        trim_galore --paired -o {params.out_dir} -q {params.phred_cutoff} {input.R1} {input.R2}
    '''
"""

"""This is optimised for >150bp reads:
--outFilterMultimapScoreRange 20 : the score range below the maximum score for multimapping alignments
--outFilterScoreMinOverLread 0 : alignment will output only if score is ghier than or equal to this value normalised for read length
--outFilterMatchNminOverLread 0.66 :  alignment will output only if number of matched bases is higher than or equal to his value normalised for read length
--outFilterMismatchNmax 100 : increases the number of allowed mismatches to 100 - need to allow more mismatches for longer reads
--winAnchorMultimapNmax 200 : Maximum number of loci anchors are allowed to map
--seedPerReadNmax 100000   --seedPerWindowNmax 100 : increase the number of allowed seeds for each read and alignment window - need to store more seeds for longer reads

## Not used
--seedSearchStartLmax 30 : increases the number of seed search start position in the read - important for reads with high error rate
--seedSearchLmax 30 : similar to the above, limits the maximum length of the seeds. Presently, I do not recommend changing this parameter
--alignTranscriptsPerReadNmax 100000 --alignTranscriptsPerWindowNmax 10000 : increase the number of allowed alignments for each read and alignment window - need to store more putative alignments for longer reads

"""
rule map_starlong:
    input:
        R1='preprocessed/{sample_name}/{sample_name}_{lane}_R1_001_val_1.fq.gz',
        R2='preprocessed/{sample_name}/{sample_name}_{lane}_R2_001_val_2.fq.gz',
        index=STAR_INDEX
    output: 'mapped/bams/{sample_name}/{sample_name}_{lane}.bam'
    params:
        prefix = 'mapped/bams/{sample_name}_{lane}',
        unmapped = 'unmapped/fastq/{sample_name}_{lane}',
        starlogs = 'mapped/starlogs'
    threads: 16
    shell:
        r'''
        STARlong --runThreadN {threads}\
             --genomeDir {input.index}\
             --outFileNamePrefix {params.prefix} --readFilesIn {input.R1} {input.R2}\
             --outSAMtype BAM SortedByCoordinate\
             --readFilesCommand zcat\
             --outReadsUnmapped {params.unmapped}\
            --outFilterMultimapScoreRange 20\
            --outFilterScoreMinOverLread 0\
            --outFilterMatchNminOverLread 0.66\
            --outFilterMismatchNmax 100\
            --winAnchorMultimapNmax 200\
            --alignEndsType EndToEnd\
            --seedPerReadNmax 100000 --seedPerWindowNmax 100\
            && mv {params.prefix}Aligned.sortedByCoord.out.bam {output} &&\
            mkdir -p {params.starlogs} && mv {params.prefix}Log.final.out\
            {params.prefix}Log.out {params.prefix}Log.progress.out {params.starlogs}
        '''

## See: https://software.broadinstitute.org/gatk/guide/article?id=3060
## Merging should happen post alignment
rule merge_bams:
    input: expand('mapped/bams/{{sample_name}}/{{sample_name}}_{lane}.bam', lane=set(LANE_NAMES))
    output: 'mapped/merged_bams/{sample_name}.bam'
    run:
        cmd = ' -in '.join(input)
        shell(r'''bamtools merge -in {cmd} -out {output}''')

rule sort_by_name:
    input: 'mapped/merged_bams/{sample}.bam'
    output: 'mapped/merged_bams/{sample}.sortedByName.bam'
    shell:
        r'''
            samtools sort -on {input} -T /tmp/ -o {output}
        '''

rule count:
    input: 'mapped/merged_bams/{sample}.sortedByName.bam'
    params:
        annotation=GTF,
        phred_cutoff=5
    output: 'mapped/HTSeq-counts/byExon/{sample}.exon.counts.tsv'
    shell:
        r'''
        source activate python2 && htseq-count --order=name --format=bam --mode=intersection-strict --stranded={HTSEQ_STRANDED} --minaqual={params.phred_cutoff} --type=exon --idattr=gene_id {input} {params.annotation} > {output}
        '''

rule count_byCDS:
    input: 'mapped/merged_bams/{sample}.sortedByName.bam'
    params:
        annotation=GTF,
        phred_cutoff=5
    output: 'mapped/HTSeq-counts/byCDS/{sample}.CDS.counts.tsv'
    shell:
        r'''
        source activate python2 && htseq-count --order=name --format=bam --mode=intersection-strict --stranded={HTSEQ_STRANDED} --minaqual={params.phred_cutoff} --type=CDS --idattr=gene_id {input} {params.annotation} > {output}
        '''

rule format_counts:
    input: 'mapped/HTSeq-counts/{sample}.counts.tsv'
    output: 'mapped/HTSeq-counts/{sample}.counts.noversion.tsv'
    shell:
        r'''
        cat {input} | sed -E 's/\.[0-9]+//' > {output}

        '''

rule infer_experiment:
    input: 'mapped/merged_bams/{sample}.bam'
    output: 'inferred_experiment/{sample}.txt'
    shell:
        r'''
        source activate python2 && infer_experiment.py -r {GENE_BED_TSV} -i {input} 2>&1 > {output} 
        '''

rule run_deseq:
    input: expand('mapped/HTSeq-counts/{sample}.counts.noversion.tsv', sample=SAMPLE_NAMES)
    output:
        'mapped/DE_analysis/HTSeq/'+GENOME_BUILD+'.HTSeq.DESeq2.all.tsv',
        'mapped/DE_analysis/HTSeq/'+GENOME_BUILD+'.HTSeq.DESeq2.sig.tsv'

    params:
        basedir = 'mapped/HTSeq-counts/',
        inprefix = 'counts.noversion',
        gene_annotations  = GENE_NAMES,
        outprefix = 'mapped/DE_analysis/HTSeq/'+GENOME_BUILD+'.HTSeq'
    shell:
        r'''
        Rscript {SRC_DIR}/do_DE_analysis.R --basedir={params.basedir} \
            --gene_annotations={params.gene_annotations} \
            --design_file={DESIGN_FILE} \
            --outprefix={params.outprefix} \
            --inprefix={params.inprefix}

        '''


rule featurecounts:
    input: expand('mapped/merged_bams/{sample}.sortedByName.bam', sample=set(SAMPLE_NAMES))
    params:
        annotation=GTF
    output: 'mapped/featureCounts/byExon/fcounts.exon.tsv'
    threads: 16
    shell:
        r'''featureCounts {FEATURECOUNTS_S} -a {params.annotation} -o {output} -t exon -g gene_id -Q 4 -T {threads} {input}'''

rule featurecounts_cds:
    input: expand('mapped/merged_bams/{sample}.sortedByName.bam', sample=set(SAMPLE_NAMES))
    params:
        annotation=GTF
    output: 'mapped/featureCounts/byCDS/fcounts.CDS.tsv'
    threads: 16
    shell:
        r'''featureCounts {FEATURECOUNTS_S} -a {params.annotation} -o {output} -t CDS -g gene_id -Q 4 -T {threads} {input}'''

rule format_fcounts:
    input: 'mapped/featureCounts/fcounts.tsv'
    output: 'mapped/featureCounts/fcounts.noversion.tsv'
    shell:
        r'''
        cat {input} | sed -E 's/\.[0-9]+//' > {output}

        '''
    output: 'mapped/featureCounts/fcounts.noversion.tsv'

rule run_deseq_featureCounts:
    input: 'mapped/featureCounts/fcounts.noversion.tsv'
    output:
        'mapped/DE_analysis/featureCounts/'+GENOME_BUILD+'.featureCounts.DESeq2.all.tsv',
        'mapped/DE_analysis/featureCounts/'+GENOME_BUILD+'.featureCounts.DESeq2.sig.tsv'
    params:
        gene_annotations  = GENE_NAMES,
        outprefix = 'mapped/DE_analysis/featureCounts/'+GENOME_BUILD+'.featureCounts'
    shell:
        r'''
        Rscript {SRC_DIR}/do_DE_analysis_featureCounts.R --counts={input} \
            --gene_annotations={params.gene_annotations} \
            --design_file={DESIGN_FILE} \
            --outprefix={params.outprefix}

        '''

rule run_picardmetrics:
    input: 'mapped/merged_bams/{sample}.bam'
    output: 'mapped/bam_metrics/{sample}.metrics'
    shell:
        r'''
        picard CollectInsertSizeMetrics I={input} H={output}.insertsize.pdf O={output}

        '''

rule create_insertsize_tsv:
    input: 'mapped/bam_metrics/{sample}.metrics'
    output: 'mapped/bam_metrics/{sample}.insertsizes.tsv'
    shell:
        r'''
        python {SRC_DIR}/collect_picard_metrics.py {input} {output}

        '''

rule counts_to_tpm:
    input:
        count = expand('mapped/HTSeq-counts/{sample}.counts.noversion.tsv', sample=SAMPLE_NAMES),
        insert_size = expand('mapped/bam_metrics/{sample}.insertsizes.tsv', sample=SAMPLE_NAMES),
    output: 'mapped/tpm/HTSeq/masterTPM.tsv'
    params:
        gene_lengths=GENE_LENGTHS,
        name=expand('{sample}', sample=set(SAMPLE_NAMES)),
        outprefix='mapped/tpm/HTSeq',
        gene_map=GENE_NAMES
    run:
        counts_input = (',').join(input.count)
        sizes_input = (',').join(input.insert_size)
        names = (',').join(params.name)
        shell('Rscript {SRC_DIR}/counts_to_tpm.R --counts={counts_input} --insert_sizes={sizes_input} --gene_lengths={params.gene_lengths} --inprefix={names} --gene_map={params.gene_map} --outprefix={params.outprefix}')


rule featurecounts_to_tpm:
    input:
        insert_size = expand('mapped/bam_metrics/{sample}.insertsizes.tsv', sample=set(SAMPLE_NAMES)),
        fcounts = 'mapped/featureCounts/fcounts.noversion.tsv'
    output: 'mapped/tpm/featureCounts/masterTPM.tsv'
    params:
        gene_lengths=GENE_LENGTHS,
        name=expand('{sample}', sample=set(SAMPLE_NAMES)),
        outprefix='mapped/tpm/featureCounts',
        gene_map=GENE_NAMES
    run:
        sizes_input = (',').join(input.insert_size)
        names = (',').join(params.name)
        shell('Rscript {SRC_DIR}/featurecounts_to_tpm.R --counts={input.fcounts} --insert_sizes={sizes_input} --gene_lengths={params.gene_lengths} --inprefix={names} --gene_map={params.gene_map} --outprefix={params.outprefix}')


rule plot_tpm:
    input: 'mapped/tpm/HTSeq/masterTPM.tsv'
    output: 'mapped/plots/tpm_scatter/'
    params:
        prefix = 'mapped/plots/tpm_scatter/'
    shell:
        r'''
        export LC_ALL=en_US.UTF-8 && python {SRC_DIR}/plot_tpm_scatter.py --master {input} --outprefix  {params.prefix}
        '''

rule perform_qualimap_qc:
    input:  'mapped/merged_bams/{sample}.bam',
    output: 'mapped/post_mapping_qualimap/{sample}/qualimapReport.html',
    params:
        outdir='mapped/post_mapping_qualimap/{sample}',
        gtf=GTF
    shell:
        r'''
        qualimap rnaseq -bam {input} -gtf {params.gtf} --outdir {params.outdir} --java-mem-size=8G

        '''

rule get_duplication_estimate:
    input:  'mapped/merged_bams/{sample}.bam'
    output: 'mapped/post_mapping_deduplication/{sample}/output.DupRate_plot.r'
    params:
        outprefix='mapped/post_mapping_deduplication/{sample}/output'
    shell:
        r'''
        source activate python2 && read_duplication.py -i {input} -o {params.outprefix}
        '''

rule run_multiqc:
    input:
        expand('qc/{sample_name}_{lane}_R1_001_fastqc.html', zip, sample_name=SAMPLE_NAMES, lane=LANE_NAMES),
        expand('qc/{sample_name}_{lane}_R2_001_fastqc.html', zip, sample_name=SAMPLE_NAMES, lane=LANE_NAMES),
        expand('mapped/bam_metrics/{sample}.metrics', sample=SAMPLE_NAMES),
        expand('mapped/post_mapping_deduplication/{sample}/output.DupRate_plot.r', sample=SAMPLE_NAMES),
        expand('mapped/post_mapping_qualimap/{sample}/qualimapReport.html', sample=SAMPLE_NAMES),
        expand('mapped/merged_bams/{sample}.bam', sample=SAMPLE_NAMES),
        #'mapped/DE_analysis/HTSeq/'+GENOME_BUILD+'.HTSeq.DESeq2.sig.tsv',
        #'mapped/DE_analysis/featureCounts/'+GENOME_BUILD+'.featureCounts.DESeq2.sig.tsv',
    output:
        'multiqc_report/multiqc_report.html'
    shell:
        'export LC_ALL=en_US.UTF-8 && multiqc -f --outdir multiqc_report .'

rule extract_intergenic_reads:
    input:
        bam='mapped/merged_bams/{sample}.bam'
    output: 'mapped/intergenic_reads/{sample}.intergenic.bam'
    shell:
        r'''
        samtools index {input.bam} && samtools view -bhL {INTERGENIC_BED} > {output}
        '''

rule count_dexseq:
    input: 'mapped/merged_bams/{sample}.sortedByName.bam'
    output: 'mapped/counts_DEXSeq/{sample}.DEXSeq.counts.tsv'
    shell:
        r'''
        source activate python2 && python {SRC_DIR}/external/DEXSeq/dexseq_count.py -p yes -s {HTSEQ_STRANDED} -r name -f bam {DEXSeq_GFF} {input} {output}
        '''

rule calc_fpkm:
    input: 'mapped/merged_bams/{sample}.bam'
    output: 'mapped/fpkm/{sample}.FPKM.xls'
    params:
        prefix='mapped/fpkm/{sample}'
    shell:
        r'''
        source activate python2 && FPKM_count.py -i {input} -o {params.prefix} -r {GENE_BED}
        '''

rule alt_t1t0:
    input: 
        t1 = expand('mapped/merged_bams/{sample}.bam', sample=T1_SAMPLES),
        t0 = expand('mapped/merged_bams/{sample}.bam', sample=T0_SAMPLES)
    params:
        outdir = 'alternative_splicing_rMATS/All_T1_vs_T0'
    output: 'alternative_splicing_rMATS/All_T1_vs_T0/MATS_output/AS_Event.MATS.JunctionCountOnly.txt'
    run:
        b1 = ','.join(input.t0),
        b2 = ','.join(input.t1)
        gtf = GTF
        star_index = STAR_INDEX
        shell(r'''source activate python2 && python /home/cmb-panasas2/skchoudh/software_frozen/rMATS.3.2.5/RNASeq-MATS.py -b1 {b1} -b2 {b2} -t paired -len 138 -gtf {gtf} -bi {star_index} -o {params.outdir} 1>&2''')


rule alt_t24t0:
    input: 
        t24 = expand('mapped/merged_bams/{sample}.bam', sample=T24_SAMPLES),
        t0 = expand('mapped/merged_bams/{sample}.bam', sample=T0_SAMPLES)
    params:
        outdir = 'alternative_splicing_rMATS/All_T24_vs_T0'
    output: 'alternative_splicing_rMATS/All_T24_vs_T0/MATS_output/AS_Event.MATS.JunctionCountOnly.txt'
    run:
        b1 = ','.join(input.t0),
        b2 = ','.join(input.t24)
        gtf = GTF
        star_index = STAR_INDEX
        shell(r'''source activate python2 && python /home/cmb-panasas2/skchoudh/software_frozen/rMATS.3.2.5/RNASeq-MATS.py -b1 {b1} -b2 {b2} -t paired -len 138 -gtf {gtf} -bi {star_index} -o {params.outdir} 1>&2''')

rule alt_t24t1:
    input:
        t1 = expand('mapped/merged_bams/{sample}.bam', sample=T1_SAMPLES),
        t24 = expand('mapped/merged_bams/{sample}.bam', sample=T24_SAMPLES)
    params:
        outdir = 'alternative_splicing_rMATS/All_T24_vs_T1'
    output: 'alternative_splicing_rMATS/All_T24_vs_T1/MATS_output/AS_Event.MATS.JunctionCountOnly.txt'
    run:
        b1 = ','.join(input.t1),
        b2 = ','.join(input.t24)
        gtf = GTF
        star_index = STAR_INDEX
        shell(r'''source activate python2 && python /home/cmb-panasas2/skchoudh/software_frozen/rMATS.3.2.5/RNASeq-MATS.py -b1 {b1} -b2 {b2} -t paired -len 138 -gtf {gtf} -bi {star_index} -o {params.outdir} 1>&2''')

rule rsem_bams:
    input: 'mapped/merged_bams/{sample}.bam'
    output: 'mapped/rsem_bams/{sample}.bam'
    params:
        prefix = 'mapped/rsem_bams/{sample}'
    shell:
        r'''convert-sam-for-rsem {input} {params.prefix}'''


rule rsem_counts:
    input: 'mapped/rsem_bams/{sample}.bam'
    params:
        annotation=GTF,
        prefix='mapped/rsem-counts/{sample}'
    output: 'mapped/rsem-counts/{sample}.isoforms.results'
    threads: 16
    shell:
        r'''
        rsem-calculate-expression --paired-end --bam -p {threads} {input} {RSEM_INDEX} {params.prefix}
        '''


rule prepare_annotation_for_dexseq:
    input: GTF
    output: 'mapped/dexseq/gencode.v25.annotation.DEXSeq.gff'
    shell:
        r'''source activate python2 && python /panfs/cmb-panasas2/skchoudh/software_frozen/anaconda27/envs/python2/lib/R/library/DEXSeq/python_scripts/dexseq_prepare_annotation.py {input} {output}
        '''

rule run_dexseq:
    input:
        bam='mapped/merged_bams/{sample}.bam',
        gff='mapped/dexseq/gencode.v25.annotation.DEXSeq.gff'
    output:
        'mapped/dexseq/{sample}.dexseq.counts.tsv'
    shell:
        r'''source activate python2 && python /panfs/cmb-panasas2/skchoudh/software_frozen/anaconda27/envs/python2/lib/R/library/DEXSeq/python_scripts/dexseq_count.py -f bam -s no -p yes -r name {input.gff} {input.bam} {output}
        '''