Snakefile.kallisto

from itertools import chain, combinations
from os.path import join
import glob
import re

CDNA_FA_GZ = '/home/gaurav/genomes/hg38/cdna/gencode.v27.transcripts.fa.gz'
CDNA_IDX = '/home/gaurav/genomes/hg38/cdna/gencode.v27.transcripts.idx'
RAWDATA_DIR = join('/home/gaurav/NU-RNASeq-fastq', config['dir_id'])
OUT_DIR = join('/home/gaurav/NU-RNASeq-out', config['dir_id'])

workdir: OUT_DIR


SAMPLES = glob.glob('{}**/*.fastq.gz'.format(RAWDATA_DIR), recursive=True)
print(SAMPLES)
SAMPLE_LANE = []
for sample in SAMPLES:
    sample = sample.replace('{}/'.format(RAWDATA_DIR),'')
    sample_name = re.split(r'_L\d\d\d_', sample)[0]
    lane_name = re.search(r'L\d\d\d', sample).group()
    SAMPLE_LANE.append((sample_name, lane_name))

SAMPLE_LANE = set(SAMPLE_LANE)
SAMPLE_LANE = sorted(SAMPLE_LANE, key=lambda tup: tup[0])
SAMPLE_NAMES, LANE_NAMES = zip(*SAMPLE_LANE)

print(SAMPLE_NAMES)
print(LANE_NAMES)

rule all:
    input:
        CDNA_IDX,
        expand('preprocessed/merged_fastq/{sample_name}_R1.fq.gz', sample_name=SAMPLE_NAMES),
        expand('preprocessed/merged_fastq/{sample_name}_R2.fq.gz', sample_name=SAMPLE_NAMES),
        expand('counts/{sample_name}/abundance.tsv', sample_name=SAMPLE_NAMES)

rule create_index:
    input: CDNA_FA_GZ
    output: CDNA_IDX
    shell:
        '''kallisto index -i {output} {input}'''

rule merge_fastq_R1:
    input: expand(RAWDATA_DIR+'/{{sample_name}}_{lane}_R1_001.fastq.gz', lane=set(LANE_NAMES))
    output: 'preprocessed/merged_fastq/{sample_name}_R1.fq.gz'
    run:
        inp = ' '.join(input)
        shell(r'''cat {inp} > {output}''')

rule merge_fastq_R2:
    input: expand(RAWDATA_DIR+'/{{sample_name}}_{lane}_R2_001.fastq.gz', lane=set(LANE_NAMES))
    output: 'preprocessed/merged_fastq/{sample_name}_R2.fq.gz'
    run:
        inp = ' '.join(input)
        shell(r'''cat {inp} > {output}''')

rule quantify:
    input:
        R1='preprocessed/merged_fastq/{sample_name}_R1.fq.gz',
        R2='preprocessed/merged_fastq/{sample_name}_R2.fq.gz'
    output:
        'counts/{sample_name}/abundance.tsv',
    params:
        index=CDNA_IDX,
        outdir='counts/{sample_name}'
    threads: 30 ## hardcoded below
    shell:
        r'''
        kallisto quant --index={params.index} --threads=30\
        --output-dir={params.outdir} -b 100 <(zcat {input.R1}) <(zcat {input.R2})'''