MapAndMergeBams.snake

import os
import paramiko
import glob
import re
from collections import Counter
import fnmatch
import pandas as pd
from collections import defaultdict

from snakemake.remote.S3 import RemoteProvider as S3RemoteProvider
s3_key_id = os.environ.get('AWS_ACCESS_KEY')
s3_access_key = os.environ.get('AWS_SECRET_KEY')

S3 = S3RemoteProvider(
    endpoint_url='https://s3.msi.umn.edu', 
    access_key_id=s3_key_id, 
    secret_access_key=s3_access_key
)

from snakemake.remote.SFTP import RemoteProvider as SFTPRemoteProvider
key = paramiko.agent.Agent().get_keys()[0]

SFTP = SFTPRemoteProvider(username='cull0084',private_key=key)

configfile: 'config.yaml' 

SUBSET = ['1M', '12F', '17M', '20M', '35F', '35M', '36F', '37M', '39F', '39M', 
          '40M', '49M', '51F', '52M', '61M', '65F', '65M', '66F', '66M', '67F', 
          '67M', '69F', '82M', '86M', '87M', '90M']
#SUBSET = ['12F', '17M', '1M', '20M', '35F']

hiseq = False

if hiseq:
    PLATFORM = 'hiseq'
    PROJECT_PATH = 'login.msi.umn.edu/panfs/roc/data_release/3/umgc/pre2018/2015-q4/mccuem/hiseq/151006_D00635_0082_BC7HAHANXX/Project_McCue_Project_022'
    ALL_SAMPLES, = SFTP.glob_wildcards(f'{os.path.join(PROJECT_PATH, "{sample}_R2_001.fastq")}')
    ZIPPED = ''
    SAMPLES = [i for i in ALL_SAMPLES for j in SUBSET if j == i.split('_')[0]]
#    print(PROJECT_PATH)
#    print(SAMPLES)
else:
    PLATFORM = 'novaseq'
    PROJECT_PATH = 'login.msi.umn.edu/panfs/roc/data_release/3/umgc/mccuem/novaseq/181112_A00223_0050_AHCWF7DSXX/McCue_Project_032'
    ALL_SAMPLES, = SFTP.glob_wildcards(f'{os.path.join(PROJECT_PATH, "{sample}_R2_001.fastq.gz")}')
    ZIPPED = '.gz'
    SAMPLES = []
    for i in SUBSET:
        i = i[:-1] + '_' + i[-1]
        for j in ALL_SAMPLES:
            if i == '_'.join(j.split('_', 2)[:2]):
                SAMPLES.append(j)
#    print(SAMPLES)


#ASSEMBLY = config['NCBI_GCF']
ASSEMBLY = config['ENSEMBL_GCA']
ALIGNER = config['ALIGNER']
BUCKET = config['BUCKET']


rule all:
    input:
        f'{BUCKET}/{ASSEMBLY}/paired_end/{PLATFORM}/merge_{ALIGNER}.list'


# ----------------------------------------------------------
#       Trimming
# ----------------------------------------------------------

rule pe_trim_reads:
    input:
        R1 = SFTP.remote(f'{PROJECT_PATH}/{{sample}}_R1_001.fastq{ZIPPED}'),
        R2 = SFTP.remote(f'{PROJECT_PATH}/{{sample}}_R2_001.fastq{ZIPPED}')
    output:
        R1 = f'HorseGeneAnnotation/private/sequence/trimmed/{PLATFORM}/{{sample}}_trim1.fastq.gz',
        R2 = f'HorseGeneAnnotation/private/sequence/trimmed/{PLATFORM}/{{sample}}_trim2.fastq.gz'

    message:
        'AdapterRemoval - removing adapters and low quality bases on {wildcards.sample}'
    shell:
        '''
        AdapterRemoval \
        --file1 {input.R1} \
        --file2 {input.R2} \
        --output1 {output.R1} \
        --output2 {output.R2} \
        --gzip \
        --trimns \
        --trimqualities \
        '''

# ----------------------------------------------------------
#       STAR indices (NCBI and ENSEMBL)
# ----------------------------------------------------------

rule download_STAR:
    input:
        expand('HorseGeneAnnotation/public/refgen/{GCF}/STAR_INDICES/download.done',GCF=config['NCBI_GCF']),
        expand('HorseGeneAnnotation/public/refgen/{GCA}/STAR_INDICES/download.done',GCA=config['ENSEMBL_GCA'])

#DOES NOT STORE UNMAPPED READS (--outReadsUnmapped) nor log output
rule ncbi_STAR_index:
    input:
        S3.remote(expand('HorseGeneAnnotation/public/refgen/{GCF}/STAR_INDICES/Genome',GCF=config['NCBI_GCF']),keep_local=True),
        S3.remote(expand('HorseGeneAnnotation/public/refgen/{GCF}/STAR_INDICES/SA',GCF=config['NCBI_GCF']),keep_local=True),
        S3.remote(expand('HorseGeneAnnotation/public/refgen/{GCF}/STAR_INDICES/SAindex',GCF=config['NCBI_GCF']),keep_local=True),
        S3.remote(expand('HorseGeneAnnotation/public/refgen/{GCF}/STAR_INDICES/chrLength.txt',GCF=config['NCBI_GCF']),keep_local=True),
        S3.remote(expand('HorseGeneAnnotation/public/refgen/{GCF}/STAR_INDICES/chrName.txt',GCF=config['NCBI_GCF']),keep_local=True),
        S3.remote(expand('HorseGeneAnnotation/public/refgen/{GCF}/STAR_INDICES/chrNameLength.txt',GCF=config['NCBI_GCF']),keep_local=True),
        S3.remote(expand('HorseGeneAnnotation/public/refgen/{GCF}/STAR_INDICES/chrStart.txt',GCF=config['NCBI_GCF']),keep_local=True),
        S3.remote(expand('HorseGeneAnnotation/public/refgen/{GCF}/STAR_INDICES/exonGeTrInfo.tab',GCF=config['NCBI_GCF']),keep_local=True),
        S3.remote(expand('HorseGeneAnnotation/public/refgen/{GCF}/STAR_INDICES/exonInfo.tab',GCF=config['NCBI_GCF']),keep_local=True),
        S3.remote(expand('HorseGeneAnnotation/public/refgen/{GCF}/STAR_INDICES/geneInfo.tab',GCF=config['NCBI_GCF']),keep_local=True),
        S3.remote(expand('HorseGeneAnnotation/public/refgen/{GCF}/STAR_INDICES/genomeParameters.txt',GCF=config['NCBI_GCF']),keep_local=True),
        S3.remote(expand('HorseGeneAnnotation/public/refgen/{GCF}/STAR_INDICES/sjdbInfo.txt',GCF=config['NCBI_GCF']),keep_local=True),
        S3.remote(expand('HorseGeneAnnotation/public/refgen/{GCF}/STAR_INDICES/sjdbList.fromGTF.out.tab',GCF=config['NCBI_GCF']),keep_local=True),
        S3.remote(expand('HorseGeneAnnotation/public/refgen/{GCF}/STAR_INDICES/sjdbList.out.tab',GCF=config['NCBI_GCF']),keep_local=True),
        S3.remote(expand('HorseGeneAnnotation/public/refgen/{GCF}/STAR_INDICES/transcriptInfo.tab',GCF=config['NCBI_GCF']),keep_local=True) 
    output:
        touch(expand('HorseGeneAnnotation/public/refgen/{GCF}/STAR_INDICES/download.done',GCF=config['NCBI_GCF']))


rule ensembl_STAR_index:
    input:
        S3.remote(expand('HorseGeneAnnotation/public/refgen/{GCA}/STAR_INDICES/Genome',GCA=config['ENSEMBL_GCA']),keep_local=True),
        S3.remote(expand('HorseGeneAnnotation/public/refgen/{GCA}/STAR_INDICES/SA',GCA=config['ENSEMBL_GCA']),keep_local=True),
        S3.remote(expand('HorseGeneAnnotation/public/refgen/{GCA}/STAR_INDICES/SAindex',GCA=config['ENSEMBL_GCA']),keep_local=True),
        S3.remote(expand('HorseGeneAnnotation/public/refgen/{GCA}/STAR_INDICES/chrLength.txt',GCA=config['ENSEMBL_GCA']),keep_local=True),
        S3.remote(expand('HorseGeneAnnotation/public/refgen/{GCA}/STAR_INDICES/chrName.txt',GCA=config['ENSEMBL_GCA']),keep_local=True),
        S3.remote(expand('HorseGeneAnnotation/public/refgen/{GCA}/STAR_INDICES/chrNameLength.txt',GCA=config['ENSEMBL_GCA']),keep_local=True),
        S3.remote(expand('HorseGeneAnnotation/public/refgen/{GCA}/STAR_INDICES/chrStart.txt',GCA=config['ENSEMBL_GCA']),keep_local=True),
        S3.remote(expand('HorseGeneAnnotation/public/refgen/{GCA}/STAR_INDICES/exonGeTrInfo.tab',GCA=config['ENSEMBL_GCA']),keep_local=True),
        S3.remote(expand('HorseGeneAnnotation/public/refgen/{GCA}/STAR_INDICES/exonInfo.tab',GCA=config['ENSEMBL_GCA']),keep_local=True),
        S3.remote(expand('HorseGeneAnnotation/public/refgen/{GCA}/STAR_INDICES/geneInfo.tab',GCA=config['ENSEMBL_GCA']),keep_local=True),
        S3.remote(expand('HorseGeneAnnotation/public/refgen/{GCA}/STAR_INDICES/genomeParameters.txt',GCA=config['ENSEMBL_GCA']),keep_local=True),
        S3.remote(expand('HorseGeneAnnotation/public/refgen/{GCA}/STAR_INDICES/sjdbInfo.txt',GCA=config['ENSEMBL_GCA']),keep_local=True),
        S3.remote(expand('HorseGeneAnnotation/public/refgen/{GCA}/STAR_INDICES/sjdbList.fromGTF.out.tab',GCA=config['ENSEMBL_GCA']),keep_local=True),
        S3.remote(expand('HorseGeneAnnotation/public/refgen/{GCA}/STAR_INDICES/sjdbList.out.tab',GCA=config['ENSEMBL_GCA']),keep_local=True),
        S3.remote(expand('HorseGeneAnnotation/public/refgen/{GCA}/STAR_INDICES/transcriptInfo.tab',GCA=config['ENSEMBL_GCA']),keep_local=True) 
    output:
        touch(expand('HorseGeneAnnotation/public/refgen/{GCA}/STAR_INDICES/download.done',GCA=config['ENSEMBL_GCA']))

# ----------------------------------------------------------
#       STAR mapping (NCBI)
# ----------------------------------------------------------

rule pe_STAR_mapping:
    input:
        R1 = f'HorseGeneAnnotation/private/sequence/trimmed/{PLATFORM}/{{sample}}_trim1.fastq.gz',
        R2 = f'HorseGeneAnnotation/private/sequence/trimmed/{PLATFORM}/{{sample}}_trim2.fastq.gz',
        star_dl = f'HorseGeneAnnotation/public/refgen/{ASSEMBLY}/STAR_INDICES/download.done'
    output:
        S3.remote(f'{BUCKET}/{ASSEMBLY}/paired_end/{PLATFORM}/{ALIGNER}/{{sample}}_Aligned.out.bam')
    params:
        out_prefix = S3.remote(f'{BUCKET}/{ASSEMBLY}/paired_end/{PLATFORM}/{ALIGNER}/{{sample}}_'),
        star_index = f'HorseGeneAnnotation/public/refgen/{ASSEMBLY}/STAR_INDICES'
    message:
        'STAR - Creating: {output} '
    run:
        assert os.path.exists(input.star_dl)
        shell('''
        STAR \
        --genomeDir {params.star_index} \
        --genomeLoad LoadAndKeep \
        --readFilesIn {input.R1} {input.R2} \
        --readFilesCommand gunzip -c \
        --outFileNamePrefix {params.out_prefix} \
        --outSAMtype BAM Unsorted \
        ''')


# ----------------------------------------------------------
#       Sort BAMs
# ----------------------------------------------------------

rule pe_sort_bam:
    input:
        bam = S3.remote(f'{BUCKET}/{ASSEMBLY}/paired_end/{PLATFORM}/{ALIGNER}/{{sample}}_Aligned.out.bam')
    output:
        sortedbam = S3.remote(f'{BUCKET}/{ASSEMBLY}/paired_end/{PLATFORM}/{ALIGNER}/{{sample}}.sorted.bam')
    shell:
        '''
        samtools view -u {input.bam} | samtools sort - -o {output.sortedbam}
        '''

# ----------------------------------------------------------
#       Merge BAMs
# ----------------------------------------------------------

rule compose_merge:
    input:
        bams = S3.remote(expand(f'{BUCKET}/{ASSEMBLY}/paired_end/{PLATFORM}/{ALIGNER}/{{sample}}.sorted.bam',sample=SAMPLES),keep_local=True)
    output:
        txt = f'{BUCKET}/{ASSEMBLY}/paired_end/{PLATFORM}/merge_{ALIGNER}.list'
    run:
#        with open(output.txt, 'w') as out:
#            print(*input, sep="\n", file=out)
        d = defaultdict(list)
        for i in input.bams:
            i = os.path.basename(i).split('_')[0]
            for j in input.bams:
                if i == os.path.basename(j).split('_')[0]:
                    if len(d[i]) < 2:
                        d[i].append(j)
        with open(output.txt, 'w') as out:
            for v in d.values():
                if len(v) > 1:
                    out_dir = os.path.dirname(v[0])
                    b1 = os.path.basename(v[0]).split(".")[0]
                    b2 = os.path.basename(v[1]).split(".")[0][-4:]

                    cmd = (
                           'samtools merge '
                           + os.path.join(out_dir,b1+'_'+b2+'.merged.sorted.bam')
                           + f' {v[0]} {v[1]}'
                          )
                else:
                    cmd = (
                           'samtools merge '
                           + v[0].replace('sorted.bam','merged.sorted.bam')
                           + f' {v[0]}'
                          )                               

                print(cmd,file=out)
#                cmds.append(cmd)        

##rule se_sort_bam:
##    input:
##        bam = '/scratch/single_end_mapping/star_map_se/RNASEQ/bam/{sample}_se_Aligned.out.bam'
##    output:
##        sorted_bam = '/scratch/single_end_mapping/star_map_se/RNASEQ/bam/{sample}_se.sorted.bam'
##    shell:
##        '''
##        samtools view -u {input.bam} | samtools sort -o {output.sorted_bam}
##        '''