Snakefile_config

import os
import pandas

BASE_PATH = "/lab/work/arushiv/regulatoryAnnotations_comparisons"
GREGOR_SCRIPT_PATH = "/lab/sw/modules/GREGOR/1.2.1/script/GREGOR.pl"

LINKS = {
    'annotations' : "https://theparkerlab.med.umich.edu/data/regulatory_comparisons/annotations/{cell}.{region}.annotations.bed",
    'chromatinStates' : "https://theparkerlab.med.umich.edu/data/papers/doi/10.1073/pnas.1621192114/chromatin_states/{cell}.chromatinStates.bed.gz",
    'workspace' : "https://theparkerlab.med.umich.edu/data/regulatory_comparisons/reference/hg19_chromsizes.bed",
    'gencode_annotation' : "ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_human/release_19/gencode.v19.annotation.gtf.gz",
    'gtex_expression' : "https://theparkerlab.med.umich.edu/data/regulatory_comparisons/gtex_v7/GTEx_Analysis_2016-01-15_v7_RNASeQCv1.1.8_gene_median_tpm.gct.gz",
    'gtex_lcleqtl' : "https://theparkerlab.med.umich.edu/data/regulatory_comparisons/gtex_v7/Cells_EBV-transformed_lymphocytes.v7.egenes.txt.gz",
    'gtex_bloodeqtl' : "https://theparkerlab.med.umich.edu/data/regulatory_comparisons/gtex_v7/Whole_Blood.v7.egenes.txt.gz",
    'dsqtl' : "http://eqtl.uchicago.edu/dsQTL_data/QTLs/GSE31388_dsQtlTable.txt.gz",
    'gm_allelic_bias' : "https://theparkerlab.med.umich.edu/data/regulatory_comparisons/effectSize/GM12878.biasInfo.dat",
    'gm_expected_deviation' : "https://theparkerlab.med.umich.edu/data/regulatory_comparisons/effectSize/GM12878.expectedRef.dat",
    'hg19_liftover' : "http://hgdownload.cse.ucsc.edu/goldenpath/hg18/liftOver/hg18ToHg19.over.chain.gz",
    'dbsnp150' : "ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606_b150_GRCh37p13/VCF/common_all_20170710.vcf.gz",
    '1000g' : "ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/ALL.chr{chrom}.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz",
    '1000g_index' : "ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/ALL.chr{chrom}.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi",
    'vcf_sampleInfo' : "https://theparkerlab.med.umich.edu/data/regulatory_comparisons/reference/igsr.tsv.gz",
    'plot_information' : "git@github.com:arushiv/SeabornCustomGrid.git",
    'nhgri_gwas' : "https://www.ebi.ac.uk/gwas/api/search/downloads/alternative",
    'pc_gene_list' : "https://theparkerlab.med.umich.edu/data/regulatory_comparisons/reference/tsslist_gencodev19_pc.bed",
    'hg19_lengths' : "https://theparkerlab.med.umich.edu/data/regulatory_comparisons/reference/hg19_lengths.txt",
    'state13_model' : "https://theparkerlab.med.umich.edu/data/papers/doi/10.1073/pnas.1621192114/chromatin_states/model_13.txt",
    'binarized' : "https://theparkerlab.med.umich.edu/data/regulatory_comparisons/chromatinStates/bed_binarized/{cell}_chr{chrom}_binary.txt.gz",
}

DATA = {
    'annotations': os.path.join(BASE_PATH, "data/annotations/{cell}.{region}.annotations.bed"),
    'chromatinStates' : os.path.join(BASE_PATH,  "data/chromatinStates/{cell}.{chromatinState}.bed"),
    'workspace' : os.path.join(BASE_PATH, "data/reference/hg19_chromsizes.bed"),
    'gencode_annotation' : os.path.join(BASE_PATH, "data/reference/gencode.v19.tssUniq.annotation.bed.gz"),
    'pc_gene_list' : os.path.join(BASE_PATH, "data/reference/tsslist_gencodev19_pc.bed"),
    'gtex_expression' : os.path.join(BASE_PATH, "data/gtex_v7/GTEx_Analysis_2016-01-15_v7_RNASeQCv1.1.8_gene_median_tpm.gct.gz"),
    'gtex_lcleqtl' : os.path.join(BASE_PATH, "data/gtex_v7/Cells_EBV-transformed_lymphocytes.v7.egenes.txt.gz"),  
    'gtex_bloodeqtl' : os.path.join(BASE_PATH, "data/gtex_v7/Whole_Blood.v7.egenes.txt.gz"),
    'dsqtl' : os.path.join(BASE_PATH, "data/qtlEffects/dsQtlTable.txt.gz"),
    'gm_allelic_bias' : os.path.join(BASE_PATH, "data/qtlEffects/gm.allelic.bias.dat.gz"),
    'gm_expected_deviation' : os.path.join(BASE_PATH, "data/qtlEffects/gm.expectedFracRef.dat.gz"),
    'hg19_liftover' : os.path.join(BASE_PATH, "data/qtlEffects/hg18ToHg19.over.chain.gz"),
    'dbsnp150' : os.path.join(BASE_PATH, "data/qtlEffects/common_all_20170710.vcf.gz"),
    # 'gm_allelic_bias_dsp' : os.path.join(BASE_PATH, "data/qtlEffects/GM12878.snps.downsampled_{dcoverage}.bed.gz"),
    # 'gm_dsp_expected_deviation' : os.path.join(BASE_PATH,  "data/qtlEffects/GM12878.post_mpileup.err.gz"),
    # '1000g' : "/lab/data/genomes/human/hg19/1000GenomesDownloads/ALL.chr{chrom}.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz",
    # '1000g_index' : "/lab/data/genomes/human/hg19/1000GenomesDownloads/ALL.chr{chrom}.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi",
    '1000g' : os.path.join(BASE_PATH, "data/1000g/ALL.chr{chrom}.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz"),
    '1000g_index' : os.path.join(BASE_PATH, "data/1000g/ALL.chr{chrom}.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi"),
    'vcf_sampleInfo' : os.path.join(BASE_PATH,  "data/1000g/igsr.tsv.gz"),
    'plot_information' : expand(os.path.join(BASE_PATH, "histone_info_content/scripts/SeabornCustomGrid", "{script}.py"), script=["plot_utils", "faceted_jointplots"]),
    'nhgri_gwas' : os.path.join(BASE_PATH,  "data/gwas", "gwas_catalog_v1.0.2-associations_e92_r2018-05-29.tsv"),
    'hg19_lengths' : os.path.join(BASE_PATH, "data/reference/hg19_lengths.txt"),
    'state13_model' : os.path.join(BASE_PATH,  "data/chromatinStates/model_13.txt"),
    'binarized' : os.path.join(BASE_PATH,  "data/chromatinStates/bedBinarized/", "{cell}_chr{chrom}_binary.txt.gz"),   
}

DIRECTORIES = {
    'data' : "data",
    'intermediateFiles' : "intermediateFiles",
    'scripts' : "scripts",
    'figures' : "figures",
    'logs' : "logs",
}

REGIONS = ['broadDomains','hotRegions','stretchEnhancer','superEnhancers','typicalEnhancers']
CELLS = ['GM12878','H1','HepG2','K562']
CHROMATINSTATES = ['10_Active_enhancer_2', '11_Weak_enhancer', '14_Bivalent_poised_TSS', '16_Repressed_polycomb', '17_Weak_repressed_polycomb', '18_Quiescent_low_signal', '1_Active_TSS', '2_Weak_TSS', '3_Flanking_TSS', '5_Strong_transcription', '6_Weak_transcription', '8_Genic_enhancer', '9_Active_enhancer_1']
CHROM = range(1, 23)
BINARIZEDIR = os.path.join(BASE_PATH,  "data/chromatinStates/bedBinarized/")
   
rule download_annotations:
    """Download and gunzip because GREGOR requires unzipped files later"""
    output:
        protected(DATA['annotations'])
    params:
        link = LINKS['annotations'],
        outname = os.path.join(BASE_PATH, "data/annotations/{cell}.{region}.annotations.bed.gz")
    shell:
        """
        wget --no-check-certificate {params.link} -O {params.outname} ;
        gunzip {params.outname}
        """

rule download_chromatinStates:
    output:
        gzip = temp(os.path.join(BASE_PATH,  "data/chromatinStates/{cell}.chromatinStates.bed.gz")),
        separate = expand(os.path.join(BASE_PATH,  "data/chromatinStates/{{cell}}.{chromatinState}.bed"), chromatinState = CHROMATINSTATES)# protected(DATA['chromatinStates'])
    params:
        link = LINKS['chromatinStates']
    run:
        shell(r"""
        wget --no-check-certificate {params.link} -O {output.gzip}
        """)
        d = pandas.read_csv(output.gzip, sep='\t', header=None, names=['chrom','start','end','chromatinState','colorCode'])
        for name, group in d.groupby('chromatinState'):
            chromatinState_name = name.replace('/', '_')
            output_file_name = os.path.join(BASE_PATH,  "data/chromatinStates/{cell}.{chromatinState}.bed".format(cell=wildcards.cell, chromatinState=chromatinState_name))
            group[['chrom','start','end']].to_csv(output_file_name, sep='\t', header=False, index=False)
        
rule download_workspace:
    output:
        DATA['workspace']
    params:
        link = LINKS['workspace']
    shell:
        """wget --no-check-certificate {params.link} -O {output} """

rule download_gencodeAnnotation:
    output:
        tempAnnot = temp(os.path.join(BASE_PATH, "data/reference/gencode.v19.annotation.gtf.gz")),
        annot = DATA['gencode_annotation']
    params:
        link = LINKS['gencode_annotation'],
    run:
        shell(r"""
        wget {params.link} -O {output.tempAnnot} ;
        zcat {output.tempAnnot} | grep -v -P ^## | awk '{{print $1"\t"$4"\t"$5"\t"$18"\t"$6"\t"$7"\t"$10"\t"$14}}' | sed 's/"//g' | sed 's/;//' | perl -e 'while (<>) {{ chomp; ($c, $ts, $te, $n, $score, $s, $ensg, $biotype) = split /\s+/; ($id,@junk) = split (/[;.]/,$ensg); ($type,@junk) = split (/[;.]/,$biotype); if ($s eq "+") {{ $end = $ts+1; print "$c\t$ts\t$end\t$id\t$n\t$type\n";}}  else {{ $start = $te-1; $end = $start + 1; print "$id\t$n\t$type\n";   }}    }}' | sort | uniq | bgzip -c > {output.annot}
        """)
        
rule download_1000g:
    output:
        vcf = DATA['1000g'],
        index = DATA['1000g_index'],        
    params:
        vcfLink = LINKS['1000g'],
        indexLink = LINKS['1000g_index'],
    shell:
        """
        wget {params.vcfLink} -O {output.vcf} ;
        wget {params.indexLink} -O {output.index} ;       
        """

rule download_100g_sampleInfo:
    output:
        vcf_sampleInfo = DATA['vcf_sampleInfo']
    params:
        vcf_sampleInfo = LINKS['vcf_sampleInfo']
    shell:
        """
        wget {params.vcf_sampleInfo} -O {output.vcf_sampleInfo}
        """
        
rule download_gtexEqtl_lcl:
    output:
        lcl = DATA['gtex_lcleqtl'],
    params:
        lcl = LINKS['gtex_lcleqtl'],
    shell:
        """
        wget --no-check-certificate {params.lcl} -O {output.lcl} ;
        """

rule download_gtexEqtl_blood:
    output:
        blood = DATA['gtex_bloodeqtl'],
    params:
        blood = LINKS['gtex_bloodeqtl'],
    shell:
        """
        wget --no-check-certificate {params.blood} -O {output.blood}
        """

rule download_gtexExpression:
    output:
        DATA['gtex_expression']
    params:
        link = LINKS['gtex_expression']
    shell:
        """wget --no-check-certificate {params.link} -O {output}"""
        
rule download_dsqtl:
    output:
        dsqtl = DATA['dsqtl'],
        hg19_liftover = DATA['hg19_liftover']
    params:
        link_dsqtl = LINKS['dsqtl'],
        link_hg19_liftover = LINKS['hg19_liftover']
    shell:
        """
        wget {params.link_dsqtl} -O {output.dsqtl}
        wget {params.link_hg19_liftover} -O {output.hg19_liftover}
        """
        
rule download_dbsnp150:
    output:
        DATA['dbsnp150'],
    params:
        link = LINKS['dbsnp150']
    shell:
        """  wget {params.link} -O {output} """
    
rule download_gmAllelicBias:
    output:
        gm_allelic_bias = DATA['gm_allelic_bias'],
        gm_expected_deviation = DATA['gm_expected_deviation']
    params:
        gm_allelic_bias = LINKS['gm_allelic_bias'],
        gm_expected_deviation = LINKS['gm_expected_deviation']
    shell:
        """
        wget --no-check-certificate {params.gm_allelic_bias} -O {output.gm_allelic_bias} ;
        wget --no-check-certificate {params.gm_expected_deviation} -O {output.gm_expected_deviation} ;
        """

rule get_scripts_to_plot_information_content:
    output:
        plot_information = DATA['plot_information'],
    params:
        directory = os.path.dirname(DATA['plot_information'][0]),
        plot_information = LINKS['plot_information']
    shell:
        """
        git clone {params.plot_information} {params.directory}
        """

rule get_nhgri_gwas:
    output:
        nhgri_gwas = DATA['nhgri_gwas']
    params:
        nhgri_gwas = LINKS['nhgri_gwas']
    shell:
        """
        wget {params.nhgri_gwas} -O {output.nhgri_gwas}
        """
        
rule download_tss_pc_gene_list:
    """Download TSS list for Protein Coding genes """
    output:
        pc_gene_list = DATA['pc_gene_list']
    params:
        pc_gene_list = LINKS['pc_gene_list']
    shell:
        """
        wget --no-check-certificate {params.pc_gene_list} -O {output.pc_gene_list}
        """

rule download_hg19_lengths:
    """Get hg19 lengths file """
    params:
        hg19_lengths = LINKS['hg19_lengths']
    output:
        hg19_lengths = DATA['hg19_lengths']
    shell:
        """
        wget --no-check-certificate {params.hg19_lengths} -O {output.hg19_lengths} 
        """

rule download_state13model:
    """Get 13 chromatin state model file """
    params:
        state13_model = LINKS['state13_model']
    output:
        state13_model = DATA['state13_model']
    shell:
        """
        wget --no-check-certificate {params.state13_model} -O {output.state13_model} 
        """

rule download_bedBinarized:
    """Get binarized histone ChIP-seq data """
    params:
        binarized = LINKS['binarized']
    output:
        binarized = DATA['binarized']
    shell:
        """
        wget --no-check-certificate {params.binarized} -O {output.binarized} 
        """