forked from perslab/CELLECT
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcellect-genes.snakefile
121 lines (93 loc) · 5.58 KB
/
cellect-genes.snakefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
# -*- coding: utf-8 -*-
# Some overlapping functionality
include: "rules/common_func1.smk"
########################################################################################
################################### FUNCTIONS ##########################################
########################################################################################
# see the *.smk files
########################################################################################
################################### VARIABLES ##########################################
########################################################################################
# Where all the output will be saved
BASE_OUTPUT_DIR = os.path.join(config['BASE_OUTPUT_DIR'],"CELLECT-MAGMA") # note the overwriting of this variable, and that we use it to run CELLECT-MAGMA and the below variable for CELLECT-GENES workflow!
CELLECT_GENES_OUTPUT_DIR = os.path.join(config['BASE_OUTPUT_DIR'],"CELLECT-GENES")
# More overlapping functionality
include: "rules/common_func2.smk"
wildcard_constraints:
BASE_OUTPUT_DIR = BASE_OUTPUT_DIR,
CELLECT_GENES_OUTPUT_DIR = CELLECT_GENES_OUTPUT_DIR,
run_prefix = r"|".join(set(SPECIFICITY_INPUT.keys())),
annotations = r"|".join(set(ANNOTATIONS_DICT)),
gwas = r"|".join(set(GWAS_SUMSTATS.keys()))
########################################################################################
################################### Target files ##########################################
########################################################################################
# see also the *.smk files
# put this here so it is possible to run other cellect workflows without having to set config['ANALYSIS_TYPE']['effector_genes'] to False
list_target_files = []
analysis_types_performed = [] # this is for parsing and compiling the results files
tmp = "{CELLECT_GENES_OUTPUT_DIR}/results/effector_genes.csv".format(CELLECT_GENES_OUTPUT_DIR = CELLECT_GENES_OUTPUT_DIR)
list_target_files.extend([tmp])
tmp = expand("{CELLECT_GENES_OUTPUT_DIR}/out/{run_prefix}__{gwas}.effector_genes.txt",
CELLECT_GENES_OUTPUT_DIR = CELLECT_GENES_OUTPUT_DIR,
run_prefix = list(SPECIFICITY_INPUT.keys()),
gwas = list(GWAS_SUMSTATS.keys()))
list_target_files.extend(tmp)
analysis_types_performed.extend(['effector_genes'])
########################################################################################
############################# Pre-check of inputs ######################################
########################################################################################
if not config['N_GENES_MAGMA']>0:
raise Exception("config.yml: N_GENES_MAGMA must be a positive integer.")
if config['PERCENTILE_CUTOFF_ESMU']<0 or config['PERCENTILE_CUTOFF_ESMU']>100:
raise Exception("config.yml: PERCENTILE_CUTOFF_ESMU must be an integer between 0 and 100.")
########################################################################################
################################### MODULES ############################################
########################################################################################
import pandas as pd
########################################################################################
################################### PIPELINE ###########################################
########################################################################################
rule all:
'''
Defines the final target files to be generated.
'''
input:
list_target_files #expand("{CELLECT_GENES_OUTPUT_DIR}/out/{run_prefix}__{gwas}.effector_genes.txt", CELLECT_GENES_OUTPUT_DIR=CELLECT_GENES_OUTPUT_DIR, run_prefix=list(SPECIFICITY_INPUT.keys()), gwas=list(GWAS_SUMSTATS.keys()))
rule parse_results:
"""
Generates {CELLECT_GENES_OUTPUT_DIR}/results/<analysis_type>.csv by parsing ALL output files in {CELLECT_GENES_OUTPUT_DIR}/out/.
"""
input:
filter(lambda s: '.csv' not in s, list_target_files) #Not sure if strictly necessary, just to make sure that the .csv files are generated AFTER the analysis
output:
expand("{{CELLECT_GENES_OUTPUT_DIR}}/results/{analysis_type}.csv", analysis_type=analysis_types_performed)
shell:
"python3 scripts/parse_results.py --base_output_dir {CELLECT_GENES_OUTPUT_DIR}"
subworkflow cellect_magma:
snakefile:
"cellect-magma.snakefile"
rule get_effector_genes:
'''
Extract top percentile of ESmu > 0.0 genes
Extract top n MAGMA genes
Take intersection
'''
input:
cellect_magma(expand("{BASE_OUTPUT_DIR}/precomputation/{gwas}/{gwas}.resid_correct_all.gsa.genes.pvals.out", BASE_OUTPUT_DIR=BASE_OUTPUT_DIR,gwas=list(GWAS_SUMSTATS.keys())))
output:
"{CELLECT_GENES_OUTPUT_DIR}/out/{run_prefix}__{gwas}.effector_genes.txt"
conda:
"envs/cellectpy3.yml"
log:
"{CELLECT_GENES_OUTPUT_DIR}/logs/log.get_effector_genes_snake.{run_prefix}.{gwas}.txt"
params:
magma_output_dir = BASE_OUTPUT_DIR,
cellect_genes_output_dir = CELLECT_GENES_OUTPUT_DIR,
specificity_matrix_file = lambda wildcards: SPECIFICITY_INPUT[wildcards.run_prefix]['path'],
specificity_matrix_name = "{run_prefix}",
gwas_name = lambda wildcards: GWAS_SUMSTATS[wildcards.gwas]['id'],
n_genes_magma = config['N_GENES_MAGMA'],
percentile_cutoff_esmu = config['PERCENTILE_CUTOFF_ESMU']
script:
"scripts/get_effector_genes.py"