-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSnakefile
executable file
·684 lines (628 loc) · 31.1 KB
/
Snakefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
# QTL mapping pipeline
# Jack Humphrey
import glob
import pandas as pd
import os
mode = config["mode"]
dataCode = config["dataCode"]
print(" * QTL-mapping pipeline *")
print(" Jack Humphrey 2019-2020 ")
print(" * Data code is : %s " % dataCode)
print(" * Mode selected is: %s" % mode)
nPerm = 10000 # number of permutations of the permutation pass
R_VERSION = "R/4.0.3"
prepare_editing_R_VERSION = "R/4.2.0"
shell.prefix('ml anaconda3; CONDA_BASE=$(conda info --base); source $CONDA_BASE/etc/profile.d/conda.sh; ml purge; conda activate QTL-pipeline; ml {R_VERSION};')
# Interaction QTLs
# interaction mode - set default to False
if "interaction" not in config.keys():
config["interaction"] = False
interaction = bool(config["interaction"])
# requires an interaction_file and an interaction_name in the config.yaml
if( interaction is True ):
print(" * interaction mode selected")
if "interaction_name" not in config.keys():
sys.exit("config.yaml does not contain interaction_name value")
if( interaction is True ):
if "interaction_file" not in config.keys():
sys.exit("config.yaml does not contain interaction_file value")
if(interaction is True):
interaction_name = config["interaction_name"]
interaction_file = config["interaction_file"]
interaction_file_list = [str(i) for i in interaction_file]
interaction_dict = dict(zip(interaction_name, interaction_file_list))
# Trans QTLs
# set default to False
if "trans" not in config.keys():
config["trans"] = False
trans = bool(config["trans"])
# setting value for "extract_str" to determine which snps to use for trans analysis
if "snp_list" not in config.keys():
config["snp_list"] = "all"
snps = config["snp_list"]
if snps == "all":
extract_str = ""
else:
extract_str = "--extract " + snps
print("EXTRACT STRING = ", extract_str)
# Conditional eQTL
# set default to False
if "conditional_qtls" not in config.keys():
config["conditional_qtls"] = False
conditional_qtls = bool(config["conditional_qtls"])
if( trans is True ):
print(" * trans-eQTL mode selected")
if( conditional_qtls is True ):
print(" * conditinal-eQTL mode selected")
## Nominal mapping is the slow step - turn off when optimising numbers of PEER factors
if "no_nominal" not in config.keys():
config["no_nominal"] = False
no_nominal = bool(config["no_nominal"])
## MAF - minor allele frequency - default = 0.01
MAF=0.01
if "MAF" not in config.keys():
config["MAF"] = 0.01
no_nominal = config["no_nominal"]
# Common config variables - all modes require these
leafcutter_dir = "/sc/arion/projects/ad-omics/data/software/leafcutter/"
GTF = config["GTF"]
GTFexons = GTF + ".exons.txt.gz"
VCF = config["VCF"]
VCFstem = VCF.split(".vcf.gz")[0]
sample_key = ""
genotypePCs = ""
covariateFile = ""
PEER_values = ""
countMatrixRData = ""
junctionFileList = ""
phenotype_matrix = ""
bamFolder = ""
bamSuffix = ""
BAM_SAMPLES = []
group_string = ""
tpms = ""
meta = ""
inDir = ""
if "chr_type" not in config.keys():
config["chr_type"] = "chr1"
print(" * assuming chromosome names include \"chr\" in your VCF - add chr_type: \"1\" to your config.yaml if this is not the case" )
if config["chr_type"] == "chr1":
CHROM = ["chr" + str(i) for i in range(1,23)]
else:
CHROM= [i for i in range(1,22)]
sample_key = config["sampleKey"]
genotypePCs = config["genotypePCs"]
covariateFile = config["covariateFile"]
PEER_values = config["PEER_values"]
# MODE SELECTION - expression QTLs
if(mode == "eQTL"):
group_by_values = ["gene"]
PEER_values = config["PEER_values"]
dataCode = dataCode + "_expression"
qtl_window = int(1e6)
outFolder = "results/" + dataCode + "/"
prefix = outFolder + dataCode
phenotype_matrix = prefix + ".expression.bed.gz"
phenotype_tensorQTL_matrix = prefix + ".phenotype.tensorQTL.bed.gz"
final_output = [ expand(outFolder + "peer{PEER_N}/" + dataCode + "_peer{PEER_N}_{group_by}.cis_qtl.txt.gz", PEER_N = PEER_values, group_by = group_by_values) ]
if( no_nominal == False ):
final_output.append( expand( outFolder + "peer{PEER_N}/" + dataCode +"_peer{PEER_N}_{group_by}.cis_qtl_nominal_tabixed.tsv.gz", PEER_N = PEER_values, group_by = "gene" ) )
if( conditional_qtls == True ):
final_output.append( expand(outFolder + "peer{PEER_N}/" + dataCode + "_peer{PEER_N}_{group_by}.cis_independent_qtl.txt.gz", PEER_N = PEER_values, group_by = group_by_values) )
if( interaction == True ):
final_output.append( expand( outFolder + "peer{PEER_N}/" + dataCode + "_interaction_{interaction_id}_peer{PEER_N}_{group_by}.cis_qtl_nominal_tabixed.tsv.gz", PEER_N = PEER_values, group_by = "gene", interaction_id = interaction_name ) )
if( trans == True ):
final_output.append( expand( outFolder + "peer{PEER_N}/" + dataCode + "_peer{PEER_N}_{group_by}.trans_qtl_pairs.txt.gz", PEER_N = PEER_values, group_by = "gene" ) )
if( trans == True & interaction == True ):
final_output.append( expand( outFolder + "peer{PEER_N}/" + dataCode + "_interaction_{interaction_id}_peer{PEER_N}_{group_by}.trans_qtl_pairs.txt.gz", PEER_N = PEER_values, group_by = "gene", interaction_id = interaction_name ) )
countMatrixRData = config["countMatrixRData"]
# MODE SELECTION - splicing QTLs
if(mode == "sQTL"):
group_by_values = ["cluster"] # should be either 'gene' or 'cluster'
qtl_window = int(1e5) # splicing window is now 100kb either side of junction middle - maximum junction length is 100kb so will cover all
PEER_values = config["PEER_values"]
dataCode = dataCode + "_splicing"
outFolder = "results/" + dataCode + "/"
prefix = outFolder + dataCode
junctionFileList = config["junctionFileList"]
phenotype_matrix = prefix + ".leafcutter.bed.gz"
phenotype_tensorQTL_matrix = prefix + ".phenotype.tensorQTL.bed.gz"
final_output = [ expand(outFolder + "peer{PEER_N}/" + dataCode + "_peer{PEER_N}_{group_by}.cis_qtl.txt.gz", PEER_N = PEER_values, group_by = group_by_values) ]
if( no_nominal == False ):
final_output.append(expand( outFolder + "peer{PEER_N}/" + dataCode +"_peer{PEER_N}_{group_by}.cis_qtl_nominal_tabixed.tsv.gz", PEER_N = PEER_values, group_by = "gene" ) )
group_file = prefix + ".group.tensorQTL.{group_by}.bed.gz"
group_string = " --phenotype_groups " + group_file
if( conditional_qtls == True ):
final_output.append( expand(outFolder + "peer{PEER_N}/" + dataCode + "_peer{PEER_N}_{group_by}.cis_independent_qtl.txt.gz", PEER_N = PEER_values, group_by = group_by_values) )
if( interaction == True ):
final_output.append( expand( outFolder + "peer{PEER_N}/" + dataCode +"_peer{PEER_N}_{group_by}_interaction_{interaction_id}.cis_qtl_nominal_tabixed.tsv.gz", PEER_N = PEER_values, group_by = group_by_values, interaction_id = interaction_name ) )
if( trans == True ):
final_output.append( expand( outFolder + "peer{PEER_N}/" + dataCode + "_peer{PEER_N}_{group_by}.trans_qtl_pairs.txt.gz", PEER_N = PEER_values, group_by = group_by_values ) )
if( trans == True & interaction == True ):
final_output.append( expand( outFolder + "peer{PEER_N}/" + dataCode + "_interaction_{interaction_id}_peer{PEER_N}_{group_by}.trans_qtl_pairs.txt.gz", PEER_N = PEER_values, group_by = group_by_values, interaction_id = interaction_name ) )
# MODE SELECTION - editing QTLs
if(mode == "edQTL"):
inDir = config["jacusaDir"]
group_by_values = ["gene"]
PEER_values = config["PEER_values"]
dataCode = dataCode + "_editing"
qtl_window = int(1e5)
outFolder = "results/" + dataCode + "/"
prefix = outFolder + dataCode
phenotype_matrix = prefix + ".editing.bed.gz"
phenotype_tensorQTL_matrix = prefix + ".phenotype.tensorQTL.bed.gz"
final_output = [ expand(outFolder + "peer{PEER_N}/" + dataCode + "_peer{PEER_N}_{group_by}.cis_qtl.txt.gz", PEER_N = PEER_values, group_by = group_by_values) ]
if( no_nominal == False ):
final_output.append(expand( outFolder + "peer{PEER_N}/" + dataCode +"_peer{PEER_N}_{group_by}.cis_qtl_nominal_tabixed.tsv.gz", PEER_N = PEER_values, group_by = "gene" ) )
group_file = prefix + ".group.tensorQTL.{group_by}.bed.gz"
group_string = " --phenotype_groups " + group_file
if( conditional_qtls == True ):
final_output.append( expand(outFolder + "peer{PEER_N}/" + dataCode + "_peer{PEER_N}_{group_by}.cis_independent_qtl.txt.gz", PEER_N = PEER_values, group_by = group_by_values) )
if( interaction == True ):
final_output.append( expand( outFolder + "peer{PEER_N}/" + dataCode +"_peer{PEER_N}_{group_by}_interaction_{interaction_id}.cis_qtl_nominal_tabixed.tsv.gz", PEER_N = PEER_values, group_by = group_by_values, interaction_id = interaction_name ) )
if( trans == True ):
final_output.append( expand( outFolder + "peer{PEER_N}/" + dataCode + "_peer{PEER_N}_{group_by}.trans_qtl_pairs.txt.gz", PEER_N = PEER_values, group_by = group_by_values ) )
if( trans == True & interaction == True ):
final_output.append( expand( outFolder + "peer{PEER_N}/" + dataCode + "_interaction_{interaction_id}_peer{PEER_N}_{group_by}.trans_qtl_pairs.txt.gz", PEER_N = PEER_values, group_by = group_by_values, interaction_id = interaction_name ) )
# MODE SELECTION - transcript expression QTLs
if(mode == "teQTL"):
outFolder = "results/" + dataCode + "/"
group_by_values = ["transcript_expression"]
PEER_values = config["PEER_values"]
dataCode = dataCode + "_transcript_expression"
qtl_window = int(1e6)
outFolder = "results/" + dataCode + "/"
prefix = outFolder + dataCode
phenotype_matrix = prefix + ".phenotype.te.bed.gz"
phenotype_tensorQTL_matrix = prefix + ".phenotype.tensorQTL.bed.gz"
tpms = config["transcriptTPM"]
meta = config["combinedTranscriptReferenceMeta"]
final_output = [ expand(outFolder + "peer{PEER_N}/" + dataCode + "_peer{PEER_N}_{group_by}.cis_qtl.txt.gz", PEER_N = PEER_values, group_by = group_by_values) ]
if( no_nominal == False ):
final_output.append( expand( outFolder + "peer{PEER_N}/" + dataCode +"_peer{PEER_N}_{group_by}.cis_qtl_nominal_tabixed.tsv.gz", PEER_N = PEER_values, group_by = "gene" ) )
if( conditional_qtls == True ):
final_output.append( expand(outFolder + "peer{PEER_N}/" + dataCode + "_peer{PEER_N}_{group_by}.cis_independent_qtl.txt.gz", PEER_N = PEER_values, group_by = group_by_values) )
if( interaction == True ):
final_output.append( expand( outFolder + "peer{PEER_N}/" + dataCode + "_interaction_{interaction_id}_peer{PEER_N}_{group_by}.cis_qtl_nominal_tabixed.tsv.gz", PEER_N = PEER_values, group_by = "gene", interaction_id = interaction_name ) )
if( trans == True ):
final_output.append( expand( outFolder + "peer{PEER_N}/" + dataCode + "_peer{PEER_N}_{group_by}.trans_qtl_pairs.txt.gz", PEER_N = PEER_values, group_by = "gene" ) )
if( trans == True & interaction == True ):
final_output.append( expand( outFolder + "peer{PEER_N}/" + dataCode + "_interaction_{interaction_id}_peer{PEER_N}_{group_by}.trans_qtl_pairs.txt.gz", PEER_N = PEER_values, group_by = "gene", interaction_id = interaction_name ) )
countMatrixRData = config["countMatrixRData"]
## NEW MODES TO GO HERE - EDITING QTLs, PROTEIN QTLs etc.
rule all:
input:
#expand(outFolder + "peer{PEER_N}/" + dataCode + "_peer{PEER_N}.{group_by}.combined_covariates.txt", PEER_N = PEER_values, group_by = group_by_values)
final_output
rule collapseGTF:
input:
GTF
output:
prefix + "_collapsed.gtf"
params:
script = "scripts/collapse_annotation.py"
shell:
"python {params.script} {input} {output} "
rule getExonsFromGTF:
input:
GTF
params:
out_folder = os.path.dirname(GTF),
script = "scripts/get_exon_table.R"
output:
GTF + ".exons.txt.gz"
shell:
"ml {R_VERSION}; "
"Rscript {params.script} "
" --gtf {input} "
" --outFolder {params.out_folder} "
rule createGCTFiles:
input:
counts = countMatrixRData,
key = sample_key,
gtf = prefix + "_collapsed.gtf"
output:
counts_gct_file = prefix + "_counts.gct",
tpm_gct_file = prefix + "_tpm.gct"
shell:
"ml {R_VERSION}; "
"Rscript scripts/create_GCT_files.R "
" --counts {input.counts} "
" --key {input.key} "
" --gtf {input.gtf} "
" --outFileCounts {output.counts_gct_file} "
" --outFileTPM {output.tpm_gct_file} "
rule VCF_chr_list:
input:
VCF = VCFstem + ".vcf.gz",
index = VCFstem + ".vcf.gz.tbi"
output:
prefix + "_vcf_chr_list.txt"
shell:
"tabix -l {input.VCF} > {output}"
# script from GTEX pipeline
# performs leafcutter clustering
# filtering of junctions by missingness and variability
# runs leafcutter_prepare_phenotype.py
rule prepareSplicing:
input:
# expecting gzipped junction files with extension {sample}.junc.gz
sample_key = sample_key,
junction_file_list = junctionFileList, # from config - a file listing full paths to each junction file
exon_list = GTFexons, # hg38 exons from gencode v30 with gene_id and gene_name
genes_gtf = GTF # full GTF or just gene starts and ends?
output:
#counts = prefix + "_perind.counts.gz",
#counts_numers = prefix + "_perind_numers.counts.gz",
#clusters_pooled = prefix + "_pooled.gz",
#clusters_refined = prefix + "_refined.gz",
#clusters_to_genes = prefix + ".leafcutter.clusters_to_genes.txt",
phenotype_groups = prefix + ".leafcutter.phenotype_groups.txt",
leafcutter_bed = prefix + ".leafcutter.bed.gz",
leafcutter_bed_index = prefix + ".leafcutter.bed.gz.tbi",
leafcutter_pcs = prefix + ".leafcutter.PCs.txt"
params:
leafcutter_dir = os.getcwd() + "/scripts/leafcutter/", # all leafcutter scripts hosted in a folder - some had to be converted py2 -> py3
script = os.getcwd() + "/scripts/sqtl_prepare_splicing.py",
min_clu_reads = 30,
min_clu_ratio = 0.001,
max_intron_len = 100000, # cut down to 100k to reduce SNP testing distance
num_pcs = 10, # must be at least the number of samples!
coord_mode = "cluster_middle"
#coord_mode = "cluster_middle" # set coordinates to either "TSS" or "cluster_middle"
shell:
"ml {R_VERSION};"
"ml tabix;"
#"cd {outFolder};"
"python {params.script} "
" {input.junction_file_list} "
" {input.exon_list} "
" {input.genes_gtf} "
" {dataCode} "
" {input.sample_key} "
" --min_clu_reads {params.min_clu_reads} "
" --min_clu_ratio {params.min_clu_ratio} "
" --max_intron_len {params.max_intron_len} "
" --coord_mode {params.coord_mode} "
" --num_pcs {params.num_pcs} "
" --leafcutter_dir {params.leafcutter_dir}; "
" mv -f {dataCode}* {outFolder} "
#"rm *sorted.gz; " # clean up directory
#"rm {prefix}_perind.counts.filtered.gz.phen_* "
# from Francois Auguet at GTEX
# eQTLs - first prepare expression from RSEM count matrix
# then run PEER
# then add PEER factors to known covariates
rule prepareExpression:
input:
vcf_chr_list = prefix + "_vcf_chr_list.txt",
tpm_gct = prefix + "_tpm.gct",
counts_gct = prefix + "_counts.gct",
gtf = prefix + "_collapsed.gtf",
sample_key = sample_key
output:
prefix + ".expression.bed.gz"
params:
script = "scripts/eqtl_prepare_expression.py"
shell:
#"module load python/3.7.3;"
" python {params.script} {input.tpm_gct} {input.counts_gct} {input.gtf} "
" {input.sample_key} {input.vcf_chr_list} {prefix} "
" --tpm_threshold 0.1 "
" --count_threshold 6 "
" --sample_frac_threshold 0.2 "
" --normalization_method tmm "
rule prepareTranscriptExpression:
input:
sk = sample_key,
tpms = tpms,
meta = meta,
output:
phenotype_matrix
params:
stem = prefix,
script = "scripts/prepare_transcript_expression.R"
shell:
"ml {R_VERSION};"
"Rscript {params.script} --prefix {params.stem} --key {input.sk} --pheno_matrix {input.tpms} --pheno_meta {input.meta}"
rule prepareEditing:
input:
vcf_chr_list = prefix + "_vcf_chr_list.txt",
ratios = inDir + "all_sites_pileup_editing.tsv.gz",
anno = inDir + "all_sites_pileup_annotation.tsv.gz",
sample_key = sample_key
output:
bed = prefix + ".bed",
sorted = prefix + ".bed.sorted",
editing_bed_gz = prefix + ".editing.bed.gz",
editing_bed_index = prefix + ".editing.bed.gz.tbi"
params:
script = "scripts/edqtl_prepare_editing.R"
shell:
" ml {prepare_editing_R_VERSION}; "
" Rscript {params.script} --vcf_chr_list {input.vcf_chr_list} "
" --rat {input.ratios} --anno {input.anno} --keyIn {input.sample_key} "
" --pheno {output.bed}; "
" cat {output.bed} | (sed -u 1q; sort -k 1,1V -k2,2n) > {output.sorted}; "
" ml tabix; "
" bgzip < {output.sorted} > {output.editing_bed_gz}; "
" tabix -f -p bed {output.editing_bed_gz}; "
rule runPEER:
input:
phenotype_matrix # either expression or splicing counts
#prefix + ".expression.bed.gz"
params:
script = "scripts/run_PEER.R",
num_peer = "{PEER_N}"
output:
outFolder + "peer{PEER_N}/" + dataCode + "_peer{PEER_N}.PEER_covariates.txt"
run:
if int(wildcards.PEER_N) > 0:
shell("ml {R_VERSION}; ")
shell("Rscript {params.script} {input} {outFolder}peer{params.num_peer}/{dataCode}_peer{params.num_peer} {params.num_peer}")
else:
shell("touch {output}")
rule combineCovariates:
input:
pheno = prefix + ".phenotype.tensorQTL.{group_by}.bed.gz",
geno = genotypePCs,
peer = outFolder + "peer{PEER_N}/" + dataCode + "_peer{PEER_N}.PEER_covariates.txt",
covariates = covariateFile
output:
cov_df = outFolder + "peer{PEER_N}/" + dataCode + "_peer{PEER_N}.{group_by}.combined_covariates.txt"
params:
num_peer = "{PEER_N}",
script = "scripts/combine_covariates.py",
logNomFolder = outFolder + "peer{PEER_N}/logNomFolder",
logPerFolder = outFolder + "peer{PEER_N}/logPerFolder"
run:
if int(wildcards.PEER_N) > 0:
peerFile = "--add_covariates " + input.peer
else:
peerFile = ""
shell("python {params.script} {peerFile} \
--genotype_pcs {input.geno} {input.covariates} \
{outFolder}peer{params.num_peer}/{dataCode}_peer{params.num_peer}.{wildcards.group_by}")
# make sure combined covariate file has column names in same order as phenotype file
phenotype_df = pd.read_csv(input.pheno, sep='\t', dtype={'#chr':str, '#Chr':str})
covariate_df = pd.read_csv(output.cov_df, sep = "\t" )
covariate_df = covariate_df[ ["ID"] + list(phenotype_df.columns[4:]) ]
# write out
covariate_df.to_csv(output.cov_df, header = True, index = False, sep = "\t")
## TENSORQTL -----------------------------------------------------------------------
# tensorQTL expects fastQTL formatted phenotype files
# these don't include group info
# therefore take the phenotype file from QTLtools and strip out group info into a separate file
# this group info table should be two columns - phenotype and group
rule preparePhenotypeForTensorQTL:
input:
pheno = phenotype_matrix
output:
pheno = prefix + ".phenotype.tensorQTL.{group_by}.bed.gz",
group = prefix + ".group.tensorQTL.{group_by}.bed.gz"
run:
# pandas - read in expression bed
# drop group column
# write out table
# create separate table with just phenotype and group ID - look into
phenotype_df = pd.read_csv(input.pheno, sep='\t', dtype={'#chr':str, '#Chr':str})
gene_id = phenotype_df.columns[3]
# this should be the "geneid" for eQTls
# for splicing QTLs either gene or cluster
group_id = phenotype_df.columns[4]
# sort by group ID
if( wildcards.group_by == "cluster" ):
# split phenotype_id by ":"
split_df = phenotype_df[gene_id].str.split(":", n = 4, expand = True)
# create new group ID - gene:clusterID
phenotype_df[group_id] = split_df[4] + ":" + split_df[3]
phenotype_df = phenotype_df.sort_values(by = group_id)
# use column numbers not names - leafcutter table doesn't use same column names but positions are the same
# create group and ID table - phenotype group (used for splicing)
phenotype_group_df = phenotype_df[[gene_id, group_id]]
if( wildcards.group_by == "transcript_expression" ):
phenotype_group_df = phenotype_df[[gene_id, gene_id]]
# drop group and strand from phenotype table
phenotype_df.drop(['strand', group_id], axis=1, inplace=True)
phenotype_df.to_csv(output.pheno, index = False, sep = "\t")
# no header!
phenotype_group_df.to_csv(output.group, header = False, index = False, sep = "\t")
# extract the needed samples from the VCF
rule getParticipants:
input:
txt = sample_key
output:
txt = prefix + "_participants.txt"
run:
sk = pd.read_csv(input.txt, sep = "\t")
participants = sk[["participant_id"]]
participants.to_csv(output.txt, index = False, header = False, sep = "\t")
# tensorQTL requires genotypes in PLINK format
# convert using plink2
# test VCF has multiallelic SNPs, hence the forcing with max-alleles
# in real data we've previously excluded multiallelics
# should go back to at some point
rule VCFtoPLINK:
input:
vcf = VCFstem + ".vcf.gz",
participants = prefix + "_participants.txt"
output:
prefix + "_genotypes.fam"
params:
stem = prefix + "_genotypes"
shell:
"ml plink2/2.3; "
"plink2 --make-bed "
"--output-chr chrM "
"--max-alleles 2 "
"--keep {input.participants} "
"--maf {MAF} "
"--allow-extra-chr "
"--max-maf 0.9975 "
"--vcf {input.vcf} "
"--out {params.stem} "
"{extract_str} "
#"if snp_list"
#plink2 --bfile {params.stem} --extract <snp list> --make-bed --out {params.stem}"
# cis eQTL mapping with permutations (i.e. top variant per phenotype group)
rule tensorQTL_cis:
input:
genotypes = prefix + "_genotypes.fam",
phenotypes = prefix + ".phenotype.tensorQTL.{group_by}.bed.gz",
covariates = outFolder + "peer{PEER_N}/" + dataCode + "_peer{PEER_N}.{group_by}.combined_covariates.txt",
groups = prefix + ".group.tensorQTL.{group_by}.bed.gz"
output:
outFolder + "peer{PEER_N}/" + dataCode + "_peer{PEER_N}_{group_by}.cis_qtl.txt.gz"
params:
stem = prefix + "_genotypes",
num_peer = "{PEER_N}",
group = "{group_by}",
group_string = group_string,
script = "scripts/interaction_qvalue.R"
run:
shell( "conda deactivate; conda activate tensorqtl; module purge; ml {R_VERSION}; ml cuda/11.1; python3 -m tensorqtl {params.stem} {input.phenotypes} \
{outFolder}peer{params.num_peer}/{dataCode}_peer{params.num_peer}_{params.group} \
--phenotype_groups {input.groups} \
--covariates {input.covariates} \
--window {qtl_window} \
--load_split \
--mode cis ")
# cis-QTL mapping: summary statistics for all variant-phenotype pairs
rule tensorQTL_cis_nominal:
input:
#perm_res = outFolder + "peer{PEER_N}/" + dataCode + "_peer{PEER_N}.cis_qtl.txt.gz",
genotypes = prefix + "_genotypes.fam",
phenotypes = prefix + ".phenotype.tensorQTL.{group_by}.bed.gz",
covariates = outFolder + "peer{PEER_N}/" + dataCode + "_peer{PEER_N}.{group_by}.combined_covariates.txt"
output:
expand( outFolder + "peer{PEER_N}/" + dataCode +"_peer{PEER_N}_{group_by}.cis_qtl_pairs.{CHROM}.parquet", CHROM = CHROM, allow_missing=True )
params:
group = "{group_by}",
stem = prefix + "_genotypes",
num_peer = "{PEER_N}"
run:
shell( "conda deactivate; conda activate tensorqtl; module purge; ml {R_VERSION}; ml cuda/11.1; python3 -m tensorqtl {params.stem} {input.phenotypes} \
{outFolder}peer{params.num_peer}/{dataCode}_peer{params.num_peer}_{params.group} \
--covariates {input.covariates} \
--window {qtl_window} \
--load_split \
--mode cis_nominal ")
# cis-QTL mapping: interactions
# Instead of mapping the standard linear model (p ~ g), this mode includes an interaction term (p ~ g + i + gi) and returns full summary statistics for the model.
# The interaction term is a tab-delimited text file or pd.Series mapping sample ID to interaction value.
# With the run_eigenmt=True option, eigenMT-adjusted p-values are computed.
rule tensorQTL_cis_interaction:
input:
genotypes = prefix + "_genotypes.fam",
phenotypes = prefix + ".phenotype.tensorQTL.{group_by}.bed.gz",
covariates = outFolder + "peer{PEER_N}/" + dataCode + "_peer{PEER_N}.{group_by}.combined_covariates.txt"
output:
expand( outFolder + "peer{PEER_N}/" + dataCode +"_interaction_{interaction_id}_peer{PEER_N}_{group_by}.cis_qtl_pairs.{CHROM}.parquet", CHROM = CHROM, allow_missing=True )
params:
group = "{group_by}",
stem = prefix + "_genotypes",
num_peer = "{PEER_N}",
interaction = lambda wcs: interaction_dict[wcs.interaction_id]
shell:
"conda deactivate; conda activate tensorqtl; module purge; ml {R_VERSION}; ml cuda/11.1; \
python3 -m tensorqtl {params.stem} {input.phenotypes} \
{outFolder}peer{params.num_peer}/{dataCode}_interaction_{wildcards.interaction_id}_peer{params.num_peer}_{params.group} \
--covariates {input.covariates} \
--window {qtl_window} \
--mode cis_nominal \
--interaction {params.interaction} \
--maf_threshold_interaction 0.05"
# cis-QTL mapping: conditionally independent QTLs
# This mode maps conditionally independent cis-QTLs using the stepwise regression procedure described in GTEx Consortium, 2017.
# The output from the permutation step (see map_cis above) is required.
rule tensorQTL_cis_independent:
input:
cis_result = outFolder + "peer{PEER_N}/" + dataCode + "_peer{PEER_N}_{group_by}.cis_qtl.txt.gz",
genotypes = prefix + "_genotypes.fam",
phenotypes = prefix + ".phenotype.tensorQTL.{group_by}.bed.gz",
covariates = outFolder + "peer{PEER_N}/" + dataCode + "_peer{PEER_N}.{group_by}.combined_covariates.txt",
groups = prefix + ".group.tensorQTL.{group_by}.bed.gz"
output:
outFolder + "peer{PEER_N}/" + dataCode + "_peer{PEER_N}_{group_by}.cis_independent_qtl.txt.gz"
params:
stem = prefix + "_genotypes",
num_peer = "{PEER_N}",
group = "{group_by}",
group_string = group_string
run:
shell( "conda deactivate; conda activate tensorqtl; ml purge; ml {R_VERSION}; ml cuda/11.1; python3 -m tensorqtl {params.stem} {input.phenotypes} \
{outFolder}peer{params.num_peer}/{dataCode}_peer{params.num_peer}_{params.group} \
--covariates {input.covariates} \
--cis_output {input.cis_result} \
--window {qtl_window} \
--mode cis_independent")
# trans-eQTL mapping
# This mode computes nominal associations between all phenotypes and genotypes.
# tensorQTL generates sparse output by default (associations with p-value < 1e-5)
# The output is in parquet format, with four columns: phenotype_id, variant_id, pval, maf
rule tensorQTL_trans:
input:
genotypes = prefix + "_genotypes.fam",
phenotypes = prefix + ".phenotype.tensorQTL.{group_by}.bed.gz",
covariates = outFolder + "peer{PEER_N}/" + dataCode + "_peer{PEER_N}.{group_by}.combined_covariates.txt"
output:
outFolder + "peer{PEER_N}/" + dataCode + "_peer{PEER_N}_{group_by}.trans_qtl_pairs.txt.gz"
params:
stem = prefix + "_genotypes",
num_peer = "{PEER_N}",
group = "{group_by}"
run:
shell( "conda deactivate; conda activate tensorqtl; ml purge; ml {R_VERSION} cuda/11.1; \
python3 -m tensorqtl {params.stem} \
{input.phenotypes} \
{outFolder}peer{params.num_peer}/{dataCode}_peer{params.num_peer}_{params.group} \
--covariates {input.covariates} \
--output_text \
--pval_threshold 1e-4 \
--mode trans")
# trans-eQTL mapping with interaction term
rule tensorQTL_insteraction_trans:
input:
genotypes = prefix + "_genotypes.fam",
phenotypes = prefix + ".phenotype.tensorQTL.{group_by}.bed.gz",
covariates = outFolder + "peer{PEER_N}/" + dataCode + "_peer{PEER_N}.{group_by}.combined_covariates.txt"
output:
outFolder + "peer{PEER_N}/" + dataCode + "_interaction_{interaction_id}_peer{PEER_N}_{group_by}.trans_qtl_pairs.txt.gz"
params:
stem = prefix + "_genotypes",
num_peer = "{PEER_N}",
group = "{group_by}",
interaction = lambda wcs: interaction_dict[wcs.interaction_id]
run:
shell( "conda deactivate; conda activate tensorqtl; module purge; ml {R_VERSION}; ml cuda/11.1; \
python3 -m tensorqtl {params.stem} {input.phenotypes} \
{outFolder}peer{params.num_peer}/{dataCode}_interaction_{wildcards.interaction_id}_peer{params.num_peer}_{params.group} \
--covariates {input.covariates} \
--mode trans \
--output_text \
--pval_threshold 1e-4 \
--interaction {params.interaction}")
rule mergeNominalResult:
input:
expand( outFolder + "peer{PEER_N}/" + dataCode +"_peer{PEER_N}_{group_by}.cis_qtl_pairs.{CHROM}.parquet", CHROM = CHROM, allow_missing=True )
output:
outFolder + "peer{PEER_N}/" + dataCode +"_peer{PEER_N}_{group_by}.cis_qtl_nominal_tabixed.tsv.gz",
outFolder + "peer{PEER_N}/" + dataCode +"_peer{PEER_N}_{group_by}.cis_qtl_nominal_tabixed.tsv.gz.tbi",
params:
prefix = "peer{PEER_N}/" + dataCode +"_peer{PEER_N}_{group_by}",
script = "scripts/merge_nominal_results.R"
shell:
" ml {R_VERSION}; ml snappy;"
" Rscript {params.script} --vcf {VCF} --out_folder {outFolder} --prefix {params.prefix} "
rule mergeNominalResult_interaction:
input:
expand( outFolder + "peer{PEER_N}/" + dataCode +"_interaction_{interaction_name}_peer{PEER_N}_{group_by}.cis_qtl_pairs.{CHROM}.parquet", CHROM = CHROM, allow_missing=True )
output:
outFolder + "peer{PEER_N}/" + dataCode +"_interaction_{interaction_name}_peer{PEER_N}_{group_by}.cis_qtl_nominal_tabixed.tsv.gz",
outFolder + "peer{PEER_N}/" + dataCode +"_interaction_{interaction_name}_peer{PEER_N}_{group_by}.cis_qtl_nominal_tabixed.tsv.gz.tbi",
params:
prefix = "peer{PEER_N}/" + dataCode +"_interaction_{interaction_name}_peer{PEER_N}_{group_by}",
script = "scripts/merge_nominal_results.R"
shell:
" ml {R_VERSION}; ml snappy;"
" Rscript {params.script} --vcf {VCF} --out_folder {outFolder} --prefix {params.prefix} "