-
Notifications
You must be signed in to change notification settings - Fork 5
/
score.wdl
288 lines (262 loc) · 11.1 KB
/
score.wdl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
version development
## Copyright (c) 2021-2024 Giulio Genovese
##
## Version 2024-09-27
##
## Contact Giulio Genovese <[email protected]>
##
## This WDL workflow computes poligenic risk scores
##
## Cromwell version support
## - Successfully tested on v86
##
## Distributed under terms of the MIT License
struct Reference {
File fasta_fai
Int min_chr_len
}
workflow score {
input {
String sample_set_id
String sample_header = "sample_id"
String? region
String? tag # GP, AP, HDS, DS, GT, AS
String ext_string = "scores"
File? colheaders_tsv_file
String? summary_path
Array[File] summary_files
Array[File]? summary_idxs
Array[Float]? q_score_thr
File? covar_tsv_file
String ref_name = "GRCh38"
String? ref_path
String? ref_fasta_fai
Int? min_chr_len
File impute_tsv_file # batch_id path chr1_imp_vcf chr1_imp_vcf_index chr2_imp_vcf chr2_imp_vcf_index ...
String? impute_data_path
File? samples_file
String? exclude_str
String? include_str
String basic_bash_docker = "debian:stable-slim"
String docker_repository = "us.gcr.io/mccarroll-mocha"
String bcftools_docker = "bcftools:1.20-20240927"
String r_mocha_docker = "r_mocha:1.20-20240927"
}
String docker_repository_with_sep = docker_repository + if docker_repository != "" && docker_repository == sub(docker_repository, "/$", "") then "/" else ""
String summary_path_with_sep = select_first([summary_path, ""]) + if defined(summary_path) && select_first([summary_path]) == sub(select_first([summary_path]), "/$", "") then "/" else ""
String ref_path_with_sep = select_first([ref_path, ""]) + if defined(ref_path) && select_first([ref_path]) == sub(select_first([ref_path]), "/$", "") then "/" else ""
Reference ref = object {
fasta_fai: ref_path_with_sep + select_first([ref_fasta_fai, if ref_name == "GRCh38" then "GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.fai" else if ref_name == "GRCh37" then "human_g1k_v37.fasta.fai" else None]),
min_chr_len: select_first([min_chr_len, 2000000]),
}
# call the relevant chromosome
String? chr_string = if defined(region) then sub(sub(select_first([region]), ":.*$", ""), "^chr", "") else None
# read table with batches information (scatter could be avoided if there was a tail() function)
Array[Array[String]] impute_tsv = read_tsv(impute_tsv_file)
Int n_batches = length(impute_tsv)-1
scatter (idx in range(n_batches)) { Array[String] impute_tsv_rows = impute_tsv[(idx+1)] }
Map[String, Array[String]] impute_tbl = as_map(zip(impute_tsv[0], transpose(impute_tsv_rows)))
# check if path is in impute table (see http://github.com/openwdl/wdl/issues/305)
Boolean is_path_in_impute_tbl = length(collect_by_key(zip(flatten([keys(impute_tbl),["path"]]),range(length(keys(impute_tbl))+1)))["path"])>1
# compute data paths for each batch
scatter (idx in range(n_batches)) {
String impute_data_paths_with_sep = (if defined(impute_data_path) then sub(select_first([impute_data_path]), "/$", "") + "/" else "") +
(if is_path_in_impute_tbl then sub(impute_tbl["path"][idx], "/$", "") + "/" else "")
}
Array[Array[String]] ref_fasta_fai_tbl = transpose(read_tsv(ref.fasta_fai))
scatter (idx in range(length(ref_fasta_fai_tbl[0]))) {
Int fai_len = ref_fasta_fai_tbl[1][idx]
if (fai_len > ref.min_chr_len && ref_fasta_fai_tbl[0][idx] != "Y" && ref_fasta_fai_tbl[0][idx] != "chrY") {
String chr_strings = sub(ref_fasta_fai_tbl[0][idx], "^chr", "")
}
}
scatter (p in cross(range(n_batches), range(if defined(region) then 1 else length(select_all(chr_strings))))) {
Int batch_idx = p.left
String hdr = "chr" + (if defined(region) then select_first([chr_string]) else select_all(chr_strings)[p.right]) + "_imp_vcf"
call vcf_score {
input:
vcf_file = impute_data_paths_with_sep[p.left] + impute_tbl[hdr][p.left],
vcf_idx = impute_data_paths_with_sep[p.left] + impute_tbl[(hdr + "_index")][p.left],
samples_file = samples_file,
q_score_thr = q_score_thr,
colheaders_tsv_file = colheaders_tsv_file,
summary_files = prefix(summary_path_with_sep, summary_files),
summary_idxs = if defined(summary_idxs) then prefix(summary_path_with_sep, select_first([summary_idxs])) else None,
sample_header = sample_header,
region = region,
tag = tag,
exclude_str = exclude_str,
include_str = include_str,
filebase = basename(basename(impute_data_paths_with_sep[p.left] + impute_tbl[hdr][p.left], ".bcf"), ".vcf.gz") + "." + ext_string,
docker = docker_repository_with_sep + bcftools_docker
}
}
call score_summary {
input:
score_files = vcf_score.file,
covar_tsv_file = covar_tsv_file,
sample_header = sample_header,
filebase = sample_set_id,
ext_string = ext_string,
docker = if defined(covar_tsv_file) then docker_repository_with_sep + r_mocha_docker else basic_bash_docker
}
output {
File score_tsv_file = score_summary.file
File? adj_score_tsv_file = score_summary.adj_file
}
meta {
author: "Giulio Genovese"
email: "[email protected]"
description: "See the [MoChA](http://github.com/freeseek/mocha) website for more information"
}
}
task vcf_score {
input {
File vcf_file
File vcf_idx
File? samples_file
File? colheaders_tsv_file
Array[File]+ summary_files
Array[File]? summary_idxs
Array[Float]? q_score_thr
String? tag
String? sample_header
String? region
String? exclude_str
String? include_str
String filebase
String docker
Int cpu = 1
Int? disk_size_override
Float? memory_override
Int preemptible = 1
Int maxRetries = 0
}
Float vcf_size = size(vcf_file, "GiB")
Float summary_size = size(summary_files, "GiB")
Int disk_size = select_first([disk_size_override, ceil(10.0 + vcf_size + summary_size)])
Float memory = select_first([memory_override, 3.5 + if defined(summary_idxs) then 0 else summary_size])
command <<<
set -euo pipefail
summary_files=~{write_lines(summary_files)}
summary_idxs=~{if defined(summary_idxs) then write_lines(select_first([summary_idxs])) else ""}
echo "~{sep("\n", select_all([vcf_file, vcf_idx, samples_file, colheaders_tsv_file]))}" | \
cat - $summary_files ~{if defined(summary_idxs) then "$summary_idxs" else ""}| tr '\n' '\0' | xargs -0 mv -t .
sed -i 's/^.*\///' $summary_files
bcftools +score \
--summaries $summary_files \
~{if defined(tag) then "--use \"" + tag + "\"" else ""} \
~{if defined(q_score_thr) then "--q-score-thr " + sep(",", select_first([q_score_thr])) else ""} \
--output "~{filebase}.tsv" \
~{if defined(sample_header) then "--sample-header \"" + sample_header + "\"" else ""} \
~{if defined(exclude_str) then "--exclude '" + exclude_str + "'" else ""} \
~{if defined(include_str) then "--include '" + include_str + "'" else ""} \
~{if defined(region) then "--regions \"" + select_first([region]) + "\"" else ""} \
~{if defined(samples_file) then "--samples-file \"" + basename(select_first([samples_file])) + "\" \\\n" +
" --force-samples" else ""} \
~{if defined(colheaders_tsv_file) then "--columns-file \"" + basename(select_first([colheaders_tsv_file])) + "\"" else ""} \
"~{basename(vcf_file)}"
echo "~{sep("\n", select_all([vcf_file, vcf_idx, samples_file, colheaders_tsv_file]))}" | \
cat - $summary_files ~{if defined(summary_idxs) then "$summary_idxs" else ""}| sed 's/^.*\///' | tr '\n' '\0' | xargs -0 rm
>>>
output {
File file = filebase + ".tsv"
}
runtime {
memory: memory + " GiB"
disks: "local-disk " + disk_size + " HDD"
cpu: cpu
docker: docker
preemptible: preemptible
maxRetries: maxRetries
}
}
task score_summary {
input {
Array[File]+ score_files
File? covar_tsv_file
String sample_header
String filebase
String ext_string
String docker
Int cpu = 1
Int? disk_size_override
Float? memory_override
Int preemptible = 1
Int maxRetries = 0
}
Float score_size = size(score_files[0], "GiB")
Int disk_size = select_first([disk_size_override, ceil(10.0 + (length(score_files) + 1) * score_size)])
Float memory = select_first([memory_override, 3.5 + 4.0 * score_size])
command <<<
set -euo pipefail
score_files=~{write_lines(score_files)}
cat - $score_files | tr '\n' '\0' | xargs -0 mv -t .
~{if defined(covar_tsv_file) then "mv \"" + select_first([covar_tsv_file]) + "\" ." else ""}
cat $score_files | sed 's/^.*\///' | tr '\n' '\0' | xargs -0 cat | \
awk 'NR==1 {sample_header=$1}
{if ($1==sample_header) {
for (i=2; i<=NF; i++) {
if (!($i in score_id)) {
col_count++;
cols[col_count]=$i;
score_id[$i]++;
}
f[i] = $i;
}
} else {
if (!($1 in sample_id)) {
row_count++;
rows[row_count]=$1;
sample_id[$1]++;
}
for (i=2; i<=NF; i++)
v[f[i]"~"$1]+=$i;
}}
END {printf "%s",sample_header;
for (i=1; i<=col_count; i++)
printf "\t%s",cols[i];
printf "\n";
for(j=1; j<=row_count; j++) {
printf "%s",rows[j];
for (i=1; i<=col_count; i++)
printf "\t%f",v[cols[i]"~"rows[j]];
printf "\n";
}
}' > "~{filebase}.~{ext_string}.tsv"
~{if defined(covar_tsv_file) then
"R --vanilla <<CODE\n" +
"library(data.table)\n" +
"df_scores <- fread('" + filebase + "." + ext_string + ".tsv', sep = \"\\t\", header = TRUE, data.table = FALSE)\n" +
"df_covars <- fread('" + basename(select_first([covar_tsv_file])) + "', sep = \"\\t\", header = TRUE, data.table = FALSE)\n" +
"df <- merge(df_scores, df_covars, by = '" + sample_header + "')\n" +
"df_adj <- data.frame(" + sample_header + " = df[, '" + sample_header + "'])\n" +
"scores <- names(df_scores)\n" +
"scores <- scores[scores != '" + sample_header + "']\n" +
"covars <- names(df_covars)\n" +
"covars <- covars[covars != '" + sample_header + "']\n" +
"bt <- rawToChar(as.raw(96))\n" +
"for (score in scores) {\n" +
" formula <- paste0(bt, score, bt, ' ~ ', bt, paste(covars, collapse = paste(bt, '+', bt)), bt)\n" +
" fit <- lm(formula, df)\n" +
" df_adj[, paste0('adj_', score)] <- df[, score] - predict(fit, df)\n" +
" df_adj[, paste0('adj_', score)] <- df_adj[, paste0('adj_', score)] / sd(df_adj[, paste0('adj_', score)])\n" +
"}\n" +
"write.table(df_adj, '" + filebase + ".adj_" + ext_string + ".tsv', sep = '\t', quote = FALSE, row.names = FALSE)\n" +
"CODE" else ""}
cat - $score_files | sed 's/^.*\///' | tr '\n' '\0' | xargs -0 rm
~{if defined(covar_tsv_file) then "rm \"" + basename(select_first([covar_tsv_file])) + "\"" else ""}
>>>
output {
File file = filebase + "." + ext_string + ".tsv"
File? adj_file = if defined(covar_tsv_file) then filebase + ".adj_" + ext_string + ".tsv" else None
}
runtime {
docker: docker
cpu: cpu
disks: "local-disk " + disk_size + " HDD"
memory: memory + " GiB"
preemptible: preemptible
maxRetries: maxRetries
}
}