-
Notifications
You must be signed in to change notification settings - Fork 0
/
create_probbase_template.R
335 lines (289 loc) · 13.7 KB
/
create_probbase_template.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
#------------------------------------------------------------------------------#
#
# create_probbase_template.R
#
# Identify symptoms added & dropped in the new WHO 2022 VA instrument.
#
#------------------------------------------------------------------------------#
library(readxl)
library(dplyr)
#------------------------------------------------------------------------------#
# Download input files (probbase, mapping, and questionnaires) #
#------------------------------------------------------------------------------#
## probbase and mapping
probbase_url <- "https://github.com/verbal-autopsy-software/probbase/raw/main/probbase.xls"
download.file(probbase_url, destfile = "probbase.xls")
probbaseV5 <- read_excel("probbase.xls")
probbaseV5 <- as.matrix(probbaseV5)
probbaseV5[2, 3]
pva151_url <- "https://raw.githubusercontent.com/verbal-autopsy-software/pyCrossVA/master/pycrossva/resources/mapping_configuration_files/2016WHOv151_to_InterVA5.csv"
pva151_mapping_name <- "2016WHOv151_to_InterVA5.csv"
download.file(pva151_url, destfile = pva151_mapping_name)
pva151 <- read.csv(pva151_mapping_name)
pva151 <- lapply(pva151, function(x) if(is.character(x)){gsub("\n", " ", x)} else{x})
pva151 <- as.data.frame(pva151)
names(pva151)
## 2016 WHO VA Questionnaire v1.5.1
who2016_url <- "https://github.com/SwissTPH/WHO-VA/releases/download/1.5%2C1/WHO_VA_2016_V1.5.1.zip"
who2016_zip_name <- "WHO_VA_2016_V1.5.1.zip"
download.file(who2016_url, destfile = who2016_zip_name)
unzip(who2016_zip_name)
who2016_form_name <- "WHOVA2016_v1_5_1_XLS_form_for_ODK.xlsx"
who2016 <- read_excel(who2016_form_name, sheet = "survey")
who2016 <- lapply(who2016, function(x) if(is.character(x)){gsub("\n", " ", x)} else{x})
who2016 <- as.data.frame(who2016)
names(who2016)
table(who2016$type, useNA = "always")
exclude_type <- c("begin group", "end group", "start",
"today", "note", "text", "end", NA)
clean_who2016 <- subset(who2016, !(type %in% exclude_type))
dim(clean_who2016)
output_who2016 <- clean_who2016[, c("name", "label..English")]
# 2022 WHO VA Questionnaire v2022033001
who2022_url <- "https://github.com/SwissTPH/WHO-VA/releases/download/2022.1/WHOVA2022_XLS_form_for_ODK.xlsx"
who2022_form_name <- "WHOVA2022_XLS_form_for_ODK.xlsx"
download.file(who2022_url, destfile = who2022_form_name)
who2022 <- read_excel(who2022_form_name, sheet = "survey")
who2022 <- lapply(who2022, function(x) if(is.character(x)){gsub("\n", " ", x)} else{x})
who2022 <- as.data.frame(who2022)
names(who2022)
table(who2022$type, useNA = "always")
exclude_type <- c("audio", "audit", "begin group", "end", "end group",
"note", "start", "text", "today", "trigger")
clean_who2022 <- subset(who2022, !(type %in% exclude_type))
dim(clean_who2022)
output_who2022 <- clean_who2022[, c("name", "label..English..en.")]
#------------------------------------------------------------------------------#
# Build new probbase template (create new rows & columns) #
#------------------------------------------------------------------------------#
new_probbase <- probbaseV5
## add column to identify probbase rows that lost support in 2022 instrument
orig_col_names <- colnames(new_probbase)
orig_col_names
new_probbase <- cbind(new_probbase[, 1:3], "", new_probbase[, 4:ncol(new_probbase)])
colnames(new_probbase) <- c(orig_col_names[1:3],
"status_in_2022",
orig_col_names[4:length(orig_col_names)])
no_pb2020 <- !(tolower(pva151$Source.Column.ID) %in% tolower(output_who2022$name))
cbind(pva151[, c(1, 3)], no_pb2020)
table(new_probbase[, 1] %in% pva151[no_pb2020, 1])
dropped <- new_probbase[, 1] %in% pva151[no_pb2020, 1]
table(dropped)
length(dropped) == nrow(new_probbase)
colnames(new_probbase)
colnames(new_probbase)[4]
new_probbase[dropped, 4] <- "dropped"
## rename column who_2016 <- who_2022
colnames(new_probbase)[(colnames(new_probbase) == "who_2016")] <- "who_2022"
## add in COVID-19 column
index_col_covid <- which(colnames(new_probbase) == "b_0112")
n_rows_fill <- nrow(new_probbase) - 1
new_probbase <- cbind(new_probbase[, 1:index_col_covid],
c("01.13 Coronavirus disease (COVID-19)", rep("", n_rows_fill)),
new_probbase[, (index_col_covid + 1):ncol(new_probbase)])
colnames(new_probbase)[colnames(new_probbase) == ""] = "b_0113"
## Add new probbase rows for new 2022 instrument questions
table(output_who2022$name %in% output_who2016$name)
#### create new indic qdesc sdesc and who
new_pb_items <- output_who2022[!(output_who2022$name %in% output_who2016$name), ]
new_pb_items
new_pb_items <- new_pb_items %>%
filter(grepl("^Id", name)) %>%
filter(!grepl("check", name))
#### remove non-symptoms
## 1 Id10010a
## 2 Id10010b
## 3 Id10007a
## 4 Id10007b
new_pb_items[, "label..English..en."]
new_pb_items <- new_pb_items %>%
slice(5:n())
colnames(new_probbase)
new_pb_indic <- sub("_1", "a", new_pb_items$name)
new_pb_indic <- sub("_", "", new_pb_indic)
new_pb_indic[nchar(new_pb_indic) == 7] <- paste0(new_pb_indic[nchar(new_pb_indic) == 7], "o")
new_pb_indic <- substr(new_pb_indic, nchar(new_pb_indic) - 3, nchar(new_pb_indic))
new_pb_indic <- paste0("i", new_pb_indic)
## new_pb_qdesc <- sub("^\\(.*\\) ", "", new_pb_items[, "label..English..en."])
new_pb_qdesc <- new_pb_items[, "label..English..en."]
new_pb_who <- paste0("W610",
substr(new_pb_indic, 2, 4),
"-",
substr(new_pb_indic, 5, 5))
new_rows <- cbind(new_pb_indic, trimws(new_pb_qdesc),
"", "added", # sdesc & status_in_2022
new_pb_who,
matrix("", nrow=length(new_pb_indic), ncol = ncol(new_probbase) - 5))
colnames(new_rows) <- colnames(new_probbase)
dim(new_rows)
dim(new_probbase)
new_probbase <- rbind(new_probbase, new_rows)
## Convert new 2022 questions into probbase indicators
#### COVID
###### old: "(Id10483) Did s(h)e have a recent test for COVID-19?"
###### new: "(Id10483) Did s(h)e have a recent postive test result for COVID-19?"
###### remove: "(Id10484) What was the result?"
grep("\\(Id10483\\) Did s\\(h\\)e have a recent test for COVID\\-19\\?",
new_probbase[, 2])
tmp_index <- grep("\\(Id10483\\) Did s\\(h\\)e have a recent test for COVID\\-19\\?",
new_probbase[, 2])
new_probbase[tmp_index, 2]
new_probbase[tmp_index, 2] <- "(Id10483) Did s(h)e have a recent positive test for COVID-19?"
grep("\\(Id10484\\) What was the result\\?", new_probbase[, 2])
tmp_index <- grep("\\(Id10484\\) What was the result\\?", new_probbase[, 2])
new_probbase[tmp_index, 2]
dim(new_probbase)
new_probbase <- new_probbase[-tmp_index, ]
dim(new_probbase)
#### old: "(Id10077_a) How long after the injury or accident did s/he die?"
#### new: "(Id10077_a) Did s(h)e die within 7 days of the injury or accident?"
grep("\\(Id10077_a\\) How long after the injury or accident did s/he die\\?",
new_probbase[, 2])
tmp_index <- grep("\\(Id10077_a\\) How long after the injury or accident did s/he die\\?",
new_probbase[, 2])
new_probbase[tmp_index, 2]
new_probbase[tmp_index, 2] <- "(Id10077_a) Did s(h)e die within 7 days of the injury or accident?"
new_probbase[tmp_index, 2]
#### Smoke Tobacco
##### convert duration questions into binary inidicators
##### remove "How many (months/years)"
grep("\\(Id10413_a\\) For how long did s/he smoke tobacco\\?",
new_probbase[, 2])
tmp_index <- grep("\\(Id10413_a\\) For how long did s/he smoke tobacco\\?",
new_probbase[, 2])
new_probbase[tmp_index, 2] <- "(Id10413_a) Did s(h)e smoke tobacco for less than X years?"
new_probbase[tmp_index, 4] <- "create categories"
new_row_1 <- new_row_2 <- new_probbase[tmp_index,]
new_row_1[2] <- "(Id10413_b) Did s(h)e smoke tobacco for more than X years but less than Y years?"
new_row_1[4] <- "create categories"
new_row_2[2] <- "(Id10413_c) Did s(h)e smoke tobacco for at least Y years?"
new_row_2[4] <- "create categories"
dim(new_probbase)
new_probbase <- rbind(new_probbase[1:tmp_index,],
new_row_1, new_row_2,
new_probbase[(tmp_index + 1):nrow(new_probbase), ])
dim(new_probbase)
grep("\\(Id10413_d\\) How many \\(months/years\\)",
new_probbase[, 2])
tmp_index <- grep("\\(Id10413_d\\) How many \\(months/years\\)",
new_probbase[, 2])
dim(new_probbase)
new_probbase <- new_probbase[-tmp_index, ]
dim(new_probbase)
#### Chew/Sniff Tobacco
##### convert duration questions into binary inidicators
##### remove "How many (months/years)"
grep("\\(Id10414_a\\) For how long did s/he chew and/or sniff tobacco?",
new_probbase[, 2])
tmp_index <- grep("\\(Id10414_a\\) For how long did s/he chew and/or sniff tobacco?",
new_probbase[, 2])
new_probbase[tmp_index, 2] <- "(Id10414_a) Did s(h)e chew and/or sniff tobacco for less than X years?"
new_probbase[tmp_index, 4] <- "create categories"
new_row_1 <- new_row_2 <- new_probbase[tmp_index,]
new_row_1[2] <- "(Id10414_b) Did s(h)e chew and/or sniff tobacco for more than X years but less than Y years?"
new_row_1[4] <- "create categories"
new_row_2[2] <- "(Id10414_c) Did s(h)e chew and/or sniff tobacco for at least Y years?"
new_row_2[4] <- "create categories"
dim(new_probbase)
new_probbase <- rbind(new_probbase[1:tmp_index,],
new_row_1, new_row_2,
new_probbase[(tmp_index + 1):nrow(new_probbase), ])
dim(new_probbase)
grep("\\(Id10414_d\\) How many \\(months/years\\)",
new_probbase[, 2])
tmp_index <- grep("\\(Id10414_d\\) How many \\(months/years\\)",
new_probbase[, 2])
dim(new_probbase)
new_probbase <- new_probbase[-tmp_index, ]
dim(new_probbase)
#------------------------------------------------------------------------------#
# Identify age- & sex-specific indicators #
#------------------------------------------------------------------------------#
orig_col_names <- colnames(new_probbase)
orig_col_names
dim(new_probbase)
new_probbase <- cbind(new_probbase[, 1:8],
"", "", # sex and age indicators
new_probbase[, 9:ncol(new_probbase)])
dim(new_probbase)
colnames(new_probbase) <- c(orig_col_names[1:8],
"sex", "age",
orig_col_names[9:length(orig_col_names)])
## sex
dont_ask_cols <- grep("dontask", colnames(new_probbase))
for (i in 1:nrow(new_probbase)) {
male <- grep("i019bY", new_probbase[i, dont_ask_cols])
female <- grep("i019aY", new_probbase[i, dont_ask_cols])
if (length(male) == 1) {
new_probbase[i, "sex"] <- "male"
}
if (length(female) == 1) {
new_probbase[i, "sex"] <- "female"
}
}
table(new_probbase[, "sex"])
## age
age_indic <- paste0("i022", letters[1:7], "Y")
age_indic
index_age_labels <- grep("i022[a-g]", new_probbase[, "indic"])
age_labels <- new_probbase[index_age_labels, "sdesc"]
#### neonates
tmp_index <- new_probbase[, "nnonly"] == "i022gY"
table(tmp_index, useNA="ifany")
new_probbase[tmp_index, "age"] <- "neonate"
for (i in 1:nrow(new_probbase)) {
if (new_probbase[i, "age"] == "neonate") next
d_ask_age <- grep("i022[a-g]", new_probbase[i, dont_ask_cols])
if (length(d_ask_age) > 0) {
tmp_row <- new_probbase[i, dont_ask_cols][d_ask_age]
age_only <- age_labels[!(age_indic %in% tmp_row)]
if (length(age_only) > 0){
new_probbase[i, "age"] <- paste(age_only, collapse = " & ")
}
}
}
#------------------------------------------------------------------------------#
# Identify possible, impossible, irrelevant, and split symptom/cause pairs #
#------------------------------------------------------------------------------#
## fill in values for cause/symptom pairs: (beware of SUBST==N!)
## (1) possible
## (2) impossible b/c demographics
## (3) impossible - probbase
colnames(new_probbase)
index_cause_cols <- grep("b_", colnames(new_probbase))
cause_col_labels <- colnames(new_probbase)[index_cause_cols]
orig <- new_probbase
for (i in 2:nrow(new_probbase)) {
for (j in index_cause_cols) {
new_probbase[i, j] <- ifelse(orig[i, j] == "N", 3, 1)
}
}
## impossible b/c of demographics
#### men & children -> maternal mortality
tmp_index <- which(new_probbase[, "indic"] == "i019a")
index_maternal_cols <- grep("b_09", colnames(new_probbase))
new_probbase[tmp_index, index_maternal_cols] <- 2
young_index <- new_probbase[, "indic"] != "" & new_probbase[, "age"] != "" &
!grepl("5 to 14", new_probbase[, "age"]) &
!grepl("15 to 49", new_probbase[, "age"]) &
!grepl("50 to 64", new_probbase[, "age"]) &
!grepl("65+", new_probbase[, "age"])
new_probbase[young_index, index_maternal_cols] <- 2
## non-neonates: 10.02 Birth asphyxia, 10.03 Neonatal pneumonia, 10.04 Neonatal sepsis,
## "10.99 Other and unspecified neonatal CoD", "11.01 Fresh stillbirth",
## "11.02 Macerated stillbirth"
neonate_cod <- c("b_1002", "b_1003", "b_1004", "b_1099", "b_1101", "b_1102")
index_neonate_cols <- which(colnames(new_probbase) %in% neonate_cod)
non_neonate_ind <- new_probbase[, "indic"] != "" &
new_probbase[, "age"] != "" & !grepl("neonate", new_probbase[, "age"])
table(non_neonate_ind)
new_probbase[non_neonate_ind, index_neonate_cols] <- 2
#------------------------------------------------------------------------------#
# Write template to CSV #
#------------------------------------------------------------------------------#
keep_cols <- c("indic", "qdesc", "status_in_2022", "who_22", "subst", "sex", "age",
cause_col_labels)
index_keep_cols <- which(colnames(new_probbase) %in% keep_cols)
final_probbase <- new_probbase[, index_keep_cols]
final_probbase <- final_probbase[-2, ] ## remove prior
write.csv(final_probbase, file = "template_probbase2020.csv", row.names=FALSE, na="")