Skip to content

Commit

Permalink
Add first version of cdi utilities (percentile benchmark, cleanup, re…
Browse files Browse the repository at this point in the history
…lative scores)
  • Loading branch information
adriansteffan committed Dec 23, 2024
1 parent e774122 commit ae7164d
Show file tree
Hide file tree
Showing 17 changed files with 533 additions and 1 deletion.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: peekbankr
Type: Package
Title: Accessing the Peekbank Database and working with Peekbank data
Version: 0.2.1.0
Version: 0.2.3.1
Authors@R: c(
person("Mika", "Braginsky", email = "[email protected]", role = c("aut", "cre")),
person("Kyle", "MacDonald", email = "[email protected]", role = "aut"),
Expand Down
3 changes: 3 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# Generated by roxygen2: do not edit by hand

export(append_relative_cdi_scores)
export(cleanup_cdi_data)
export(connect_to_peekbank)
export(ds.add_aois)
export(ds.get_json_fields)
Expand Down Expand Up @@ -30,6 +32,7 @@ export(get_trial_types)
export(get_trials)
export(get_xy_timepoints)
export(list_peekbank_tables)
export(populate_cdi_percentiles)
export(unpack_aux_data)
importFrom(dplyr,"%>%")
importFrom(glue,glue)
Expand Down
228 changes: 228 additions & 0 deletions R/cdi_processing.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,228 @@
library(dplyr)
library(tidyr)
library(here)
library(purrr)

#' Checks cdi data for inconsistencies, warns about them, and fixes them
#'
#' @param cdi_data a subjects table with unnested cdi data, needs columns "subject_id", "language", "instrument_type", "age", "sex", "measure", "rawscore"
#'
#' @return a cleaned up version of the cdi data
#' @export
#'
#' @examples
#' \donttest{
#' clean_cdi_data <- all_subjects %>%
#' unnest(subject_aux_data) %>%
#' filter(!is.na(cdi_responses)) %>%
#' unnest(cdi_responses) %>%
#' peekbankr::cleanup_cdi_data()
#' }
cleanup_cdi_data <- function(cdi_data) {
# TODO: what else do we need to check here? Ask the others about this

required_columns <- c(
"subject_id",
"language",
"instrument_type",
"age",
"sex",
"measure",
"rawscore"
)

missing_cols <- setdiff(required_columns, colnames(cdi_data))
if (length(missing_cols) > 0) {
stop("Missing required columns: ", paste(missing_cols, collapse = ", "))
}

# Check if there are multiple scores for the same cdi administration within a single participant
duplicate_removed_rows <- cdi_data %>%
dplyr::group_by(subject_id, instrument_type, measure, age, language) %>%
dplyr::filter(rawscore != max(rawscore, na.rm = TRUE))

if (nrow(duplicate_removed_rows) > 0) {
print("Warning: there are some duplicate cdi values in your data. These were removed, but you should check the input data.")
print(duplicate_removed_rows)

# fix the duplicates for analysis until the input data is fixed
cdi_data <- cdi_data %>%
dplyr::group_by(subject_id, instrument_type, measure, age, language) %>%
dplyr::filter(rawscore == max(rawscore, na.rm = TRUE))
}

return(cdi_data)
}

#' Populate the provided cdi data with percentile values for that specific age, instrument_type, measure and language. Loosely based on the work from this repo https://github.com/kachergis/cdi-percentiles/tree/main by George Kachergis and Jess Mankewitz with advice from Virginia Marchman.
#'
#' @param subjects_table a subjects table with unnested cdi data, needs columns "subject_id", "language", "instrument_type", "age", "sex", "measure", "rawscore"
#'
#' @return the input table with added columns containing the reference age used, the reference year used, and both gender specific and general percentile values for the cdi score
#' @export
#'
#' @examples
#' \donttest{
#' full_cdi_data <- all_subjects %>%
#' unnest(subject_aux_data) %>%
#' filter(!is.na(cdi_responses)) %>%
#' unnest(cdi_responses) %>%
#' peekbankr::cleanup_cdi_data() %>%
#' peekbankr::populate_cdi_percentiles()
#' }
populate_cdi_percentiles <- function(subjects_table) {
required_columns <- c(
"subject_id",
"language",
"instrument_type",
"age",
"sex",
"measure",
"rawscore"
)

output_columns <- c(
"reference_age",
"reference_year",
"percentile_all",
"percentile_sex",
"norm_score_all",
"norm_score_sex"
)

missing_cols <- setdiff(required_columns, colnames(subjects_table))
if (length(missing_cols) > 0) {
stop("Missing required columns: ", paste(missing_cols, collapse = ", "))
}

existing_output_cols <- intersect(output_columns, colnames(subjects_table))
if (length(existing_output_cols) > 0) {
stop("Output columns already exist: ", paste(existing_output_cols, collapse = ", "))
}

norms_tables <- readRDS("data/cdi_benchmarks_2022/norms_tables.rds")

cdi_norms_long <- norms_tables %>%
purrr::imap(\(table, name){
table %>%
as_tibble() %>%
tidyr::pivot_longer(cols = c(-age), names_to = "head", values_to = "score") %>%
dplyr::rename(norm_percentile = age, reference_age = head) %>%
dplyr::mutate(name = gsub(".csv", "", name, fixed = T)) %>%
tidyr::separate(name,
into = c("language", "instrument_type", "measure", "norm_sex"),
sep = "_"
) %>%
# Add zero rows in one step
dplyr::bind_rows(., dplyr::distinct(., reference_age, language, instrument_type, measure, norm_sex) %>%
dplyr::mutate(norm_percentile = 1, score = 0))
}) %>%
dplyr::bind_rows() %>%
# TODO: create a wordbank/iso lookup for all languages
dplyr::mutate(
language = ifelse(language == "eng", "English (American)", NA),
reference_age = as.numeric(reference_age)
)

# find reference age for each participants entry
subject_table_with_ref_age <- subjects_table %>%
dplyr::inner_join(
cdi_norms_long %>% dplyr::distinct(reference_age, instrument_type, measure, language),
by = c(
"instrument_type",
"measure", "language"
),
relationship = "many-to-many"
) %>%
dplyr::mutate(age_diff = abs(age - reference_age)) %>%
dplyr::group_by(across(!c(age_diff, reference_age))) %>%
dplyr::slice_min(abs(age_diff), n = 1, with_ties = FALSE) %>%
dplyr::ungroup() %>%
dplyr::select(-age_diff)

subject_table_with_cdi_percentiles <- subject_table_with_ref_age %>%
dplyr::inner_join(
cdi_norms_long,
by = c("instrument_type", "measure", "reference_age", "language"),
relationship = "many-to-many"
) %>%
dplyr::filter(score < rawscore) %>%
dplyr::group_by(across(!c(score, norm_percentile))) %>%
dplyr::slice_max(score, n = 1, with_ties = FALSE) %>%
dplyr::ungroup() %>%
tidyr::pivot_wider(names_from = "norm_sex", values_from = c("norm_percentile", "score")) %>%
dplyr::mutate(
percentile_all = norm_percentile_both,
percentile_sex = case_when(
sex == "male" ~ norm_percentile_m,
sex == "female" ~ norm_percentile_f,
T ~ NA
),
norm_score_all = score_both,
norm_score_sex = case_when(
sex == "male" ~ score_m,
sex == "female" ~ score_f,
T ~ NA
)
) %>%
dplyr::select(subject_id, instrument_type, measure, age, language, reference_age, percentile_all, percentile_sex, norm_score_all, norm_score_sex) %>%
dplyr::mutate(reference_year = "2022")

return(subjects_table %>% dplyr::left_join(
subject_table_with_cdi_percentiles,
by = c("subject_id", "instrument_type", "measure", "age", "language")
))
}


#' Adds a relative cdi score indicating the percentage of total achievable points the subject got on each given measure
#'
#' @param subjects_table a subjects table with unnested cdi data, needs columns "subject_id", "language", "instrument_type", "measure", "rawscore"
#'
#' @return the input table with an added "cdi_relative" column that contains the percentage of total points gained in the given administrations
#' @export
#'
#' @examples
#' \donttest{
#' cdi_data <- all_subjects %>%
#' unnest(subject_aux_data) %>%
#' filter(!is.na(cdi_responses)) %>%
#' unnest(cdi_responses) %>%
#' append_relative_cdi_scores()
#' }
append_relative_cdi_scores <- function(subjects_table) {
required_columns <- c(
"subject_id",
"instrument_type",
"language",
"measure",
"rawscore"
)

output_columns <- c(
"cdi_relative"
)

missing_cols <- setdiff(required_columns, colnames(subjects_table))
if (length(missing_cols) > 0) {
stop("Missing required columns: ", paste(missing_cols, collapse = ", "))
}

existing_output_cols <- intersect(output_columns, colnames(subjects_table))
if (length(existing_output_cols) > 0) {
stop("Output columns already exist: ", paste(existing_output_cols, collapse = ", "))
}

# TODO: find instrument_length values for all languages
subjects_table %>%
dplyr::mutate(
instrument_length = case_when(instrument_type == "ws" ~ 680,
instrument_type == "wg" & language == "English (American)" ~ 396,
instrument_type == "wsshort" ~ 100,
instrument_type == "wg" & language == "Spanish (Mexican)" ~ 428, # TODO: double-check Spanish WG length..
.default = NA
),
CDI_percent = rawscore / instrument_length
) %>%
dplyr::select(-instrument_length)
}
22 changes: 22 additions & 0 deletions data/cdi_benchmarks_2022/eng_wg_comp_both.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
"","V1","V2","V3","V4","V5","V6","V7","V8","V9","V10","V11"
"age",8,9,10,11,12,13,14,15,16,17,18
"5",1,1,2,3,4,6,9,14,20,30,41
"10",2,3,5,7,10,14,20,29,41,55,73
"15",4,6,8,12,17,24,33,45,61,80,101
"20",6,9,13,18,25,34,46,62,82,104,128
"25",8,12,17,24,33,45,60,80,102,126,152
"30",11,16,23,31,42,57,75,97,122,149,176
"35",14,20,28,39,52,70,91,115,142,170,198
"40",18,25,35,47,63,83,107,134,162,191,220
"45",22,30,42,56,75,98,124,153,182,212,240
"50",26,36,49,66,87,113,142,172,202,232,259
"55",31,43,58,77,101,129,160,192,222,251,278
"60",37,50,68,89,116,146,179,212,242,270,296
"65",43,59,78,102,132,164,199,232,262,289,312
"70",51,68,90,117,149,184,220,253,282,307,328
"75",60,80,104,134,169,206,242,274,301,324,343
"80",71,93,121,154,191,229,265,296,321,341,357
"85",85,110,141,177,216,255,290,319,340,357,369
"90",103,132,167,206,247,286,318,342,359,371,380
"95",134,168,206,248,289,324,350,367,378,385,390
"99",194,233,275,314,347,369,382,389,393,394,395
22 changes: 22 additions & 0 deletions data/cdi_benchmarks_2022/eng_wg_comp_f.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
"","V1","V2","V3","V4","V5","V6","V7","V8","V9","V10","V11"
"age",8,9,10,11,12,13,14,15,16,17,18
"5",1,1,2,4,5,8,12,18,26,36,49
"10",2,4,6,9,12,18,25,36,49,65,84
"15",4,6,10,14,20,28,39,53,71,91,114
"20",6,10,14,20,28,39,53,71,92,116,142
"25",9,13,19,27,37,51,67,88,112,139,168
"30",12,18,25,35,47,63,82,105,132,161,192
"35",16,22,31,42,57,75,97,123,152,183,214
"40",19,27,38,51,68,88,113,141,171,203,235
"45",24,33,45,60,79,102,129,159,191,223,255
"50",28,39,53,70,91,117,146,177,210,243,274
"55",34,46,62,81,105,132,163,196,229,261,292
"60",40,54,71,93,119,148,181,214,248,279,308
"65",47,63,82,106,134,166,199,234,266,297,324
"70",55,73,94,121,151,184,219,253,285,314,338
"75",65,84,108,137,170,204,240,273,304,330,352
"80",76,98,125,156,191,227,262,294,322,345,364
"85",91,116,145,179,215,252,286,316,340,360,375
"90",111,139,171,207,245,281,312,339,359,374,384
"95",143,175,210,248,285,318,344,364,377,386,391
"99",205,241,278,312,342,364,379,387,392,395,396
22 changes: 22 additions & 0 deletions data/cdi_benchmarks_2022/eng_wg_comp_m.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
"","V1","V2","V3","V4","V5","V6","V7","V8","V9","V10","V11"
"age",8,9,10,11,12,13,14,15,16,17,18
"5",1,1,1,2,3,5,7,11,16,23,31
"10",2,3,4,6,8,12,17,24,34,46,61
"15",3,5,7,10,14,20,28,39,53,69,88
"20",5,8,11,15,22,30,41,55,72,92,115
"25",8,11,16,22,30,40,54,71,92,115,141
"30",11,15,21,28,38,52,68,89,112,138,165
"35",14,19,26,36,48,64,83,106,132,160,189
"40",17,24,33,44,59,77,99,125,153,182,212
"45",22,30,40,53,70,91,116,144,173,204,234
"50",26,36,48,63,83,106,133,163,194,225,255
"55",32,43,57,74,96,122,152,183,215,246,275
"60",38,50,66,86,111,139,171,204,236,266,294
"65",45,59,77,100,127,158,191,225,256,286,312
"70",53,70,90,115,145,178,212,246,277,305,329
"75",63,82,105,132,165,200,235,268,298,323,344
"80",75,96,122,153,187,224,259,291,318,341,359
"85",90,114,143,177,213,250,285,315,339,357,371
"90",111,139,171,207,245,282,314,340,359,373,382
"95",144,176,212,251,288,321,347,366,378,386,391
"99",209,246,283,318,347,368,381,389,393,395,396
22 changes: 22 additions & 0 deletions data/cdi_benchmarks_2022/eng_wg_prod_both.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
"","V1","V2","V3","V4","V5","V6","V7","V8","V9","V10","V11"
"age",8,9,10,11,12,13,14,15,16,17,18
"5",0,0,0,0,0,0,0,0,0,1,2
"10",0,0,0,0,0,0,0,1,2,3,5
"15",0,0,0,0,0,1,1,2,4,6,10
"20",0,0,0,1,1,1,2,4,6,10,15
"25",0,0,1,1,2,2,4,6,9,14,21
"30",1,1,1,2,2,4,5,8,13,20,29
"35",1,1,2,2,3,5,8,12,17,26,37
"40",1,2,2,3,5,7,10,15,23,33,46
"45",2,2,3,5,7,10,14,20,29,40,56
"50",2,3,4,6,9,12,18,25,36,49,67
"55",3,4,6,8,11,16,22,31,43,59,79
"60",4,6,8,11,15,20,28,39,53,70,92
"65",5,7,10,13,18,25,35,47,63,83,107
"70",7,9,12,17,23,32,43,57,75,98,124
"75",9,12,16,21,29,39,52,69,90,114,143
"80",11,15,20,27,37,49,64,84,107,134,165
"85",15,20,26,35,47,62,80,103,129,158,191
"90",20,26,35,47,61,80,102,128,158,190,224
"95",29,38,51,67,86,110,138,169,201,235,269
"99",51,67,88,112,141,174,209,244,277,307,334
22 changes: 22 additions & 0 deletions data/cdi_benchmarks_2022/eng_wg_prod_f.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
"","V1","V2","V3","V4","V5","V6","V7","V8","V9","V10","V11"
"age",8,9,10,11,12,13,14,15,16,17,18
"5",0,0,0,0,0,0,0,1,1,2,3
"10",0,0,0,0,0,1,1,2,3,5,8
"15",0,0,0,0,1,1,2,3,5,9,13
"20",0,0,0,1,1,2,3,5,8,13,19
"25",0,0,1,1,2,3,5,8,12,18,26
"30",0,1,1,2,3,5,7,11,16,23,33
"35",1,1,2,3,4,6,9,14,20,29,41
"40",1,2,3,4,6,8,12,17,25,36,50
"45",2,2,4,5,7,11,15,22,31,43,59
"50",2,3,5,7,10,13,19,26,37,51,69
"55",3,4,6,9,12,17,23,32,44,60,80
"60",4,6,8,11,15,20,28,38,52,70,92
"65",5,7,10,14,18,25,33,45,61,81,105
"70",7,9,13,17,22,30,40,53,71,93,120
"75",9,12,16,21,28,36,48,63,83,107,137
"80",12,15,20,26,34,44,58,75,97,124,156
"85",16,20,26,33,43,55,71,91,115,145,179
"90",21,27,34,43,55,70,88,112,140,172,208
"95",32,40,49,61,76,94,117,145,177,212,250
"99",57,69,84,101,122,147,177,210,245,281,315
22 changes: 22 additions & 0 deletions data/cdi_benchmarks_2022/eng_wg_prod_m.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
"","V1","V2","V3","V4","V5","V6","V7","V8","V9","V10","V11"
"age",8,9,10,11,12,13,14,15,16,17,18
"5",0,0,0,0,0,0,0,0,0,0,1
"10",0,0,0,0,0,0,0,1,1,2,3
"15",0,0,0,0,0,0,1,1,2,4,7
"20",0,0,0,0,1,1,2,3,4,7,11
"25",0,0,1,1,1,2,3,4,7,11,18
"30",1,1,1,1,2,3,4,7,11,17,25
"35",1,1,2,2,3,4,6,10,15,23,33
"40",1,2,2,3,4,6,9,14,21,30,43
"45",2,2,3,4,6,8,12,18,27,39,55
"50",2,3,4,6,8,11,16,24,34,49,68
"55",3,4,5,7,10,15,21,31,43,60,82
"60",4,5,7,10,14,19,27,38,54,73,99
"65",5,7,9,12,17,25,34,48,66,89,117
"70",6,8,12,16,22,31,43,59,80,106,138
"75",8,11,15,20,28,39,54,73,97,126,161
"80",10,14,19,26,36,50,68,90,117,150,188
"85",13,18,25,34,47,64,85,112,143,179,219
"90",17,24,33,46,63,84,110,141,176,215,256
"95",24,34,48,66,89,118,151,187,226,265,303
"99",43,60,83,113,148,187,229,269,305,337,362
Loading

0 comments on commit ae7164d

Please sign in to comment.