Add first version of cdi utilities (percentile benchmark, cleanup, re…

…lative scores)
peekbank · Dec 23, 2024 · ae7164d · ae7164d
1 parent e774122
commit ae7164d
Show file tree

Hide file tree

Showing 17 changed files with 533 additions and 1 deletion.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: peekbankr
 Type: Package
 Title: Accessing the Peekbank Database and working with Peekbank data
-Version: 0.2.1.0
+Version: 0.2.3.1
 Authors@R: c(
     person("Mika", "Braginsky", email = "[email protected]", role = c("aut", "cre")),
     person("Kyle", "MacDonald", email = "[email protected]", role = "aut"),

diff --git a/NAMESPACE b/NAMESPACE
@@ -1,5 +1,7 @@
 # Generated by roxygen2: do not edit by hand
 
+export(append_relative_cdi_scores)
+export(cleanup_cdi_data)
 export(connect_to_peekbank)
 export(ds.add_aois)
 export(ds.get_json_fields)
@@ -30,6 +32,7 @@ export(get_trial_types)
 export(get_trials)
 export(get_xy_timepoints)
 export(list_peekbank_tables)
+export(populate_cdi_percentiles)
 export(unpack_aux_data)
 importFrom(dplyr,"%>%")
 importFrom(glue,glue)

diff --git a/R/cdi_processing.R b/R/cdi_processing.R
@@ -0,0 +1,228 @@
+library(dplyr)
+library(tidyr)
+library(here)
+library(purrr)
+
+#' Checks cdi data for inconsistencies, warns about them, and fixes them
+#'
+#' @param cdi_data a subjects table with unnested cdi data, needs columns "subject_id", "language", "instrument_type", "age", "sex", "measure", "rawscore"
+#'
+#' @return a cleaned up version of the cdi data
+#' @export
+#'
+#' @examples
+#' \donttest{
+#' clean_cdi_data <- all_subjects %>%
+#'   unnest(subject_aux_data) %>%
+#'   filter(!is.na(cdi_responses)) %>%
+#'   unnest(cdi_responses) %>%
+#'   peekbankr::cleanup_cdi_data()
+#' }
+cleanup_cdi_data <- function(cdi_data) {
+  # TODO: what else do we need to check here? Ask the others about this
+
+  required_columns <- c(
+    "subject_id",
+    "language",
+    "instrument_type",
+    "age",
+    "sex",
+    "measure",
+    "rawscore"
+  )
+
+  missing_cols <- setdiff(required_columns, colnames(cdi_data))
+  if (length(missing_cols) > 0) {
+    stop("Missing required columns: ", paste(missing_cols, collapse = ", "))
+  }
+
+  # Check if there are multiple scores for the same cdi administration within a single participant
+  duplicate_removed_rows <- cdi_data %>%
+    dplyr::group_by(subject_id, instrument_type, measure, age, language) %>%
+    dplyr::filter(rawscore != max(rawscore, na.rm = TRUE))
+
+  if (nrow(duplicate_removed_rows) > 0) {
+    print("Warning: there are some duplicate cdi values in your data. These were removed, but you should check the input data.")
+    print(duplicate_removed_rows)
+
+    # fix the duplicates for analysis until the input data is fixed
+    cdi_data <- cdi_data %>%
+      dplyr::group_by(subject_id, instrument_type, measure, age, language) %>%
+      dplyr::filter(rawscore == max(rawscore, na.rm = TRUE))
+  }
+
+  return(cdi_data)
+}
+
+#' Populate the provided cdi data with percentile values for that specific age, instrument_type, measure and language. Loosely based on the work from this repo https://github.com/kachergis/cdi-percentiles/tree/main by George Kachergis and Jess Mankewitz with advice from Virginia Marchman.
+#'
+#' @param subjects_table a subjects table with unnested cdi data, needs columns "subject_id", "language", "instrument_type", "age", "sex", "measure", "rawscore"
+#'
+#' @return the input table with added columns containing the reference age used, the reference year used, and both gender specific and general percentile values for the cdi score
+#' @export
+#'
+#' @examples
+#' \donttest{
+#' full_cdi_data <- all_subjects %>%
+#'   unnest(subject_aux_data) %>%
+#'   filter(!is.na(cdi_responses)) %>%
+#'   unnest(cdi_responses) %>%
+#'   peekbankr::cleanup_cdi_data() %>%
+#'   peekbankr::populate_cdi_percentiles()
+#' }
+populate_cdi_percentiles <- function(subjects_table) {
+  required_columns <- c(
+    "subject_id",
+    "language",
+    "instrument_type",
+    "age",
+    "sex",
+    "measure",
+    "rawscore"
+  )
+
+  output_columns <- c(
+    "reference_age",
+    "reference_year",
+    "percentile_all",
+    "percentile_sex",
+    "norm_score_all",
+    "norm_score_sex"
+  )
+
+  missing_cols <- setdiff(required_columns, colnames(subjects_table))
+  if (length(missing_cols) > 0) {
+    stop("Missing required columns: ", paste(missing_cols, collapse = ", "))
+  }
+
+  existing_output_cols <- intersect(output_columns, colnames(subjects_table))
+  if (length(existing_output_cols) > 0) {
+    stop("Output columns already exist: ", paste(existing_output_cols, collapse = ", "))
+  }
+
+  norms_tables <- readRDS("data/cdi_benchmarks_2022/norms_tables.rds")
+
+  cdi_norms_long <- norms_tables %>%
+    purrr::imap(\(table, name){
+      table %>%
+        as_tibble() %>%
+        tidyr::pivot_longer(cols = c(-age), names_to = "head", values_to = "score") %>%
+        dplyr::rename(norm_percentile = age, reference_age = head) %>%
+        dplyr::mutate(name = gsub(".csv", "", name, fixed = T)) %>%
+        tidyr::separate(name,
+          into = c("language", "instrument_type", "measure", "norm_sex"),
+          sep = "_"
+        ) %>%
+        # Add zero rows in one step
+        dplyr::bind_rows(., dplyr::distinct(., reference_age, language, instrument_type, measure, norm_sex) %>%
+          dplyr::mutate(norm_percentile = 1, score = 0))
+    }) %>%
+    dplyr::bind_rows() %>%
+    # TODO: create a wordbank/iso lookup for all languages
+    dplyr::mutate(
+      language = ifelse(language == "eng", "English (American)", NA),
+      reference_age = as.numeric(reference_age)
+    )
+
+  # find reference age for each participants entry
+  subject_table_with_ref_age <- subjects_table %>%
+    dplyr::inner_join(
+      cdi_norms_long %>% dplyr::distinct(reference_age, instrument_type, measure, language),
+      by = c(
+        "instrument_type",
+        "measure", "language"
+      ),
+      relationship = "many-to-many"
+    ) %>%
+    dplyr::mutate(age_diff = abs(age - reference_age)) %>%
+    dplyr::group_by(across(!c(age_diff, reference_age))) %>%
+    dplyr::slice_min(abs(age_diff), n = 1, with_ties = FALSE) %>%
+    dplyr::ungroup() %>%
+    dplyr::select(-age_diff)
+
+  subject_table_with_cdi_percentiles <- subject_table_with_ref_age %>%
+    dplyr::inner_join(
+      cdi_norms_long,
+      by = c("instrument_type", "measure", "reference_age", "language"),
+      relationship = "many-to-many"
+    ) %>%
+    dplyr::filter(score < rawscore) %>%
+    dplyr::group_by(across(!c(score, norm_percentile))) %>%
+    dplyr::slice_max(score, n = 1, with_ties = FALSE) %>%
+    dplyr::ungroup() %>%
+    tidyr::pivot_wider(names_from = "norm_sex", values_from = c("norm_percentile", "score")) %>%
+    dplyr::mutate(
+      percentile_all = norm_percentile_both,
+      percentile_sex = case_when(
+        sex == "male" ~ norm_percentile_m,
+        sex == "female" ~ norm_percentile_f,
+        T ~ NA
+      ),
+      norm_score_all = score_both,
+      norm_score_sex = case_when(
+        sex == "male" ~ score_m,
+        sex == "female" ~ score_f,
+        T ~ NA
+      )
+    ) %>%
+    dplyr::select(subject_id, instrument_type, measure, age, language, reference_age, percentile_all, percentile_sex, norm_score_all, norm_score_sex) %>%
+    dplyr::mutate(reference_year = "2022")
+
+  return(subjects_table %>% dplyr::left_join(
+    subject_table_with_cdi_percentiles,
+    by = c("subject_id", "instrument_type", "measure", "age", "language")
+  ))
+}
+
+
+#' Adds a relative cdi score indicating the percentage of total achievable points the subject got on each given measure
+#'
+#' @param subjects_table a subjects table with unnested cdi data, needs columns "subject_id", "language", "instrument_type", "measure", "rawscore"
+#'
+#' @return the input table with an added "cdi_relative" column that contains the percentage of total points gained in the given administrations
+#' @export
+#'
+#' @examples
+#' \donttest{
+#' cdi_data <- all_subjects %>%
+#'   unnest(subject_aux_data) %>%
+#'   filter(!is.na(cdi_responses)) %>%
+#'   unnest(cdi_responses) %>%
+#'   append_relative_cdi_scores()
+#' }
+append_relative_cdi_scores <- function(subjects_table) {
+  required_columns <- c(
+    "subject_id",
+    "instrument_type",
+    "language",
+    "measure",
+    "rawscore"
+  )
+
+  output_columns <- c(
+    "cdi_relative"
+  )
+
+  missing_cols <- setdiff(required_columns, colnames(subjects_table))
+  if (length(missing_cols) > 0) {
+    stop("Missing required columns: ", paste(missing_cols, collapse = ", "))
+  }
+
+  existing_output_cols <- intersect(output_columns, colnames(subjects_table))
+  if (length(existing_output_cols) > 0) {
+    stop("Output columns already exist: ", paste(existing_output_cols, collapse = ", "))
+  }
+
+  # TODO: find instrument_length values for all languages
+  subjects_table %>%
+    dplyr::mutate(
+      instrument_length = case_when(instrument_type == "ws" ~ 680,
+        instrument_type == "wg" & language == "English (American)" ~ 396,
+        instrument_type == "wsshort" ~ 100,
+        instrument_type == "wg" & language == "Spanish (Mexican)" ~ 428, # TODO: double-check Spanish WG length..
+        .default = NA
+      ),
+      CDI_percent = rawscore / instrument_length
+    ) %>%
+    dplyr::select(-instrument_length)
+}
diff --git a/data/cdi_benchmarks_2022/eng_wg_comp_both.csv b/data/cdi_benchmarks_2022/eng_wg_comp_both.csv
@@ -0,0 +1,22 @@
+"","V1","V2","V3","V4","V5","V6","V7","V8","V9","V10","V11"
+"age",8,9,10,11,12,13,14,15,16,17,18
+"5",1,1,2,3,4,6,9,14,20,30,41
+"10",2,3,5,7,10,14,20,29,41,55,73
+"15",4,6,8,12,17,24,33,45,61,80,101
+"20",6,9,13,18,25,34,46,62,82,104,128
+"25",8,12,17,24,33,45,60,80,102,126,152
+"30",11,16,23,31,42,57,75,97,122,149,176
+"35",14,20,28,39,52,70,91,115,142,170,198
+"40",18,25,35,47,63,83,107,134,162,191,220
+"45",22,30,42,56,75,98,124,153,182,212,240
+"50",26,36,49,66,87,113,142,172,202,232,259
+"55",31,43,58,77,101,129,160,192,222,251,278
+"60",37,50,68,89,116,146,179,212,242,270,296
+"65",43,59,78,102,132,164,199,232,262,289,312
+"70",51,68,90,117,149,184,220,253,282,307,328
+"75",60,80,104,134,169,206,242,274,301,324,343
+"80",71,93,121,154,191,229,265,296,321,341,357
+"85",85,110,141,177,216,255,290,319,340,357,369
+"90",103,132,167,206,247,286,318,342,359,371,380
+"95",134,168,206,248,289,324,350,367,378,385,390
+"99",194,233,275,314,347,369,382,389,393,394,395
diff --git a/data/cdi_benchmarks_2022/eng_wg_comp_f.csv b/data/cdi_benchmarks_2022/eng_wg_comp_f.csv
@@ -0,0 +1,22 @@
+"","V1","V2","V3","V4","V5","V6","V7","V8","V9","V10","V11"
+"age",8,9,10,11,12,13,14,15,16,17,18
+"5",1,1,2,4,5,8,12,18,26,36,49
+"10",2,4,6,9,12,18,25,36,49,65,84
+"15",4,6,10,14,20,28,39,53,71,91,114
+"20",6,10,14,20,28,39,53,71,92,116,142
+"25",9,13,19,27,37,51,67,88,112,139,168
+"30",12,18,25,35,47,63,82,105,132,161,192
+"35",16,22,31,42,57,75,97,123,152,183,214
+"40",19,27,38,51,68,88,113,141,171,203,235
+"45",24,33,45,60,79,102,129,159,191,223,255
+"50",28,39,53,70,91,117,146,177,210,243,274
+"55",34,46,62,81,105,132,163,196,229,261,292
+"60",40,54,71,93,119,148,181,214,248,279,308
+"65",47,63,82,106,134,166,199,234,266,297,324
+"70",55,73,94,121,151,184,219,253,285,314,338
+"75",65,84,108,137,170,204,240,273,304,330,352
+"80",76,98,125,156,191,227,262,294,322,345,364
+"85",91,116,145,179,215,252,286,316,340,360,375
+"90",111,139,171,207,245,281,312,339,359,374,384
+"95",143,175,210,248,285,318,344,364,377,386,391
+"99",205,241,278,312,342,364,379,387,392,395,396
diff --git a/data/cdi_benchmarks_2022/eng_wg_comp_m.csv b/data/cdi_benchmarks_2022/eng_wg_comp_m.csv
@@ -0,0 +1,22 @@
+"","V1","V2","V3","V4","V5","V6","V7","V8","V9","V10","V11"
+"age",8,9,10,11,12,13,14,15,16,17,18
+"5",1,1,1,2,3,5,7,11,16,23,31
+"10",2,3,4,6,8,12,17,24,34,46,61
+"15",3,5,7,10,14,20,28,39,53,69,88
+"20",5,8,11,15,22,30,41,55,72,92,115
+"25",8,11,16,22,30,40,54,71,92,115,141
+"30",11,15,21,28,38,52,68,89,112,138,165
+"35",14,19,26,36,48,64,83,106,132,160,189
+"40",17,24,33,44,59,77,99,125,153,182,212
+"45",22,30,40,53,70,91,116,144,173,204,234
+"50",26,36,48,63,83,106,133,163,194,225,255
+"55",32,43,57,74,96,122,152,183,215,246,275
+"60",38,50,66,86,111,139,171,204,236,266,294
+"65",45,59,77,100,127,158,191,225,256,286,312
+"70",53,70,90,115,145,178,212,246,277,305,329
+"75",63,82,105,132,165,200,235,268,298,323,344
+"80",75,96,122,153,187,224,259,291,318,341,359
+"85",90,114,143,177,213,250,285,315,339,357,371
+"90",111,139,171,207,245,282,314,340,359,373,382
+"95",144,176,212,251,288,321,347,366,378,386,391
+"99",209,246,283,318,347,368,381,389,393,395,396
diff --git a/data/cdi_benchmarks_2022/eng_wg_prod_both.csv b/data/cdi_benchmarks_2022/eng_wg_prod_both.csv
@@ -0,0 +1,22 @@
+"","V1","V2","V3","V4","V5","V6","V7","V8","V9","V10","V11"
+"age",8,9,10,11,12,13,14,15,16,17,18
+"5",0,0,0,0,0,0,0,0,0,1,2
+"10",0,0,0,0,0,0,0,1,2,3,5
+"15",0,0,0,0,0,1,1,2,4,6,10
+"20",0,0,0,1,1,1,2,4,6,10,15
+"25",0,0,1,1,2,2,4,6,9,14,21
+"30",1,1,1,2,2,4,5,8,13,20,29
+"35",1,1,2,2,3,5,8,12,17,26,37
+"40",1,2,2,3,5,7,10,15,23,33,46
+"45",2,2,3,5,7,10,14,20,29,40,56
+"50",2,3,4,6,9,12,18,25,36,49,67
+"55",3,4,6,8,11,16,22,31,43,59,79
+"60",4,6,8,11,15,20,28,39,53,70,92
+"65",5,7,10,13,18,25,35,47,63,83,107
+"70",7,9,12,17,23,32,43,57,75,98,124
+"75",9,12,16,21,29,39,52,69,90,114,143
+"80",11,15,20,27,37,49,64,84,107,134,165
+"85",15,20,26,35,47,62,80,103,129,158,191
+"90",20,26,35,47,61,80,102,128,158,190,224
+"95",29,38,51,67,86,110,138,169,201,235,269
+"99",51,67,88,112,141,174,209,244,277,307,334
diff --git a/data/cdi_benchmarks_2022/eng_wg_prod_f.csv b/data/cdi_benchmarks_2022/eng_wg_prod_f.csv
@@ -0,0 +1,22 @@
+"","V1","V2","V3","V4","V5","V6","V7","V8","V9","V10","V11"
+"age",8,9,10,11,12,13,14,15,16,17,18
+"5",0,0,0,0,0,0,0,1,1,2,3
+"10",0,0,0,0,0,1,1,2,3,5,8
+"15",0,0,0,0,1,1,2,3,5,9,13
+"20",0,0,0,1,1,2,3,5,8,13,19
+"25",0,0,1,1,2,3,5,8,12,18,26
+"30",0,1,1,2,3,5,7,11,16,23,33
+"35",1,1,2,3,4,6,9,14,20,29,41
+"40",1,2,3,4,6,8,12,17,25,36,50
+"45",2,2,4,5,7,11,15,22,31,43,59
+"50",2,3,5,7,10,13,19,26,37,51,69
+"55",3,4,6,9,12,17,23,32,44,60,80
+"60",4,6,8,11,15,20,28,38,52,70,92
+"65",5,7,10,14,18,25,33,45,61,81,105
+"70",7,9,13,17,22,30,40,53,71,93,120
+"75",9,12,16,21,28,36,48,63,83,107,137
+"80",12,15,20,26,34,44,58,75,97,124,156
+"85",16,20,26,33,43,55,71,91,115,145,179
+"90",21,27,34,43,55,70,88,112,140,172,208
+"95",32,40,49,61,76,94,117,145,177,212,250
+"99",57,69,84,101,122,147,177,210,245,281,315
diff --git a/data/cdi_benchmarks_2022/eng_wg_prod_m.csv b/data/cdi_benchmarks_2022/eng_wg_prod_m.csv
@@ -0,0 +1,22 @@
+"","V1","V2","V3","V4","V5","V6","V7","V8","V9","V10","V11"
+"age",8,9,10,11,12,13,14,15,16,17,18
+"5",0,0,0,0,0,0,0,0,0,0,1
+"10",0,0,0,0,0,0,0,1,1,2,3
+"15",0,0,0,0,0,0,1,1,2,4,7
+"20",0,0,0,0,1,1,2,3,4,7,11
+"25",0,0,1,1,1,2,3,4,7,11,18
+"30",1,1,1,1,2,3,4,7,11,17,25
+"35",1,1,2,2,3,4,6,10,15,23,33
+"40",1,2,2,3,4,6,9,14,21,30,43
+"45",2,2,3,4,6,8,12,18,27,39,55
+"50",2,3,4,6,8,11,16,24,34,49,68
+"55",3,4,5,7,10,15,21,31,43,60,82
+"60",4,5,7,10,14,19,27,38,54,73,99
+"65",5,7,9,12,17,25,34,48,66,89,117
+"70",6,8,12,16,22,31,43,59,80,106,138
+"75",8,11,15,20,28,39,54,73,97,126,161
+"80",10,14,19,26,36,50,68,90,117,150,188
+"85",13,18,25,34,47,64,85,112,143,179,219
+"90",17,24,33,46,63,84,110,141,176,215,256
+"95",24,34,48,66,89,118,151,187,226,265,303
+"99",43,60,83,113,148,187,229,269,305,337,362