From f627949b76836c044859652e3655418a6506ccad Mon Sep 17 00:00:00 2001 From: l-acs Date: Thu, 1 Jun 2023 17:43:13 -0400 Subject: [PATCH 01/33] Preliminary analysis prep: collapse across passages --- code/prepReadAloudBeta.R | 46 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 code/prepReadAloudBeta.R diff --git a/code/prepReadAloudBeta.R b/code/prepReadAloudBeta.R new file mode 100644 index 0000000..eacff00 --- /dev/null +++ b/code/prepReadAloudBeta.R @@ -0,0 +1,46 @@ +# Reading in CSVs of preprocessed error data and participant information, +# writing a new CSV including the preprocessed data as well as comprehension, +# demographic, reading, and passage information +# +# Luc Sahar and Jessica M. Alexander -- NDCLab, Florida International University +# last updated 5/31/23 + +# NB passages "sun" and "broccoli" as coded contain errors. Namely, broccoli had +# "iodized _table_ counteracts" instead of the intended "table salt", and sun +# showed participants "_empower_ individuals" whereas it was coded as "enable" + +library(readxl) # read_xlsx +library(stringr) # str_extract +library(dplyr) # most things +library(purrr) # map, map_df; generally good to have +library(lubridate) # now +library(readr) # write_csv + +ext_default = 'csv' +tz_default = "America/New_York" +date_format_default = "%Y%m%d_%I%M%P" + + +build_output_filename <- function(label, ext = ext_default, timezone = tz_default, date_format = date_format_default) { + # `label` may include the destination directory, if different from the working directory when the script is run + current_datetime <- now(timezone) %>% format(date_format) + paste(label, '_', current_datetime, '.', ext, sep = "") +} + +collapse_by_participant <- function(filename_in, filename_out) { + by_participant <- read_csv(filename_in) %>% + unique %>% # dedup + group_by(id) %>% summarize(across(misprod:total_uncorrected_errors, sum)) # summarize by participant, across all passages + # TODO change the columns selected, once more have been added to the output of the preproc script + + write_csv(by_participant, filename_out) + return(filename_out) +} + + +# base = "~/Documents/ndclab/analysis-sandbox/github-structure-mirror/readAloud-valence-dataset/derivatives/preprocessed" +base = "/home/data/NDClab/datasets/readAloud-valence-dataset/derivatives/preprocessed" +preprocessed_summary_filename = "TODO" +collapsed_filename = build_output_filename(label = paste(base, "disfluencies_subject", sep='/')) + +collapse_by_participant(preprocessed_summary_file, collapsed_filename) \ No newline at end of file From 6f9897c2052091003dc0c4839ded02206fe45ffb Mon Sep 17 00:00:00 2001 From: JMA <77893711+jessb0t@users.noreply.github.com> Date: Wed, 14 Jun 2023 11:22:52 -0500 Subject: [PATCH 02/33] Initial prep and analysis scripts --- code/analysisReadAloudBeta.R | 159 ++++++++++++++++++++++ code/prepReadAloudBeta.R | 248 +++++++++++++++++++++++++++++++---- 2 files changed, 379 insertions(+), 28 deletions(-) create mode 100644 code/analysisReadAloudBeta.R diff --git a/code/analysisReadAloudBeta.R b/code/analysisReadAloudBeta.R new file mode 100644 index 0000000..2f662c3 --- /dev/null +++ b/code/analysisReadAloudBeta.R @@ -0,0 +1,159 @@ +# readAloud-valence-beta Reading Task Analyses +# Authors: Luc Sahar, Jessica M. Alexander +# Last Updated: 2023-06-14 + +# INPUTS +# data/df: behavioral data, for each participant on each passage, with relevant participant information and trial-level stimulus information + +# OUTPUTS +# TBD + +# NOTES TO DO +# drop 150086 as only completed 12 of 20 passages and low accuracy + +### SECTION 1: SETTING UP +library(dplyr) +library(lme4) +library(lmerTest) +library(interactions) + +#visualization tools +library(ggplot2) +library(gridExtra) +library(grid) +library(cowplot) +library(colorspace) +library(colorblindr) + +#set up date for output file naming +today <- Sys.Date() +today <- format(today, "%Y%m%d") + +#set up directories for input/output data +data <- '/Users/jalexand/github/readAloud-valence-beta/derivatives/readAloudBetaData_20230614.csv' +out_path <- '/Users/jalexand/github/readAloud-valence-beta/derivatives/' + +#read in data +df <- read.csv(data) + +#organize participant demographic variables +df$sex <- as.factor(df$sex) +df$pronouns <- as.factor(df$pronouns) +df$ethnic <- as.factor(df$ethnic) +df$socclass <- as.factor(df$socclass) + +#extract demo stats +summary(df$age) +sd(df$age) +summary(df$sex)/18 +summary(df$sex)/18 / (nrow(df)/18) +summary(df$pronouns)/18 +summary(df$pronouns)/18 / (nrow(df)/18) +summary(df$ethnic)/18 +summary(df$ethnic)/18 / (nrow(df)/18) +summary(df$socclass)/18 +summary(df$socclass)/18 / (nrow(df)/18) + +#remove participants whose challenge question accuracy was below 50% (chance = 25%) +dfTrim <- df +dfTrim <- dfTrim %>% + group_by(id) %>% + mutate(challengeAvgSub = mean(challengeACC)) %>% + ungroup + +dfTrim <- subset(dfTrim, challengeAvgSub>0.5) +length(unique(df$id)) - length(unique(dfTrim$id)) #number of participants removed + +#calculate average accuracy +mean(dfTrim$challengeAvgSub) +sd(dfTrim$challengeAvgSub) + + +### SECTION 2: INITIAL DATA TRIMMING +passage_no_before_trimming <- nrow(dfTrim) + +#insert passage-level trimming here + +passage_no_after_trim1 <- nrow(dfTrim) +passage_no_before_trimming - passage_no_after_trim1 #number of passages trimmed +(passage_no_before_trimming - passage_no_after_trim1) / passage_no_before_trimming #percentage of passages trimmed + + +### SECTION 3: TRANSITION DATA TO LONG FORMAT +errorDat <- data.frame(matrix(ncol=17, nrow=0)) +colnames(errorDat) <- c("passage", "id", + "sex", "pronouns", "age", "ethnic", "socclass", + "bfne", "phq8", "scaaredTotal", "scaaredGA", "scaaredSoc", "sps", + "lenSyll", "lenWord", "avgSyllPerWord", + "errors") +for(j in 1:nrow(dfTrim)){ + passage <- dfTrim$passage[j] + id <- dfTrim$id[j] + sex <- as.character(dfTrim$sex[j]) + pronouns <- as.character(dfTrim$pronouns[j]) + age <- dfTrim$age[j] + ethnic <- as.character(dfTrim$ethnic[j]) + socclass <- as.character(dfTrim$socclass[j]) + bfne <- dfTrim$bfne[j] + phq8 <- dfTrim$phq8[j] + scaaredTotal <- dfTrim$scaaredTotal[j] + scaaredGA <- dfTrim$scaaredGA[j] + scaaredSoc <- dfTrim$scaaredSoc[j] + sps <- dfTrim$sps[j] + lenSyll <- dfTrim$lenSyll[j] + lenWord <- dfTrim$lenWord[j] + avgSyllPerWord <- dfTrim$avgSyllPerWord[j] + errors <- dfTrim$errors[j] + errorDat[nrow(errorDat) + 1,] <-c(passage, id, sex, pronouns, age, ethnic, socclass, bfne, phq8, scaaredTotal, scaaredGA, scaaredSoc, sps, lenSyll, lenWord, avgSyllPerWord, errors) +} + +#organize data types +errorDat$sex <- as.factor(errorDat$sex) +errorDat$pronouns <- as.factor(errorDat$pronouns) +errorDat$age <- as.numeric(errorDat$age) +errorDat$ethnic <- as.factor(errorDat$ethnic) +errorDat$socclass <- as.factor(errorDat$socclass) +errorDat$bfne <- as.numeric(errorDat$bfne) +errorDat$phq8 <- as.numeric(errorDat$phq8) +errorDat$scaaredTotal <- as.numeric(errorDat$scaaredTotal) +errorDat$scaaredGA <- as.numeric(errorDat$scaaredGA) +errorDat$scaaredSoc <- as.numeric(errorDat$scaaredSoc) +errorDat$sps <- as.numeric(errorDat$sps) +errorDat$lenSyll <- as.numeric(errorDat$lenSyll) +errorDat$lenWord <- as.numeric(errorDat$lenWord) +errorDat$avgSyllPerWord <- as.numeric(errorDat$avgSyllPerWord) +errorDat$errors <- as.numeric(errorDat$errors) + +#modify contrasts for categorical predictors +contrasts(errorDat$sex) <- contr.sum(2) #male: -1, female: +1 + +#center continuous predictors +errorDat$age_gmc <- errorDat$age - mean(errorDat$age) +errorDat$bfne_gmc <- errorDat$bfne - mean(errorDat$bfne) +errorDat$phq8_gmc <- errorDat$phq8 - mean(errorDat$phq8) +errorDat$scaaredTotal_gmc <- errorDat$scaaredTotal - mean(errorDat$scaaredTotal) +errorDat$scaaredGA_gmc <- errorDat$scaaredGA - mean(errorDat$scaaredGA) +errorDat$scaaredSoc_gmc <- errorDat$scaaredSoc - mean(errorDat$scaaredSoc) +errorDat$sps_gmc <- errorDat$sps - mean(errorDat$sps) +errorDat$lenSyll_gmc <- errorDat$lenSyll - mean(errorDat$lenSyll) +errorDat$lenWord_gmc <- errorDat$lenWord - mean(errorDat$lenWord) +errorDat$avgSyllPerWord_gmc <- errorDat$avgSyllPerWord - mean(errorDat$avgSyllPerWord) +errorDat$errors_gmc <- errorDat$errors - mean(errorDat$errors) + +#extract demo stats +errorDatStats <- subset(errorDat, !duplicated(errorDat$id)) +summary(errorDatStats$age) +sd(errorDatStats$age) +summary(errorDatStats$sex) +summary(errorDatStats$sex) / length(unique(errorDatStats$id)) +summary(errorDatStats$pronouns) +summary(errorDatStats$pronouns) / length(unique(errorDatStats$id)) +summary(errorDatStats$ethnic) +summary(errorDatStats$ethnic) / length(unique(errorDatStats$id)) +summary(errorDatStats$socclass) +summary(errorDatStats$socclass) / length(unique(errorDatStats$id)) + + +### SECTION 4: MODEL RESULTS +modelErr <- lmerTest::lmer(errors ~ scaaredTotal_gmc * lenSyll_gmc + (1|id) + (1|passage), data=errorDat, REML=TRUE) +summary(modelErr) \ No newline at end of file diff --git a/code/prepReadAloudBeta.R b/code/prepReadAloudBeta.R index eacff00..d7a3c67 100644 --- a/code/prepReadAloudBeta.R +++ b/code/prepReadAloudBeta.R @@ -1,14 +1,27 @@ -# Reading in CSVs of preprocessed error data and participant information, -# writing a new CSV including the preprocessed data as well as comprehension, -# demographic, reading, and passage information -# -# Luc Sahar and Jessica M. Alexander -- NDCLab, Florida International University -# last updated 5/31/23 +# readAloud-valence-beta Analysis Preparation +# Authors: Luc Sahar and Jessica M. Alexander -- NDCLab, Florida International University +# Last updated: 2023-06-14 -# NB passages "sun" and "broccoli" as coded contain errors. Namely, broccoli had -# "iodized _table_ counteracts" instead of the intended "table salt", and sun -# showed participants "_empower_ individuals" whereas it was coded as "enable" +# INPUTS +# data/df: behavioral (error-related) data, for each participant on each passage +# accDat: comprehension question accuracy (0/1) for each participant for each passage +# readDat: stimuli characteristics (by passage half) +# redcap: participant data, incl. demographics and responses + scored factors for questionnaires: + # bfne (brief fear of negative evaluation): bfne_b_scrdTotal (fear of negative evaluation total) + # phq8 (patient health questionnaire): phq8_scrdTotal (depression scale total) + # scaared, total (screen for adult anxiety disorders): scaared_b_scrdTotal (total anxiety) + # scaared, social (screen for adult anxiety disorders): scaared_b_scrdSoc (social phobias) + # scaared, general (screen for adult anxiety disorders): scaared_b_scrdGA (general anxiety) + # sps (social phobia scale): sias6sps6_b_scrdSPS +# OUTPUTS +# dfTrim: for each passage, for each participant, details on: + # participant behavior: reading errors made, comprehension question accuracy + # passage characteristics: length (syllable and word), average syllables per word + # participant data: demographics, language history, mood and mood disorder scores + + +### SECTION 1: SETTING UP library(readxl) # read_xlsx library(stringr) # str_extract library(dplyr) # most things @@ -16,31 +29,210 @@ library(purrr) # map, map_df; generally good to have library(lubridate) # now library(readr) # write_csv -ext_default = 'csv' -tz_default = "America/New_York" -date_format_default = "%Y%m%d_%I%M%P" +#set up defaults for output file naming +# ext_default = 'csv' +# tz_default = "America/New_York" +# date_format_default = "%Y%m%d_%I%M%P" +# +# build_output_filename <- function(label, ext = ext_default, timezone = tz_default, date_format = date_format_default) { +# # `label` may include the destination directory, if different from the working directory when the script is run +# current_datetime <- now(timezone) %>% format(date_format) +# paste(label, '_', current_datetime, '.', ext, sep = "") +# } +today <- Sys.Date() +today <- format(today, "%Y%m%d") + +#set up directories for input/output +main_dataset <- '/Users/jalexand/github/readAloud-valence-dataset/' +main_analyses <- '/Users/jalexand/github/readAloud-valence-beta/' +out_path <- '/Users/jalexand/github/readAloud-valence-beta/derivatives/' + +#load input files +data <- paste(main_dataset, 'derivatives/preprocessed/disfluencies_subject-x-passage_20230613_0450pm.csv', sep="", collapse=NULL) +accDat_path <- paste(main_dataset,'derivatives/preprocessed/readAloud_passage-level_summary_20220812.csv', sep="", collapse=NULL) +readDat_path <- paste(main_dataset, 'derivatives/analysisStimuli_readDat_20230614.csv', sep="", collapse=NULL) +redcap_path <- paste(main_dataset,'derivatives/preprocessed/202201v0readAloudval_SCRD_2022-06-20_1019.csv', sep="", collapse=NULL) +agedat_path <- paste(main_dataset,'derivatives/preprocessed/202201v0readAloudval_SCRD_2022-06-20_1019_ageonly.csv', sep="", collapse=NULL) + +df <- read.csv(data) +redcap <- read.csv(redcap_path, na.strings='NA') #participant questionnaire responses +agedat <- read.csv(agedat_path, na.strings='NA') #participant age information +readDat <- read.csv(readDat_path, na.strings='N') #passage-level characteristics from analysisStimuli.R +accDat <- read.csv(accDat_path, na.strings='NA', check.names=FALSE) #passage level accuracy for each subject +accDat$passage <- c("dams", "flying", "bats", "broccoli", "realty", "bees", "dogshow", "dolphins", "icefishing", + "cars", "vegas", "sun", "caramel", "congo", "antarctica", "depression", "skunkowl", "grizzly", + "mantis", "dentist") #rename passages with short-name + +#add missing passages for 150086 so that nrow is divisible by 20 +passages_read <- df$passage[which(df$id=="150086")] +all_passages <- unique(df$passage) +tempdf <- data.frame(matrix(nrow=0, ncol=ncol(df))) +colnames(tempdf) <- colnames(df) +for(passage in 1:length(all_passages)){ + if(all_passages[passage] %in% passages_read){next}else{ + tempdf[nrow(tempdf) + 1,] <- c("150086", all_passages[passage], rep(NA, 41)) + } +} +df <- rbind(df, tempdf) + +### SECTION 2: BUILD DEMOGRAPHIC DATA DF +demoDat <- redcap[,c(1,5)] +#biological sex: replace numerical values with text description +for(a in 1:nrow(redcap)){ + if(is.na(redcap$demo_b_sex_s1_r1_e1[a])){demoDat$sex[a] <- 'undisclosed'} + else if(redcap$demo_b_sex_s1_r1_e1[a]==1){demoDat$sex[a] <- 'male'} + else if(redcap$demo_b_sex_s1_r1_e1[a]==2){demoDat$sex[a] <- 'female'} + else if(redcap$demo_b_sex_s1_r1_e1[a]==3){demoDat$sex[a] <- 'intersex'} + else if(redcap$demo_b_sex_s1_r1_e1[a]==4){demoDat$sex[a] <- 'other'} + else if(redcap$demo_b_sex_s1_r1_e1[a]==5){demoDat$sex[a] <- 'unknown'} + else{demoDat$sex[a] <- 'undisclosed'} +} + +#preferred pronouns: replace numerical values with text description +for(b in 1:nrow(redcap)){ + if(is.na(redcap$demo_b_pronouns_s1_r1_e1[b])){demoDat$pron[b] <- 'undisclosed'} + else if(redcap$demo_b_pronouns_s1_r1_e1[b]==1){demoDat$pron[b] <- 'she/her'} + else if(redcap$demo_b_pronouns_s1_r1_e1[b]==2){demoDat$pron[b] <- 'he/him'} + else if(redcap$demo_b_pronouns_s1_r1_e1[b]==3){demoDat$pron[b] <- 'they/them'} + else if(redcap$demo_b_pronouns_s1_r1_e1[b]==5){demoDat$pron[b] <- 'other'} + else{demoDat$pron[b] <- 'undisclosed'} +} + +#ethnicity affiliation: map to text description +for(c in 1:nrow(redcap)){ + if(redcap$demo_b_ethnic_s1_r1_e1___1[c]==1){demoDat$ethnic[c] <- 'AI'} #american indian/alaska native + else if(redcap$demo_b_ethnic_s1_r1_e1___2[c]==1){demoDat$ethnic[c] <- 'A'} #asian + else if(redcap$demo_b_ethnic_s1_r1_e1___3[c]==1){demoDat$ethnic[c] <- 'AA'} #african american + else if(redcap$demo_b_ethnic_s1_r1_e1___4[c]==1){demoDat$ethnic[c] <- 'LX'} #hispanic/latinx + else if(redcap$demo_b_ethnic_s1_r1_e1___5[c]==1){demoDat$ethnic[c] <- 'ME'} #middle eastern + else if(redcap$demo_b_ethnic_s1_r1_e1___6[c]==1){demoDat$ethnic[c] <- 'PI'} #pacific islander + else if(redcap$demo_b_ethnic_s1_r1_e1___7[c]==1){demoDat$ethnic[c] <- 'W'} #white + else if(redcap$demo_b_ethnic_s1_r1_e1___8[c]==1){demoDat$ethnic[c] <- 'O'} #other + else{demoDat$ethnic[c] <- 'UND'} #undisclosed +} + +#social class affiliation: replace numerical values with text description +for(d in 1:nrow(redcap)){ + if(is.na(redcap$demo_b_socclass_s1_r1_e1[d])){demoDat$socclass[d] <- 'undisclosed'} + else if(redcap$demo_b_socclass_s1_r1_e1[d]==1){demoDat$socclass[d] <- 'poor'} + else if(redcap$demo_b_socclass_s1_r1_e1[d]==2){demoDat$socclass[d] <- 'working'} + else if(redcap$demo_b_socclass_s1_r1_e1[d]==3){demoDat$socclass[d] <- 'middle'} + else if(redcap$demo_b_socclass_s1_r1_e1[d]==4){demoDat$socclass[d] <- 'affluent'} + else{demoDat$socclass[d] <- 'undisclosed'} +} + +#communication disorders diagnoses: sum across childhood, adolescence, and adulthood +for(e in 1:nrow(redcap)){ + demoDat$commdis[e] <- sum(redcap$demo_b_comdiskid_s1_r1_e1[e], + redcap$demo_b_comdisteen_s1_r1_e1[e], + redcap$demo_b_comdisad_s1_r1_e[e]) +} +#language history: transfer directly +for(f in 1:nrow(redcap)){ + demoDat$eng[f] <- redcap$demo_b_eng_s1_r1_e1[match(demoDat$record_id[f], redcap$record_id)] #participant monolingualism + demoDat$langhis[f] <- redcap$demo_b_langhis_s1_r1_e1[match(demoDat$record_id[f], redcap$record_id)] #participant language history + demoDat$ageen[f] <- redcap$demo_b_ageen_s1_r1_e1[match(demoDat$record_id[f], redcap$record_id)] #participant age of English acquisition + demoDat$profen[f] <- redcap$demo_b_profen_s1_r1_e1[match(demoDat$record_id[f], redcap$record_id)] #participant English proficiency +} + +#mood and mood disorders: transfer directly +for(g in 1:nrow(redcap)){ + demoDat$bfne[g] <- redcap$bfne_b_scrdTotal[match(demoDat$record_id[g], redcap$record_id)] #bfne total score + demoDat$phq8[g] <- redcap$phq8_scrdTotal[match(demoDat$record_id[g], redcap$record_id)] #phq8 depression scale + demoDat$scaaredTotal[g] <- redcap$scaared_b_scrdTotal[match(demoDat$record_id[g], redcap$record_id)] #scaared total anxiety + demoDat$scaaredGA[g] <- redcap$scaared_b_scrdGA[match(demoDat$record_id[g], redcap$record_id)] #scaared general anxiety + demoDat$scaaredSoc[g] <- redcap$scaared_b_scrdSoc[match(demoDat$record_id[g], redcap$record_id)] #scaared social phobias + demoDat$sps[g] <- redcap$sias6sps6_b_scrdSPS[match(demoDat$record_id[g], redcap$record_id)] #sps social phobia scale +} -build_output_filename <- function(label, ext = ext_default, timezone = tz_default, date_format = date_format_default) { - # `label` may include the destination directory, if different from the working directory when the script is run - current_datetime <- now(timezone) %>% format(date_format) - paste(label, '_', current_datetime, '.', ext, sep = "") +#age: pull from separate file +for(h in 1:nrow(demoDat)){ + demoDat$age[h] <- agedat$info_age_s1_r1_e1[match(demoDat$record_id[h], agedat$record_id)] } -collapse_by_participant <- function(filename_in, filename_out) { - by_participant <- read_csv(filename_in) %>% - unique %>% # dedup - group_by(id) %>% summarize(across(misprod:total_uncorrected_errors, sum)) # summarize by participant, across all passages - # TODO change the columns selected, once more have been added to the output of the preproc script - write_csv(by_participant, filename_out) - return(filename_out) +### SECTION 3: BUILD TRIAL-LEVEL DF (ADD DEMODAT, READDAT, and ACCDAT to DF) +for(i in 1:nrow(df)){ + subject <- df$id[i] #extract subject number for matching + passage <- df$passage[i] #extract passage name for matching + errors <- df$errors[i] #extract total errors + + #extract passage characteristics from readDat + df$lenSyll[i] <- sum(readDat$lengthSyll[which(readDat$passage==passage)]) #length of passage in syllables + df$lenWord[i] <- sum(readDat$lengthWord[which(readDat$passage==passage)]) #length of passage in words + df$avgSyllPerWord[i] <- df$lenSyll[i]/df$lenWord[i] + + #extract participant accuracy from accDat + df$challengeACC[i] <- accDat[match(passage, accDat$passage), as.character(subject)] #passage-specific challenge question accuracy for subject + + #extract participant demographics from demoDat + df$sex[i] <- demoDat$sex[match(df$id[i], demoDat$record_id)] #participant biological sex + df$pronouns[i] <- demoDat$pron[match(df$id[i], demoDat$record_id)] #participant preferred pronouns + df$age[i] <- demoDat$age[match(df$id[i], demoDat$record_id)] #participant age + df$ethnic[i] <- demoDat$ethnic[match(df$id[i], demoDat$record_id)] #participant ethnic group affiliation + df$socclass[i] <- demoDat$socclass[match(df$id[i], demoDat$record_id)] #participant social class identification + df$eng[i] <- demoDat$eng[match(df$id[i], demoDat$record_id)] #participant multilingualism (0=EN only, 1=EN+another) + df$langhis[i] <- demoDat$langhis[match(df$id[i], demoDat$record_id)] #participant language learning history (1=EN first, 2=other first, 3=EN+other same, 4=something else) + df$ageen[i] <- demoDat$ageen[match(df$id[i], demoDat$record_id)] #participant age of English acquisition (if not L1) + df$profen[i] <- demoDat$profen[match(df$id[i], demoDat$record_id)] #participant English proficiency (1=Native, 2=Advanced, 3=Intermediate, 4=Elementary, 5=Not proficient) + df$commdis[i] <- demoDat$commdis[match(df$id[i], demoDat$record_id)] #participant communication disorder history (0=none, 1+=diagnoses to review) + df$bfne[i] <- demoDat$bfne[match(df$id[i], demoDat$record_id)] #participant fear of negative evaluation + df$phq8[i] <- demoDat$phq8[match(df$id[i], demoDat$record_id)] #participant depression + df$scaaredTotal[i] <- demoDat$scaaredTotal[match(df$id[i], demoDat$record_id)] #participant overall anxiety + df$scaaredGA[i] <- demoDat$scaaredGA[match(df$id[i], demoDat$record_id)] #participant general anxiety + df$scaaredSoc[i] <- demoDat$scaaredSoc[match(df$id[i], demoDat$record_id)] #participant social phobias (scaared) + df$sps[i] <- demoDat$sps[match(df$id[i], demoDat$record_id)] #participant social phobias (sias6sps6) } +#organize participant demographic variables +df$sex <- as.factor(df$sex) +df$pronouns <- as.factor(df$pronouns) +df$ethnic <- as.factor(df$ethnic) +df$socclass <- as.factor(df$socclass) + + +### SECTION 3: CROSS-CHECK ALL PARTICIPANTS MET INCLUSION CRITERIA +#note: given the time required to annotated errors, only participants who met inclusion criteria were annotated +#sum(df$eng==1 & df$langhis %in% c(2,4) & df$ageen>6) #confirm all subjects monolingual English OR natively bilingual OR learned English before age 6 +#sum(df$commdis>0) #confirm no subject diagnosed with any communication disorder +#sum(df$profen>3, na.rm=TRUE)/20 #one remaining subject (150060) rates own English proficiency as not "elementary" or "not proficient", but reads fluidly and achieved 80% accuracy on challenge questions, so not excluded + +#extract age and sex stats +summary(df$age) #age range and mean +sd(df$age) #age standard deviation +summary(df$sex)/20 #number of participants by sex +summary(df$sex)/20 / (nrow(df)/20) #percentage of participants by sex +summary(df$pronouns)/20 #number of participants by preferred pronoun +summary(df$pronouns)/20 / (nrow(df)/20) #percentage of participants by preferred pronoun +summary(df$ethnic)/20 #number of participants by ethnic affiliation +summary(df$ethnic)/20 / (nrow(df)/20) #percentage of participants by ethnic affiliation +summary(df$socclass)/20 #number of participants by social class affiliation +summary(df$socclass)/20 / (nrow(df)/20) #percentage of participants by social class affiliation + + +### SECTION 4: TRIM PASSAGES DUE TO EXPERIMENTER ERROR +dfTrim <- subset(df, !(df$passage=='broccoli')) #remove broccoli passage due to typo in the last sentence as presented on-screen to participants +dfTrim <- subset(dfTrim, !(dfTrim$passage=='sun')) #remove sun passage due to error in coding Excel + -# base = "~/Documents/ndclab/analysis-sandbox/github-structure-mirror/readAloud-valence-dataset/derivatives/preprocessed" -base = "/home/data/NDClab/datasets/readAloud-valence-dataset/derivatives/preprocessed" -preprocessed_summary_filename = "TODO" -collapsed_filename = build_output_filename(label = paste(base, "disfluencies_subject", sep='/')) +### SECTION 5: OUTPUT DATAFRAME +write.csv(dfTrim, paste(out_path, "readAloudBetaData_", today, ".csv", sep="", collapse=NULL)) -collapse_by_participant(preprocessed_summary_file, collapsed_filename) \ No newline at end of file +# collapse_by_participant <- function(filename_in, filename_out) { +# by_participant <- read_csv(filename_in) %>% +# unique %>% # dedup +# group_by(id) %>% summarize(across(misprod:total_uncorrected_errors, sum)) # summarize by participant, across all passages +# # TODO change the columns selected, once more have been added to the output of the preproc script +# +# write_csv(by_participant, filename_out) +# return(filename_out) +# } +# +# +# # base = "~/Documents/ndclab/analysis-sandbox/github-structure-mirror/readAloud-valence-dataset/derivatives/preprocessed" +# base = "/home/data/NDClab/datasets/readAloud-valence-dataset/derivatives/preprocessed" +# preprocessed_summary_filename = "TODO" +# collapsed_filename = build_output_filename(label = paste(base, "disfluencies_subject", sep='/')) +# +# collapse_by_participant(preprocessed_summary_file, collapsed_filename) \ No newline at end of file From 9d0b8c2f9bb7824b5d93d94e51a10c22f931bcd2 Mon Sep 17 00:00:00 2001 From: JMA <77893711+jessb0t@users.noreply.github.com> Date: Fri, 30 Jun 2023 15:01:35 -0500 Subject: [PATCH 03/33] Update scripts with infrastructure to match -alpha --- code/analysisReadAloudBeta.R | 108 ++++++++++++++++++----------------- code/prepReadAloudBeta.R | 20 +++++-- 2 files changed, 73 insertions(+), 55 deletions(-) diff --git a/code/analysisReadAloudBeta.R b/code/analysisReadAloudBeta.R index 2f662c3..6fd2724 100644 --- a/code/analysisReadAloudBeta.R +++ b/code/analysisReadAloudBeta.R @@ -1,6 +1,6 @@ # readAloud-valence-beta Reading Task Analyses # Authors: Luc Sahar, Jessica M. Alexander -# Last Updated: 2023-06-14 +# Last Updated: 2023-06-30 # INPUTS # data/df: behavioral data, for each participant on each passage, with relevant participant information and trial-level stimulus information @@ -30,7 +30,7 @@ today <- Sys.Date() today <- format(today, "%Y%m%d") #set up directories for input/output data -data <- '/Users/jalexand/github/readAloud-valence-beta/derivatives/readAloudBetaData_20230614.csv' +data <- '/Users/jalexand/github/readAloud-valence-beta/derivatives/readAloudBetaData_20230630.csv' out_path <- '/Users/jalexand/github/readAloud-valence-beta/derivatives/' #read in data @@ -54,6 +54,9 @@ summary(df$ethnic)/18 / (nrow(df)/18) summary(df$socclass)/18 summary(df$socclass)/18 / (nrow(df)/18) +#remove participants who were not engaged in the task +#TBD, ex. 150222 + #remove participants whose challenge question accuracy was below 50% (chance = 25%) dfTrim <- df dfTrim <- dfTrim %>% @@ -69,60 +72,19 @@ mean(dfTrim$challengeAvgSub) sd(dfTrim$challengeAvgSub) -### SECTION 2: INITIAL DATA TRIMMING +### SECTION 2: PASSAGE-LEVEL TRIMMING passage_no_before_trimming <- nrow(dfTrim) -#insert passage-level trimming here +#remove passages with high omissions (participant did not complete reading) +##vegas 150013 passage_no_after_trim1 <- nrow(dfTrim) passage_no_before_trimming - passage_no_after_trim1 #number of passages trimmed (passage_no_before_trimming - passage_no_after_trim1) / passage_no_before_trimming #percentage of passages trimmed -### SECTION 3: TRANSITION DATA TO LONG FORMAT -errorDat <- data.frame(matrix(ncol=17, nrow=0)) -colnames(errorDat) <- c("passage", "id", - "sex", "pronouns", "age", "ethnic", "socclass", - "bfne", "phq8", "scaaredTotal", "scaaredGA", "scaaredSoc", "sps", - "lenSyll", "lenWord", "avgSyllPerWord", - "errors") -for(j in 1:nrow(dfTrim)){ - passage <- dfTrim$passage[j] - id <- dfTrim$id[j] - sex <- as.character(dfTrim$sex[j]) - pronouns <- as.character(dfTrim$pronouns[j]) - age <- dfTrim$age[j] - ethnic <- as.character(dfTrim$ethnic[j]) - socclass <- as.character(dfTrim$socclass[j]) - bfne <- dfTrim$bfne[j] - phq8 <- dfTrim$phq8[j] - scaaredTotal <- dfTrim$scaaredTotal[j] - scaaredGA <- dfTrim$scaaredGA[j] - scaaredSoc <- dfTrim$scaaredSoc[j] - sps <- dfTrim$sps[j] - lenSyll <- dfTrim$lenSyll[j] - lenWord <- dfTrim$lenWord[j] - avgSyllPerWord <- dfTrim$avgSyllPerWord[j] - errors <- dfTrim$errors[j] - errorDat[nrow(errorDat) + 1,] <-c(passage, id, sex, pronouns, age, ethnic, socclass, bfne, phq8, scaaredTotal, scaaredGA, scaaredSoc, sps, lenSyll, lenWord, avgSyllPerWord, errors) -} - -#organize data types -errorDat$sex <- as.factor(errorDat$sex) -errorDat$pronouns <- as.factor(errorDat$pronouns) -errorDat$age <- as.numeric(errorDat$age) -errorDat$ethnic <- as.factor(errorDat$ethnic) -errorDat$socclass <- as.factor(errorDat$socclass) -errorDat$bfne <- as.numeric(errorDat$bfne) -errorDat$phq8 <- as.numeric(errorDat$phq8) -errorDat$scaaredTotal <- as.numeric(errorDat$scaaredTotal) -errorDat$scaaredGA <- as.numeric(errorDat$scaaredGA) -errorDat$scaaredSoc <- as.numeric(errorDat$scaaredSoc) -errorDat$sps <- as.numeric(errorDat$sps) -errorDat$lenSyll <- as.numeric(errorDat$lenSyll) -errorDat$lenWord <- as.numeric(errorDat$lenWord) -errorDat$avgSyllPerWord <- as.numeric(errorDat$avgSyllPerWord) -errorDat$errors <- as.numeric(errorDat$errors) +### SECTION 3: ORGANIZE DATA FOR MODELING +errorDat <- dfTrim #modify contrasts for categorical predictors contrasts(errorDat$sex) <- contr.sum(2) #male: -1, female: +1 @@ -138,7 +100,6 @@ errorDat$sps_gmc <- errorDat$sps - mean(errorDat$sps) errorDat$lenSyll_gmc <- errorDat$lenSyll - mean(errorDat$lenSyll) errorDat$lenWord_gmc <- errorDat$lenWord - mean(errorDat$lenWord) errorDat$avgSyllPerWord_gmc <- errorDat$avgSyllPerWord - mean(errorDat$avgSyllPerWord) -errorDat$errors_gmc <- errorDat$errors - mean(errorDat$errors) #extract demo stats errorDatStats <- subset(errorDat, !duplicated(errorDat$id)) @@ -155,5 +116,50 @@ summary(errorDatStats$socclass) / length(unique(errorDatStats$id)) ### SECTION 4: MODEL RESULTS -modelErr <- lmerTest::lmer(errors ~ scaaredTotal_gmc * lenSyll_gmc + (1|id) + (1|passage), data=errorDat, REML=TRUE) -summary(modelErr) \ No newline at end of file +#misprod_rate x bfne +model1 <- lmerTest::lmer(misprod_rate ~ bfne_gmc + (1|id) + (1|passage), data=errorDat, REML=TRUE) +summary(model1) + +#misprod_rate x scaaredSoc +model2 <- lmerTest::lmer(misprod_rate ~ scaaredSoc_gmc + (1|id) + (1|passage), data=errorDat, REML=TRUE) +summary(model2) + +#misprod_rate x sps +model3 <- lmerTest::lmer(misprod_rate ~ sps_gmc + (1|id) + (1|passage), data=errorDat, REML=TRUE) +summary(model3) + +#hesitation_rate x bfne +model4 <- lmerTest::lmer(hesitation_rate ~ bfne_gmc + (1|id) + (1|passage), data=errorDat, REML=TRUE) +summary(model4) + +#hesitation_rate x scaaredSoc +model5 <- lmerTest::lmer(hesitation_rate ~ scaaredSoc_gmc + (1|id) + (1|passage), data=errorDat, REML=TRUE) +summary(model5) + +#hesitation_rate x sps +model6 <- lmerTest::lmer(hesitation_rate ~ sps_gmc + (1|id) + (1|passage), data=errorDat, REML=TRUE) +summary(model6) + +#words_with_misprod_rate x bfne +model7 <- lmerTest::lmer(words_with_misprod_rate ~ bfne_gmc + (1|id) + (1|passage), data=errorDat, REML=TRUE) +summary(model7) + +#words_with_misprod_rate x scaaredSoc +model8 <- lmerTest::lmer(words_with_misprod_rate ~ scaaredSoc_gmc + (1|id) + (1|passage), data=errorDat, REML=TRUE) +summary(model8) + +#words_with_misprod_rate x sps +model9 <- lmerTest::lmer(words_with_misprod_rate ~ sps_gmc + (1|id) + (1|passage), data=errorDat, REML=TRUE) +summary(model9) + +#words_with_hes_rate x bfne +model10 <- lmerTest::lmer(words_with_hes_rate ~ bfne_gmc + (1|id) + (1|passage), data=errorDat, REML=TRUE) +summary(model10) + +#words_with_hes_rate x scaaredSoc +model11 <- lmerTest::lmer(words_with_hes_rate ~ scaaredSoc_gmc + (1|id) + (1|passage), data=errorDat, REML=TRUE) +summary(model11) + +#words_with_hes_rate x sps +model12 <- lmerTest::lmer(words_with_hes_rate ~ sps_gmc + (1|id) + (1|passage), data=errorDat, REML=TRUE) +summary(model12) \ No newline at end of file diff --git a/code/prepReadAloudBeta.R b/code/prepReadAloudBeta.R index d7a3c67..e062d58 100644 --- a/code/prepReadAloudBeta.R +++ b/code/prepReadAloudBeta.R @@ -1,6 +1,6 @@ # readAloud-valence-beta Analysis Preparation # Authors: Luc Sahar and Jessica M. Alexander -- NDCLab, Florida International University -# Last updated: 2023-06-14 +# Last updated: 2023-06-30 # INPUTS # data/df: behavioral (error-related) data, for each participant on each passage @@ -48,7 +48,7 @@ main_analyses <- '/Users/jalexand/github/readAloud-valence-beta/' out_path <- '/Users/jalexand/github/readAloud-valence-beta/derivatives/' #load input files -data <- paste(main_dataset, 'derivatives/preprocessed/disfluencies_subject-x-passage_20230613_0450pm.csv', sep="", collapse=NULL) +data <- paste(main_dataset, 'derivatives/preprocessed/disfluencies_subject-x-passage_20230616_1229pm.csv', sep="", collapse=NULL) accDat_path <- paste(main_dataset,'derivatives/preprocessed/readAloud_passage-level_summary_20220812.csv', sep="", collapse=NULL) readDat_path <- paste(main_dataset, 'derivatives/analysisStimuli_readDat_20230614.csv', sep="", collapse=NULL) redcap_path <- paste(main_dataset,'derivatives/preprocessed/202201v0readAloudval_SCRD_2022-06-20_1019.csv', sep="", collapse=NULL) @@ -63,6 +63,9 @@ accDat$passage <- c("dams", "flying", "bats", "broccoli", "realty", "bees", "dog "cars", "vegas", "sun", "caramel", "congo", "antarctica", "depression", "skunkowl", "grizzly", "mantis", "dentist") #rename passages with short-name +#organize data types +df[,3:30] <- sapply(df[,3:30],as.numeric) + #add missing passages for 150086 so that nrow is divisible by 20 passages_read <- df$passage[which(df$id=="150086")] all_passages <- unique(df$passage) @@ -70,7 +73,7 @@ tempdf <- data.frame(matrix(nrow=0, ncol=ncol(df))) colnames(tempdf) <- colnames(df) for(passage in 1:length(all_passages)){ if(all_passages[passage] %in% passages_read){next}else{ - tempdf[nrow(tempdf) + 1,] <- c("150086", all_passages[passage], rep(NA, 41)) + tempdf[nrow(tempdf) + 1,] <- c("150086", all_passages[passage], rep(NA, 28)) } } df <- rbind(df, tempdf) @@ -156,7 +159,16 @@ for(h in 1:nrow(demoDat)){ for(i in 1:nrow(df)){ subject <- df$id[i] #extract subject number for matching passage <- df$passage[i] #extract passage name for matching - errors <- df$errors[i] #extract total errors + + #production errors of interest + # misprod = raw misproduction errors + # hesitation = raw hesitations + # words_with_misprod = distinct words with misproduction errors + # words_with_hes = distinct words with pre-word or word-internal hesitation + # misprod_rate = rate of raw misproduction errors + # hesitation_rate = rate of raw hesitations + # words_with_misprod_rate = rate of word-level misproduction errors + # words_with_hes_rate = rate of word-level hesitations #extract passage characteristics from readDat df$lenSyll[i] <- sum(readDat$lengthSyll[which(readDat$passage==passage)]) #length of passage in syllables From a5c853ac2f7a48fde4db6ff2929fc222614d08d7 Mon Sep 17 00:00:00 2001 From: l-acs Date: Thu, 10 Aug 2023 20:24:17 -0400 Subject: [PATCH 04/33] MWE of looking for outliers --- code/analysisReadAloudBeta.R | 43 ++++++++++++++++---- code/prepReadAloudBeta.R | 42 +++++++++++++------- input/passages-to-omit_20230810.csv | 61 +++++++++++++++++++++++++++++ 3 files changed, 124 insertions(+), 22 deletions(-) create mode 100644 input/passages-to-omit_20230810.csv diff --git a/code/analysisReadAloudBeta.R b/code/analysisReadAloudBeta.R index 6fd2724..51b49ab 100644 --- a/code/analysisReadAloudBeta.R +++ b/code/analysisReadAloudBeta.R @@ -1,6 +1,6 @@ # readAloud-valence-beta Reading Task Analyses # Authors: Luc Sahar, Jessica M. Alexander -# Last Updated: 2023-06-30 +# Last Updated: 2023-08-10 # INPUTS # data/df: behavioral data, for each participant on each passage, with relevant participant information and trial-level stimulus information @@ -25,16 +25,29 @@ library(cowplot) library(colorspace) library(colorblindr) +# ``` +# Warning in install.packages : +# package ‘colorblindr’ is not available for this version of R +# +# A version of this package for your version of R might be available elsewhere, +# see the ideas at +# https://cran.r-project.org/doc/manuals/r-patched/R-admin.html#Installing-packages +# ``` + #set up date for output file naming today <- Sys.Date() today <- format(today, "%Y%m%d") #set up directories for input/output data -data <- '/Users/jalexand/github/readAloud-valence-beta/derivatives/readAloudBetaData_20230630.csv' -out_path <- '/Users/jalexand/github/readAloud-valence-beta/derivatives/' +# data <- '/Users/jalexand/github/readAloud-valence-beta/derivatives/readAloudBetaData_20230630.csv' +data <- '/home/luc/Documents/ndclab/analysis-sandbox/rwe-analysis/derivatives/readAloudBetaData_20230810.csv' +to_omit <- '/home/luc/Documents/ndclab/analysis-sandbox/rwe-analysis/input/passages-to-omit_20230810.csv' +# out_path <- '/Users/jalexand/github/readAloud-valence-beta/derivatives/' +out_path <- '/home/luc/Documents/ndclab/analysis-sandbox/rwe-analysis/derivatives/' #read in data -df <- read.csv(data) +df <- read.csv(data, row.names = NULL) +passage_omissions_df <- read.csv(to_omit, row.names = NULL) #organize participant demographic variables df$sex <- as.factor(df$sex) @@ -43,6 +56,9 @@ df$ethnic <- as.factor(df$ethnic) df$socclass <- as.factor(df$socclass) #extract demo stats + +# all these values are just in case they're useful - not needed per se for later +# steps of the logic in this script summary(df$age) sd(df$age) summary(df$sex)/18 @@ -55,10 +71,17 @@ summary(df$socclass)/18 summary(df$socclass)/18 / (nrow(df)/18) #remove participants who were not engaged in the task -#TBD, ex. 150222 +dfTrim <- df +# wait, are my removals supposed to be in dfTrim or df? dfTrim, right? + +# removal based on coder notes of audible distractions, others in the room, etc.: + +# 150015 +# 150208 + +dfTrim <- subset(dfTrim, !(id %in% c(150015, 150208))) #remove participants whose challenge question accuracy was below 50% (chance = 25%) -dfTrim <- df dfTrim <- dfTrim %>% group_by(id) %>% mutate(challengeAvgSub = mean(challengeACC)) %>% @@ -77,6 +100,12 @@ passage_no_before_trimming <- nrow(dfTrim) #remove passages with high omissions (participant did not complete reading) ##vegas 150013 +# TODO +# my attempt/idea, idk yet if it works +dfTrim <- anti_join(dfTrim, + passage_omissions_df, + by = join_by(id == participant, passage == passage)) + passage_no_after_trim1 <- nrow(dfTrim) passage_no_before_trimming - passage_no_after_trim1 #number of passages trimmed @@ -162,4 +191,4 @@ summary(model11) #words_with_hes_rate x sps model12 <- lmerTest::lmer(words_with_hes_rate ~ sps_gmc + (1|id) + (1|passage), data=errorDat, REML=TRUE) -summary(model12) \ No newline at end of file +summary(model12) diff --git a/code/prepReadAloudBeta.R b/code/prepReadAloudBeta.R index e062d58..35b9808 100644 --- a/code/prepReadAloudBeta.R +++ b/code/prepReadAloudBeta.R @@ -33,7 +33,7 @@ library(readr) # write_csv # ext_default = 'csv' # tz_default = "America/New_York" # date_format_default = "%Y%m%d_%I%M%P" -# +# # build_output_filename <- function(label, ext = ext_default, timezone = tz_default, date_format = date_format_default) { # # `label` may include the destination directory, if different from the working directory when the script is run # current_datetime <- now(timezone) %>% format(date_format) @@ -43,17 +43,26 @@ today <- Sys.Date() today <- format(today, "%Y%m%d") #set up directories for input/output -main_dataset <- '/Users/jalexand/github/readAloud-valence-dataset/' -main_analyses <- '/Users/jalexand/github/readAloud-valence-beta/' -out_path <- '/Users/jalexand/github/readAloud-valence-beta/derivatives/' +# main_dataset <- '/Users/jalexand/github/readAloud-valence-dataset/' +# main_analyses <- '/Users/jalexand/github/readAloud-valence-beta/' +# out_path <- '/Users/jalexand/github/readAloud-valence-beta/derivatives/' +main_dataset <- '/home/luc/Documents/ndclab/analysis-sandbox/rwe-dataset/' +main_analyses <- '/home/luc/Documents/ndclab/analysis-sandbox/rwe-analysis/' +out_path <- '/home/luc/Documents/ndclab/analysis-sandbox/rwe-analysis/derivatives/' + #load input files -data <- paste(main_dataset, 'derivatives/preprocessed/disfluencies_subject-x-passage_20230616_1229pm.csv', sep="", collapse=NULL) +# data <- paste(main_dataset, 'derivatives/preprocessed/disfluencies_subject-x-passage_20230616_1229pm.csv', sep="", collapse=NULL) +data <- "/home/luc/Documents/ndclab/analysis-sandbox/output-csvs/disfluencies_subject-x-passage_20230616_1229pm.csv" accDat_path <- paste(main_dataset,'derivatives/preprocessed/readAloud_passage-level_summary_20220812.csv', sep="", collapse=NULL) readDat_path <- paste(main_dataset, 'derivatives/analysisStimuli_readDat_20230614.csv', sep="", collapse=NULL) redcap_path <- paste(main_dataset,'derivatives/preprocessed/202201v0readAloudval_SCRD_2022-06-20_1019.csv', sep="", collapse=NULL) agedat_path <- paste(main_dataset,'derivatives/preprocessed/202201v0readAloudval_SCRD_2022-06-20_1019_ageonly.csv', sep="", collapse=NULL) +# c(data, accDat_path, readDat_path, redcap_path, agedat_path) %>% fs::as_fs_path() %>% fs::is_file() +# ✅: all TRUE + + df <- read.csv(data) redcap <- read.csv(redcap_path, na.strings='NA') #participant questionnaire responses agedat <- read.csv(agedat_path, na.strings='NA') #participant age information @@ -159,7 +168,7 @@ for(h in 1:nrow(demoDat)){ for(i in 1:nrow(df)){ subject <- df$id[i] #extract subject number for matching passage <- df$passage[i] #extract passage name for matching - + #production errors of interest # misprod = raw misproduction errors # hesitation = raw hesitations @@ -169,15 +178,15 @@ for(i in 1:nrow(df)){ # hesitation_rate = rate of raw hesitations # words_with_misprod_rate = rate of word-level misproduction errors # words_with_hes_rate = rate of word-level hesitations - + #extract passage characteristics from readDat df$lenSyll[i] <- sum(readDat$lengthSyll[which(readDat$passage==passage)]) #length of passage in syllables df$lenWord[i] <- sum(readDat$lengthWord[which(readDat$passage==passage)]) #length of passage in words df$avgSyllPerWord[i] <- df$lenSyll[i]/df$lenWord[i] - + #extract participant accuracy from accDat df$challengeACC[i] <- accDat[match(passage, accDat$passage), as.character(subject)] #passage-specific challenge question accuracy for subject - + #extract participant demographics from demoDat df$sex[i] <- demoDat$sex[match(df$id[i], demoDat$record_id)] #participant biological sex df$pronouns[i] <- demoDat$pron[match(df$id[i], demoDat$record_id)] #participant preferred pronouns @@ -211,6 +220,9 @@ df$socclass <- as.factor(df$socclass) #sum(df$profen>3, na.rm=TRUE)/20 #one remaining subject (150060) rates own English proficiency as not "elementary" or "not proficient", but reads fluidly and achieved 80% accuracy on challenge questions, so not excluded #extract age and sex stats + +# all these values are just in case they're useful - not needed per se for later +# steps of the logic in this script summary(df$age) #age range and mean sd(df$age) #age standard deviation summary(df$sex)/20 #number of participants by sex @@ -229,22 +241,22 @@ dfTrim <- subset(dfTrim, !(dfTrim$passage=='sun')) #remove sun passage due to er ### SECTION 5: OUTPUT DATAFRAME -write.csv(dfTrim, paste(out_path, "readAloudBetaData_", today, ".csv", sep="", collapse=NULL)) +write.csv(dfTrim, paste(out_path, "readAloudBetaData_", today, ".csv", sep="", collapse=NULL), row.names = FALSE) # collapse_by_participant <- function(filename_in, filename_out) { # by_participant <- read_csv(filename_in) %>% # unique %>% # dedup # group_by(id) %>% summarize(across(misprod:total_uncorrected_errors, sum)) # summarize by participant, across all passages # # TODO change the columns selected, once more have been added to the output of the preproc script -# +# # write_csv(by_participant, filename_out) # return(filename_out) # } -# -# +# +# # # base = "~/Documents/ndclab/analysis-sandbox/github-structure-mirror/readAloud-valence-dataset/derivatives/preprocessed" # base = "/home/data/NDClab/datasets/readAloud-valence-dataset/derivatives/preprocessed" # preprocessed_summary_filename = "TODO" # collapsed_filename = build_output_filename(label = paste(base, "disfluencies_subject", sep='/')) -# -# collapse_by_participant(preprocessed_summary_file, collapsed_filename) \ No newline at end of file +# +# collapse_by_participant(preprocessed_summary_file, collapsed_filename) diff --git a/input/passages-to-omit_20230810.csv b/input/passages-to-omit_20230810.csv new file mode 100644 index 0000000..07827be --- /dev/null +++ b/input/passages-to-omit_20230810.csv @@ -0,0 +1,61 @@ +participant,passage +150005,vegas +150013,vegas +150022,depression +150026,dams +150030,grizzly +150034,realty +150034,skunkowl +150036,caramel +150036,dams +150056,sun +150079,caramel +150079,depression +150079,realty +150079,mantis +150083,bees +150083,cars +150083,skunkowl +150083,broccoli +150086,dams +150086,antarctica +150086,dentist +150086,dogshow +150086,dolphins +150086,flying +150086,grizzly +150086,mantis +150088,grizzly +150088,dams +150091,caramel +150146,caramel +150146,mantis +150146,dogshow +150196,dams +150196,cars +150196,dentist +150196,broccoli +150196,depression +150208,dams +150216,bees +150222,dogshow +150222,sun +150222,mantis +150222,bats +150242,bats +150242,dentist +150245,icefishing +150245,dentist +150245,congo +150245,cars +150245,skunkowl +150245,broccoli +150245,antarctica +150245,grizzly +150245,dogshow +150245,sun +150252,bees +150252,cars +150252,congo +150274,flying +150299,vegas From 95c112f3ee628de2407a8bf3a8b54f4f0b20eb0c Mon Sep 17 00:00:00 2001 From: l-acs Date: Sat, 12 Aug 2023 10:23:37 -0400 Subject: [PATCH 05/33] Add some models for follow-up analyses --- code/analysisReadAloudBeta.R | 252 ++++++++++++++++++++++++++++++++--- code/prepReadAloudBeta.R | 1 + 2 files changed, 236 insertions(+), 17 deletions(-) diff --git a/code/analysisReadAloudBeta.R b/code/analysisReadAloudBeta.R index 51b49ab..905adbb 100644 --- a/code/analysisReadAloudBeta.R +++ b/code/analysisReadAloudBeta.R @@ -1,6 +1,6 @@ # readAloud-valence-beta Reading Task Analyses # Authors: Luc Sahar, Jessica M. Alexander -# Last Updated: 2023-08-10 +# Last Updated: 2023-08-11 # INPUTS # data/df: behavioral data, for each participant on each passage, with relevant participant information and trial-level stimulus information @@ -78,8 +78,11 @@ dfTrim <- df # 150015 # 150208 +# 150245 had many passages that were entirely or near-entirely inaudible; the +# rest were dropped too under the assumption that the audible ones too +# would be too faint to identify errors in -dfTrim <- subset(dfTrim, !(id %in% c(150015, 150208))) +dfTrim <- subset(dfTrim, !(id %in% c(150015, 150208, 150245))) #remove participants whose challenge question accuracy was below 50% (chance = 25%) dfTrim <- dfTrim %>% @@ -99,9 +102,7 @@ sd(dfTrim$challengeAvgSub) passage_no_before_trimming <- nrow(dfTrim) #remove passages with high omissions (participant did not complete reading) -##vegas 150013 -# TODO -# my attempt/idea, idk yet if it works +# e.g. vegas 150013 dfTrim <- anti_join(dfTrim, passage_omissions_df, by = join_by(id == participant, passage == passage)) @@ -130,6 +131,11 @@ errorDat$lenSyll_gmc <- errorDat$lenSyll - mean(errorDat$lenSyll) errorDat$lenWord_gmc <- errorDat$lenWord - mean(errorDat$lenWord) errorDat$avgSyllPerWord_gmc <- errorDat$avgSyllPerWord - mean(errorDat$avgSyllPerWord) +# LS additions 8/11/23 +errorDat$corrections <- errorDat$corrections - mean(errorDat$corrections) +errorDat$errors <- errorDat$errors - mean(errorDat$errors) + + #extract demo stats errorDatStats <- subset(errorDat, !duplicated(errorDat$id)) summary(errorDatStats$age) @@ -146,49 +152,261 @@ summary(errorDatStats$socclass) / length(unique(errorDatStats$id)) ### SECTION 4: MODEL RESULTS #misprod_rate x bfne -model1 <- lmerTest::lmer(misprod_rate ~ bfne_gmc + (1|id) + (1|passage), data=errorDat, REML=TRUE) +model1 <- lmerTest::lmer(misprod_rate ~ bfne_gmc + (1|id) + (1|passage), + data=errorDat, REML=TRUE) summary(model1) #misprod_rate x scaaredSoc -model2 <- lmerTest::lmer(misprod_rate ~ scaaredSoc_gmc + (1|id) + (1|passage), data=errorDat, REML=TRUE) +model2 <- lmerTest::lmer(misprod_rate ~ scaaredSoc_gmc + (1|id) + (1|passage), + data=errorDat, REML=TRUE) summary(model2) #misprod_rate x sps -model3 <- lmerTest::lmer(misprod_rate ~ sps_gmc + (1|id) + (1|passage), data=errorDat, REML=TRUE) +model3 <- lmerTest::lmer(misprod_rate ~ sps_gmc + (1|id) + (1|passage), + data=errorDat, REML=TRUE) summary(model3) #hesitation_rate x bfne -model4 <- lmerTest::lmer(hesitation_rate ~ bfne_gmc + (1|id) + (1|passage), data=errorDat, REML=TRUE) +model4 <- lmerTest::lmer(hesitation_rate ~ bfne_gmc + (1|id) + (1|passage), + data=errorDat, REML=TRUE) summary(model4) #hesitation_rate x scaaredSoc -model5 <- lmerTest::lmer(hesitation_rate ~ scaaredSoc_gmc + (1|id) + (1|passage), data=errorDat, REML=TRUE) +model5 <- lmerTest::lmer(hesitation_rate ~ scaaredSoc_gmc + (1|id) + (1|passage), + data=errorDat, REML=TRUE) summary(model5) #hesitation_rate x sps -model6 <- lmerTest::lmer(hesitation_rate ~ sps_gmc + (1|id) + (1|passage), data=errorDat, REML=TRUE) +model6 <- lmerTest::lmer(hesitation_rate ~ sps_gmc + (1|id) + (1|passage), + data=errorDat, REML=TRUE) summary(model6) #words_with_misprod_rate x bfne -model7 <- lmerTest::lmer(words_with_misprod_rate ~ bfne_gmc + (1|id) + (1|passage), data=errorDat, REML=TRUE) +model7 <- lmerTest::lmer(words_with_misprod_rate ~ bfne_gmc + (1|id) + (1|passage), + data=errorDat, REML=TRUE) summary(model7) #words_with_misprod_rate x scaaredSoc -model8 <- lmerTest::lmer(words_with_misprod_rate ~ scaaredSoc_gmc + (1|id) + (1|passage), data=errorDat, REML=TRUE) +model8 <- lmerTest::lmer(words_with_misprod_rate ~ scaaredSoc_gmc + (1|id) + (1|passage), + data=errorDat, REML=TRUE) summary(model8) #words_with_misprod_rate x sps -model9 <- lmerTest::lmer(words_with_misprod_rate ~ sps_gmc + (1|id) + (1|passage), data=errorDat, REML=TRUE) +model9 <- lmerTest::lmer(words_with_misprod_rate ~ sps_gmc + (1|id) + (1|passage), + data=errorDat, REML=TRUE) summary(model9) #words_with_hes_rate x bfne -model10 <- lmerTest::lmer(words_with_hes_rate ~ bfne_gmc + (1|id) + (1|passage), data=errorDat, REML=TRUE) +model10 <- lmerTest::lmer(words_with_hes_rate ~ bfne_gmc + (1|id) + (1|passage), + data=errorDat, REML=TRUE) summary(model10) #words_with_hes_rate x scaaredSoc -model11 <- lmerTest::lmer(words_with_hes_rate ~ scaaredSoc_gmc + (1|id) + (1|passage), data=errorDat, REML=TRUE) +model11 <- lmerTest::lmer(words_with_hes_rate ~ scaaredSoc_gmc + (1|id) + (1|passage), + data=errorDat, REML=TRUE) summary(model11) #words_with_hes_rate x sps -model12 <- lmerTest::lmer(words_with_hes_rate ~ sps_gmc + (1|id) + (1|passage), data=errorDat, REML=TRUE) +model12 <- lmerTest::lmer(words_with_hes_rate ~ sps_gmc + (1|id) + (1|passage), + data=errorDat, REML=TRUE) summary(model12) + + +#### supplemental analyses +# see notes + + + + + + +# glmer(accuracy ~ scaaredSoc_gmc + (1|id) + (1|passage), data=errorDat, family="binomial") +# "f_" : follow-up + +# Accuracy/comprehension as explained by social anxiety: scaaredSoc +f_model1 <- glmer(challengeACC ~ scaaredSoc_gmc + (1|id) + (1|passage), + data=errorDat, family = "binomial") +summary(f_model1) + +# Accuracy/comprehension as explained by social anxiety: bfne +f_model2 <- glmer(challengeACC ~ bfne_gmc + (1|id) + (1|passage), + data=errorDat, family = "binomial") +summary(f_model2) + + +# Accuracy/comprehension as explained by social anxiety: sps +f_model3 <- glmer(challengeACC ~ sps_gmc + (1|id) + (1|passage), + data=errorDat, family = "binomial") +summary(f_model3) + + +# Accuracy/comprehension as explained by disfluencies: hesitations per syllable +f_model4 <- glmer(challengeACC ~ hesitation_rate + (1|id) + (1|passage), + data=errorDat, family = "binomial") +summary(f_model4) + +# Accuracy/comprehension as explained by disfluencies: hesitations per word +f_model5 <- glmer(challengeACC ~ words_with_hes_rate + (1|id) + (1|passage), + data=errorDat, family = "binomial") +summary(f_model5) + + +# Accuracy/comprehension as explained by errors: misproductions per syllable +f_model6 <- glmer(challengeACC ~ misprod_rate + (1|id) + (1|passage), + data=errorDat, family = "binomial") +summary(f_model6) + +# Accuracy/comprehension as explained by errors: misproductions per word +f_model7 <- glmer(challengeACC ~ words_with_misprod_rate + (1|id) + (1|passage), + data=errorDat, family = "binomial") +summary(f_model7) + + + + +# Accuracy/comprehension as explained by disfluencies *and* SA: hesitations per syllable with scaared +f_model8 <- glmer(challengeACC ~ hesitation_rate * scaaredSoc_gmc + (1|id) + (1|passage), + data=errorDat, family = "binomial") +summary(f_model8) + +# Accuracy/comprehension as explained by disfluencies: hesitations per word with scaared +f_model9 <- glmer(challengeACC ~ words_with_hes_rate * scaaredSoc_gmc + (1|id) + (1|passage), + data=errorDat, family = "binomial") +summary(f_model9) + + +# Accuracy/comprehension as explained by errors: misproductions per syllable with scaared +f_model10 <- glmer(challengeACC ~ misprod_rate * scaaredSoc_gmc + (1|id) + (1|passage), + data=errorDat, family = "binomial") +summary(f_model10) + +# Accuracy/comprehension as explained by errors: misproductions per word with scaared +f_model11 <- glmer(challengeACC ~ words_with_misprod_rate * scaaredSoc_gmc + (1|id) + (1|passage), + data=errorDat, family = "binomial") +summary(f_model11) + + + +# Accuracy/comprehension as explained by disfluencies *and* SA: hesitations per syllable with bfne +f_model12 <- glmer(challengeACC ~ hesitation_rate * bfne_gmc + (1|id) + (1|passage), + data=errorDat, family = "binomial") +summary(f_model12) + +# Accuracy/comprehension as explained by disfluencies: hesitations per word with bfne +f_model13 <- glmer(challengeACC ~ words_with_hes_rate * bfne_gmc + (1|id) + (1|passage), + data=errorDat, family = "binomial") +summary(f_model13) + + +# Accuracy/comprehension as explained by errors: misproductions per syllable with bfne +f_model14 <- glmer(challengeACC ~ misprod_rate * bfne_gmc + (1|id) + (1|passage), + data=errorDat, family = "binomial") +summary(f_model14) + +# Accuracy/comprehension as explained by errors: misproductions per word with bfne +f_model15 <- glmer(challengeACC ~ words_with_misprod_rate * bfne_gmc + (1|id) + (1|passage), + data=errorDat, family = "binomial") +summary(f_model15) + + + +# Accuracy/comprehension as explained by disfluencies *and* SA: hesitations per syllable with sps +f_model16 <- glmer(challengeACC ~ hesitation_rate * sps_gmc + (1|id) + (1|passage), + data=errorDat, family = "binomial") +summary(f_model16) + +# Accuracy/comprehension as explained by disfluencies: hesitations per word with sps +f_model17 <- glmer(challengeACC ~ words_with_hes_rate * sps_gmc + (1|id) + (1|passage), + data=errorDat, family = "binomial") +summary(f_model17) + + +# Accuracy/comprehension as explained by errors: misproductions per syllable with sps +f_model18 <- glmer(challengeACC ~ misprod_rate * sps_gmc + (1|id) + (1|passage), + data=errorDat, family = "binomial") +summary(f_model18) + +# Accuracy/comprehension as explained by errors: misproductions per word with sps +f_model19 <- glmer(challengeACC ~ words_with_misprod_rate * sps_gmc + (1|id) + (1|passage), + data=errorDat, family = "binomial") +summary(f_model19) + + + +# Now, misproduction-hesitation interactions + +# Errors as explained by disfluency: rate of misproduced syllables from rate of hesitated syllables +f_model20 <- lmerTest::lmer(misprod_rate ~ hesitation_rate + (1|id) + (1|passage), + data=errorDat, REML=TRUE) +summary(f_model20) + +# Errors as explained by disfluency: rate of misproduced words from rate of hesitated words +f_model21 <- lmerTest::lmer(words_with_misprod_rate ~ words_with_hes_rate + (1|id) + (1|passage), + data=errorDat, REML=TRUE) +summary(f_model21) + + +# Errors as explained by disfluency: rate of misproduced words from rate of hesitated syllables +f_model22 <- lmerTest::lmer(words_with_misprod_rate ~ hesitation_rate + (1|id) + (1|passage), + data=errorDat, REML=TRUE) +summary(f_model22) + + + +# Now, misproduction-hesitation interactions with social anxiety + +# Errors as explained by disfluency and SA: rate of misproduced syllables from rate of hesitated syllables and scaared +f_model23 <- lmerTest::lmer(misprod_rate ~ hesitation_rate * scaaredSoc_gmc + (1|id) + (1|passage), + data=errorDat, REML=TRUE) +summary(f_model23) + +# Errors as explained by disfluency and SA: rate of misproduced words from rate of hesitated words and scaared +f_model24 <- lmerTest::lmer(words_with_misprod_rate ~ words_with_hes_rate * scaaredSoc_gmc + (1|id) + (1|passage), + data=errorDat, REML=TRUE) +summary(f_model24) + + +# Errors as explained by disfluency and SA: rate of misproduced words from rate of hesitated syllables and scaared +f_model25 <- lmerTest::lmer(words_with_misprod_rate ~ hesitation_rate * scaaredSoc_gmc + (1|id) + (1|passage), + data=errorDat, REML=TRUE) +summary(f_model25) + + +# Errors as explained by disfluency and SA: rate of misproduced syllables from rate of hesitated syllables and bfne +f_model26 <- lmerTest::lmer(misprod_rate ~ hesitation_rate * bfne_gmc + (1|id) + (1|passage), + data=errorDat, REML=TRUE) +summary(f_model26) + +# Errors as explained by disfluency and SA: rate of misproduced words from rate of hesitated words and bfne +f_model27 <- lmerTest::lmer(words_with_misprod_rate ~ words_with_hes_rate * bfne_gmc + (1|id) + (1|passage), + data=errorDat, REML=TRUE) +summary(f_model27) + + +# Errors as explained by disfluency and SA: rate of misproduced words from rate of hesitated syllables and bfne +f_model28 <- lmerTest::lmer(words_with_misprod_rate ~ hesitation_rate * bfne_gmc + (1|id) + (1|passage), + data=errorDat, REML=TRUE) +summary(f_model28) + + +# Errors as explained by disfluency and SA: rate of misproduced syllables from rate of hesitated syllables and sps +f_model29 <- lmerTest::lmer(misprod_rate ~ hesitation_rate * sps_gmc + (1|id) + (1|passage), + data=errorDat, REML=TRUE) +summary(f_model29) + +# Errors as explained by disfluency and SA: rate of misproduced words from rate of hesitated words and sps +f_model30 <- lmerTest::lmer(words_with_misprod_rate ~ words_with_hes_rate * sps_gmc + (1|id) + (1|passage), + data=errorDat, REML=TRUE) +summary(f_model30) + + +# Errors as explained by disfluency and SA: rate of misproduced words from rate of hesitated syllables and sps +f_model31 <- lmerTest::lmer(words_with_misprod_rate ~ hesitation_rate * sps_gmc + (1|id) + (1|passage), + data=errorDat, REML=TRUE) +summary(f_model31) + + + + + + diff --git a/code/prepReadAloudBeta.R b/code/prepReadAloudBeta.R index 35b9808..407fcd5 100644 --- a/code/prepReadAloudBeta.R +++ b/code/prepReadAloudBeta.R @@ -223,6 +223,7 @@ df$socclass <- as.factor(df$socclass) # all these values are just in case they're useful - not needed per se for later # steps of the logic in this script + summary(df$age) #age range and mean sd(df$age) #age standard deviation summary(df$sex)/20 #number of participants by sex From a6656cb4dfd8c4006f130db2e08c72c214a2e922 Mon Sep 17 00:00:00 2001 From: l-acs Date: Sat, 12 Aug 2023 23:45:19 -0400 Subject: [PATCH 06/33] Comment some things and add additional follow-up models --- code/analysisReadAloudBeta.R | 126 +++++++++++++++++++++++++++++------ 1 file changed, 106 insertions(+), 20 deletions(-) diff --git a/code/analysisReadAloudBeta.R b/code/analysisReadAloudBeta.R index 905adbb..dd159c0 100644 --- a/code/analysisReadAloudBeta.R +++ b/code/analysisReadAloudBeta.R @@ -1,6 +1,6 @@ # readAloud-valence-beta Reading Task Analyses # Authors: Luc Sahar, Jessica M. Alexander -# Last Updated: 2023-08-11 +# Last Updated: 2023-08-12 # INPUTS # data/df: behavioral data, for each participant on each passage, with relevant participant information and trial-level stimulus information @@ -46,8 +46,8 @@ to_omit <- '/home/luc/Documents/ndclab/analysis-sandbox/rwe-analysis/input/passa out_path <- '/home/luc/Documents/ndclab/analysis-sandbox/rwe-analysis/derivatives/' #read in data -df <- read.csv(data, row.names = NULL) -passage_omissions_df <- read.csv(to_omit, row.names = NULL) +df <- read.csv(data, row.names = NULL) # output of prep script +passage_omissions_df <- read.csv(to_omit, row.names = NULL) # hand-crafted list of participant x passage entries to exclude, based on coder comments #organize participant demographic variables df$sex <- as.factor(df$sex) @@ -72,16 +72,13 @@ summary(df$socclass)/18 / (nrow(df)/18) #remove participants who were not engaged in the task dfTrim <- df -# wait, are my removals supposed to be in dfTrim or df? dfTrim, right? # removal based on coder notes of audible distractions, others in the room, etc.: - # 150015 # 150208 # 150245 had many passages that were entirely or near-entirely inaudible; the # rest were dropped too under the assumption that the audible ones too # would be too faint to identify errors in - dfTrim <- subset(dfTrim, !(id %in% c(150015, 150208, 150245))) #remove participants whose challenge question accuracy was below 50% (chance = 25%) @@ -91,7 +88,7 @@ dfTrim <- dfTrim %>% ungroup dfTrim <- subset(dfTrim, challengeAvgSub>0.5) -length(unique(df$id)) - length(unique(dfTrim$id)) #number of participants removed +length(unique(df$id)) - length(unique(dfTrim$id)) #number of participants removed due to distraction or low accuracy #calculate average accuracy mean(dfTrim$challengeAvgSub) @@ -101,7 +98,7 @@ sd(dfTrim$challengeAvgSub) ### SECTION 2: PASSAGE-LEVEL TRIMMING passage_no_before_trimming <- nrow(dfTrim) -#remove passages with high omissions (participant did not complete reading) +#remove passages with high omissions (participant did not complete reading) or other problems (someone else is in the room, etc.) # e.g. vegas 150013 dfTrim <- anti_join(dfTrim, passage_omissions_df, @@ -132,8 +129,10 @@ errorDat$lenWord_gmc <- errorDat$lenWord - mean(errorDat$lenWord) errorDat$avgSyllPerWord_gmc <- errorDat$avgSyllPerWord - mean(errorDat$avgSyllPerWord) # LS additions 8/11/23 -errorDat$corrections <- errorDat$corrections - mean(errorDat$corrections) -errorDat$errors <- errorDat$errors - mean(errorDat$errors) +# errorDat$errors <- errorDat$errors - mean(errorDat$errors) +# errorDat$correction <- errorDat$corrections - mean(errorDat$corrections) +errorDat$error_rate <- errorDat$errors / errorDat$lenSyll +errorDat$correction_rate <- errorDat$corrections / errorDat$lenSyll #extract demo stats @@ -292,18 +291,18 @@ f_model12 <- glmer(challengeACC ~ hesitation_rate * bfne_gmc + (1|id) + (1|passa data=errorDat, family = "binomial") summary(f_model12) -# Accuracy/comprehension as explained by disfluencies: hesitations per word with bfne +# Accuracy/comprehension as explained by disfluencies *and* SA: hesitations per word with bfne f_model13 <- glmer(challengeACC ~ words_with_hes_rate * bfne_gmc + (1|id) + (1|passage), data=errorDat, family = "binomial") summary(f_model13) -# Accuracy/comprehension as explained by errors: misproductions per syllable with bfne +# Accuracy/comprehension as explained by errors *and* SA: misproductions per syllable with bfne f_model14 <- glmer(challengeACC ~ misprod_rate * bfne_gmc + (1|id) + (1|passage), data=errorDat, family = "binomial") summary(f_model14) -# Accuracy/comprehension as explained by errors: misproductions per word with bfne +# Accuracy/comprehension as explained by errors *and* SA: misproductions per word with bfne f_model15 <- glmer(challengeACC ~ words_with_misprod_rate * bfne_gmc + (1|id) + (1|passage), data=errorDat, family = "binomial") summary(f_model15) @@ -315,41 +314,41 @@ f_model16 <- glmer(challengeACC ~ hesitation_rate * sps_gmc + (1|id) + (1|passag data=errorDat, family = "binomial") summary(f_model16) -# Accuracy/comprehension as explained by disfluencies: hesitations per word with sps +# Accuracy/comprehension as explained by disfluencies *and* SA: hesitations per word with sps f_model17 <- glmer(challengeACC ~ words_with_hes_rate * sps_gmc + (1|id) + (1|passage), data=errorDat, family = "binomial") summary(f_model17) -# Accuracy/comprehension as explained by errors: misproductions per syllable with sps +# Accuracy/comprehension as explained by errors *and* SA: misproductions per syllable with sps f_model18 <- glmer(challengeACC ~ misprod_rate * sps_gmc + (1|id) + (1|passage), data=errorDat, family = "binomial") summary(f_model18) -# Accuracy/comprehension as explained by errors: misproductions per word with sps +# Accuracy/comprehension as explained by errors *and* SA: misproductions per word with sps f_model19 <- glmer(challengeACC ~ words_with_misprod_rate * sps_gmc + (1|id) + (1|passage), data=errorDat, family = "binomial") summary(f_model19) -# Now, misproduction-hesitation interactions +# Now, misproduction-hesitation relationships # Errors as explained by disfluency: rate of misproduced syllables from rate of hesitated syllables f_model20 <- lmerTest::lmer(misprod_rate ~ hesitation_rate + (1|id) + (1|passage), data=errorDat, REML=TRUE) -summary(f_model20) +summary(f_model20) # *** # Errors as explained by disfluency: rate of misproduced words from rate of hesitated words f_model21 <- lmerTest::lmer(words_with_misprod_rate ~ words_with_hes_rate + (1|id) + (1|passage), data=errorDat, REML=TRUE) -summary(f_model21) +summary(f_model21) # *** # Errors as explained by disfluency: rate of misproduced words from rate of hesitated syllables f_model22 <- lmerTest::lmer(words_with_misprod_rate ~ hesitation_rate + (1|id) + (1|passage), data=errorDat, REML=TRUE) -summary(f_model22) +summary(f_model22) # *** @@ -406,7 +405,94 @@ f_model31 <- lmerTest::lmer(words_with_misprod_rate ~ hesitation_rate * sps_gmc summary(f_model31) +# LS ideas: +# error rate ~ SA +# correction rate ~ error rate * SA + +# comprehension ~ correction rate +# comprehension ~ SA * correction rate +# comprehension ~ error rate +# comprehension ~ SA * error rate +# comprehension ~ error rate * correction rate +# comprehension ~ error rate * correction rate * SA + +# comprehension ~ hes rate * error rate +# comprehension ~ hes rate * correction rate + +# comprehension ~ hes rate * error rate * SA +# comprehension ~ hes rate * correction rate * SA +# comprehension ~ hes rate * error rate * correction rate * SA + +# error rate ~ hes rate * SA +# correction rate ~ hes rate * SA, control for error rate + +# corr_ : corrections rate +# err_ : errors rate + + +# error rate as explained by social anxiety +fls_model_err_scaared <- lmer(error_rate ~ scaaredSoc_gmc + (1|id) + (1|passage), + data=errorDat, REML=TRUE) +summary(fls_model_err_scaared) + +# correction rate as explained by social anxiety +fls_model_corr_scaared <- lmer(correction_rate ~ scaaredSoc_gmc + (1|id) + (1|passage), + data=errorDat, REML=TRUE) +summary(fls_model_corr_scaared) + +# correction rate as explained by social anxiety and errors +fls_model_corr_scaared_err <- lmer(correction_rate ~ scaaredSoc_gmc * error_rate + (1|id) + (1|passage), + data=errorDat, REML=TRUE) +summary(fls_model_corr_scaared_err) + +# comprehension ~ correction rate +fls_model_comp_corr <- glmer(challengeACC ~ correction_rate + (1|id) + (1|passage), + data=errorDat, family = "binomial") +summary(fls_model_comp_corr) + +# comprehension ~ SA * correction rate +fls_model_comp_scaared_corr <- glmer(challengeACC ~ scaaredSoc_gmc * correction_rate + (1|id) + (1|passage), + data=errorDat, family = "binomial") +summary(fls_model_comp_scaared_corr) # * + +# comprehension ~ error rate +fls_model_comp_err <- glmer(challengeACC ~ error_rate + (1|id) + (1|passage), + data=errorDat, family = "binomial") +summary(fls_model_comp_err) + +# comprehension ~ SA * error rate +fls_model_comp_scaared_err <- glmer(challengeACC ~ scaaredSoc_gmc * error_rate + (1|id) + (1|passage), + data=errorDat, family = "binomial") +summary(fls_model_comp_scaared_err) + +# comprehension ~ error rate * correction rate +fls_model_comp_err_corr <- glmer(challengeACC ~ error_rate * correction_rate + (1|id) + (1|passage), + data=errorDat, family = "binomial") +summary(fls_model_comp_err_corr) + +# comprehension ~ hes rate * error rate +fls_model_comp_hes_err <- glmer(challengeACC ~ hesitation_rate * error_rate + (1|id) + (1|passage), + data=errorDat, family = "binomial") +summary(fls_model_comp_hes_err) + +# comprehension ~ hes rate * correction rate +fls_model_comp_hes_corr <- glmer(challengeACC ~ hesitation_rate * correction_rate + (1|id) + (1|passage), + data=errorDat, family = "binomial") +summary(fls_model_comp_hes_corr) # *** + + +# Check the main one of interest, comprehension as explained by SA and +# corrections (see fls_model_comp_scaared_corr above), with all three tests +f_model_comp_scaared_corr <- glmer(challengeACC ~ scaaredSoc_gmc * correction_rate + (1|id) + (1|passage), + data=errorDat, family = "binomial") +summary(f_model_comp_scaared_corr) # * +f_model_comp_bfne_corr <- glmer(challengeACC ~ bfne_gmc * correction_rate + (1|id) + (1|passage), + data=errorDat, family = "binomial") +summary(f_model_comp_bfne_corr) # N.S. +f_model_comp_sps_corr <- glmer(challengeACC ~ sps_gmc * correction_rate + (1|id) + (1|passage), + data=errorDat, family = "binomial") +summary(f_model_comp_sps_corr) # N.S., but p for interaction is at 0.0698 From 7d5efa67db6c78fce5e1279084beb41721ae5b5b Mon Sep 17 00:00:00 2001 From: l-acs Date: Wed, 16 Aug 2023 19:37:28 -0400 Subject: [PATCH 07/33] WIP: Reading speed analysis --- code/analysisReadAloudBeta.R | 109 +++++++++++++++++++++++++++++++++-- code/prepReadAloudBeta.R | 26 ++++++--- 2 files changed, 124 insertions(+), 11 deletions(-) diff --git a/code/analysisReadAloudBeta.R b/code/analysisReadAloudBeta.R index dd159c0..7b71881 100644 --- a/code/analysisReadAloudBeta.R +++ b/code/analysisReadAloudBeta.R @@ -1,6 +1,6 @@ # readAloud-valence-beta Reading Task Analyses # Authors: Luc Sahar, Jessica M. Alexander -# Last Updated: 2023-08-12 +# Last Updated: 2023-08-16 # INPUTS # data/df: behavioral data, for each participant on each passage, with relevant participant information and trial-level stimulus information @@ -40,7 +40,9 @@ today <- format(today, "%Y%m%d") #set up directories for input/output data # data <- '/Users/jalexand/github/readAloud-valence-beta/derivatives/readAloudBetaData_20230630.csv' -data <- '/home/luc/Documents/ndclab/analysis-sandbox/rwe-analysis/derivatives/readAloudBetaData_20230810.csv' +# data <- '/home/luc/Documents/ndclab/analysis-sandbox/rwe-analysis/derivatives/readAloudBetaData_20230810.csv' +# data <- '/home/luc/Documents/ndclab/analysis-sandbox/rwe-analysis/derivatives/readAloudBetaData_20230815.csv' +data <- '/home/luc/Documents/ndclab/analysis-sandbox/rwe-analysis/derivatives/readAloudBetaData_20230816.csv' to_omit <- '/home/luc/Documents/ndclab/analysis-sandbox/rwe-analysis/input/passages-to-omit_20230810.csv' # out_path <- '/Users/jalexand/github/readAloud-valence-beta/derivatives/' out_path <- '/home/luc/Documents/ndclab/analysis-sandbox/rwe-analysis/derivatives/' @@ -55,6 +57,9 @@ df$pronouns <- as.factor(df$pronouns) df$ethnic <- as.factor(df$ethnic) df$socclass <- as.factor(df$socclass) + +# FIXME: stats on df and dfTrim for reading speed are NA + #extract demo stats # all these values are just in case they're useful - not needed per se for later @@ -77,8 +82,8 @@ dfTrim <- df # 150015 # 150208 # 150245 had many passages that were entirely or near-entirely inaudible; the -# rest were dropped too under the assumption that the audible ones too -# would be too faint to identify errors in +# rest were dropped too under the assumption that the audible ones too would be +# too faint to identify errors in dfTrim <- subset(dfTrim, !(id %in% c(150015, 150208, 150245))) #remove participants whose challenge question accuracy was below 50% (chance = 25%) @@ -94,6 +99,12 @@ length(unique(df$id)) - length(unique(dfTrim$id)) #number of participants remove mean(dfTrim$challengeAvgSub) sd(dfTrim$challengeAvgSub) +# calculate average speed +mean(dfTrim$timePerSyllable, na.rm=TRUE) +sd(dfTrim$timePerSyllable, na.rm=TRUE) +mean(dfTrim$timePerWord, na.rm=TRUE) +sd(dfTrim$timePerWord, na.rm=TRUE) + ### SECTION 2: PASSAGE-LEVEL TRIMMING passage_no_before_trimming <- nrow(dfTrim) @@ -110,6 +121,34 @@ passage_no_before_trimming - passage_no_after_trim1 #number of passages trimmed (passage_no_before_trimming - passage_no_after_trim1) / passage_no_before_trimming #percentage of passages trimmed +# band-aid fix: remove passages without reading speed data so that we can run +# our analyses on them nonetheless +# todo + +# these are the only four passages without reading time data... +# and incidentally? well, see their comments here... +c(150013, "vegas") # N.B.: 161 omitted syllables of 318 total in passage +c(150022, "depression") # N.B.: 160 omitted syllables of 362 total in passage +c(150083, "caramel") # N.B.: only one of four passages to have >= 5% of syllables omitted +c(150083, "cars") # N.B.: only one of four passages to have >= 5% of syllables omitted + + +dfTrim <- filter(dfTrim, !is.na(timePerSyllable)) +# itself, but without ones for which we have no reading data + + + +"TODO" + +passage_no_after_trim2 <- nrow(dfTrim) +passage_no_after_trim1 - passage_no_after_trim2 #number of passages trimmed +(passage_no_after_trim1 - passage_no_after_trim2) / passage_no_after_trim1 #percentage of passages trimmed of last bunch +(passage_no_after_trim1 - passage_no_after_trim2) / passage_no_before_trimming #percentage of passages trimmed of whole + + + + + ### SECTION 3: ORGANIZE DATA FOR MODELING errorDat <- dfTrim @@ -134,6 +173,11 @@ errorDat$avgSyllPerWord_gmc <- errorDat$avgSyllPerWord - mean(errorDat$avgSyllPe errorDat$error_rate <- errorDat$errors / errorDat$lenSyll errorDat$correction_rate <- errorDat$corrections / errorDat$lenSyll +errorDat$timePerSyllable_gmc <- errorDat$timePerSyllable - mean(errorDat$timePerSyllable) +errorDat$timePerWord_gmc <- errorDat$timePerWord - mean(errorDat$timePerWord) + +# todo center avg reading speed, probably + #extract demo stats errorDatStats <- subset(errorDat, !duplicated(errorDat$id)) @@ -148,6 +192,12 @@ summary(errorDatStats$ethnic) / length(unique(errorDatStats$id)) summary(errorDatStats$socclass) summary(errorDatStats$socclass) / length(unique(errorDatStats$id)) +# Reading speed stats (ls additions 8/16/23) +summary(errorDatStats$timePerSyllable) +summary(errorDatStats$timePerWord) + +summary(errorDatStats$timePerSyllable_gmc) +summary(errorDatStats$timePerWord_gmc) ### SECTION 4: MODEL RESULTS #misprod_rate x bfne @@ -405,6 +455,57 @@ f_model31 <- lmerTest::lmer(words_with_misprod_rate ~ hesitation_rate * sps_gmc summary(f_model31) + +# Now: see if reading speed plays into it + +# Does scaaredSoc predict reading speed? +# syllable level +rs_model_1 <- lmerTest::lmer(timePerSyllable_gmc ~ scaaredSoc_gmc + (1|id) + (1|passage), + data=errorDat, REML=TRUE) +summary(rs_model_1) + +# word level +rs_model_2 <- lmerTest::lmer(timePerWord_gmc ~ scaaredSoc_gmc + (1|id) + (1|passage), + data=errorDat, REML=TRUE) +summary(rs_model_2) + + +rs_model_1_bfne <- lmerTest::lmer(timePerSyllable_gmc ~ bfne_gmc + (1|id) + (1|passage), + data=errorDat, REML=TRUE) +summary(rs_model_1_bfne) + +# word level +rs_model_2_bfne <- lmerTest::lmer(timePerWord_gmc ~ bfne_gmc + (1|id) + (1|passage), + data=errorDat, REML=TRUE) +summary(rs_model_2_bfne) + +rs_model_1_sps <- lmerTest::lmer(timePerSyllable_gmc ~ sps_gmc + (1|id) + (1|passage), + data=errorDat, REML=TRUE) +summary(rs_model_1_sps) + +# word level +rs_model_2_bfne <- lmerTest::lmer(timePerWord_gmc ~ sps_gmc + (1|id) + (1|passage), + data=errorDat, REML=TRUE) +summary(rs_model_2_sps) + + + +# And now ->> check out work +# Does our hesitation ~ scaaredSoc finding hold with reading speed controlled for? +# syllable level +rs_model_3 <- lmerTest::lmer(hesitation_rate ~ scaaredSoc_gmc + timePerSyllable_gmc + (1|id) + (1|passage), + data=errorDat, REML=TRUE) +summary(rs_model_3) + +# word level +rs_model_4 <- lmerTest::lmer(words_with_hes_rate ~ scaaredSoc_gmc + timePerWord_gmc + (1|id) + (1|passage), + data=errorDat, REML=TRUE) +summary(rs_model_4) + + +# todo: check models 1 and 2 when hesitation rate is held still + + # LS ideas: # error rate ~ SA # correction rate ~ error rate * SA diff --git a/code/prepReadAloudBeta.R b/code/prepReadAloudBeta.R index 407fcd5..878e824 100644 --- a/code/prepReadAloudBeta.R +++ b/code/prepReadAloudBeta.R @@ -1,6 +1,6 @@ # readAloud-valence-beta Analysis Preparation # Authors: Luc Sahar and Jessica M. Alexander -- NDCLab, Florida International University -# Last updated: 2023-06-30 +# Last updated: 2023-08-15 # INPUTS # data/df: behavioral (error-related) data, for each participant on each passage @@ -58,12 +58,13 @@ accDat_path <- paste(main_dataset,'derivatives/preprocessed/readAloud_passage-le readDat_path <- paste(main_dataset, 'derivatives/analysisStimuli_readDat_20230614.csv', sep="", collapse=NULL) redcap_path <- paste(main_dataset,'derivatives/preprocessed/202201v0readAloudval_SCRD_2022-06-20_1019.csv', sep="", collapse=NULL) agedat_path <- paste(main_dataset,'derivatives/preprocessed/202201v0readAloudval_SCRD_2022-06-20_1019_ageonly.csv', sep="", collapse=NULL) +speedDat_path <- paste(main_dataset, "derivatives/preprocessed/valence-timing/timingpitch_subject-by-passage_2022-09-09.csv", sep="", collapse=NULL) -# c(data, accDat_path, readDat_path, redcap_path, agedat_path) %>% fs::as_fs_path() %>% fs::is_file() +# c(data, accDat_path, readDat_path, redcap_path, agedat_path, speedDat_path) %>% fs::as_fs_path() %>% fs::is_file() # ✅: all TRUE -df <- read.csv(data) +df <- read.csv(data, na.strings='NA') redcap <- read.csv(redcap_path, na.strings='NA') #participant questionnaire responses agedat <- read.csv(agedat_path, na.strings='NA') #participant age information readDat <- read.csv(readDat_path, na.strings='N') #passage-level characteristics from analysisStimuli.R @@ -71,6 +72,7 @@ accDat <- read.csv(accDat_path, na.strings='NA', check.names=FALSE) #passage lev accDat$passage <- c("dams", "flying", "bats", "broccoli", "realty", "bees", "dogshow", "dolphins", "icefishing", "cars", "vegas", "sun", "caramel", "congo", "antarctica", "depression", "skunkowl", "grizzly", "mantis", "dentist") #rename passages with short-name +speedDat <- read.csv(speedDat_path, na.strings='NA') #organize data types df[,3:30] <- sapply(df[,3:30],as.numeric) @@ -164,7 +166,13 @@ for(h in 1:nrow(demoDat)){ } -### SECTION 3: BUILD TRIAL-LEVEL DF (ADD DEMODAT, READDAT, and ACCDAT to DF) +### SECTION 3: SET UP DERIVED FIELDS FOR SPEED ANALYSES +speedDat$readingTime <- speedDat$readEnd - speedDat$readStart +speedDat$id <- as.character(speedDat$id) # so we can join and it doesn't complain about type comparison +df <- left_join(df, speedDat, by = c("id", "passage")) # now reading timestamps and duration are looped into df + + +### SECTION 4: BUILD TRIAL-LEVEL DF (ADD DEMODAT, READDAT, and ACCDAT to DF) for(i in 1:nrow(df)){ subject <- df$id[i] #extract subject number for matching passage <- df$passage[i] #extract passage name for matching @@ -212,8 +220,12 @@ df$pronouns <- as.factor(df$pronouns) df$ethnic <- as.factor(df$ethnic) df$socclass <- as.factor(df$socclass) +# compute speed +df$timePerSyllable <- df$readingTime / df$lenSyll +df$timePerWord <- df$readingTime / df$lenWord + -### SECTION 3: CROSS-CHECK ALL PARTICIPANTS MET INCLUSION CRITERIA +### SECTION 5: CROSS-CHECK ALL PARTICIPANTS MET INCLUSION CRITERIA #note: given the time required to annotated errors, only participants who met inclusion criteria were annotated #sum(df$eng==1 & df$langhis %in% c(2,4) & df$ageen>6) #confirm all subjects monolingual English OR natively bilingual OR learned English before age 6 #sum(df$commdis>0) #confirm no subject diagnosed with any communication disorder @@ -236,12 +248,12 @@ summary(df$socclass)/20 #number of participants by social class affiliation summary(df$socclass)/20 / (nrow(df)/20) #percentage of participants by social class affiliation -### SECTION 4: TRIM PASSAGES DUE TO EXPERIMENTER ERROR +### SECTION 6: TRIM PASSAGES DUE TO EXPERIMENTER ERROR dfTrim <- subset(df, !(df$passage=='broccoli')) #remove broccoli passage due to typo in the last sentence as presented on-screen to participants dfTrim <- subset(dfTrim, !(dfTrim$passage=='sun')) #remove sun passage due to error in coding Excel -### SECTION 5: OUTPUT DATAFRAME +### SECTION 7: OUTPUT DATAFRAME write.csv(dfTrim, paste(out_path, "readAloudBetaData_", today, ".csv", sep="", collapse=NULL), row.names = FALSE) # collapse_by_participant <- function(filename_in, filename_out) { From 9d77a5a109e96b6f32973f1d6d6a2d8d00757a54 Mon Sep 17 00:00:00 2001 From: l-acs Date: Thu, 17 Aug 2023 09:17:08 -0400 Subject: [PATCH 08/33] WIP: Control for age --- code/analysisReadAloudBeta.R | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/code/analysisReadAloudBeta.R b/code/analysisReadAloudBeta.R index 7b71881..39d9b20 100644 --- a/code/analysisReadAloudBeta.R +++ b/code/analysisReadAloudBeta.R @@ -490,6 +490,18 @@ summary(rs_model_2_sps) +# What happens when we control for age? +#hesitation_rate x scaaredSoc +age_model1 <- lmerTest::lmer(hesitation_rate ~ scaaredSoc_gmc + age_gmc + (1|id) + (1|passage), + data=errorDat, REML=TRUE) +summary(age_model1) + +#words_with_hes_rate x scaaredSoc +age_model2 <- lmerTest::lmer(words_with_hes_rate ~ scaaredSoc_gmc + age_gmc + (1|id) + (1|passage), + data=errorDat, REML=TRUE) +summary(age_model2) + + # And now ->> check out work # Does our hesitation ~ scaaredSoc finding hold with reading speed controlled for? # syllable level @@ -506,6 +518,9 @@ summary(rs_model_4) # todo: check models 1 and 2 when hesitation rate is held still + + + # LS ideas: # error rate ~ SA # correction rate ~ error rate * SA From 9f953416ceafa0c7f181313231f7514b95bdf0d6 Mon Sep 17 00:00:00 2001 From: l-acs Date: Thu, 17 Aug 2023 09:37:07 -0400 Subject: [PATCH 09/33] Check if reading speed mean-centering changes things (it doesn't) --- code/analysisReadAloudBeta.R | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/code/analysisReadAloudBeta.R b/code/analysisReadAloudBeta.R index 39d9b20..2e93828 100644 --- a/code/analysisReadAloudBeta.R +++ b/code/analysisReadAloudBeta.R @@ -135,6 +135,8 @@ c(150083, "cars") # N.B.: only one of four passages to have >= 5% of sylla dfTrim <- filter(dfTrim, !is.na(timePerSyllable)) # itself, but without ones for which we have no reading data +# this ends up only dropping 0083, caramel - the other three already end up +# getting dropped based on other criteria @@ -490,6 +492,22 @@ summary(rs_model_2_sps) +# Does scaaredSoc predict reading speed? +# syllable level +rs_model_3 <- lmerTest::lmer(timePerSyllable ~ scaaredSoc_gmc + (1|id) + (1|passage), + data=errorDat, REML=TRUE) +summary(rs_model_3) + +# word level +rs_model_4 <- lmerTest::lmer(timePerWord ~ scaaredSoc_gmc + (1|id) + (1|passage), + data=errorDat, REML=TRUE) +summary(rs_model_4) + + + + + + # What happens when we control for age? #hesitation_rate x scaaredSoc age_model1 <- lmerTest::lmer(hesitation_rate ~ scaaredSoc_gmc + age_gmc + (1|id) + (1|passage), From ab8787b9087c94f83b5c7a1ed4e440c22bae2d5d Mon Sep 17 00:00:00 2001 From: l-acs Date: Tue, 22 Aug 2023 21:46:15 -0400 Subject: [PATCH 10/33] WIP: Misproduction-hesitation sequences --- code/analysisReadAloudBeta.R | 163 +++++++++++++---------------------- code/prepReadAloudBeta.R | 9 +- 2 files changed, 65 insertions(+), 107 deletions(-) diff --git a/code/analysisReadAloudBeta.R b/code/analysisReadAloudBeta.R index 2e93828..44a3deb 100644 --- a/code/analysisReadAloudBeta.R +++ b/code/analysisReadAloudBeta.R @@ -1,6 +1,6 @@ # readAloud-valence-beta Reading Task Analyses # Authors: Luc Sahar, Jessica M. Alexander -# Last Updated: 2023-08-16 +# Last Updated: 2023-08-24 # INPUTS # data/df: behavioral data, for each participant on each passage, with relevant participant information and trial-level stimulus information @@ -42,7 +42,8 @@ today <- format(today, "%Y%m%d") # data <- '/Users/jalexand/github/readAloud-valence-beta/derivatives/readAloudBetaData_20230630.csv' # data <- '/home/luc/Documents/ndclab/analysis-sandbox/rwe-analysis/derivatives/readAloudBetaData_20230810.csv' # data <- '/home/luc/Documents/ndclab/analysis-sandbox/rwe-analysis/derivatives/readAloudBetaData_20230815.csv' -data <- '/home/luc/Documents/ndclab/analysis-sandbox/rwe-analysis/derivatives/readAloudBetaData_20230816.csv' +# data <- '/home/luc/Documents/ndclab/analysis-sandbox/rwe-analysis/derivatives/readAloudBetaData_20230816.csv' +data <- '/home/luc/Documents/ndclab/analysis-sandbox/rwe-analysis/derivatives/readAloudBetaData_20230821.csv' to_omit <- '/home/luc/Documents/ndclab/analysis-sandbox/rwe-analysis/input/passages-to-omit_20230810.csv' # out_path <- '/Users/jalexand/github/readAloud-valence-beta/derivatives/' out_path <- '/home/luc/Documents/ndclab/analysis-sandbox/rwe-analysis/derivatives/' @@ -57,9 +58,6 @@ df$pronouns <- as.factor(df$pronouns) df$ethnic <- as.factor(df$ethnic) df$socclass <- as.factor(df$socclass) - -# FIXME: stats on df and dfTrim for reading speed are NA - #extract demo stats # all these values are just in case they're useful - not needed per se for later @@ -123,7 +121,6 @@ passage_no_before_trimming - passage_no_after_trim1 #number of passages trimmed # band-aid fix: remove passages without reading speed data so that we can run # our analyses on them nonetheless -# todo # these are the only four passages without reading time data... # and incidentally? well, see their comments here... @@ -138,10 +135,6 @@ dfTrim <- filter(dfTrim, !is.na(timePerSyllable)) # this ends up only dropping 0083, caramel - the other three already end up # getting dropped based on other criteria - - -"TODO" - passage_no_after_trim2 <- nrow(dfTrim) passage_no_after_trim1 - passage_no_after_trim2 #number of passages trimmed (passage_no_after_trim1 - passage_no_after_trim2) / passage_no_after_trim1 #percentage of passages trimmed of last bunch @@ -178,7 +171,7 @@ errorDat$correction_rate <- errorDat$corrections / errorDat$lenSyll errorDat$timePerSyllable_gmc <- errorDat$timePerSyllable - mean(errorDat$timePerSyllable) errorDat$timePerWord_gmc <- errorDat$timePerWord - mean(errorDat$timePerWord) -# todo center avg reading speed, probably + #extract demo stats @@ -201,6 +194,45 @@ summary(errorDatStats$timePerWord) summary(errorDatStats$timePerSyllable_gmc) summary(errorDatStats$timePerWord_gmc) + +### SECTION 3.5: preparing for misprod-hes sequential analyses + +# ignore the misprod-hes columns for now +errorDatMisprodHes <- select(errorDat, !contains("_syllables")) + +# First: look at a given misproduction and check for nearby hesitations +justMisprodWithHesBefore <- cbind(errorDatMisprodHes, + hes_position = 0, # "before", + misprod_tally = errorDat$misprod_with_hes_in_previous_syllables) + +justMisprodWithHesAfter <- cbind(errorDatMisprodHes, + hes_position = 1, # "after", + misprod_tally = errorDat$misprod_with_hes_in_next_syllables) + + +# stack the ones before and the ones after as rows of a single df (my attempt at long form) +errorDatLongMisprodWithRelHes <- rbind(justMisprodWithHesBefore, justMisprodWithHesAfter) + +# track the binary relative position as a factor +errorDatLongMisprodWithRelHes$hes_position <- as.factor(errorDatLongMisprodWithRelHes$hes_position) + +# Then: look at a given hesitation and check for nearby misproductions +justHesWithMisprodBefore <- cbind(errorDatMisprodHes, + misprod_position = 0, # "before", + hes_tally = errorDat$hes_with_misprod_in_previous_syllables) + +justHesWithMisprodAfter <- cbind(errorDatMisprodHes, + misprod_position = 1, # "after", + hes_tally = errorDat$hes_with_misprod_in_next_syllables) + +# stack the ones before and the ones after as rows of a single df (my attempt at long form) +errorDatLongHesWithRelMisprod <- rbind(justHesWithMisprodBefore, justHesWithMisprodAfter) + +# track the binary relative position as a factor +errorDatLongHesWithRelMisprod$misprod_position <- as.factor(errorDatLongHesWithRelMisprod$misprod_position) + + + ### SECTION 4: MODEL RESULTS #misprod_rate x bfne model1 <- lmerTest::lmer(misprod_rate ~ bfne_gmc + (1|id) + (1|passage), @@ -506,8 +538,6 @@ summary(rs_model_4) - - # What happens when we control for age? #hesitation_rate x scaaredSoc age_model1 <- lmerTest::lmer(hesitation_rate ~ scaaredSoc_gmc + age_gmc + (1|id) + (1|passage), @@ -520,7 +550,7 @@ age_model2 <- lmerTest::lmer(words_with_hes_rate ~ scaaredSoc_gmc + age_gmc + (1 summary(age_model2) -# And now ->> check out work +# And now ->> check our work # Does our hesitation ~ scaaredSoc finding hold with reading speed controlled for? # syllable level rs_model_3 <- lmerTest::lmer(hesitation_rate ~ scaaredSoc_gmc + timePerSyllable_gmc + (1|id) + (1|passage), @@ -533,100 +563,25 @@ rs_model_4 <- lmerTest::lmer(words_with_hes_rate ~ scaaredSoc_gmc + timePerWord_ summary(rs_model_4) -# todo: check models 1 and 2 when hesitation rate is held still - - - - - -# LS ideas: -# error rate ~ SA -# correction rate ~ error rate * SA - -# comprehension ~ correction rate -# comprehension ~ SA * correction rate -# comprehension ~ error rate -# comprehension ~ SA * error rate -# comprehension ~ error rate * correction rate -# comprehension ~ error rate * correction rate * SA - -# comprehension ~ hes rate * error rate -# comprehension ~ hes rate * correction rate - -# comprehension ~ hes rate * error rate * SA -# comprehension ~ hes rate * correction rate * SA -# comprehension ~ hes rate * error rate * correction rate * SA - -# error rate ~ hes rate * SA -# correction rate ~ hes rate * SA, control for error rate - -# corr_ : corrections rate -# err_ : errors rate - - -# error rate as explained by social anxiety -fls_model_err_scaared <- lmer(error_rate ~ scaaredSoc_gmc + (1|id) + (1|passage), - data=errorDat, REML=TRUE) -summary(fls_model_err_scaared) - -# correction rate as explained by social anxiety -fls_model_corr_scaared <- lmer(correction_rate ~ scaaredSoc_gmc + (1|id) + (1|passage), - data=errorDat, REML=TRUE) -summary(fls_model_corr_scaared) - -# correction rate as explained by social anxiety and errors -fls_model_corr_scaared_err <- lmer(correction_rate ~ scaaredSoc_gmc * error_rate + (1|id) + (1|passage), - data=errorDat, REML=TRUE) -summary(fls_model_corr_scaared_err) - -# comprehension ~ correction rate -fls_model_comp_corr <- glmer(challengeACC ~ correction_rate + (1|id) + (1|passage), - data=errorDat, family = "binomial") -summary(fls_model_comp_corr) - -# comprehension ~ SA * correction rate -fls_model_comp_scaared_corr <- glmer(challengeACC ~ scaaredSoc_gmc * correction_rate + (1|id) + (1|passage), - data=errorDat, family = "binomial") -summary(fls_model_comp_scaared_corr) # * - -# comprehension ~ error rate -fls_model_comp_err <- glmer(challengeACC ~ error_rate + (1|id) + (1|passage), - data=errorDat, family = "binomial") -summary(fls_model_comp_err) - -# comprehension ~ SA * error rate -fls_model_comp_scaared_err <- glmer(challengeACC ~ scaaredSoc_gmc * error_rate + (1|id) + (1|passage), - data=errorDat, family = "binomial") -summary(fls_model_comp_scaared_err) - -# comprehension ~ error rate * correction rate -fls_model_comp_err_corr <- glmer(challengeACC ~ error_rate * correction_rate + (1|id) + (1|passage), - data=errorDat, family = "binomial") -summary(fls_model_comp_err_corr) - -# comprehension ~ hes rate * error rate -fls_model_comp_hes_err <- glmer(challengeACC ~ hesitation_rate * error_rate + (1|id) + (1|passage), - data=errorDat, family = "binomial") -summary(fls_model_comp_hes_err) +# misprod-hes ordering -# comprehension ~ hes rate * correction rate -fls_model_comp_hes_corr <- glmer(challengeACC ~ hesitation_rate * correction_rate + (1|id) + (1|passage), - data=errorDat, family = "binomial") -summary(fls_model_comp_hes_corr) # *** +# Is the number of hesitations adjacent to misproductions in a particular +# reading predicted by the +# Does the position of misproductions relative to hesitations -# Check the main one of interest, comprehension as explained by SA and -# corrections (see fls_model_comp_scaared_corr above), with all three tests -f_model_comp_scaared_corr <- glmer(challengeACC ~ scaaredSoc_gmc * correction_rate + (1|id) + (1|passage), - data=errorDat, family = "binomial") -summary(f_model_comp_scaared_corr) # * -f_model_comp_bfne_corr <- glmer(challengeACC ~ bfne_gmc * correction_rate + (1|id) + (1|passage), - data=errorDat, family = "binomial") -summary(f_model_comp_bfne_corr) # N.S. +# we have a number of occurrences of a misproduction in a particular position +# relative to a passage's hesitations. does knowing the position (before/after) +# predict the number of these sequences we have? -f_model_comp_sps_corr <- glmer(challengeACC ~ sps_gmc * correction_rate + (1|id) + (1|passage), - data=errorDat, family = "binomial") -summary(f_model_comp_sps_corr) # N.S., but p for interaction is at 0.0698 +# does misproduction location relative to a hesitation predict how many +# instances we get in a particular reading? +hes_with_rel_misprod_model_1 <- lmerTest::lmer(hes_tally ~ misprod_position + (1|id) + (1|passage), + data=errorDatLongHesWithRelMisprod, REML=TRUE) +summary(hes_with_rel_misprod_model_1) +misprod_with_rel_hes_model_1 <- lmerTest::lmer(misprod_tally ~ hes_position + (1|id) + (1|passage), + data=errorDatLongMisprodWithRelHes, REML=TRUE) +summary(misprod_with_rel_hes_model_1) diff --git a/code/prepReadAloudBeta.R b/code/prepReadAloudBeta.R index 878e824..04f5476 100644 --- a/code/prepReadAloudBeta.R +++ b/code/prepReadAloudBeta.R @@ -53,7 +53,8 @@ out_path <- '/home/luc/Documents/ndclab/analysis-sandbox/rwe-analysis/derivative #load input files # data <- paste(main_dataset, 'derivatives/preprocessed/disfluencies_subject-x-passage_20230616_1229pm.csv', sep="", collapse=NULL) -data <- "/home/luc/Documents/ndclab/analysis-sandbox/output-csvs/disfluencies_subject-x-passage_20230616_1229pm.csv" +# data <- "/home/luc/Documents/ndclab/analysis-sandbox/output-csvs/disfluencies_subject-x-passage_20230616_1229pm.csv" +data <- "/home/luc/Documents/ndclab/analysis-sandbox/output-csvs/disfluencies_subject-x-passage_20230818_1042pm.csv" accDat_path <- paste(main_dataset,'derivatives/preprocessed/readAloud_passage-level_summary_20220812.csv', sep="", collapse=NULL) readDat_path <- paste(main_dataset, 'derivatives/analysisStimuli_readDat_20230614.csv', sep="", collapse=NULL) redcap_path <- paste(main_dataset,'derivatives/preprocessed/202201v0readAloudval_SCRD_2022-06-20_1019.csv', sep="", collapse=NULL) @@ -84,7 +85,7 @@ tempdf <- data.frame(matrix(nrow=0, ncol=ncol(df))) colnames(tempdf) <- colnames(df) for(passage in 1:length(all_passages)){ if(all_passages[passage] %in% passages_read){next}else{ - tempdf[nrow(tempdf) + 1,] <- c("150086", all_passages[passage], rep(NA, 28)) + tempdf[nrow(tempdf) + 1,] <- c("150086", all_passages[passage], rep(NA, 30)) } } df <- rbind(df, tempdf) @@ -254,7 +255,9 @@ dfTrim <- subset(dfTrim, !(dfTrim$passage=='sun')) #remove sun passage due to er ### SECTION 7: OUTPUT DATAFRAME -write.csv(dfTrim, paste(out_path, "readAloudBetaData_", today, ".csv", sep="", collapse=NULL), row.names = FALSE) +out_filename <- paste(out_path, "readAloudBetaData_", today, ".csv", sep="", collapse=NULL) +write.csv(dfTrim, out_filename, row.names = FALSE) +out_filename # collapse_by_participant <- function(filename_in, filename_out) { # by_participant <- read_csv(filename_in) %>% From a66e53669c453dc7a5035fb1a4929e86c5102ec1 Mon Sep 17 00:00:00 2001 From: l-acs Date: Fri, 25 Aug 2023 11:12:17 -0400 Subject: [PATCH 11/33] Add prep script for word freq analysis, after `dataset`'s analysisStimuli.R --- code/prepWordFreqReadAloudBeta.R | 138 +++++++++++++++++++++++++++++++ 1 file changed, 138 insertions(+) create mode 100644 code/prepWordFreqReadAloudBeta.R diff --git a/code/prepWordFreqReadAloudBeta.R b/code/prepWordFreqReadAloudBeta.R new file mode 100644 index 0000000..e1fd57a --- /dev/null +++ b/code/prepWordFreqReadAloudBeta.R @@ -0,0 +1,138 @@ +# readAloud-valence-beta Analysis Preparation for word frequency +# Authors: Jessica M. Alexander and Luc Sahar -- NDCLab, Florida International University +# Last updated: 2023-08-25 + +# Input: +# - SUBTLEXus corpus +# - Scaffolds xlsx, used only as the simplest way to enumerate the passages +# - Stimulus characteristics for our passages + +# Output: +# - a CSV, with one row per passage, showing the average word frequency (in +# SUBTLEXus) for the words in that passage + +### SECTION 1: SETTING UP +library(readxl) #load excel files +library(textstem) #lemmatize_words function + +#set up date for output file naming +today <- Sys.Date() +today <- format(today, "%Y%m%d") + +#set up directories for input/output +# main_dataset <- '/Users/jalexand/github/readAloud-valence-dataset/' +main_dataset <- '~/Documents/ndclab/analysis-sandbox/rwe-dataset/' +main_analysis <- '~/Documents/ndclab/analysis-sandbox/rwe-analysis/' +# out_path_readDat <- '/Users/jalexand/github/readAloud-valence-dataset/derivatives/' +out_path_readDat <- paste(main_analysis, 'derivatives/', sep = '') + +#load input files +scaffolds <- paste(main_dataset, 'code/scaffolds.xlsx', sep="", collapse=NULL) +readAloudStimChar <- paste(main_dataset, 'materials/readAloud-ldt/stimuli/readAloud/readAloud-stimuli_characteristics.xlsx', sep="", collapse=NULL) +SUBList <- paste(main_dataset, 'materials/readAloud-ldt/stimuli/resources/SUBTLEXus74286wordstextversion.txt', sep="", collapse=NULL) #downloaded from https://www.ugent.be/pp/experimentele-psychologie/en/research/documents/subtlexus on 06/13/2022 + +# maybe include passDat? it has something about word data, and seems to have flesch too? +subtlexus <- read.table(SUBList, header=TRUE) +subtlexus$Word <- tolower(subtlexus$Word) #make all entries in SUBTLEXUS lower-case +passages <-excel_sheets(scaffolds) # antarctica ... vegas + +# Jess' suggestions +# L72-124 from analysisStimuli.R + +### SECTION 2: SET UP PASSAGE LIST AND PREPARE LEMMAS FOR ACCESSING FREQUENCY INFO +#create manual mapping of words to SUBTLEXUS corpus when lemma doesn't automatically match +manualLemma <- data.frame(matrix(ncol=2, nrow=0)) #initialize a table to hold words without a frequency match +colnames(manualLemma) <- c("stimWord", + "lemma") +for(i in 1:length(passages)){ + passage <- passages[i] + passageDat <- read_xlsx(readAloudStimChar, sheet=passage, skip=1, na="#") #read in passage data + passWords <- passageDat[,1:2] #pull word list + for(a in 1:length(passWords$stimWord)){ #correct apostrophes (curly to straight) + string <- passWords$stimWord[a] + passWords$stimWord[a] <- gsub("’", "'",string) + } + passWords$stimWord <- tolower(passWords$stimWord) #shift word list to lowercase to match SUBTLEXUS + passWords$lemma <- lemmatize_words(passWords$stimWord) #lemmatize word list + passWords$freq <- rep(0, nrow(passWords)) #initialize new column to hold word frequency data + for(f in 1:nrow(passWords)){ #add log word frequency from SUBTLEXUS corpus for each word lemma in the passage + passWords$freq[f] <- subtlexus$Lg10WF[match(passWords$lemma[f], subtlexus$Word)] + } + noFreqTable <- subset(passWords, is.na(passWords$freq))[,2:3] #extract words that did not get a frequency match + manualLemma <- rbind(manualLemma, noFreqTable) #bind to running table of words without a frequency match +} +manualLemma <- manualLemma[!duplicated(manualLemma$stimWord),] #remove duplicate rows +manualLemma <- manualLemma[order(manualLemma$stimWord),] #alphabetize +manualLemma$newLemma <- c(rep(0,nrow(manualLemma))) #initialize column to hold manual lemmas extracted by researcher +for(lemma in 1:nrow(manualLemma)){ #manually adjust possessives by dropping "'s" (except for "it's") + string <- manualLemma$lemma[lemma] + if(substr(string, nchar(string)-1, nchar(string))=="'s" & string!="it's"){ + manualLemma$newLemma[lemma] <- substr(string, 0, nchar(string)-2) + } +} +#manually adjust plurals +manualLemma[match("brittles", manualLemma$stimWord),3] <- "brittle" +#manually adjust adjectives +manualLemma[match("club-like", manualLemma$stimWord),3] <- "club" +manualLemma[match("in-flight", manualLemma$stimWord),3] <- "flight" +manualLemma[match("mid-", manualLemma$stimWord),3] <- "middle" +#no manual adjustment of the following categories: +#compound words with no obvious "primary" lemma: ccc, don't, it's, long-term +#proper nouns: delano, nissan +#words simply not in the SUBTLEXUS database: ecotourism, hydropower, jetsetter, megabat, microbats, photoreceptor, plumicorn, powertrain, spearer, trinocular +#ordinal numbers: nineteeth, second, twentieth +manualLemma <- subset(manualLemma, manualLemma$newLemma!=0) #drop words without a manual mapping +manualLemma$freq <- rep(0, nrow(manualLemma)) #add log word frequency from SUBTLEXUS corpus +for(f in 1:nrow(manualLemma)){ + manualLemma$freq[f] <- subtlexus$Lg10WF[match(manualLemma$newLemma[f], subtlexus$Word)] +} + +# Jess' suggestions +# L170-189 from analysisStimuli.R + +### SECTION 3: BUILD AND OUTPUT READDAT MATRIX +readDat <- data.frame(matrix(ncol=2, nrow=0)) +colnames(readDat) <- c("passage", "avgFreq") + +#calculate characteristics per passage half +for(j in 1:length(passages)){ + passage <- passages[j] + passageDat <- read_xlsx(readAloudStimChar, sheet=passage, skip=1, na="#") #read in passage word list + + #extract passage word list + passWords <- passageDat[,1:2] #pull word list + for(a in 1:length(passWords$stimWord)){ #correct apostrophes (curly to straight) + string <- passWords$stimWord[a] + passWords$stimWord[a] <- gsub("’", "'",string) + } + passWords$stimWord <- tolower(passWords$stimWord) #shift word list to lowercase to match SUBTLEXUS + passWords$lemma <- lemmatize_words(passWords$stimWord) #lemmatize word list + + #add frequency data + passWords$freq <- rep(0, nrow(passWords)) #initialize column to hold frequency data + for(f in 1:nrow(passWords)){ + if(!is.na(subtlexus$Lg10WF[match(passWords$stimWord[f], subtlexus$Word)])){ #automatic matching of full word to SUBTLEXUS corpus + passWords$freq[f] <- subtlexus$Lg10WF[match(passWords$stimWord[f], subtlexus$Word)] + } else if(!is.na(subtlexus$Lg10WF[match(passWords$lemma[f], subtlexus$Word)])){ #automatic matching of lemma to SUBTLEXUS corpus + passWords$freq[f] <- subtlexus$Lg10WF[match(passWords$lemma[f], subtlexus$Word)] + } else if (passWords$lemma[f] %in% manualLemma$lemma){ #check manual table if no auto-match + passWords$freq[f] <- manualLemma$freq[match(passWords$lemma[f], manualLemma$lemma)] + } else {passWords$freq[f] <- median(subtlexus$Lg10WF)} #impute median value for non-matches + } + + + #calculate average frequency + freqAvg <- mean(passWords$freq) #calculate mean frequency for whole passage + + #create vectors for each passage and add to readDat + vectorWholePassage <- c(passage, freqAvg) # instead of vectorPre and vectorPost, as in analysisStimuli.R + readDat[nrow(readDat) + 1,] <- c(vectorWholePassage) +} + +#organize data types +readDat$avgFreq <- as.numeric(readDat$avgFreq) + +#output readDat +out_path_readDat_csv <- paste(out_path_readDat, 'prepWordFreq_readDat', today, '.csv', sep="", collapse=NULL) +write.csv(readDat, out_path_readDat_csv) +out_path_readDat_csv # print out to terminal so it's easy to open! From e2af6ce1653154b22f81bd3bff9580aa5ff96b0d Mon Sep 17 00:00:00 2001 From: l-acs Date: Mon, 28 Aug 2023 09:31:53 -0400 Subject: [PATCH 12/33] WIP: word freq prep/analysis --- code/analysisReadAloudBeta.R | 24 ++++++++++++++++++++++-- code/prepReadAloudBeta.R | 8 ++++++-- 2 files changed, 28 insertions(+), 4 deletions(-) diff --git a/code/analysisReadAloudBeta.R b/code/analysisReadAloudBeta.R index 44a3deb..4ed68fd 100644 --- a/code/analysisReadAloudBeta.R +++ b/code/analysisReadAloudBeta.R @@ -1,6 +1,6 @@ # readAloud-valence-beta Reading Task Analyses # Authors: Luc Sahar, Jessica M. Alexander -# Last Updated: 2023-08-24 +# Last Updated: 2023-08-25 # INPUTS # data/df: behavioral data, for each participant on each passage, with relevant participant information and trial-level stimulus information @@ -43,7 +43,7 @@ today <- format(today, "%Y%m%d") # data <- '/home/luc/Documents/ndclab/analysis-sandbox/rwe-analysis/derivatives/readAloudBetaData_20230810.csv' # data <- '/home/luc/Documents/ndclab/analysis-sandbox/rwe-analysis/derivatives/readAloudBetaData_20230815.csv' # data <- '/home/luc/Documents/ndclab/analysis-sandbox/rwe-analysis/derivatives/readAloudBetaData_20230816.csv' -data <- '/home/luc/Documents/ndclab/analysis-sandbox/rwe-analysis/derivatives/readAloudBetaData_20230821.csv' +data <- '/home/luc/Documents/ndclab/analysis-sandbox/rwe-analysis/derivatives/readAloudBetaData_20230825.csv' to_omit <- '/home/luc/Documents/ndclab/analysis-sandbox/rwe-analysis/input/passages-to-omit_20230810.csv' # out_path <- '/Users/jalexand/github/readAloud-valence-beta/derivatives/' out_path <- '/home/luc/Documents/ndclab/analysis-sandbox/rwe-analysis/derivatives/' @@ -585,3 +585,23 @@ summary(hes_with_rel_misprod_model_1) misprod_with_rel_hes_model_1 <- lmerTest::lmer(misprod_tally ~ hes_position + (1|id) + (1|passage), data=errorDatLongMisprodWithRelHes, REML=TRUE) summary(misprod_with_rel_hes_model_1) + + + +# Word frequency analysis +# Does a passage's average word frequency predict participants' hesitation rate or misproduction rate? +wordfreq_model_1 <- lmerTest::lmer(hesitation_rate ~ avgWordFreq + (1|id) + (1|passage), + data=errorDat, REML=TRUE) +summary(wordfreq_model_1) +wordfreq_model_2 <- lmerTest::lmer(misprod_rate ~ avgWordFreq + (1|id) + (1|passage), + data=errorDat, REML=TRUE) +summary(wordfreq_model_2) + +# Do social anxiety and frequency interact to predict hesitation rate or misproduction rate? +wordfreq_model_3 <- lmerTest::lmer(hesitation_rate ~ avgWordFreq * scaaredSoc_gmc + (1|id) + (1|passage), + data=errorDat, REML=TRUE) +summary(wordfreq_model_3) + +wordfreq_model_4 <- lmerTest::lmer(misprod_rate ~ avgWordFreq * scaaredSoc_gmc + (1|id) + (1|passage), + data=errorDat, REML=TRUE) +summary(wordfreq_model_4) diff --git a/code/prepReadAloudBeta.R b/code/prepReadAloudBeta.R index 04f5476..afb78d8 100644 --- a/code/prepReadAloudBeta.R +++ b/code/prepReadAloudBeta.R @@ -60,7 +60,7 @@ readDat_path <- paste(main_dataset, 'derivatives/analysisStimuli_readDat_2023061 redcap_path <- paste(main_dataset,'derivatives/preprocessed/202201v0readAloudval_SCRD_2022-06-20_1019.csv', sep="", collapse=NULL) agedat_path <- paste(main_dataset,'derivatives/preprocessed/202201v0readAloudval_SCRD_2022-06-20_1019_ageonly.csv', sep="", collapse=NULL) speedDat_path <- paste(main_dataset, "derivatives/preprocessed/valence-timing/timingpitch_subject-by-passage_2022-09-09.csv", sep="", collapse=NULL) - +freqDat_path <- paste(main_analyses, "derivatives/prepWordFreq_readDat20230825.csv", sep="") # c(data, accDat_path, readDat_path, redcap_path, agedat_path, speedDat_path) %>% fs::as_fs_path() %>% fs::is_file() # ✅: all TRUE @@ -74,6 +74,7 @@ accDat$passage <- c("dams", "flying", "bats", "broccoli", "realty", "bees", "dog "cars", "vegas", "sun", "caramel", "congo", "antarctica", "depression", "skunkowl", "grizzly", "mantis", "dentist") #rename passages with short-name speedDat <- read.csv(speedDat_path, na.strings='NA') +freqDat <- read.csv(freqDat_path, na.strings = 'NA') #organize data types df[,3:30] <- sapply(df[,3:30],as.numeric) @@ -173,7 +174,7 @@ speedDat$id <- as.character(speedDat$id) # so we can join and it doesn't complai df <- left_join(df, speedDat, by = c("id", "passage")) # now reading timestamps and duration are looped into df -### SECTION 4: BUILD TRIAL-LEVEL DF (ADD DEMODAT, READDAT, and ACCDAT to DF) +### SECTION 4: BUILD TRIAL-LEVEL DF (ADD DEMODAT, READDAT, ACCDAT, AND FREQDAT to DF) for(i in 1:nrow(df)){ subject <- df$id[i] #extract subject number for matching passage <- df$passage[i] #extract passage name for matching @@ -193,6 +194,9 @@ for(i in 1:nrow(df)){ df$lenWord[i] <- sum(readDat$lengthWord[which(readDat$passage==passage)]) #length of passage in words df$avgSyllPerWord[i] <- df$lenSyll[i]/df$lenWord[i] + #extract passage characteristics from freqDat + df$avgWordFreq[i] <- freqDat$avgFreq[which(freqDat$passage==passage)] + #extract participant accuracy from accDat df$challengeACC[i] <- accDat[match(passage, accDat$passage), as.character(subject)] #passage-specific challenge question accuracy for subject From 1b21dac7138121eb7d2421a7b5c2c508918745b8 Mon Sep 17 00:00:00 2001 From: l-acs Date: Tue, 30 Jan 2024 10:42:52 -0500 Subject: [PATCH 13/33] Add data dict and a few more possible models --- code/analysisReadAloudBeta.R | 112 +++++++++++++++++++++++++++++++++-- 1 file changed, 106 insertions(+), 6 deletions(-) diff --git a/code/analysisReadAloudBeta.R b/code/analysisReadAloudBeta.R index 4ed68fd..2509334 100644 --- a/code/analysisReadAloudBeta.R +++ b/code/analysisReadAloudBeta.R @@ -11,6 +11,103 @@ # NOTES TO DO # drop 150086 as only completed 12 of 20 passages and low accuracy +# Data dict + +# errorDatMisprodHes: +# +# our errorDat dataframe, just without the misprod-sequencing columns (which +# we'll add in piecemeal by different names later) + +# First, look at a given misproduction and check for nearby hesitations +# +# hes_position: +# +# for long-form dataframes counting misproductions, this indicates whether the +# relevant count is the number of hesitations before (0) or after (1) those +# misproductions being counted in that row +# +# +# misprod_tally: +# +# conversely, in long-form dataframes counting misproductions, this column +# actually tracks how many misproductions there are in that reading +# (participant x passage) that have a hesitation in the relevant relative +# position + + +# justMisprodWithHesBefore: +# +# this is the dataframe with every (participant x passage) reading, counting +# the number of misproductions with a nearby preceding hesitation +# +# i.e., for each reading, it counts the number of times (misprod_tally) that a +# hesitation comes before a misproduction -- so for every entry, hes_position = 0 +# +# +# justMisprodWithHesAfter +# +# similarly, this is the dataframe with every (participant x passage) reading, +# counting the number of misproductions with a nearby following hesitation +# +# i.e., for each reading, it counts the number of times (misprod_tally) that a +# hesitation comes after a misproduction -- so for every entry, hes_position = 1 +# +# +# errorDatLongMisprodWithRelHes: +# +# this is the long-form dataframe, with two rows per reading (participant x +# passage): one for each position for a relative hesitation. i.e. this stacks +# the two dataframes that respectively have (1) every passage, with a count of +# misproductions for hes_position = 0, and (2) every passage, with a count of +# misproductions for hes_position = 1 + + + +# Then, look at a given hesitation and check for nearby misproductions + +# misprod_position: +# +# for long-form dataframes counting hesitations, this indicates whether the +# relevant count is the number of misproductions before (0) or after (1) those +# hesitations being counted in that row +# +# +# hes_tally: +# +# conversely, in long-form dataframes counting hesitations, this column +# actually tracks how many hesitations there are in that reading (participant +# x passage) that have a misproduction in the relevant relative position + + +# justHesWithMisprodBefore: +# +# this is the dataframe with every (participant x passage) reading, counting +# the number of hesitations with a nearby preceding misproduction +# +# i.e., for each reading, it counts the number of times (hes_tally) that a +# misproduction comes before a hesitation -- so for every entry, +# misprod_position = 0 +# +# +# justHesWithMisprodAfter +# +# similarly, this is the dataframe with every (participant x passage) reading, +# counting the number of hesitations with a nearby following misproduction +# +# i.e., for each reading, it counts the number of times (hes_tally) that a +# misproduction comes after a hesitation -- so for every entry, +# misprod_position = 1 +# +# +# errorDatLongHesWithRelMisprod: +# +# this is the long-form dataframe, with two rows per reading (participant x +# passage): one for each position for a relative misproduction, i.e. this +# stacks the two dataframes that respectively have (1) every passage, with a +# count of hesitations for misprod_position = 0, and (2) every passage, with a +# count of hesitations for misprod_position = 1 + + ### SECTION 1: SETTING UP library(dplyr) library(lme4) @@ -119,9 +216,6 @@ passage_no_before_trimming - passage_no_after_trim1 #number of passages trimmed (passage_no_before_trimming - passage_no_after_trim1) / passage_no_before_trimming #percentage of passages trimmed -# band-aid fix: remove passages without reading speed data so that we can run -# our analyses on them nonetheless - # these are the only four passages without reading time data... # and incidentally? well, see their comments here... c(150013, "vegas") # N.B.: 161 omitted syllables of 318 total in passage @@ -300,9 +394,6 @@ summary(model12) - - - # glmer(accuracy ~ scaaredSoc_gmc + (1|id) + (1|passage), data=errorDat, family="binomial") # "f_" : follow-up @@ -586,6 +677,15 @@ misprod_with_rel_hes_model_1 <- lmerTest::lmer(misprod_tally ~ hes_position + (1 data=errorDatLongMisprodWithRelHes, REML=TRUE) summary(misprod_with_rel_hes_model_1) +## does it interact with SA? +hes_with_rel_misprod_model_3 <- lmerTest::lmer(hes_tally ~ misprod_position * scaaredSoc_gmc + (1|id) + (1|passage), + data=errorDatLongHesWithRelMisprod, REML=TRUE) +# summary(hes_with_rel_misprod_model_3) + +misprod_with_rel_hes_model_4 <- lmerTest::lmer(misprod_tally ~ hes_position * scaaredSoc_gmc + (1|id) + (1|passage), + data=errorDatLongMisprodWithRelHes, REML=TRUE) +# summary(misprod_with_rel_hes_model_4) + # Word frequency analysis From b6fc56e9ea727c339087fc0ffcbb3fe908e39524 Mon Sep 17 00:00:00 2001 From: l-acs Date: Tue, 30 Jan 2024 16:23:43 -0500 Subject: [PATCH 14/33] Keep code/prepReadAloudBeta.R --- ...prepReadAloudBeta.R-move-to-code__prepWordLevelReadAloudBeta.R | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename code/prepReadAloudBeta.R => code__prepReadAloudBeta.R-move-to-code__prepWordLevelReadAloudBeta.R (100%) diff --git a/code/prepReadAloudBeta.R b/code__prepReadAloudBeta.R-move-to-code__prepWordLevelReadAloudBeta.R similarity index 100% rename from code/prepReadAloudBeta.R rename to code__prepReadAloudBeta.R-move-to-code__prepWordLevelReadAloudBeta.R From fd71d50d0f3942ab14ddab925158e19fbf136f4b Mon Sep 17 00:00:00 2001 From: l-acs Date: Tue, 30 Jan 2024 16:23:43 -0500 Subject: [PATCH 15/33] Copy code/prepReadAloudBeta.R into code/prepWordLevelReadAloudBeta.R --- code/{prepReadAloudBeta.R => prepWordLevelReadAloudBeta.R} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename code/{prepReadAloudBeta.R => prepWordLevelReadAloudBeta.R} (100%) diff --git a/code/prepReadAloudBeta.R b/code/prepWordLevelReadAloudBeta.R similarity index 100% rename from code/prepReadAloudBeta.R rename to code/prepWordLevelReadAloudBeta.R From 80ab7a810e0856ddcd9401880f62f5b69f43defb Mon Sep 17 00:00:00 2001 From: l-acs Date: Tue, 30 Jan 2024 16:23:43 -0500 Subject: [PATCH 16/33] Revert prepReadAloudBeta.R, keeping history --- .../prepReadAloudBeta.R | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename code__prepReadAloudBeta.R-move-to-code__prepWordLevelReadAloudBeta.R => code/prepReadAloudBeta.R (100%) diff --git a/code__prepReadAloudBeta.R-move-to-code__prepWordLevelReadAloudBeta.R b/code/prepReadAloudBeta.R similarity index 100% rename from code__prepReadAloudBeta.R-move-to-code__prepWordLevelReadAloudBeta.R rename to code/prepReadAloudBeta.R From 6e629e8f4032b10e2e9b8d906f7bedc143e568f8 Mon Sep 17 00:00:00 2001 From: l-acs Date: Tue, 30 Jan 2024 12:11:00 -0500 Subject: [PATCH 17/33] Add initial prep script for word level analysis, a copy of the passage-level script --- code/prepWordLevelReadAloudBeta.R | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/code/prepWordLevelReadAloudBeta.R b/code/prepWordLevelReadAloudBeta.R index afb78d8..3ce7172 100644 --- a/code/prepWordLevelReadAloudBeta.R +++ b/code/prepWordLevelReadAloudBeta.R @@ -1,11 +1,13 @@ # readAloud-valence-beta Analysis Preparation # Authors: Luc Sahar and Jessica M. Alexander -- NDCLab, Florida International University -# Last updated: 2023-08-15 +# Last updated: 2024-01-30 # INPUTS -# data/df: behavioral (error-related) data, for each participant on each passage +# data/df: behavioral (error-related) data, for each participant for each word # accDat: comprehension question accuracy (0/1) for each participant for each passage -# readDat: stimuli characteristics (by passage half) +# as of 1/30/24 this is alphabetical by *passage* now (more easily +# automated, less stateful, less hard coding) +# readDat: stimuli characteristics (by passage half) - 2024-01-30 - now NA? # redcap: participant data, incl. demographics and responses + scored factors for questionnaires: # bfne (brief fear of negative evaluation): bfne_b_scrdTotal (fear of negative evaluation total) # phq8 (patient health questionnaire): phq8_scrdTotal (depression scale total) @@ -39,29 +41,28 @@ library(readr) # write_csv # current_datetime <- now(timezone) %>% format(date_format) # paste(label, '_', current_datetime, '.', ext, sep = "") # } -today <- Sys.Date() -today <- format(today, "%Y%m%d") +today <- Sys.Date() %>% format("%Y%m%d") #set up directories for input/output # main_dataset <- '/Users/jalexand/github/readAloud-valence-dataset/' # main_analyses <- '/Users/jalexand/github/readAloud-valence-beta/' # out_path <- '/Users/jalexand/github/readAloud-valence-beta/derivatives/' -main_dataset <- '/home/luc/Documents/ndclab/analysis-sandbox/rwe-dataset/' -main_analyses <- '/home/luc/Documents/ndclab/analysis-sandbox/rwe-analysis/' -out_path <- '/home/luc/Documents/ndclab/analysis-sandbox/rwe-analysis/derivatives/' +main_dataset <- '~/Documents/ndclab/rwe-analysis-sandbox/rwe-dataset/' +main_analyses <- '~/Documents/ndclab/rwe-analysis-sandbox/rwe-analysis/' +out_path <- '~/Documents/ndclab/rwe-analysis-sandbox/rwe-analysis/derivatives/' #load input files # data <- paste(main_dataset, 'derivatives/preprocessed/disfluencies_subject-x-passage_20230616_1229pm.csv', sep="", collapse=NULL) -# data <- "/home/luc/Documents/ndclab/analysis-sandbox/output-csvs/disfluencies_subject-x-passage_20230616_1229pm.csv" -data <- "/home/luc/Documents/ndclab/analysis-sandbox/output-csvs/disfluencies_subject-x-passage_20230818_1042pm.csv" +data <- "~/Documents/ndclab/rwe-analysis-sandbox/rwe-dataset/derivatives/disfluencies_subject-x-passage-x-word_20240126_0253pm.csv" accDat_path <- paste(main_dataset,'derivatives/preprocessed/readAloud_passage-level_summary_20220812.csv', sep="", collapse=NULL) readDat_path <- paste(main_dataset, 'derivatives/analysisStimuli_readDat_20230614.csv', sep="", collapse=NULL) redcap_path <- paste(main_dataset,'derivatives/preprocessed/202201v0readAloudval_SCRD_2022-06-20_1019.csv', sep="", collapse=NULL) agedat_path <- paste(main_dataset,'derivatives/preprocessed/202201v0readAloudval_SCRD_2022-06-20_1019_ageonly.csv', sep="", collapse=NULL) speedDat_path <- paste(main_dataset, "derivatives/preprocessed/valence-timing/timingpitch_subject-by-passage_2022-09-09.csv", sep="", collapse=NULL) freqDat_path <- paste(main_analyses, "derivatives/prepWordFreq_readDat20230825.csv", sep="") -# c(data, accDat_path, readDat_path, redcap_path, agedat_path, speedDat_path) %>% fs::as_fs_path() %>% fs::is_file() +scaffolds_path <- paste(main_dataset, 'code/scaffolds.xlsx', sep="", collapse=NULL) +# c(data, accDat_path, readDat_path, redcap_path, agedat_path, speedDat_path, scaffolds_path) %>% fs::as_fs_path() %>% fs::is_file() # ✅: all TRUE @@ -73,6 +74,7 @@ accDat <- read.csv(accDat_path, na.strings='NA', check.names=FALSE) #passage lev accDat$passage <- c("dams", "flying", "bats", "broccoli", "realty", "bees", "dogshow", "dolphins", "icefishing", "cars", "vegas", "sun", "caramel", "congo", "antarctica", "depression", "skunkowl", "grizzly", "mantis", "dentist") #rename passages with short-name + # can't we use scaffolds for this? speedDat <- read.csv(speedDat_path, na.strings='NA') freqDat <- read.csv(freqDat_path, na.strings = 'NA') From c0b1ae48f25f503aa1d5da1c433d9be20d99f23a Mon Sep 17 00:00:00 2001 From: l-acs Date: Tue, 30 Jan 2024 13:33:04 -0500 Subject: [PATCH 18/33] Limit block comments to RStudio's prescribed 80 character width --- code/prepWordLevelReadAloudBeta.R | 40 +++++++++++++++++++------------ 1 file changed, 25 insertions(+), 15 deletions(-) diff --git a/code/prepWordLevelReadAloudBeta.R b/code/prepWordLevelReadAloudBeta.R index 3ce7172..1d9d2bc 100644 --- a/code/prepWordLevelReadAloudBeta.R +++ b/code/prepWordLevelReadAloudBeta.R @@ -1,26 +1,35 @@ # readAloud-valence-beta Analysis Preparation -# Authors: Luc Sahar and Jessica M. Alexander -- NDCLab, Florida International University +# Authors: Luc Sahar and Jessica M. Alexander +# NDCLab, Florida International University # Last updated: 2024-01-30 # INPUTS # data/df: behavioral (error-related) data, for each participant for each word -# accDat: comprehension question accuracy (0/1) for each participant for each passage -# as of 1/30/24 this is alphabetical by *passage* now (more easily -# automated, less stateful, less hard coding) +# accDat: comprehension question accuracy (0/1) for each participant for each +# passage +# as of 1/30/24 this is alphabetical by *passage* now (more easily +# automated, less stateful, less hard coding) # readDat: stimuli characteristics (by passage half) - 2024-01-30 - now NA? -# redcap: participant data, incl. demographics and responses + scored factors for questionnaires: - # bfne (brief fear of negative evaluation): bfne_b_scrdTotal (fear of negative evaluation total) - # phq8 (patient health questionnaire): phq8_scrdTotal (depression scale total) - # scaared, total (screen for adult anxiety disorders): scaared_b_scrdTotal (total anxiety) - # scaared, social (screen for adult anxiety disorders): scaared_b_scrdSoc (social phobias) - # scaared, general (screen for adult anxiety disorders): scaared_b_scrdGA (general anxiety) - # sps (social phobia scale): sias6sps6_b_scrdSPS +# redcap: participant data, incl. demographics and responses + scored factors +# for questionnaires: +# bfne (brief fear of negative evaluation): bfne_b_scrdTotal (fear of negative +# evaluation total) +# phq8 (patient health questionnaire): phq8_scrdTotal (depression scale total) +# scaared, total (screen for adult anxiety disorders): scaared_b_scrdTotal +# (total anxiety) +# scaared, social (screen for adult anxiety disorders): scaared_b_scrdSoc +# (social phobias) +# scaared, general (screen for adult anxiety disorders): scaared_b_scrdGA +# (general anxiety) +# sps (social phobia scale): sias6sps6_b_scrdSPS # OUTPUTS # dfTrim: for each passage, for each participant, details on: - # participant behavior: reading errors made, comprehension question accuracy - # passage characteristics: length (syllable and word), average syllables per word - # participant data: demographics, language history, mood and mood disorder scores +# participant behavior: reading errors made, comprehension question accuracy +# passage characteristics: length (syllable and word), average syllables per +# word +# participant data: demographics, language history, mood and mood disorder +# scores ### SECTION 1: SETTING UP @@ -233,7 +242,8 @@ df$timePerWord <- df$readingTime / df$lenWord ### SECTION 5: CROSS-CHECK ALL PARTICIPANTS MET INCLUSION CRITERIA -#note: given the time required to annotated errors, only participants who met inclusion criteria were annotated +# note: given the time required to annotated errors, only participants who met +# inclusion criteria were annotated #sum(df$eng==1 & df$langhis %in% c(2,4) & df$ageen>6) #confirm all subjects monolingual English OR natively bilingual OR learned English before age 6 #sum(df$commdis>0) #confirm no subject diagnosed with any communication disorder #sum(df$profen>3, na.rm=TRUE)/20 #one remaining subject (150060) rates own English proficiency as not "elementary" or "not proficient", but reads fluidly and achieved 80% accuracy on challenge questions, so not excluded From 8165183beed30500775abadfe59231e6ea935967 Mon Sep 17 00:00:00 2001 From: l-acs Date: Tue, 30 Jan 2024 13:45:38 -0500 Subject: [PATCH 19/33] Declutter --- code/prepWordLevelReadAloudBeta.R | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/code/prepWordLevelReadAloudBeta.R b/code/prepWordLevelReadAloudBeta.R index 1d9d2bc..b0ca2ee 100644 --- a/code/prepWordLevelReadAloudBeta.R +++ b/code/prepWordLevelReadAloudBeta.R @@ -41,15 +41,6 @@ library(lubridate) # now library(readr) # write_csv #set up defaults for output file naming -# ext_default = 'csv' -# tz_default = "America/New_York" -# date_format_default = "%Y%m%d_%I%M%P" -# -# build_output_filename <- function(label, ext = ext_default, timezone = tz_default, date_format = date_format_default) { -# # `label` may include the destination directory, if different from the working directory when the script is run -# current_datetime <- now(timezone) %>% format(date_format) -# paste(label, '_', current_datetime, '.', ext, sep = "") -# } today <- Sys.Date() %>% format("%Y%m%d") #set up directories for input/output @@ -80,10 +71,7 @@ redcap <- read.csv(redcap_path, na.strings='NA') #participant questionnaire resp agedat <- read.csv(agedat_path, na.strings='NA') #participant age information readDat <- read.csv(readDat_path, na.strings='N') #passage-level characteristics from analysisStimuli.R accDat <- read.csv(accDat_path, na.strings='NA', check.names=FALSE) #passage level accuracy for each subject -accDat$passage <- c("dams", "flying", "bats", "broccoli", "realty", "bees", "dogshow", "dolphins", "icefishing", - "cars", "vegas", "sun", "caramel", "congo", "antarctica", "depression", "skunkowl", "grizzly", - "mantis", "dentist") #rename passages with short-name - # can't we use scaffolds for this? +accDat$passage <- excel_sheets(scaffolds_path) #rename passages with short-name speedDat <- read.csv(speedDat_path, na.strings='NA') freqDat <- read.csv(freqDat_path, na.strings = 'NA') @@ -289,6 +277,6 @@ out_filename # # base = "~/Documents/ndclab/analysis-sandbox/github-structure-mirror/readAloud-valence-dataset/derivatives/preprocessed" # base = "/home/data/NDClab/datasets/readAloud-valence-dataset/derivatives/preprocessed" # preprocessed_summary_filename = "TODO" -# collapsed_filename = build_output_filename(label = paste(base, "disfluencies_subject", sep='/')) +# collapsed_filename = "TODO" # # collapse_by_participant(preprocessed_summary_file, collapsed_filename) From c1b3048f28bd21dc7aca5aab73a7a794aefa77c9 Mon Sep 17 00:00:00 2001 From: l-acs Date: Wed, 31 Jan 2024 12:54:04 -0500 Subject: [PATCH 20/33] Simplify data wrangling --- code/prepWordLevelReadAloudBeta.R | 134 ++++++++++++++---------------- 1 file changed, 62 insertions(+), 72 deletions(-) diff --git a/code/prepWordLevelReadAloudBeta.R b/code/prepWordLevelReadAloudBeta.R index b0ca2ee..c249858 100644 --- a/code/prepWordLevelReadAloudBeta.R +++ b/code/prepWordLevelReadAloudBeta.R @@ -65,111 +65,99 @@ scaffolds_path <- paste(main_dataset, 'code/scaffolds.xlsx', sep="", collapse=NU # c(data, accDat_path, readDat_path, redcap_path, agedat_path, speedDat_path, scaffolds_path) %>% fs::as_fs_path() %>% fs::is_file() # ✅: all TRUE - +all_passages <- excel_sheets(scaffolds_path) df <- read.csv(data, na.strings='NA') redcap <- read.csv(redcap_path, na.strings='NA') #participant questionnaire responses agedat <- read.csv(agedat_path, na.strings='NA') #participant age information readDat <- read.csv(readDat_path, na.strings='N') #passage-level characteristics from analysisStimuli.R accDat <- read.csv(accDat_path, na.strings='NA', check.names=FALSE) #passage level accuracy for each subject -accDat$passage <- excel_sheets(scaffolds_path) #rename passages with short-name +accDat$passage <- all_passages #rename passages with short-name speedDat <- read.csv(speedDat_path, na.strings='NA') freqDat <- read.csv(freqDat_path, na.strings = 'NA') #organize data types -df[,3:30] <- sapply(df[,3:30],as.numeric) +# df[,3:30] <- sapply(df[,3:30],as.numeric) +# not applicable for now, we'll see #add missing passages for 150086 so that nrow is divisible by 20 -passages_read <- df$passage[which(df$id=="150086")] -all_passages <- unique(df$passage) -tempdf <- data.frame(matrix(nrow=0, ncol=ncol(df))) -colnames(tempdf) <- colnames(df) -for(passage in 1:length(all_passages)){ - if(all_passages[passage] %in% passages_read){next}else{ - tempdf[nrow(tempdf) + 1,] <- c("150086", all_passages[passage], rep(NA, 30)) - } -} -df <- rbind(df, tempdf) +# passages_read <- df$passage[which(df$id=="150086")] +# tempdf <- data.frame(matrix(nrow=0, ncol=ncol(df))) +# colnames(tempdf) <- colnames(df) +# for(passage in 1:length(all_passages)){ +# if(all_passages[passage] %in% passages_read){next}else{ +# tempdf[nrow(tempdf) + 1,] <- c("150086", all_passages[passage], rep(NA, 30)) +# } +# } +# df <- rbind(df, tempdf) ### SECTION 2: BUILD DEMOGRAPHIC DATA DF demoDat <- redcap[,c(1,5)] +# demoDat_imperative <- redcap[,c(1,5)]; demoDat_dplyr <- demoDat_imperative +# as we refactor, our test case is: all.equal(demoDat_imperative, demoDat_dplyr) +# this has been confirmed to work for with every new column added + #biological sex: replace numerical values with text description -for(a in 1:nrow(redcap)){ - if(is.na(redcap$demo_b_sex_s1_r1_e1[a])){demoDat$sex[a] <- 'undisclosed'} - else if(redcap$demo_b_sex_s1_r1_e1[a]==1){demoDat$sex[a] <- 'male'} - else if(redcap$demo_b_sex_s1_r1_e1[a]==2){demoDat$sex[a] <- 'female'} - else if(redcap$demo_b_sex_s1_r1_e1[a]==3){demoDat$sex[a] <- 'intersex'} - else if(redcap$demo_b_sex_s1_r1_e1[a]==4){demoDat$sex[a] <- 'other'} - else if(redcap$demo_b_sex_s1_r1_e1[a]==5){demoDat$sex[a] <- 'unknown'} - else{demoDat$sex[a] <- 'undisclosed'} -} +demoDat$sex <- case_match(redcap$demo_b_sex_s1_r1_e1, + 1 ~ 'male', 2 ~ 'female', 3 ~ 'intersex', + 4 ~ 'other', 5 ~ 'unknown', .default = 'undisclosed') #preferred pronouns: replace numerical values with text description -for(b in 1:nrow(redcap)){ - if(is.na(redcap$demo_b_pronouns_s1_r1_e1[b])){demoDat$pron[b] <- 'undisclosed'} - else if(redcap$demo_b_pronouns_s1_r1_e1[b]==1){demoDat$pron[b] <- 'she/her'} - else if(redcap$demo_b_pronouns_s1_r1_e1[b]==2){demoDat$pron[b] <- 'he/him'} - else if(redcap$demo_b_pronouns_s1_r1_e1[b]==3){demoDat$pron[b] <- 'they/them'} - else if(redcap$demo_b_pronouns_s1_r1_e1[b]==5){demoDat$pron[b] <- 'other'} - else{demoDat$pron[b] <- 'undisclosed'} -} +# for(b in 1:nrow(redcap)){ ... } +# 8 lines, 466 chars; boilerplatey + +# try rewriting as case_when +demoDat$pron <- case_match (redcap$demo_b_pronouns_s1_r1_e1, + 1 ~ "she/her", 2 ~ "he/him", 3 ~ "they/them", + 5 ~ "other", .default = "undisclosed") +# `.default` catches both NA and everything else + #ethnicity affiliation: map to text description -for(c in 1:nrow(redcap)){ - if(redcap$demo_b_ethnic_s1_r1_e1___1[c]==1){demoDat$ethnic[c] <- 'AI'} #american indian/alaska native - else if(redcap$demo_b_ethnic_s1_r1_e1___2[c]==1){demoDat$ethnic[c] <- 'A'} #asian - else if(redcap$demo_b_ethnic_s1_r1_e1___3[c]==1){demoDat$ethnic[c] <- 'AA'} #african american - else if(redcap$demo_b_ethnic_s1_r1_e1___4[c]==1){demoDat$ethnic[c] <- 'LX'} #hispanic/latinx - else if(redcap$demo_b_ethnic_s1_r1_e1___5[c]==1){demoDat$ethnic[c] <- 'ME'} #middle eastern - else if(redcap$demo_b_ethnic_s1_r1_e1___6[c]==1){demoDat$ethnic[c] <- 'PI'} #pacific islander - else if(redcap$demo_b_ethnic_s1_r1_e1___7[c]==1){demoDat$ethnic[c] <- 'W'} #white - else if(redcap$demo_b_ethnic_s1_r1_e1___8[c]==1){demoDat$ethnic[c] <- 'O'} #other - else{demoDat$ethnic[c] <- 'UND'} #undisclosed -} +# wait, I don't think this lets people be multiple races +demoDat$ethnic <- case_when( + redcap$demo_b_ethnic_s1_r1_e1___1 == 1 ~ 'AI', #american indian/alaska native + redcap$demo_b_ethnic_s1_r1_e1___2 == 1 ~ 'A', #asian + redcap$demo_b_ethnic_s1_r1_e1___3 == 1 ~ 'AA', #african american + redcap$demo_b_ethnic_s1_r1_e1___4 == 1 ~ 'LX', #hispanic/latinx + redcap$demo_b_ethnic_s1_r1_e1___5 == 1 ~ 'ME', #middle eastern + redcap$demo_b_ethnic_s1_r1_e1___6 == 1 ~ 'PI', #pacific islander + redcap$demo_b_ethnic_s1_r1_e1___7 == 1 ~ 'W', #white + redcap$demo_b_ethnic_s1_r1_e1___8 == 1 ~ 'O', #other + .default = 'UND' #undisclosed +) #social class affiliation: replace numerical values with text description -for(d in 1:nrow(redcap)){ - if(is.na(redcap$demo_b_socclass_s1_r1_e1[d])){demoDat$socclass[d] <- 'undisclosed'} - else if(redcap$demo_b_socclass_s1_r1_e1[d]==1){demoDat$socclass[d] <- 'poor'} - else if(redcap$demo_b_socclass_s1_r1_e1[d]==2){demoDat$socclass[d] <- 'working'} - else if(redcap$demo_b_socclass_s1_r1_e1[d]==3){demoDat$socclass[d] <- 'middle'} - else if(redcap$demo_b_socclass_s1_r1_e1[d]==4){demoDat$socclass[d] <- 'affluent'} - else{demoDat$socclass[d] <- 'undisclosed'} -} +demoDat$socclass <- case_match(redcap$demo_b_socclass_s1_r1_e1, + 1 ~ "poor", 2 ~ "working", 3 ~ "middle", + 4 ~ "affluent", .default = "undisclosed") #communication disorders diagnoses: sum across childhood, adolescence, and adulthood -for(e in 1:nrow(redcap)){ - demoDat$commdis[e] <- sum(redcap$demo_b_comdiskid_s1_r1_e1[e], - redcap$demo_b_comdisteen_s1_r1_e1[e], - redcap$demo_b_comdisad_s1_r1_e[e]) -} +# nb. there was a typo in the old version: no adult diagnoses were being checked +# because the column name did not exist and `sum` with the df$col syntax did not +# catch that +demoDat$commdis <- select(redcap, matches("demo_b_comdis.*e1")) %>% rowSums #language history: transfer directly -for(f in 1:nrow(redcap)){ - demoDat$eng[f] <- redcap$demo_b_eng_s1_r1_e1[match(demoDat$record_id[f], redcap$record_id)] #participant monolingualism - demoDat$langhis[f] <- redcap$demo_b_langhis_s1_r1_e1[match(demoDat$record_id[f], redcap$record_id)] #participant language history - demoDat$ageen[f] <- redcap$demo_b_ageen_s1_r1_e1[match(demoDat$record_id[f], redcap$record_id)] #participant age of English acquisition - demoDat$profen[f] <- redcap$demo_b_profen_s1_r1_e1[match(demoDat$record_id[f], redcap$record_id)] #participant English proficiency -} +demoDat$eng <- redcap$demo_b_eng_s1_r1_e1 #participant monolingualism +demoDat$langhis <- redcap$demo_b_langhis_s1_r1_e1 #participant language history +demoDat$ageen <- redcap$demo_b_ageen_s1_r1_e1 #participant age of English acquisition +demoDat$profen <- redcap$demo_b_profen_s1_r1_e1 #participant English proficiency #mood and mood disorders: transfer directly -for(g in 1:nrow(redcap)){ - demoDat$bfne[g] <- redcap$bfne_b_scrdTotal[match(demoDat$record_id[g], redcap$record_id)] #bfne total score - demoDat$phq8[g] <- redcap$phq8_scrdTotal[match(demoDat$record_id[g], redcap$record_id)] #phq8 depression scale - demoDat$scaaredTotal[g] <- redcap$scaared_b_scrdTotal[match(demoDat$record_id[g], redcap$record_id)] #scaared total anxiety - demoDat$scaaredGA[g] <- redcap$scaared_b_scrdGA[match(demoDat$record_id[g], redcap$record_id)] #scaared general anxiety - demoDat$scaaredSoc[g] <- redcap$scaared_b_scrdSoc[match(demoDat$record_id[g], redcap$record_id)] #scaared social phobias - demoDat$sps[g] <- redcap$sias6sps6_b_scrdSPS[match(demoDat$record_id[g], redcap$record_id)] #sps social phobia scale -} +demoDat$bfne <- redcap$bfne_b_scrdTotal +demoDat$phq8 <- redcap$phq8_scrdTotal #phq8 depression scale +demoDat$scaaredTotal <- redcap$scaared_b_scrdTotal #scaared total anxiety +demoDat$scaaredGA <- redcap$scaared_b_scrdGA #scaared general anxiety +demoDat$scaaredSoc <- redcap$scaared_b_scrdSoc #scaared social phobias +demoDat$sps <- redcap$sias6sps6_b_scrdSPS #sps social phobia scale #age: pull from separate file -for(h in 1:nrow(demoDat)){ - demoDat$age[h] <- agedat$info_age_s1_r1_e1[match(demoDat$record_id[h], agedat$record_id)] -} +demoDat <- left_join(demoDat, # can't just assign: matching matters given new df + select(agedat, record_id, age = info_age_s1_r1_e1)) ### SECTION 3: SET UP DERIVED FIELDS FOR SPEED ANALYSES speedDat$readingTime <- speedDat$readEnd - speedDat$readStart -speedDat$id <- as.character(speedDat$id) # so we can join and it doesn't complain about type comparison df <- left_join(df, speedDat, by = c("id", "passage")) # now reading timestamps and duration are looped into df @@ -217,6 +205,8 @@ for(i in 1:nrow(df)){ df$scaaredSoc[i] <- demoDat$scaaredSoc[match(df$id[i], demoDat$record_id)] #participant social phobias (scaared) df$sps[i] <- demoDat$sps[match(df$id[i], demoDat$record_id)] #participant social phobias (sias6sps6) } +# first, test it- then we can delete the old versions +# expect it to fail tho #organize participant demographic variables df$sex <- as.factor(df$sex) From 6a5360bb7601fc63dc3caf2b3fee33de72550b1b Mon Sep 17 00:00:00 2001 From: l-acs Date: Wed, 31 Jan 2024 15:59:13 -0500 Subject: [PATCH 21/33] Revise for word level: inclusion checks, summaries --- code/prepWordLevelReadAloudBeta.R | 36 +++++++++++++++---------------- 1 file changed, 17 insertions(+), 19 deletions(-) diff --git a/code/prepWordLevelReadAloudBeta.R b/code/prepWordLevelReadAloudBeta.R index c249858..1e30014 100644 --- a/code/prepWordLevelReadAloudBeta.R +++ b/code/prepWordLevelReadAloudBeta.R @@ -113,8 +113,8 @@ demoDat$pron <- case_match (redcap$demo_b_pronouns_s1_r1_e1, #ethnicity affiliation: map to text description -# wait, I don't think this lets people be multiple races -demoDat$ethnic <- case_when( +demoDat$ethnic <- case_when( # first, check for self-identifying 2+ ethnicities + redcap %>% select(matches("demo_b_ethnic_*")) %>% rowSums >= 2 ~ 'M', # multi redcap$demo_b_ethnic_s1_r1_e1___1 == 1 ~ 'AI', #american indian/alaska native redcap$demo_b_ethnic_s1_r1_e1___2 == 1 ~ 'A', #asian redcap$demo_b_ethnic_s1_r1_e1___3 == 1 ~ 'AA', #african american @@ -162,6 +162,8 @@ df <- left_join(df, speedDat, by = c("id", "passage")) # now reading timestamps ### SECTION 4: BUILD TRIAL-LEVEL DF (ADD DEMODAT, READDAT, ACCDAT, AND FREQDAT to DF) +# this takes over an hour and a half to run - todo: reimplement for efficiency +# (worthwhile given it's 200,000 rows to loop over) for(i in 1:nrow(df)){ subject <- df$id[i] #extract subject number for matching passage <- df$passage[i] #extract passage name for matching @@ -205,8 +207,7 @@ for(i in 1:nrow(df)){ df$scaaredSoc[i] <- demoDat$scaaredSoc[match(df$id[i], demoDat$record_id)] #participant social phobias (scaared) df$sps[i] <- demoDat$sps[match(df$id[i], demoDat$record_id)] #participant social phobias (sias6sps6) } -# first, test it- then we can delete the old versions -# expect it to fail tho +# succeeded given changes above it; to be revised #organize participant demographic variables df$sex <- as.factor(df$sex) @@ -215,6 +216,7 @@ df$ethnic <- as.factor(df$ethnic) df$socclass <- as.factor(df$socclass) # compute speed +# nb these are passage level data but are tracked by word df$timePerSyllable <- df$readingTime / df$lenSyll df$timePerWord <- df$readingTime / df$lenWord @@ -222,25 +224,21 @@ df$timePerWord <- df$readingTime / df$lenWord ### SECTION 5: CROSS-CHECK ALL PARTICIPANTS MET INCLUSION CRITERIA # note: given the time required to annotated errors, only participants who met # inclusion criteria were annotated -#sum(df$eng==1 & df$langhis %in% c(2,4) & df$ageen>6) #confirm all subjects monolingual English OR natively bilingual OR learned English before age 6 -#sum(df$commdis>0) #confirm no subject diagnosed with any communication disorder -#sum(df$profen>3, na.rm=TRUE)/20 #one remaining subject (150060) rates own English proficiency as not "elementary" or "not proficient", but reads fluidly and achieved 80% accuracy on challenge questions, so not excluded +sum(df$eng==1 & df$langhis %in% c(2,4) & df$ageen>6) #confirm all subjects monolingual English OR natively bilingual OR learned English before age 6 +sum(df$commdis>0) #confirm no subject diagnosed with any communication disorder +filter(df, profen > 3) %>% select(id) %>% unique %>% nrow #one remaining subject (150060) rates own English proficiency as not "elementary" or "not proficient", but reads fluidly and achieved 80% accuracy on challenge questions, so not excluded #extract age and sex stats - # all these values are just in case they're useful - not needed per se for later # steps of the logic in this script +summary_unique <- function(df, key, column, f = summary) { + unique(select(df, column, key))[[column]] %>% f +} + +c("age", "sex", "pronouns", "ethnic", "socclass") %>% + map(\(col) summary_unique(df, "id", col)) # `map` not `for`: return, not print -summary(df$age) #age range and mean -sd(df$age) #age standard deviation -summary(df$sex)/20 #number of participants by sex -summary(df$sex)/20 / (nrow(df)/20) #percentage of participants by sex -summary(df$pronouns)/20 #number of participants by preferred pronoun -summary(df$pronouns)/20 / (nrow(df)/20) #percentage of participants by preferred pronoun -summary(df$ethnic)/20 #number of participants by ethnic affiliation -summary(df$ethnic)/20 / (nrow(df)/20) #percentage of participants by ethnic affiliation -summary(df$socclass)/20 #number of participants by social class affiliation -summary(df$socclass)/20 / (nrow(df)/20) #percentage of participants by social class affiliation +summary_unique(df, "id", "age", f = sd) # also do stdev for age ### SECTION 6: TRIM PASSAGES DUE TO EXPERIMENTER ERROR @@ -249,7 +247,7 @@ dfTrim <- subset(dfTrim, !(dfTrim$passage=='sun')) #remove sun passage due to er ### SECTION 7: OUTPUT DATAFRAME -out_filename <- paste(out_path, "readAloudBetaData_", today, ".csv", sep="", collapse=NULL) +out_filename <- paste(out_path, "readAloudBetaData-wordLevel_", today, ".csv", sep="", collapse=NULL) write.csv(dfTrim, out_filename, row.names = FALSE) out_filename From 4e4d66069bb627a82ed38220d92ae4743bd27aa3 Mon Sep 17 00:00:00 2001 From: l-acs Date: Wed, 31 Jan 2024 21:00:55 -0500 Subject: [PATCH 22/33] Also summarize demographic stats by percent --- code/prepWordLevelReadAloudBeta.R | 28 ++++++++-------------------- 1 file changed, 8 insertions(+), 20 deletions(-) diff --git a/code/prepWordLevelReadAloudBeta.R b/code/prepWordLevelReadAloudBeta.R index 1e30014..aca099c 100644 --- a/code/prepWordLevelReadAloudBeta.R +++ b/code/prepWordLevelReadAloudBeta.R @@ -1,7 +1,7 @@ # readAloud-valence-beta Analysis Preparation # Authors: Luc Sahar and Jessica M. Alexander # NDCLab, Florida International University -# Last updated: 2024-01-30 +# Last updated: 2024-01-31 # INPUTS # data/df: behavioral (error-related) data, for each participant for each word @@ -235,9 +235,15 @@ summary_unique <- function(df, key, column, f = summary) { unique(select(df, column, key))[[column]] %>% f } -c("age", "sex", "pronouns", "ethnic", "socclass") %>% +demo_cols <- c("age", "sex", "pronouns", "ethnic", "socclass") +sample_size <- select(df, "id") %>% unique %>% nrow + +demo_cols %>% # as totals, for each of our demographic columns map(\(col) summary_unique(df, "id", col)) # `map` not `for`: return, not print +demo_cols %>% # as a percent + map(\(col) summary_unique(df, "id", col) / sample_size) + summary_unique(df, "id", "age", f = sd) # also do stdev for age @@ -250,21 +256,3 @@ dfTrim <- subset(dfTrim, !(dfTrim$passage=='sun')) #remove sun passage due to er out_filename <- paste(out_path, "readAloudBetaData-wordLevel_", today, ".csv", sep="", collapse=NULL) write.csv(dfTrim, out_filename, row.names = FALSE) out_filename - -# collapse_by_participant <- function(filename_in, filename_out) { -# by_participant <- read_csv(filename_in) %>% -# unique %>% # dedup -# group_by(id) %>% summarize(across(misprod:total_uncorrected_errors, sum)) # summarize by participant, across all passages -# # TODO change the columns selected, once more have been added to the output of the preproc script -# -# write_csv(by_participant, filename_out) -# return(filename_out) -# } -# -# -# # base = "~/Documents/ndclab/analysis-sandbox/github-structure-mirror/readAloud-valence-dataset/derivatives/preprocessed" -# base = "/home/data/NDClab/datasets/readAloud-valence-dataset/derivatives/preprocessed" -# preprocessed_summary_filename = "TODO" -# collapsed_filename = "TODO" -# -# collapse_by_participant(preprocessed_summary_file, collapsed_filename) From 086753ada89cbca0187f18d99146a6fe61911500 Mon Sep 17 00:00:00 2001 From: l-acs Date: Wed, 31 Jan 2024 21:09:20 -0500 Subject: [PATCH 23/33] Clean up unused material & out-of-date notes --- code/prepWordLevelReadAloudBeta.R | 37 +++++-------------------------- 1 file changed, 6 insertions(+), 31 deletions(-) diff --git a/code/prepWordLevelReadAloudBeta.R b/code/prepWordLevelReadAloudBeta.R index aca099c..1129de2 100644 --- a/code/prepWordLevelReadAloudBeta.R +++ b/code/prepWordLevelReadAloudBeta.R @@ -76,25 +76,12 @@ speedDat <- read.csv(speedDat_path, na.strings='NA') freqDat <- read.csv(freqDat_path, na.strings = 'NA') #organize data types -# df[,3:30] <- sapply(df[,3:30],as.numeric) -# not applicable for now, we'll see - -#add missing passages for 150086 so that nrow is divisible by 20 -# passages_read <- df$passage[which(df$id=="150086")] -# tempdf <- data.frame(matrix(nrow=0, ncol=ncol(df))) -# colnames(tempdf) <- colnames(df) -# for(passage in 1:length(all_passages)){ -# if(all_passages[passage] %in% passages_read){next}else{ -# tempdf[nrow(tempdf) + 1,] <- c("150086", all_passages[passage], rep(NA, 30)) -# } -# } -# df <- rbind(df, tempdf) + ### SECTION 2: BUILD DEMOGRAPHIC DATA DF demoDat <- redcap[,c(1,5)] -# demoDat_imperative <- redcap[,c(1,5)]; demoDat_dplyr <- demoDat_imperative -# as we refactor, our test case is: all.equal(demoDat_imperative, demoDat_dplyr) -# this has been confirmed to work for with every new column added +# while refactoring, the test case: all.equal(demoDat_imperative, demoDat_dplyr) +# has been confirmed to work with every new column added #biological sex: replace numerical values with text description demoDat$sex <- case_match(redcap$demo_b_sex_s1_r1_e1, @@ -102,10 +89,6 @@ demoDat$sex <- case_match(redcap$demo_b_sex_s1_r1_e1, 4 ~ 'other', 5 ~ 'unknown', .default = 'undisclosed') #preferred pronouns: replace numerical values with text description -# for(b in 1:nrow(redcap)){ ... } -# 8 lines, 466 chars; boilerplatey - -# try rewriting as case_when demoDat$pron <- case_match (redcap$demo_b_pronouns_s1_r1_e1, 1 ~ "she/her", 2 ~ "he/him", 3 ~ "they/them", 5 ~ "other", .default = "undisclosed") @@ -132,9 +115,6 @@ demoDat$socclass <- case_match(redcap$demo_b_socclass_s1_r1_e1, 4 ~ "affluent", .default = "undisclosed") #communication disorders diagnoses: sum across childhood, adolescence, and adulthood -# nb. there was a typo in the old version: no adult diagnoses were being checked -# because the column name did not exist and `sum` with the df$col syntax did not -# catch that demoDat$commdis <- select(redcap, matches("demo_b_comdis.*e1")) %>% rowSums #language history: transfer directly @@ -168,15 +148,10 @@ for(i in 1:nrow(df)){ subject <- df$id[i] #extract subject number for matching passage <- df$passage[i] #extract passage name for matching - #production errors of interest + # production errors of interest are already included # misprod = raw misproduction errors # hesitation = raw hesitations - # words_with_misprod = distinct words with misproduction errors - # words_with_hes = distinct words with pre-word or word-internal hesitation - # misprod_rate = rate of raw misproduction errors - # hesitation_rate = rate of raw hesitations - # words_with_misprod_rate = rate of word-level misproduction errors - # words_with_hes_rate = rate of word-level hesitations + # misprod-hesitation linear sequences and vice versa #extract passage characteristics from readDat df$lenSyll[i] <- sum(readDat$lengthSyll[which(readDat$passage==passage)]) #length of passage in syllables @@ -216,7 +191,7 @@ df$ethnic <- as.factor(df$ethnic) df$socclass <- as.factor(df$socclass) # compute speed -# nb these are passage level data but are tracked by word +# nb these are passage-level data but are tracked in every row, i.e. word level df$timePerSyllable <- df$readingTime / df$lenSyll df$timePerWord <- df$readingTime / df$lenWord From e16cd3e93ee019dbc18be72ea0a8f85a1f8c2b91 Mon Sep 17 00:00:00 2001 From: l-acs Date: Wed, 31 Jan 2024 21:21:10 -0500 Subject: [PATCH 24/33] Keep code/analysisReadAloudBeta.R --- ...ReadAloudBeta.R-move-to-code__analysisWordLevelReadAloudBeta.R | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename code/analysisReadAloudBeta.R => code__analysisReadAloudBeta.R-move-to-code__analysisWordLevelReadAloudBeta.R (100%) diff --git a/code/analysisReadAloudBeta.R b/code__analysisReadAloudBeta.R-move-to-code__analysisWordLevelReadAloudBeta.R similarity index 100% rename from code/analysisReadAloudBeta.R rename to code__analysisReadAloudBeta.R-move-to-code__analysisWordLevelReadAloudBeta.R From 0e41693addbbce5c420a70c85649de45a270b267 Mon Sep 17 00:00:00 2001 From: l-acs Date: Wed, 31 Jan 2024 21:21:10 -0500 Subject: [PATCH 25/33] Copy code/analysisReadAloudBeta.R into code/analysisWordLevelReadAloudBeta.R --- .../{analysisReadAloudBeta.R => analysisWordLevelReadAloudBeta.R} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename code/{analysisReadAloudBeta.R => analysisWordLevelReadAloudBeta.R} (100%) diff --git a/code/analysisReadAloudBeta.R b/code/analysisWordLevelReadAloudBeta.R similarity index 100% rename from code/analysisReadAloudBeta.R rename to code/analysisWordLevelReadAloudBeta.R From f27304c4f3fa5346808d77151546640f83be8bb2 Mon Sep 17 00:00:00 2001 From: l-acs Date: Wed, 31 Jan 2024 21:21:10 -0500 Subject: [PATCH 26/33] Revert analysisReadAloudBeta.R, keeping history --- .../analysisReadAloudBeta.R | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename code__analysisReadAloudBeta.R-move-to-code__analysisWordLevelReadAloudBeta.R => code/analysisReadAloudBeta.R (100%) diff --git a/code__analysisReadAloudBeta.R-move-to-code__analysisWordLevelReadAloudBeta.R b/code/analysisReadAloudBeta.R similarity index 100% rename from code__analysisReadAloudBeta.R-move-to-code__analysisWordLevelReadAloudBeta.R rename to code/analysisReadAloudBeta.R From 34b5e472519e189b01ce6296b2b915e735d0e4da Mon Sep 17 00:00:00 2001 From: l-acs Date: Thu, 1 Feb 2024 02:15:31 -0500 Subject: [PATCH 27/33] Begin revising models and pre-analysis for word level --- code/analysisWordLevelReadAloudBeta.R | 150 ++++++++++++++------------ 1 file changed, 80 insertions(+), 70 deletions(-) diff --git a/code/analysisWordLevelReadAloudBeta.R b/code/analysisWordLevelReadAloudBeta.R index 2509334..246a049 100644 --- a/code/analysisWordLevelReadAloudBeta.R +++ b/code/analysisWordLevelReadAloudBeta.R @@ -1,6 +1,6 @@ # readAloud-valence-beta Reading Task Analyses # Authors: Luc Sahar, Jessica M. Alexander -# Last Updated: 2023-08-25 +# Last Updated: 2024-02-01 # INPUTS # data/df: behavioral data, for each participant on each passage, with relevant participant information and trial-level stimulus information @@ -140,10 +140,11 @@ today <- format(today, "%Y%m%d") # data <- '/home/luc/Documents/ndclab/analysis-sandbox/rwe-analysis/derivatives/readAloudBetaData_20230810.csv' # data <- '/home/luc/Documents/ndclab/analysis-sandbox/rwe-analysis/derivatives/readAloudBetaData_20230815.csv' # data <- '/home/luc/Documents/ndclab/analysis-sandbox/rwe-analysis/derivatives/readAloudBetaData_20230816.csv' -data <- '/home/luc/Documents/ndclab/analysis-sandbox/rwe-analysis/derivatives/readAloudBetaData_20230825.csv' -to_omit <- '/home/luc/Documents/ndclab/analysis-sandbox/rwe-analysis/input/passages-to-omit_20230810.csv' +# data <- '/home/luc/Documents/ndclab/analysis-sandbox/rwe-analysis/derivatives/readAloudBetaData_20230825.csv' +data <- '~/Documents/ndclab/rwe-analysis-sandbox/rwe-analysis/derivatives/readAloudBetaData-wordLevel_20240130.csv' +to_omit <- '/home/luc/Documents/ndclab/rwe-analysis-sandbox/rwe-analysis/input/passages-to-omit_20230810.csv' # out_path <- '/Users/jalexand/github/readAloud-valence-beta/derivatives/' -out_path <- '/home/luc/Documents/ndclab/analysis-sandbox/rwe-analysis/derivatives/' +out_path <- '/home/luc/Documents/ndclab/rwe-analysis-sandbox/rwe-analysis/derivatives/' #read in data df <- read.csv(data, row.names = NULL) # output of prep script @@ -159,16 +160,23 @@ df$socclass <- as.factor(df$socclass) # all these values are just in case they're useful - not needed per se for later # steps of the logic in this script -summary(df$age) -sd(df$age) -summary(df$sex)/18 -summary(df$sex)/18 / (nrow(df)/18) -summary(df$pronouns)/18 -summary(df$pronouns)/18 / (nrow(df)/18) -summary(df$ethnic)/18 -summary(df$ethnic)/18 / (nrow(df)/18) -summary(df$socclass)/18 -summary(df$socclass)/18 / (nrow(df)/18) +# from prep script: +demo_cols <- c("age", "sex", "pronouns", "ethnic", "socclass") +sample_size <- select(df, "id") %>% unique %>% nrow + + +summary_unique <- function(df, key, column, f = summary) { + unique(select(df, column, key))[[column]] %>% f +} + +demo_cols %>% # as totals, for each of our demographic columns + map(\(col) summary_unique(df, "id", col)) # `map` not `for`: return, not print + +demo_cols %>% # as a percent + map(\(col) summary_unique(df, "id", col) / sample_size) + +summary_unique(df, "id", "age", f = sd) # also do stdev for age + #remove participants who were not engaged in the task dfTrim <- df @@ -191,18 +199,17 @@ dfTrim <- subset(dfTrim, challengeAvgSub>0.5) length(unique(df$id)) - length(unique(dfTrim$id)) #number of participants removed due to distraction or low accuracy #calculate average accuracy -mean(dfTrim$challengeAvgSub) -sd(dfTrim$challengeAvgSub) - -# calculate average speed -mean(dfTrim$timePerSyllable, na.rm=TRUE) -sd(dfTrim$timePerSyllable, na.rm=TRUE) -mean(dfTrim$timePerWord, na.rm=TRUE) -sd(dfTrim$timePerWord, na.rm=TRUE) +summary_unique(dfTrim, "id", "challengeAvgSub") +summary_unique(dfTrim, "id", "challengeAvgSub", f = sd) +# calculate average speed per *person* per *passage* +unique(select(dfTrim, timePerSyllable, id, passage))$timePerSyllable %>% mean(na.rm = TRUE) +unique(select(dfTrim, timePerSyllable, id, passage))$timePerSyllable %>% sd(na.rm = TRUE) +unique(select(dfTrim, timePerWord, id, passage))$timePerWord %>% mean(na.rm = TRUE) +unique(select(dfTrim, timePerWord, id, passage))$timePerWord %>% sd(na.rm = TRUE) ### SECTION 2: PASSAGE-LEVEL TRIMMING -passage_no_before_trimming <- nrow(dfTrim) +passage_no_before_trimming <- summary_unique(dfTrim, "id", "passage", f = length) #remove passages with high omissions (participant did not complete reading) or other problems (someone else is in the room, etc.) # e.g. vegas 150013 @@ -211,7 +218,7 @@ dfTrim <- anti_join(dfTrim, by = join_by(id == participant, passage == passage)) -passage_no_after_trim1 <- nrow(dfTrim) +passage_no_after_trim1 <- summary_unique(dfTrim, "id", "passage", f = length) passage_no_before_trimming - passage_no_after_trim1 #number of passages trimmed (passage_no_before_trimming - passage_no_after_trim1) / passage_no_before_trimming #percentage of passages trimmed @@ -229,7 +236,7 @@ dfTrim <- filter(dfTrim, !is.na(timePerSyllable)) # this ends up only dropping 0083, caramel - the other three already end up # getting dropped based on other criteria -passage_no_after_trim2 <- nrow(dfTrim) +passage_no_after_trim2 <- summary_unique(dfTrim, "id", "passage", f = length) passage_no_after_trim1 - passage_no_after_trim2 #number of passages trimmed (passage_no_after_trim1 - passage_no_after_trim2) / passage_no_after_trim1 #percentage of passages trimmed of last bunch (passage_no_after_trim1 - passage_no_after_trim2) / passage_no_before_trimming #percentage of passages trimmed of whole @@ -259,8 +266,8 @@ errorDat$avgSyllPerWord_gmc <- errorDat$avgSyllPerWord - mean(errorDat$avgSyllPe # LS additions 8/11/23 # errorDat$errors <- errorDat$errors - mean(errorDat$errors) # errorDat$correction <- errorDat$corrections - mean(errorDat$corrections) -errorDat$error_rate <- errorDat$errors / errorDat$lenSyll -errorDat$correction_rate <- errorDat$corrections / errorDat$lenSyll +# errorDat$error_rate <- errorDat$errors / errorDat$lenSyll +# errorDat$correction_rate <- errorDat$corrections / errorDat$lenSyll errorDat$timePerSyllable_gmc <- errorDat$timePerSyllable - mean(errorDat$timePerSyllable) errorDat$timePerWord_gmc <- errorDat$timePerWord - mean(errorDat$timePerWord) @@ -269,24 +276,27 @@ errorDat$timePerWord_gmc <- errorDat$timePerWord - mean(errorDat$timePerWord) #extract demo stats -errorDatStats <- subset(errorDat, !duplicated(errorDat$id)) -summary(errorDatStats$age) -sd(errorDatStats$age) -summary(errorDatStats$sex) -summary(errorDatStats$sex) / length(unique(errorDatStats$id)) -summary(errorDatStats$pronouns) -summary(errorDatStats$pronouns) / length(unique(errorDatStats$id)) -summary(errorDatStats$ethnic) -summary(errorDatStats$ethnic) / length(unique(errorDatStats$id)) -summary(errorDatStats$socclass) -summary(errorDatStats$socclass) / length(unique(errorDatStats$id)) - -# Reading speed stats (ls additions 8/16/23) -summary(errorDatStats$timePerSyllable) -summary(errorDatStats$timePerWord) - -summary(errorDatStats$timePerSyllable_gmc) -summary(errorDatStats$timePerWord_gmc) +# fixme: these are at passage level, we need to `(summary_)unique` +# actually, nvm: we already have all these from prep +# errorDatStats <- subset(errorDat, !duplicated(errorDat$id)) +# errorDatStats <- +# summary(errorDatStats$age) +# sd(errorDatStats$age) +# summary(errorDatStats$sex) +# summary(errorDatStats$sex) / length(unique(errorDatStats$id)) +# summary(errorDatStats$pronouns) +# summary(errorDatStats$pronouns) / length(unique(errorDatStats$id)) +# summary(errorDatStats$ethnic) +# summary(errorDatStats$ethnic) / length(unique(errorDatStats$id)) +# summary(errorDatStats$socclass) +# summary(errorDatStats$socclass) / length(unique(errorDatStats$id)) +# +# # Reading speed stats (ls additions 8/16/23) +# summary(errorDatStats$timePerSyllable) +# summary(errorDatStats$timePerWord) +# +# summary(errorDatStats$timePerSyllable_gmc) +# summary(errorDatStats$timePerWord_gmc) ### SECTION 3.5: preparing for misprod-hes sequential analyses @@ -329,24 +339,24 @@ errorDatLongHesWithRelMisprod$misprod_position <- as.factor(errorDatLongHesWithR ### SECTION 4: MODEL RESULTS #misprod_rate x bfne -model1 <- lmerTest::lmer(misprod_rate ~ bfne_gmc + (1|id) + (1|passage), - data=errorDat, REML=TRUE) -summary(model1) +# model1 <- lmerTest::lmer(misprod ~ bfne_gmc + (1|id) + (1|passage), +# data=errorDat, REML=TRUE) +# summary(model1) #misprod_rate x scaaredSoc -model2 <- lmerTest::lmer(misprod_rate ~ scaaredSoc_gmc + (1|id) + (1|passage), +model2 <- lmerTest::lmer(misprod ~ scaaredSoc_gmc + (1|id) + (1|passage), data=errorDat, REML=TRUE) summary(model2) #misprod_rate x sps -model3 <- lmerTest::lmer(misprod_rate ~ sps_gmc + (1|id) + (1|passage), - data=errorDat, REML=TRUE) -summary(model3) +# model3 <- lmerTest::lmer(misprod_rate ~ sps_gmc + (1|id) + (1|passage), +# data=errorDat, REML=TRUE) +# summary(model3) #hesitation_rate x bfne -model4 <- lmerTest::lmer(hesitation_rate ~ bfne_gmc + (1|id) + (1|passage), - data=errorDat, REML=TRUE) -summary(model4) +# model4 <- lmerTest::lmer(hesitation_rate ~ bfne_gmc + (1|id) + (1|passage), +# data=errorDat, REML=TRUE) +# summary(model4) #hesitation_rate x scaaredSoc model5 <- lmerTest::lmer(hesitation_rate ~ scaaredSoc_gmc + (1|id) + (1|passage), @@ -354,14 +364,14 @@ model5 <- lmerTest::lmer(hesitation_rate ~ scaaredSoc_gmc + (1|id) + (1|passage) summary(model5) #hesitation_rate x sps -model6 <- lmerTest::lmer(hesitation_rate ~ sps_gmc + (1|id) + (1|passage), - data=errorDat, REML=TRUE) -summary(model6) +# model6 <- lmerTest::lmer(hesitation_rate ~ sps_gmc + (1|id) + (1|passage), +# data=errorDat, REML=TRUE) +# summary(model6) #words_with_misprod_rate x bfne -model7 <- lmerTest::lmer(words_with_misprod_rate ~ bfne_gmc + (1|id) + (1|passage), - data=errorDat, REML=TRUE) -summary(model7) +# model7 <- lmerTest::lmer(words_with_misprod_rate ~ bfne_gmc + (1|id) + (1|passage), +# data=errorDat, REML=TRUE) +# summary(model7) #words_with_misprod_rate x scaaredSoc model8 <- lmerTest::lmer(words_with_misprod_rate ~ scaaredSoc_gmc + (1|id) + (1|passage), @@ -369,14 +379,14 @@ model8 <- lmerTest::lmer(words_with_misprod_rate ~ scaaredSoc_gmc + (1|id) + (1| summary(model8) #words_with_misprod_rate x sps -model9 <- lmerTest::lmer(words_with_misprod_rate ~ sps_gmc + (1|id) + (1|passage), - data=errorDat, REML=TRUE) -summary(model9) +# model9 <- lmerTest::lmer(words_with_misprod_rate ~ sps_gmc + (1|id) + (1|passage), +# data=errorDat, REML=TRUE) +# summary(model9) #words_with_hes_rate x bfne -model10 <- lmerTest::lmer(words_with_hes_rate ~ bfne_gmc + (1|id) + (1|passage), - data=errorDat, REML=TRUE) -summary(model10) +# model10 <- lmerTest::lmer(words_with_hes_rate ~ bfne_gmc + (1|id) + (1|passage), +# data=errorDat, REML=TRUE) +# summary(model10) #words_with_hes_rate x scaaredSoc model11 <- lmerTest::lmer(words_with_hes_rate ~ scaaredSoc_gmc + (1|id) + (1|passage), @@ -384,9 +394,9 @@ model11 <- lmerTest::lmer(words_with_hes_rate ~ scaaredSoc_gmc + (1|id) + (1|pas summary(model11) #words_with_hes_rate x sps -model12 <- lmerTest::lmer(words_with_hes_rate ~ sps_gmc + (1|id) + (1|passage), - data=errorDat, REML=TRUE) -summary(model12) +# model12 <- lmerTest::lmer(words_with_hes_rate ~ sps_gmc + (1|id) + (1|passage), +# data=errorDat, REML=TRUE) +# summary(model12) #### supplemental analyses From f4b4662e19ee5f10066d0ba116205ebf1c5183e0 Mon Sep 17 00:00:00 2001 From: l-acs Date: Thu, 1 Feb 2024 02:42:34 -0500 Subject: [PATCH 28/33] Set up models for word frequency at the word level --- code/analysisWordLevelReadAloudBeta.R | 340 +++++++++++--------------- 1 file changed, 144 insertions(+), 196 deletions(-) diff --git a/code/analysisWordLevelReadAloudBeta.R b/code/analysisWordLevelReadAloudBeta.R index 246a049..8438b18 100644 --- a/code/analysisWordLevelReadAloudBeta.R +++ b/code/analysisWordLevelReadAloudBeta.R @@ -262,139 +262,127 @@ errorDat$sps_gmc <- errorDat$sps - mean(errorDat$sps) errorDat$lenSyll_gmc <- errorDat$lenSyll - mean(errorDat$lenSyll) errorDat$lenWord_gmc <- errorDat$lenWord - mean(errorDat$lenWord) errorDat$avgSyllPerWord_gmc <- errorDat$avgSyllPerWord - mean(errorDat$avgSyllPerWord) - -# LS additions 8/11/23 -# errorDat$errors <- errorDat$errors - mean(errorDat$errors) -# errorDat$correction <- errorDat$corrections - mean(errorDat$corrections) -# errorDat$error_rate <- errorDat$errors / errorDat$lenSyll -# errorDat$correction_rate <- errorDat$corrections / errorDat$lenSyll - errorDat$timePerSyllable_gmc <- errorDat$timePerSyllable - mean(errorDat$timePerSyllable) errorDat$timePerWord_gmc <- errorDat$timePerWord - mean(errorDat$timePerWord) - - -#extract demo stats -# fixme: these are at passage level, we need to `(summary_)unique` -# actually, nvm: we already have all these from prep -# errorDatStats <- subset(errorDat, !duplicated(errorDat$id)) -# errorDatStats <- -# summary(errorDatStats$age) -# sd(errorDatStats$age) -# summary(errorDatStats$sex) -# summary(errorDatStats$sex) / length(unique(errorDatStats$id)) -# summary(errorDatStats$pronouns) -# summary(errorDatStats$pronouns) / length(unique(errorDatStats$id)) -# summary(errorDatStats$ethnic) -# summary(errorDatStats$ethnic) / length(unique(errorDatStats$id)) -# summary(errorDatStats$socclass) -# summary(errorDatStats$socclass) / length(unique(errorDatStats$id)) -# -# # Reading speed stats (ls additions 8/16/23) -# summary(errorDatStats$timePerSyllable) -# summary(errorDatStats$timePerWord) -# -# summary(errorDatStats$timePerSyllable_gmc) -# summary(errorDatStats$timePerWord_gmc) - - ### SECTION 3.5: preparing for misprod-hes sequential analyses # ignore the misprod-hes columns for now -errorDatMisprodHes <- select(errorDat, !contains("_syllables")) - -# First: look at a given misproduction and check for nearby hesitations -justMisprodWithHesBefore <- cbind(errorDatMisprodHes, - hes_position = 0, # "before", - misprod_tally = errorDat$misprod_with_hes_in_previous_syllables) - -justMisprodWithHesAfter <- cbind(errorDatMisprodHes, - hes_position = 1, # "after", - misprod_tally = errorDat$misprod_with_hes_in_next_syllables) - - -# stack the ones before and the ones after as rows of a single df (my attempt at long form) -errorDatLongMisprodWithRelHes <- rbind(justMisprodWithHesBefore, justMisprodWithHesAfter) - -# track the binary relative position as a factor -errorDatLongMisprodWithRelHes$hes_position <- as.factor(errorDatLongMisprodWithRelHes$hes_position) - -# Then: look at a given hesitation and check for nearby misproductions -justHesWithMisprodBefore <- cbind(errorDatMisprodHes, - misprod_position = 0, # "before", - hes_tally = errorDat$hes_with_misprod_in_previous_syllables) +# errorDatMisprodHes <- select(errorDat, !contains("_syllables")) +# +# # First: look at a given misproduction and check for nearby hesitations +# justMisprodWithHesBefore <- cbind(errorDatMisprodHes, +# hes_position = 0, # "before", +# misprod_tally = errorDat$misprod_with_hes_in_previous_syllables) +# +# justMisprodWithHesAfter <- cbind(errorDatMisprodHes, +# hes_position = 1, # "after", +# misprod_tally = errorDat$misprod_with_hes_in_next_syllables) -justHesWithMisprodAfter <- cbind(errorDatMisprodHes, - misprod_position = 1, # "after", - hes_tally = errorDat$hes_with_misprod_in_next_syllables) # stack the ones before and the ones after as rows of a single df (my attempt at long form) -errorDatLongHesWithRelMisprod <- rbind(justHesWithMisprodBefore, justHesWithMisprodAfter) - -# track the binary relative position as a factor -errorDatLongHesWithRelMisprod$misprod_position <- as.factor(errorDatLongHesWithRelMisprod$misprod_position) +# errorDatLongMisprodWithRelHes <- rbind(justMisprodWithHesBefore, justMisprodWithHesAfter) +# +# # track the binary relative position as a factor +# errorDatLongMisprodWithRelHes$hes_position <- as.factor(errorDatLongMisprodWithRelHes$hes_position) +# +# # Then: look at a given hesitation and check for nearby misproductions +# justHesWithMisprodBefore <- cbind(errorDatMisprodHes, +# misprod_position = 0, # "before", +# hes_tally = errorDat$hes_with_misprod_in_previous_syllables) +# +# justHesWithMisprodAfter <- cbind(errorDatMisprodHes, +# misprod_position = 1, # "after", +# hes_tally = errorDat$hes_with_misprod_in_next_syllables) +# +# # stack the ones before and the ones after as rows of a single df (my attempt at long form) +# errorDatLongHesWithRelMisprod <- rbind(justHesWithMisprodBefore, justHesWithMisprodAfter) +# +# # track the binary relative position as a factor +# errorDatLongHesWithRelMisprod$misprod_position <- as.factor(errorDatLongHesWithRelMisprod$misprod_position) ### SECTION 4: MODEL RESULTS -#misprod_rate x bfne +#misprod x bfne # model1 <- lmerTest::lmer(misprod ~ bfne_gmc + (1|id) + (1|passage), # data=errorDat, REML=TRUE) # summary(model1) -#misprod_rate x scaaredSoc +#misprod x scaaredSoc model2 <- lmerTest::lmer(misprod ~ scaaredSoc_gmc + (1|id) + (1|passage), data=errorDat, REML=TRUE) summary(model2) -#misprod_rate x sps -# model3 <- lmerTest::lmer(misprod_rate ~ sps_gmc + (1|id) + (1|passage), +#misprod x scaaredSoc control for word +model2.5 <- lmerTest::lmer(misprod ~ scaaredSoc_gmc + (1|id) + (1|passage) + (1|word), + data=errorDat, REML=TRUE) +summary(model2.5) + + + +#misprod x sps +# model3 <- lmerTest::lmer(misprod ~ sps_gmc + (1|id) + (1|passage), # data=errorDat, REML=TRUE) # summary(model3) -#hesitation_rate x bfne -# model4 <- lmerTest::lmer(hesitation_rate ~ bfne_gmc + (1|id) + (1|passage), +#hesitation x bfne +# model4 <- lmerTest::lmer(hesitation ~ bfne_gmc + (1|id) + (1|passage), # data=errorDat, REML=TRUE) # summary(model4) -#hesitation_rate x scaaredSoc -model5 <- lmerTest::lmer(hesitation_rate ~ scaaredSoc_gmc + (1|id) + (1|passage), +#hesitation x scaaredSoc +model5 <- lmerTest::lmer(hesitation ~ scaaredSoc_gmc + (1|id) + (1|passage), data=errorDat, REML=TRUE) summary(model5) -#hesitation_rate x sps -# model6 <- lmerTest::lmer(hesitation_rate ~ sps_gmc + (1|id) + (1|passage), +# hesitation x scaaredSoc, control for word +model5.5 <- lmerTest::lmer(hesitation ~ scaaredSoc_gmc + (1|id) + (1|passage) + (1|word), + data=errorDat, REML=TRUE) +summary(model5.5) + + +# hesitation x scaaredSoc, only control for word (and id; not passage) +model5.6 <- lmerTest::lmer(hesitation ~ scaaredSoc_gmc + (1|id) + (1|word), + data=errorDat, REML=TRUE) +summary(model5.6) + +# results are similar + + +#hesitation x sps +# model6 <- lmerTest::lmer(hesitation ~ sps_gmc + (1|id) + (1|passage), # data=errorDat, REML=TRUE) # summary(model6) -#words_with_misprod_rate x bfne -# model7 <- lmerTest::lmer(words_with_misprod_rate ~ bfne_gmc + (1|id) + (1|passage), +#words_with_misprod x bfne +# model7 <- lmerTest::lmer(words_with_misprod ~ bfne_gmc + (1|id) + (1|passage), # data=errorDat, REML=TRUE) # summary(model7) -#words_with_misprod_rate x scaaredSoc -model8 <- lmerTest::lmer(words_with_misprod_rate ~ scaaredSoc_gmc + (1|id) + (1|passage), - data=errorDat, REML=TRUE) -summary(model8) +#words_with_misprod x scaaredSoc +# model8 <- lmerTest::lmer(words_with_misprod ~ scaaredSoc_gmc + (1|id) + (1|passage), +# data=errorDat, REML=TRUE) +# summary(model8) -#words_with_misprod_rate x sps -# model9 <- lmerTest::lmer(words_with_misprod_rate ~ sps_gmc + (1|id) + (1|passage), +#words_with_misprod x sps +# model9 <- lmerTest::lmer(words_with_misprod ~ sps_gmc + (1|id) + (1|passage), # data=errorDat, REML=TRUE) # summary(model9) -#words_with_hes_rate x bfne -# model10 <- lmerTest::lmer(words_with_hes_rate ~ bfne_gmc + (1|id) + (1|passage), +#words_with_hes x bfne +# model10 <- lmerTest::lmer(words_with_hes ~ bfne_gmc + (1|id) + (1|passage), # data=errorDat, REML=TRUE) # summary(model10) -#words_with_hes_rate x scaaredSoc -model11 <- lmerTest::lmer(words_with_hes_rate ~ scaaredSoc_gmc + (1|id) + (1|passage), - data=errorDat, REML=TRUE) -summary(model11) +#words_with_hes x scaaredSoc +# model11 <- lmerTest::lmer(words_with_hes ~ scaaredSoc_gmc + (1|id) + (1|passage), +# data=errorDat, REML=TRUE) +# summary(model11) -#words_with_hes_rate x sps -# model12 <- lmerTest::lmer(words_with_hes_rate ~ sps_gmc + (1|id) + (1|passage), +#words_with_hes x sps +# model12 <- lmerTest::lmer(words_with_hes ~ sps_gmc + (1|id) + (1|passage), # data=errorDat, REML=TRUE) # summary(model12) @@ -425,23 +413,23 @@ summary(f_model3) # Accuracy/comprehension as explained by disfluencies: hesitations per syllable -f_model4 <- glmer(challengeACC ~ hesitation_rate + (1|id) + (1|passage), - data=errorDat, family = "binomial") -summary(f_model4) +# f_model4 <- glmer(challengeACC ~ hesitation + (1|id) + (1|passage), +# data=errorDat, family = "binomial") +# summary(f_model4) # Accuracy/comprehension as explained by disfluencies: hesitations per word -f_model5 <- glmer(challengeACC ~ words_with_hes_rate + (1|id) + (1|passage), +f_model5 <- glmer(challengeACC ~ hesitation + (1|id) + (1|passage), data=errorDat, family = "binomial") summary(f_model5) # Accuracy/comprehension as explained by errors: misproductions per syllable -f_model6 <- glmer(challengeACC ~ misprod_rate + (1|id) + (1|passage), - data=errorDat, family = "binomial") -summary(f_model6) +# f_model6 <- glmer(challengeACC ~ misprod + (1|id) + (1|passage), +# data=errorDat, family = "binomial") +# summary(f_model6) # Accuracy/comprehension as explained by errors: misproductions per word -f_model7 <- glmer(challengeACC ~ words_with_misprod_rate + (1|id) + (1|passage), +f_model7 <- glmer(challengeACC ~ misprod + (1|id) + (1|passage), data=errorDat, family = "binomial") summary(f_model7) @@ -449,69 +437,69 @@ summary(f_model7) # Accuracy/comprehension as explained by disfluencies *and* SA: hesitations per syllable with scaared -f_model8 <- glmer(challengeACC ~ hesitation_rate * scaaredSoc_gmc + (1|id) + (1|passage), +f_model8 <- glmer(challengeACC ~ hesitation * scaaredSoc_gmc + (1|id) + (1|passage), data=errorDat, family = "binomial") summary(f_model8) # Accuracy/comprehension as explained by disfluencies: hesitations per word with scaared -f_model9 <- glmer(challengeACC ~ words_with_hes_rate * scaaredSoc_gmc + (1|id) + (1|passage), +f_model9 <- glmer(challengeACC ~ words_with_hes * scaaredSoc_gmc + (1|id) + (1|passage), data=errorDat, family = "binomial") summary(f_model9) # Accuracy/comprehension as explained by errors: misproductions per syllable with scaared -f_model10 <- glmer(challengeACC ~ misprod_rate * scaaredSoc_gmc + (1|id) + (1|passage), +f_model10 <- glmer(challengeACC ~ misprod * scaaredSoc_gmc + (1|id) + (1|passage), data=errorDat, family = "binomial") summary(f_model10) # Accuracy/comprehension as explained by errors: misproductions per word with scaared -f_model11 <- glmer(challengeACC ~ words_with_misprod_rate * scaaredSoc_gmc + (1|id) + (1|passage), +f_model11 <- glmer(challengeACC ~ words_with_misprod * scaaredSoc_gmc + (1|id) + (1|passage), data=errorDat, family = "binomial") summary(f_model11) # Accuracy/comprehension as explained by disfluencies *and* SA: hesitations per syllable with bfne -f_model12 <- glmer(challengeACC ~ hesitation_rate * bfne_gmc + (1|id) + (1|passage), +f_model12 <- glmer(challengeACC ~ hesitation * bfne_gmc + (1|id) + (1|passage), data=errorDat, family = "binomial") summary(f_model12) # Accuracy/comprehension as explained by disfluencies *and* SA: hesitations per word with bfne -f_model13 <- glmer(challengeACC ~ words_with_hes_rate * bfne_gmc + (1|id) + (1|passage), +f_model13 <- glmer(challengeACC ~ words_with_hes * bfne_gmc + (1|id) + (1|passage), data=errorDat, family = "binomial") summary(f_model13) # Accuracy/comprehension as explained by errors *and* SA: misproductions per syllable with bfne -f_model14 <- glmer(challengeACC ~ misprod_rate * bfne_gmc + (1|id) + (1|passage), +f_model14 <- glmer(challengeACC ~ misprod * bfne_gmc + (1|id) + (1|passage), data=errorDat, family = "binomial") summary(f_model14) # Accuracy/comprehension as explained by errors *and* SA: misproductions per word with bfne -f_model15 <- glmer(challengeACC ~ words_with_misprod_rate * bfne_gmc + (1|id) + (1|passage), +f_model15 <- glmer(challengeACC ~ words_with_misprod * bfne_gmc + (1|id) + (1|passage), data=errorDat, family = "binomial") summary(f_model15) # Accuracy/comprehension as explained by disfluencies *and* SA: hesitations per syllable with sps -f_model16 <- glmer(challengeACC ~ hesitation_rate * sps_gmc + (1|id) + (1|passage), +f_model16 <- glmer(challengeACC ~ hesitation * sps_gmc + (1|id) + (1|passage), data=errorDat, family = "binomial") summary(f_model16) # Accuracy/comprehension as explained by disfluencies *and* SA: hesitations per word with sps -f_model17 <- glmer(challengeACC ~ words_with_hes_rate * sps_gmc + (1|id) + (1|passage), +f_model17 <- glmer(challengeACC ~ words_with_hes * sps_gmc + (1|id) + (1|passage), data=errorDat, family = "binomial") summary(f_model17) # Accuracy/comprehension as explained by errors *and* SA: misproductions per syllable with sps -f_model18 <- glmer(challengeACC ~ misprod_rate * sps_gmc + (1|id) + (1|passage), +f_model18 <- glmer(challengeACC ~ misprod * sps_gmc + (1|id) + (1|passage), data=errorDat, family = "binomial") summary(f_model18) # Accuracy/comprehension as explained by errors *and* SA: misproductions per word with sps -f_model19 <- glmer(challengeACC ~ words_with_misprod_rate * sps_gmc + (1|id) + (1|passage), +f_model19 <- glmer(challengeACC ~ words_with_misprod * sps_gmc + (1|id) + (1|passage), data=errorDat, family = "binomial") summary(f_model19) @@ -520,18 +508,18 @@ summary(f_model19) # Now, misproduction-hesitation relationships # Errors as explained by disfluency: rate of misproduced syllables from rate of hesitated syllables -f_model20 <- lmerTest::lmer(misprod_rate ~ hesitation_rate + (1|id) + (1|passage), +f_model20 <- lmerTest::lmer(misprod ~ hesitation + (1|id) + (1|passage), data=errorDat, REML=TRUE) summary(f_model20) # *** # Errors as explained by disfluency: rate of misproduced words from rate of hesitated words -f_model21 <- lmerTest::lmer(words_with_misprod_rate ~ words_with_hes_rate + (1|id) + (1|passage), +f_model21 <- lmerTest::lmer(words_with_misprod ~ words_with_hes + (1|id) + (1|passage), data=errorDat, REML=TRUE) summary(f_model21) # *** # Errors as explained by disfluency: rate of misproduced words from rate of hesitated syllables -f_model22 <- lmerTest::lmer(words_with_misprod_rate ~ hesitation_rate + (1|id) + (1|passage), +f_model22 <- lmerTest::lmer(words_with_misprod ~ hesitation + (1|id) + (1|passage), data=errorDat, REML=TRUE) summary(f_model22) # *** @@ -540,130 +528,69 @@ summary(f_model22) # *** # Now, misproduction-hesitation interactions with social anxiety # Errors as explained by disfluency and SA: rate of misproduced syllables from rate of hesitated syllables and scaared -f_model23 <- lmerTest::lmer(misprod_rate ~ hesitation_rate * scaaredSoc_gmc + (1|id) + (1|passage), +f_model23 <- lmerTest::lmer(misprod ~ hesitation * scaaredSoc_gmc + (1|id) + (1|passage), data=errorDat, REML=TRUE) summary(f_model23) # Errors as explained by disfluency and SA: rate of misproduced words from rate of hesitated words and scaared -f_model24 <- lmerTest::lmer(words_with_misprod_rate ~ words_with_hes_rate * scaaredSoc_gmc + (1|id) + (1|passage), +f_model24 <- lmerTest::lmer(words_with_misprod ~ words_with_hes * scaaredSoc_gmc + (1|id) + (1|passage), data=errorDat, REML=TRUE) summary(f_model24) # Errors as explained by disfluency and SA: rate of misproduced words from rate of hesitated syllables and scaared -f_model25 <- lmerTest::lmer(words_with_misprod_rate ~ hesitation_rate * scaaredSoc_gmc + (1|id) + (1|passage), +f_model25 <- lmerTest::lmer(words_with_misprod ~ hesitation * scaaredSoc_gmc + (1|id) + (1|passage), data=errorDat, REML=TRUE) summary(f_model25) # Errors as explained by disfluency and SA: rate of misproduced syllables from rate of hesitated syllables and bfne -f_model26 <- lmerTest::lmer(misprod_rate ~ hesitation_rate * bfne_gmc + (1|id) + (1|passage), +f_model26 <- lmerTest::lmer(misprod ~ hesitation * bfne_gmc + (1|id) + (1|passage), data=errorDat, REML=TRUE) summary(f_model26) # Errors as explained by disfluency and SA: rate of misproduced words from rate of hesitated words and bfne -f_model27 <- lmerTest::lmer(words_with_misprod_rate ~ words_with_hes_rate * bfne_gmc + (1|id) + (1|passage), +f_model27 <- lmerTest::lmer(words_with_misprod ~ words_with_hes * bfne_gmc + (1|id) + (1|passage), data=errorDat, REML=TRUE) summary(f_model27) # Errors as explained by disfluency and SA: rate of misproduced words from rate of hesitated syllables and bfne -f_model28 <- lmerTest::lmer(words_with_misprod_rate ~ hesitation_rate * bfne_gmc + (1|id) + (1|passage), +f_model28 <- lmerTest::lmer(words_with_misprod ~ hesitation * bfne_gmc + (1|id) + (1|passage), data=errorDat, REML=TRUE) summary(f_model28) # Errors as explained by disfluency and SA: rate of misproduced syllables from rate of hesitated syllables and sps -f_model29 <- lmerTest::lmer(misprod_rate ~ hesitation_rate * sps_gmc + (1|id) + (1|passage), +f_model29 <- lmerTest::lmer(misprod ~ hesitation * sps_gmc + (1|id) + (1|passage), data=errorDat, REML=TRUE) summary(f_model29) # Errors as explained by disfluency and SA: rate of misproduced words from rate of hesitated words and sps -f_model30 <- lmerTest::lmer(words_with_misprod_rate ~ words_with_hes_rate * sps_gmc + (1|id) + (1|passage), +f_model30 <- lmerTest::lmer(words_with_misprod ~ words_with_hes * sps_gmc + (1|id) + (1|passage), data=errorDat, REML=TRUE) summary(f_model30) # Errors as explained by disfluency and SA: rate of misproduced words from rate of hesitated syllables and sps -f_model31 <- lmerTest::lmer(words_with_misprod_rate ~ hesitation_rate * sps_gmc + (1|id) + (1|passage), +f_model31 <- lmerTest::lmer(words_with_misprod ~ hesitation * sps_gmc + (1|id) + (1|passage), data=errorDat, REML=TRUE) summary(f_model31) -# Now: see if reading speed plays into it - -# Does scaaredSoc predict reading speed? -# syllable level -rs_model_1 <- lmerTest::lmer(timePerSyllable_gmc ~ scaaredSoc_gmc + (1|id) + (1|passage), - data=errorDat, REML=TRUE) -summary(rs_model_1) - -# word level -rs_model_2 <- lmerTest::lmer(timePerWord_gmc ~ scaaredSoc_gmc + (1|id) + (1|passage), - data=errorDat, REML=TRUE) -summary(rs_model_2) - - -rs_model_1_bfne <- lmerTest::lmer(timePerSyllable_gmc ~ bfne_gmc + (1|id) + (1|passage), - data=errorDat, REML=TRUE) -summary(rs_model_1_bfne) - -# word level -rs_model_2_bfne <- lmerTest::lmer(timePerWord_gmc ~ bfne_gmc + (1|id) + (1|passage), - data=errorDat, REML=TRUE) -summary(rs_model_2_bfne) - -rs_model_1_sps <- lmerTest::lmer(timePerSyllable_gmc ~ sps_gmc + (1|id) + (1|passage), - data=errorDat, REML=TRUE) -summary(rs_model_1_sps) - -# word level -rs_model_2_bfne <- lmerTest::lmer(timePerWord_gmc ~ sps_gmc + (1|id) + (1|passage), - data=errorDat, REML=TRUE) -summary(rs_model_2_sps) - - - -# Does scaaredSoc predict reading speed? -# syllable level -rs_model_3 <- lmerTest::lmer(timePerSyllable ~ scaaredSoc_gmc + (1|id) + (1|passage), - data=errorDat, REML=TRUE) -summary(rs_model_3) - -# word level -rs_model_4 <- lmerTest::lmer(timePerWord ~ scaaredSoc_gmc + (1|id) + (1|passage), - data=errorDat, REML=TRUE) -summary(rs_model_4) - - - - # What happens when we control for age? -#hesitation_rate x scaaredSoc -age_model1 <- lmerTest::lmer(hesitation_rate ~ scaaredSoc_gmc + age_gmc + (1|id) + (1|passage), +#hesitation x scaaredSoc +age_model1 <- lmerTest::lmer(hesitation ~ scaaredSoc_gmc + age_gmc + (1|id) + (1|passage), data=errorDat, REML=TRUE) summary(age_model1) -#words_with_hes_rate x scaaredSoc -age_model2 <- lmerTest::lmer(words_with_hes_rate ~ scaaredSoc_gmc + age_gmc + (1|id) + (1|passage), +#words_with_hes x scaaredSoc +age_model2 <- lmerTest::lmer(words_with_hes ~ scaaredSoc_gmc + age_gmc + (1|id) + (1|passage), data=errorDat, REML=TRUE) summary(age_model2) -# And now ->> check our work -# Does our hesitation ~ scaaredSoc finding hold with reading speed controlled for? -# syllable level -rs_model_3 <- lmerTest::lmer(hesitation_rate ~ scaaredSoc_gmc + timePerSyllable_gmc + (1|id) + (1|passage), - data=errorDat, REML=TRUE) -summary(rs_model_3) - -# word level -rs_model_4 <- lmerTest::lmer(words_with_hes_rate ~ scaaredSoc_gmc + timePerWord_gmc + (1|id) + (1|passage), - data=errorDat, REML=TRUE) -summary(rs_model_4) - - # misprod-hes ordering # Is the number of hesitations adjacent to misproductions in a particular @@ -699,19 +626,40 @@ misprod_with_rel_hes_model_4 <- lmerTest::lmer(misprod_tally ~ hes_position * sc # Word frequency analysis -# Does a passage's average word frequency predict participants' hesitation rate or misproduction rate? -wordfreq_model_1 <- lmerTest::lmer(hesitation_rate ~ avgWordFreq + (1|id) + (1|passage), - data=errorDat, REML=TRUE) +# Does a word's frequency predict hesitation on that word? +errorDatAttestedFreqs <- filter(errorDat, log10frequency > 0) +wordfreq_model_1 <- lmerTest::lmer(hesitation ~ log10frequency + (1|id) + (1|passage), + data=errorDatAttestedFreqs, REML=TRUE) summary(wordfreq_model_1) -wordfreq_model_2 <- lmerTest::lmer(misprod_rate ~ avgWordFreq + (1|id) + (1|passage), - data=errorDat, REML=TRUE) +wordfreq_model_2 <- lmerTest::lmer(misprod ~ log10frequency + (1|id) + (1|passage), + data=errorDatAttestedFreqs, REML=TRUE) summary(wordfreq_model_2) + +# control for word, that must matter right? +wordfreq_model_1.5 <- lmerTest::lmer(hesitation ~ log10frequency + (1|id) + (1|passage) + (1|word), + data=errorDatAttestedFreqs, REML=TRUE) +summary(wordfreq_model_1.5) +wordfreq_model_2.5 <- lmerTest::lmer(misprod ~ log10frequency + (1|id) + (1|passage) + (1|word), + data=errorDatAttestedFreqs, REML=TRUE) +summary(wordfreq_model_2.5) + + # Do social anxiety and frequency interact to predict hesitation rate or misproduction rate? -wordfreq_model_3 <- lmerTest::lmer(hesitation_rate ~ avgWordFreq * scaaredSoc_gmc + (1|id) + (1|passage), - data=errorDat, REML=TRUE) -summary(wordfreq_model_3) +wordfreq_model_3 <- lmerTest::lmer(hesitation ~ log10frequency * scaaredSoc_gmc + (1|id) + (1|passage), + data=errorDatAttestedFreqs, REML=TRUE) +summary(wordfreq_model_3) # yes! + +wordfreq_model_4 <- lmerTest::lmer(misprod ~ log10frequency * scaaredSoc_gmc + (1|id) + (1|passage), + data=errorDatAttestedFreqs, REML=TRUE) +summary(wordfreq_model_4) # no, not at all - well, p = 0.15 + + +# control for word, that must matter right? +wordfreq_model_3.5 <- lmerTest::lmer(hesitation ~ log10frequency * scaaredSoc_gmc + (1|id) + (1|passage) + (1|word), + data=errorDatAttestedFreqs, REML=TRUE) +summary(wordfreq_model_3.5) # still yes, very slightly higher p -wordfreq_model_4 <- lmerTest::lmer(misprod_rate ~ avgWordFreq * scaaredSoc_gmc + (1|id) + (1|passage), - data=errorDat, REML=TRUE) -summary(wordfreq_model_4) +wordfreq_model_4.5 <- lmerTest::lmer(misprod ~ log10frequency * scaaredSoc_gmc + (1|id) + (1|passage) + (1|word), + data=errorDatAttestedFreqs, REML=TRUE) +summary(wordfreq_model_4.5) # still no, slightly lower p = 0.114 From e4dfc14dd463c8b6294f1a51579edfbcd601cf1b Mon Sep 17 00:00:00 2001 From: l-acs Date: Thu, 1 Feb 2024 03:29:52 -0500 Subject: [PATCH 29/33] Set up models for misproduction-hesitation ordering at the word level. Other tweaks. --- code/analysisWordLevelReadAloudBeta.R | 188 +++++++++++++++----------- 1 file changed, 112 insertions(+), 76 deletions(-) diff --git a/code/analysisWordLevelReadAloudBeta.R b/code/analysisWordLevelReadAloudBeta.R index 8438b18..c87d87a 100644 --- a/code/analysisWordLevelReadAloudBeta.R +++ b/code/analysisWordLevelReadAloudBeta.R @@ -269,38 +269,38 @@ errorDat$timePerWord_gmc <- errorDat$timePerWord - mean(errorDat$timePerWord) ### SECTION 3.5: preparing for misprod-hes sequential analyses # ignore the misprod-hes columns for now -# errorDatMisprodHes <- select(errorDat, !contains("_syllables")) -# -# # First: look at a given misproduction and check for nearby hesitations -# justMisprodWithHesBefore <- cbind(errorDatMisprodHes, -# hes_position = 0, # "before", -# misprod_tally = errorDat$misprod_with_hes_in_previous_syllables) -# -# justMisprodWithHesAfter <- cbind(errorDatMisprodHes, -# hes_position = 1, # "after", -# misprod_tally = errorDat$misprod_with_hes_in_next_syllables) +errorDatMisprodHes <- select(errorDat, !contains("any_")) + +# First: look at a given misproduction and check for nearby hesitations +justMisprodWithHesBefore <- cbind(errorDatMisprodHes, + hes_position = 0, # "before", + misprod_in_adjacent_window = errorDat$misprod_with_any_prior_hesitation) + +justMisprodWithHesAfter <- cbind(errorDatMisprodHes, + hes_position = 1, # "after", + misprod_in_adjacent_window = errorDat$misprod_with_any_upcoming_hesitation) # stack the ones before and the ones after as rows of a single df (my attempt at long form) -# errorDatLongMisprodWithRelHes <- rbind(justMisprodWithHesBefore, justMisprodWithHesAfter) -# -# # track the binary relative position as a factor -# errorDatLongMisprodWithRelHes$hes_position <- as.factor(errorDatLongMisprodWithRelHes$hes_position) -# -# # Then: look at a given hesitation and check for nearby misproductions -# justHesWithMisprodBefore <- cbind(errorDatMisprodHes, -# misprod_position = 0, # "before", -# hes_tally = errorDat$hes_with_misprod_in_previous_syllables) -# -# justHesWithMisprodAfter <- cbind(errorDatMisprodHes, -# misprod_position = 1, # "after", -# hes_tally = errorDat$hes_with_misprod_in_next_syllables) -# -# # stack the ones before and the ones after as rows of a single df (my attempt at long form) -# errorDatLongHesWithRelMisprod <- rbind(justHesWithMisprodBefore, justHesWithMisprodAfter) -# -# # track the binary relative position as a factor -# errorDatLongHesWithRelMisprod$misprod_position <- as.factor(errorDatLongHesWithRelMisprod$misprod_position) +errorDatLongMisprodWithRelHes <- rbind(justMisprodWithHesBefore, justMisprodWithHesAfter) + +# track the binary relative position as a factor +errorDatLongMisprodWithRelHes$hes_position <- as.factor(errorDatLongMisprodWithRelHes$hes_position) + +# Then: look at a given hesitation and check for nearby misproductions +justHesWithMisprodBefore <- cbind(errorDatMisprodHes, + misprod_position = 0, # "before", + hes_in_adjacent_window = errorDat$hesitation_with_any_prior_misprod) + +justHesWithMisprodAfter <- cbind(errorDatMisprodHes, + misprod_position = 1, # "after", + hes_in_adjacent_window = errorDat$hesitation_with_any_upcoming_misprod) + +# stack the ones before and the ones after as rows of a single df (my attempt at long form) +errorDatLongHesWithRelMisprod <- rbind(justHesWithMisprodBefore, justHesWithMisprodAfter) + +# track the binary relative position as a factor +errorDatLongHesWithRelMisprod$misprod_position <- as.factor(errorDatLongHesWithRelMisprod$misprod_position) @@ -437,71 +437,71 @@ summary(f_model7) # Accuracy/comprehension as explained by disfluencies *and* SA: hesitations per syllable with scaared -f_model8 <- glmer(challengeACC ~ hesitation * scaaredSoc_gmc + (1|id) + (1|passage), - data=errorDat, family = "binomial") -summary(f_model8) +# f_model8 <- glmer(challengeACC ~ hesitation * scaaredSoc_gmc + (1|id) + (1|passage), +# data=errorDat, family = "binomial") +# summary(f_model8) # Accuracy/comprehension as explained by disfluencies: hesitations per word with scaared -f_model9 <- glmer(challengeACC ~ words_with_hes * scaaredSoc_gmc + (1|id) + (1|passage), +f_model9 <- glmer(challengeACC ~ hesitation * scaaredSoc_gmc + (1|id) + (1|passage), data=errorDat, family = "binomial") summary(f_model9) # Accuracy/comprehension as explained by errors: misproductions per syllable with scaared -f_model10 <- glmer(challengeACC ~ misprod * scaaredSoc_gmc + (1|id) + (1|passage), - data=errorDat, family = "binomial") -summary(f_model10) +# f_model10 <- glmer(challengeACC ~ misprod * scaaredSoc_gmc + (1|id) + (1|passage), +# data=errorDat, family = "binomial") +# summary(f_model10) # Accuracy/comprehension as explained by errors: misproductions per word with scaared -f_model11 <- glmer(challengeACC ~ words_with_misprod * scaaredSoc_gmc + (1|id) + (1|passage), +f_model11 <- glmer(challengeACC ~ misprod * scaaredSoc_gmc + (1|id) + (1|passage), data=errorDat, family = "binomial") summary(f_model11) # Accuracy/comprehension as explained by disfluencies *and* SA: hesitations per syllable with bfne -f_model12 <- glmer(challengeACC ~ hesitation * bfne_gmc + (1|id) + (1|passage), - data=errorDat, family = "binomial") -summary(f_model12) +# f_model12 <- glmer(challengeACC ~ hesitation * bfne_gmc + (1|id) + (1|passage), +# data=errorDat, family = "binomial") +# summary(f_model12) # Accuracy/comprehension as explained by disfluencies *and* SA: hesitations per word with bfne -f_model13 <- glmer(challengeACC ~ words_with_hes * bfne_gmc + (1|id) + (1|passage), - data=errorDat, family = "binomial") -summary(f_model13) +# f_model13 <- glmer(challengeACC ~ words_with_hes * bfne_gmc + (1|id) + (1|passage), +# data=errorDat, family = "binomial") +# summary(f_model13) # Accuracy/comprehension as explained by errors *and* SA: misproductions per syllable with bfne -f_model14 <- glmer(challengeACC ~ misprod * bfne_gmc + (1|id) + (1|passage), - data=errorDat, family = "binomial") -summary(f_model14) +# f_model14 <- glmer(challengeACC ~ misprod * bfne_gmc + (1|id) + (1|passage), +# data=errorDat, family = "binomial") +# summary(f_model14) # Accuracy/comprehension as explained by errors *and* SA: misproductions per word with bfne -f_model15 <- glmer(challengeACC ~ words_with_misprod * bfne_gmc + (1|id) + (1|passage), - data=errorDat, family = "binomial") -summary(f_model15) +# f_model15 <- glmer(challengeACC ~ words_with_misprod * bfne_gmc + (1|id) + (1|passage), +# data=errorDat, family = "binomial") +# summary(f_model15) # Accuracy/comprehension as explained by disfluencies *and* SA: hesitations per syllable with sps -f_model16 <- glmer(challengeACC ~ hesitation * sps_gmc + (1|id) + (1|passage), - data=errorDat, family = "binomial") -summary(f_model16) +# f_model16 <- glmer(challengeACC ~ hesitation * sps_gmc + (1|id) + (1|passage), +# data=errorDat, family = "binomial") +# summary(f_model16) # Accuracy/comprehension as explained by disfluencies *and* SA: hesitations per word with sps -f_model17 <- glmer(challengeACC ~ words_with_hes * sps_gmc + (1|id) + (1|passage), - data=errorDat, family = "binomial") -summary(f_model17) +# f_model17 <- glmer(challengeACC ~ words_with_hes * sps_gmc + (1|id) + (1|passage), +# data=errorDat, family = "binomial") +# summary(f_model17) # Accuracy/comprehension as explained by errors *and* SA: misproductions per syllable with sps -f_model18 <- glmer(challengeACC ~ misprod * sps_gmc + (1|id) + (1|passage), - data=errorDat, family = "binomial") -summary(f_model18) +# f_model18 <- glmer(challengeACC ~ misprod * sps_gmc + (1|id) + (1|passage), +# data=errorDat, family = "binomial") +# summary(f_model18) # Accuracy/comprehension as explained by errors *and* SA: misproductions per word with sps -f_model19 <- glmer(challengeACC ~ words_with_misprod * sps_gmc + (1|id) + (1|passage), - data=errorDat, family = "binomial") -summary(f_model19) +# f_model19 <- glmer(challengeACC ~ words_with_misprod * sps_gmc + (1|id) + (1|passage), +# data=errorDat, family = "binomial") +# summary(f_model19) @@ -522,7 +522,7 @@ summary(f_model21) # *** f_model22 <- lmerTest::lmer(words_with_misprod ~ hesitation + (1|id) + (1|passage), data=errorDat, REML=TRUE) summary(f_model22) # *** - +# NB my * comments here (this section of models at least) are out of date # Now, misproduction-hesitation interactions with social anxiety @@ -593,12 +593,6 @@ summary(age_model2) # misprod-hes ordering -# Is the number of hesitations adjacent to misproductions in a particular -# reading predicted by the - -# Does the position of misproductions relative to hesitations - - # we have a number of occurrences of a misproduction in a particular position # relative to a passage's hesitations. does knowing the position (before/after) # predict the number of these sequences we have? @@ -606,22 +600,58 @@ summary(age_model2) # does misproduction location relative to a hesitation predict how many # instances we get in a particular reading? -hes_with_rel_misprod_model_1 <- lmerTest::lmer(hes_tally ~ misprod_position + (1|id) + (1|passage), +hes_with_rel_misprod_model_1 <- lmerTest::lmer(hes_in_adjacent_window ~ misprod_position + (1|id) + (1|passage), + data=errorDatLongHesWithRelMisprod, REML=TRUE) +summary(hes_with_rel_misprod_model_1) # n.s., 0.271 + +misprod_with_rel_hes_model_1 <- lmerTest::lmer(misprod_in_adjacent_window ~ hes_position + (1|id) + (1|passage), + data=errorDatLongMisprodWithRelHes, REML=TRUE) +summary(misprod_with_rel_hes_model_1) # n.s., 0.108 + +## does it interact with SA? +hes_with_rel_misprod_model_3 <- lmerTest::lmer(hes_in_adjacent_window ~ misprod_position * scaaredSoc_gmc + (1|id) + (1|passage), + data=errorDatLongHesWithRelMisprod, REML=TRUE) +summary(hes_with_rel_misprod_model_3) # n.s. + +misprod_with_rel_hes_model_4 <- lmerTest::lmer(misprod_in_adjacent_window ~ hes_position * scaaredSoc_gmc + (1|id) + (1|passage), + data=errorDatLongMisprodWithRelHes, REML=TRUE) +summary(misprod_with_rel_hes_model_4) # n.s. + +# what if we control for word? +hes_with_rel_misprod_model_1.5 <- lmerTest::lmer(hes_in_adjacent_window ~ misprod_position + (1|id) + (1|passage) + (1|word), data=errorDatLongHesWithRelMisprod, REML=TRUE) -summary(hes_with_rel_misprod_model_1) +summary(hes_with_rel_misprod_model_1.5) # n.s., sameish -misprod_with_rel_hes_model_1 <- lmerTest::lmer(misprod_tally ~ hes_position + (1|id) + (1|passage), +misprod_with_rel_hes_model_1.5 <- lmerTest::lmer(misprod_in_adjacent_window ~ hes_position + (1|id) + (1|passage) + (1|word), data=errorDatLongMisprodWithRelHes, REML=TRUE) -summary(misprod_with_rel_hes_model_1) +summary(misprod_with_rel_hes_model_1.5) # ., 0.0974 ## does it interact with SA? -hes_with_rel_misprod_model_3 <- lmerTest::lmer(hes_tally ~ misprod_position * scaaredSoc_gmc + (1|id) + (1|passage), +hes_with_rel_misprod_model_3.5 <- lmerTest::lmer(hes_in_adjacent_window ~ misprod_position * scaaredSoc_gmc + (1|id) + (1|passage) + (1|word), data=errorDatLongHesWithRelMisprod, REML=TRUE) -# summary(hes_with_rel_misprod_model_3) +summary(hes_with_rel_misprod_model_3.5) # n.s. -misprod_with_rel_hes_model_4 <- lmerTest::lmer(misprod_tally ~ hes_position * scaaredSoc_gmc + (1|id) + (1|passage), +misprod_with_rel_hes_model_4.5 <- lmerTest::lmer(misprod_in_adjacent_window ~ hes_position * scaaredSoc_gmc + (1|id) + (1|passage) + (1|word), data=errorDatLongMisprodWithRelHes, REML=TRUE) -# summary(misprod_with_rel_hes_model_4) +summary(misprod_with_rel_hes_model_4.5) # n.s. + +# and if we ignore passage? +hes_with_rel_misprod_model_1.6 <- lmerTest::lmer(hes_in_adjacent_window ~ misprod_position + (1|id) + (1|word), + data=errorDatLongHesWithRelMisprod, REML=TRUE) +summary(hes_with_rel_misprod_model_1.6) # n.s., sameish + +misprod_with_rel_hes_model_1.6 <- lmerTest::lmer(misprod_in_adjacent_window ~ hes_position + (1|id) + (1|word), + data=errorDatLongMisprodWithRelHes, REML=TRUE) +summary(misprod_with_rel_hes_model_1.6) # made no difference, as you might expect + +## does it interact with SA? +hes_with_rel_misprod_model_3.6 <- lmerTest::lmer(hes_in_adjacent_window ~ misprod_position * scaaredSoc_gmc + (1|id) + (1|word), + data=errorDatLongHesWithRelMisprod, REML=TRUE) +summary(hes_with_rel_misprod_model_3.6) # "" + +misprod_with_rel_hes_model_4.6 <- lmerTest::lmer(misprod_in_adjacent_window ~ hes_position * scaaredSoc_gmc + (1|id) + (1|word), + data=errorDatLongMisprodWithRelHes, REML=TRUE) +summary(misprod_with_rel_hes_model_4.6) # "" @@ -663,3 +693,9 @@ summary(wordfreq_model_3.5) # still yes, very slightly higher p wordfreq_model_4.5 <- lmerTest::lmer(misprod ~ log10frequency * scaaredSoc_gmc + (1|id) + (1|passage) + (1|word), data=errorDatAttestedFreqs, REML=TRUE) summary(wordfreq_model_4.5) # still no, slightly lower p = 0.114 + + +# hesitation ~ wf x SA +interact_plot(model = wordfreq_model_3, + pred = log10frequency, modx = scaaredSoc_gmc, interval = TRUE) + From ac0378d18f51b87fc9d32578d524060ee9bf9bc7 Mon Sep 17 00:00:00 2001 From: l-acs Date: Thu, 1 Feb 2024 20:40:43 -0500 Subject: [PATCH 30/33] Clean up: sanitize for clarity before revised analyses --- code/analysisWordLevelReadAloudBeta.R | 78 +++++++++++++++------------ 1 file changed, 45 insertions(+), 33 deletions(-) diff --git a/code/analysisWordLevelReadAloudBeta.R b/code/analysisWordLevelReadAloudBeta.R index c87d87a..03dd2e4 100644 --- a/code/analysisWordLevelReadAloudBeta.R +++ b/code/analysisWordLevelReadAloudBeta.R @@ -110,6 +110,7 @@ ### SECTION 1: SETTING UP library(dplyr) +library(purrr) library(lme4) library(lmerTest) library(interactions) @@ -120,31 +121,17 @@ library(gridExtra) library(grid) library(cowplot) library(colorspace) -library(colorblindr) - -# ``` -# Warning in install.packages : -# package ‘colorblindr’ is not available for this version of R -# -# A version of this package for your version of R might be available elsewhere, -# see the ideas at -# https://cran.r-project.org/doc/manuals/r-patched/R-admin.html#Installing-packages -# ``` +# library(colorblindr) #set up date for output file naming today <- Sys.Date() today <- format(today, "%Y%m%d") #set up directories for input/output data -# data <- '/Users/jalexand/github/readAloud-valence-beta/derivatives/readAloudBetaData_20230630.csv' -# data <- '/home/luc/Documents/ndclab/analysis-sandbox/rwe-analysis/derivatives/readAloudBetaData_20230810.csv' -# data <- '/home/luc/Documents/ndclab/analysis-sandbox/rwe-analysis/derivatives/readAloudBetaData_20230815.csv' -# data <- '/home/luc/Documents/ndclab/analysis-sandbox/rwe-analysis/derivatives/readAloudBetaData_20230816.csv' -# data <- '/home/luc/Documents/ndclab/analysis-sandbox/rwe-analysis/derivatives/readAloudBetaData_20230825.csv' data <- '~/Documents/ndclab/rwe-analysis-sandbox/rwe-analysis/derivatives/readAloudBetaData-wordLevel_20240130.csv' -to_omit <- '/home/luc/Documents/ndclab/rwe-analysis-sandbox/rwe-analysis/input/passages-to-omit_20230810.csv' +to_omit <- '~/Documents/ndclab/rwe-analysis-sandbox/rwe-analysis/input/passages-to-omit_20230810.csv' # out_path <- '/Users/jalexand/github/readAloud-valence-beta/derivatives/' -out_path <- '/home/luc/Documents/ndclab/rwe-analysis-sandbox/rwe-analysis/derivatives/' +out_path <- '~/Documents/ndclab/rwe-analysis-sandbox/rwe-analysis/derivatives/' #read in data df <- read.csv(data, row.names = NULL) # output of prep script @@ -202,11 +189,6 @@ length(unique(df$id)) - length(unique(dfTrim$id)) #number of participants remove summary_unique(dfTrim, "id", "challengeAvgSub") summary_unique(dfTrim, "id", "challengeAvgSub", f = sd) -# calculate average speed per *person* per *passage* -unique(select(dfTrim, timePerSyllable, id, passage))$timePerSyllable %>% mean(na.rm = TRUE) -unique(select(dfTrim, timePerSyllable, id, passage))$timePerSyllable %>% sd(na.rm = TRUE) -unique(select(dfTrim, timePerWord, id, passage))$timePerWord %>% mean(na.rm = TRUE) -unique(select(dfTrim, timePerWord, id, passage))$timePerWord %>% sd(na.rm = TRUE) ### SECTION 2: PASSAGE-LEVEL TRIMMING passage_no_before_trimming <- summary_unique(dfTrim, "id", "passage", f = length) @@ -231,11 +213,6 @@ c(150083, "caramel") # N.B.: only one of four passages to have >= 5% of sylla c(150083, "cars") # N.B.: only one of four passages to have >= 5% of syllables omitted -dfTrim <- filter(dfTrim, !is.na(timePerSyllable)) -# itself, but without ones for which we have no reading data -# this ends up only dropping 0083, caramel - the other three already end up -# getting dropped based on other criteria - passage_no_after_trim2 <- summary_unique(dfTrim, "id", "passage", f = length) passage_no_after_trim1 - passage_no_after_trim2 #number of passages trimmed (passage_no_after_trim1 - passage_no_after_trim2) / passage_no_after_trim1 #percentage of passages trimmed of last bunch @@ -259,11 +236,7 @@ errorDat$scaaredTotal_gmc <- errorDat$scaaredTotal - mean(errorDat$scaaredTotal) errorDat$scaaredGA_gmc <- errorDat$scaaredGA - mean(errorDat$scaaredGA) errorDat$scaaredSoc_gmc <- errorDat$scaaredSoc - mean(errorDat$scaaredSoc) errorDat$sps_gmc <- errorDat$sps - mean(errorDat$sps) -errorDat$lenSyll_gmc <- errorDat$lenSyll - mean(errorDat$lenSyll) -errorDat$lenWord_gmc <- errorDat$lenWord - mean(errorDat$lenWord) -errorDat$avgSyllPerWord_gmc <- errorDat$avgSyllPerWord - mean(errorDat$avgSyllPerWord) -errorDat$timePerSyllable_gmc <- errorDat$timePerSyllable - mean(errorDat$timePerSyllable) -errorDat$timePerWord_gmc <- errorDat$timePerWord - mean(errorDat$timePerWord) + ### SECTION 3.5: preparing for misprod-hes sequential analyses @@ -674,6 +647,15 @@ wordfreq_model_2.5 <- lmerTest::lmer(misprod ~ log10frequency + (1|id) + (1|pass data=errorDatAttestedFreqs, REML=TRUE) summary(wordfreq_model_2.5) +# control for word w/o passage +wordfreq_model_1.6 <- lmerTest::lmer(hesitation ~ log10frequency + (1|id) + (1|word), + data=errorDatAttestedFreqs, REML=TRUE) +summary(wordfreq_model_1.6) +wordfreq_model_2.6 <- lmerTest::lmer(misprod ~ log10frequency + (1|id) + (1|word), + data=errorDatAttestedFreqs, REML=TRUE) +summary(wordfreq_model_2.6) + + # Do social anxiety and frequency interact to predict hesitation rate or misproduction rate? wordfreq_model_3 <- lmerTest::lmer(hesitation ~ log10frequency * scaaredSoc_gmc + (1|id) + (1|passage), @@ -696,6 +678,36 @@ summary(wordfreq_model_4.5) # still no, slightly lower p = 0.114 # hesitation ~ wf x SA -interact_plot(model = wordfreq_model_3, +interact_plot(model = wordfreq_model_3.5, + pred = log10frequency, modx = scaaredSoc_gmc, interval = TRUE) + + + +interact_plot(model = wordfreq_model_4.5, pred = log10frequency, modx = scaaredSoc_gmc, interval = TRUE) + + + + + + + +summary(errorDatAttestedFreqs$log10frequency) + + +case_when(errorDatAttestedFreqs$log10frequency <= 2.711 ~ 1, errorDatAttestedFreqs$log10frequency <= 4.058 ~ 2, errorDatAttestedFreqs$log10frequency <= 5.609 ~ 3, .default = 4 ) + + +errorDatAttestedFreqs$freqCat <- case_when(errorDatAttestedFreqs$log10frequency <= 2.711 ~ "low", .default = "high") + +# errorDatAttestedFreqs$freqCat <- as.factor(errorDatAttestedFreqs$freqCat) + + +wordfreq_model_category_1 <- lmerTest::lmer(misprod ~ freqCat * scaaredSoc_gmc + (1|id) + (1|passage), + data=errorDatAttestedFreqs, REML=TRUE) +summary(wordfreq_model_category_1) + +interact_plot(model = wordfreq_model_category_1, + pred = freqCat, modx = scaaredSoc_gmc, points = TRUE, interval = TRUE) + From 60950b59ebce022dfd32cfa8697f359146013459 Mon Sep 17 00:00:00 2001 From: l-acs Date: Thu, 1 Feb 2024 21:02:21 -0500 Subject: [PATCH 31/33] Update data dict to reflect word-level level of analysis --- code/analysisWordLevelReadAloudBeta.R | 87 ++++++++++++++------------- 1 file changed, 44 insertions(+), 43 deletions(-) diff --git a/code/analysisWordLevelReadAloudBeta.R b/code/analysisWordLevelReadAloudBeta.R index 03dd2e4..c115260 100644 --- a/code/analysisWordLevelReadAloudBeta.R +++ b/code/analysisWordLevelReadAloudBeta.R @@ -22,44 +22,45 @@ # # hes_position: # -# for long-form dataframes counting misproductions, this indicates whether the -# relevant count is the number of hesitations before (0) or after (1) those -# misproductions being counted in that row +# for long-form dataframes looking for misproductions, this indicates whether +# the relevant binary value is tracking whether a hesitation comes before (0) +# or after (1) the misproduction being counted in that row # # -# misprod_tally: +# misprod_in_adjacent_window: # -# conversely, in long-form dataframes counting misproductions, this column -# actually tracks how many misproductions there are in that reading -# (participant x passage) that have a hesitation in the relevant relative -# position +# conversely, in long-form dataframes looking at misproductions, this column +# actually tracks whether or not there is a misproduction in the relevant +# relative position (to a hesitation), within the specified window # justMisprodWithHesBefore: # -# this is the dataframe with every (participant x passage) reading, counting -# the number of misproductions with a nearby preceding hesitation +# this is the dataframe with every (participant x passage x word) read, +# checking for each word whether there is a misproduction with a nearby +# preceding hesitation # -# i.e., for each reading, it counts the number of times (misprod_tally) that a -# hesitation comes before a misproduction -- so for every entry, hes_position = 0 +# i.e., for each reading, it only looks at rows where a hesitation comes +# before a misproduction -- so for every entry, hes_position = 0 # # # justMisprodWithHesAfter # -# similarly, this is the dataframe with every (participant x passage) reading, -# counting the number of misproductions with a nearby following hesitation +# similarly, this is the dataframe with every (participant x passage x word) +# read, checking each time whether there is a misproduction with a nearby +# following hesitation # -# i.e., for each reading, it counts the number of times (misprod_tally) that a -# hesitation comes after a misproduction -- so for every entry, hes_position = 1 +# i.e., for each reading, it only looks at rows where a hesitation comes after +# a misproduction -- so for every entry, hes_position = 1 # # # errorDatLongMisprodWithRelHes: # -# this is the long-form dataframe, with two rows per reading (participant x -# passage): one for each position for a relative hesitation. i.e. this stacks -# the two dataframes that respectively have (1) every passage, with a count of -# misproductions for hes_position = 0, and (2) every passage, with a count of -# misproductions for hes_position = 1 +# this is the long-form dataframe, with two rows per word read (participant x +# passage x word): one for each position for a relative hesitation, i.e. this +# stacks the two dataframes that respectively have (1) every word, where +# misprod_in_adjacent_window corresponds to hes_position = 0, and (2) every +# word, where misprod_in_adjacent_window corresponds to hes_position = 1 @@ -67,45 +68,45 @@ # misprod_position: # -# for long-form dataframes counting hesitations, this indicates whether the -# relevant count is the number of misproductions before (0) or after (1) those -# hesitations being counted in that row +# for long-form dataframes looking for hesitations, this indicates whether the +# relevant binary value is tracking whether a misproduction comes before (0) +# or after (1) the hesitation being counted in that row # # -# hes_tally: +# hes_in_adjacent_window: # -# conversely, in long-form dataframes counting hesitations, this column -# actually tracks how many hesitations there are in that reading (participant -# x passage) that have a misproduction in the relevant relative position +# conversely, in long-form dataframes looking at hesitations, this column +# actually tracks whether or not there is a hesitation in the relevant +# relative position (to a misproduction), within the specified window # justHesWithMisprodBefore: # -# this is the dataframe with every (participant x passage) reading, counting -# the number of hesitations with a nearby preceding misproduction +# this is the dataframe with every (participant x passage x word) read, +# checking for each word whether there is a hesitation with a nearby +# preceding misproduction # -# i.e., for each reading, it counts the number of times (hes_tally) that a -# misproduction comes before a hesitation -- so for every entry, -# misprod_position = 0 +# i.e., for each reading, it only looks at rows where a misproduction comes +# before a hesitation -- so for every entry, misprod_position = 0 # # # justHesWithMisprodAfter # -# similarly, this is the dataframe with every (participant x passage) reading, -# counting the number of hesitations with a nearby following misproduction +# similarly, this is the dataframe with every (participant x passage x word) +# read, checking for each word whether there is a hesitation with a nearby +# following misproduction # -# i.e., for each reading, it counts the number of times (hes_tally) that a -# misproduction comes after a hesitation -- so for every entry, -# misprod_position = 1 +# i.e., for each reading, it only looks at rows where a misproduction comes +# after a hesitation -- so for every entry, misprod_position = 1 # # # errorDatLongHesWithRelMisprod: # -# this is the long-form dataframe, with two rows per reading (participant x -# passage): one for each position for a relative misproduction, i.e. this -# stacks the two dataframes that respectively have (1) every passage, with a -# count of hesitations for misprod_position = 0, and (2) every passage, with a -# count of hesitations for misprod_position = 1 +# this is the long-form dataframe, with two rows per word read (participant x +# passage x word): one for each position for a relative misproduction, i.e. +# this stacks the two dataframes that respectively have (1) every word, where +# hes_in_adjacent_window corresponds to misprod_position = 0, and (2) every +# word, where hes_in_adjacent_window corresponds to misprod_position = 1 ### SECTION 1: SETTING UP From 7492f3c0b1b9691cb6c6374a557bb98a8ce094ac Mon Sep 17 00:00:00 2001 From: l-acs Date: Thu, 1 Feb 2024 21:32:25 -0500 Subject: [PATCH 32/33] Add models for word frequency, interpolating absent words --- code/analysisWordLevelReadAloudBeta.R | 74 +++++++++++++++++++++++---- 1 file changed, 65 insertions(+), 9 deletions(-) diff --git a/code/analysisWordLevelReadAloudBeta.R b/code/analysisWordLevelReadAloudBeta.R index c115260..0469768 100644 --- a/code/analysisWordLevelReadAloudBeta.R +++ b/code/analysisWordLevelReadAloudBeta.R @@ -629,7 +629,7 @@ summary(misprod_with_rel_hes_model_4.6) # "" -# Word frequency analysis +# Word frequency analysis with words absent from corpus dropped # Does a word's frequency predict hesitation on that word? errorDatAttestedFreqs <- filter(errorDat, log10frequency > 0) wordfreq_model_1 <- lmerTest::lmer(hesitation ~ log10frequency + (1|id) + (1|passage), @@ -697,18 +697,74 @@ interact_plot(model = wordfreq_model_4.5, summary(errorDatAttestedFreqs$log10frequency) -case_when(errorDatAttestedFreqs$log10frequency <= 2.711 ~ 1, errorDatAttestedFreqs$log10frequency <= 4.058 ~ 2, errorDatAttestedFreqs$log10frequency <= 5.609 ~ 3, .default = 4 ) +# case_when(errorDatAttestedFreqs$log10frequency <= 2.711 ~ 1, errorDatAttestedFreqs$log10frequency <= 4.058 ~ 2, errorDatAttestedFreqs$log10frequency <= 5.609 ~ 3, .default = 4 ) +# +# +# errorDatAttestedFreqs$freqCat <- case_when(errorDatAttestedFreqs$log10frequency <= 2.711 ~ "low", .default = "high") +# +# # errorDatAttestedFreqs$freqCat <- as.factor(errorDatAttestedFreqs$freqCat) +# +# +# wordfreq_model_category_1 <- lmerTest::lmer(misprod ~ freqCat * scaaredSoc_gmc + (1|id) + (1|passage), +# data=errorDatAttestedFreqs, REML=TRUE) +# summary(wordfreq_model_category_1) +# +# interact_plot(model = wordfreq_model_category_1, +# pred = freqCat, modx = scaaredSoc_gmc, points = TRUE, interval = TRUE) -errorDatAttestedFreqs$freqCat <- case_when(errorDatAttestedFreqs$log10frequency <= 2.711 ~ "low", .default = "high") -# errorDatAttestedFreqs$freqCat <- as.factor(errorDatAttestedFreqs$freqCat) + +# Word frequency analysis with words absent from corpus set to corpus minimum +subtlexus_minimum = 0.301 # or: # subtlexus %>% select(Lg10WF) %>% min +errorDat$log10frequency_with_absents <- case_match( + errorDat$log10frequency, + 0 ~ subtlexus_minimum, + .default = errorDat$log10frequency) + +compare_freq <- data.frame(cbind(old = errorDat$log10frequency, + new = errorDat$log10frequency_with_absents)) + +filter(compare_freq, old != new) %>% # confirm it worked as expected + filter(old != 0 | new != subtlexus_minimum) %>% + nrow == 0 # TRUE + + +# Does a word's frequency predict hesitation on that word? +wordfreq_model_with_absents_1 <- lmerTest::lmer(hesitation ~ log10frequency_with_absents + (1|id) + (1|passage) + (1|word), + data=errorDat, REML=TRUE) +summary(wordfreq_model_with_absents_1) + +# "" misprod on that word? +wordfreq_model_with_absents_2 <- lmerTest::lmer(misprod ~ log10frequency_with_absents + (1|id) + (1|passage) + (1|word), + data=errorDat, REML=TRUE) +summary(wordfreq_model_with_absents_2) + + +# Do social anxiety and frequency interact to predict hesitation rate or misproduction rate? + +# control for word, that must matter right? +wordfreq_model_with_absents_3.5 <- lmerTest::lmer(hesitation ~ log10frequency_with_absents * scaaredSoc_gmc + (1|id) + (1|passage) + (1|word), + data=errorDat, REML=TRUE) +summary(wordfreq_model_with_absents_3.5) # still yes, very slightly higher p + +wordfreq_model_with_absents_4.5 <- lmerTest::lmer(misprod ~ log10frequency_with_absents * scaaredSoc_gmc + (1|id) + (1|passage) + (1|word), + data=errorDat, REML=TRUE) +summary(wordfreq_model_with_absents_4.5) # still no, slightly lower p = 0.114 + + +# hesitation ~ wf x SA +interact_plot(model = wordfreq_model_with_absents_3.5, + pred = log10frequency_with_absents, modx = scaaredSoc_gmc, interval = TRUE) + + + +interact_plot(model = wordfreq_model_with_absents_4.5, + pred = log10frequency_with_absents, modx = scaaredSoc_gmc, interval = TRUE) + + -wordfreq_model_category_1 <- lmerTest::lmer(misprod ~ freqCat * scaaredSoc_gmc + (1|id) + (1|passage), - data=errorDatAttestedFreqs, REML=TRUE) -summary(wordfreq_model_category_1) -interact_plot(model = wordfreq_model_category_1, - pred = freqCat, modx = scaaredSoc_gmc, points = TRUE, interval = TRUE) +summary(errorDat$log10frequency_with_absents) From a335041e398cc7633c8da11328413e6a4a045224 Mon Sep 17 00:00:00 2001 From: l-acs Date: Thu, 1 Feb 2024 22:17:15 -0500 Subject: [PATCH 33/33] Prune models and set up plots --- code/analysisWordLevelReadAloudBeta.R | 95 +++++++++++---------------- 1 file changed, 39 insertions(+), 56 deletions(-) diff --git a/code/analysisWordLevelReadAloudBeta.R b/code/analysisWordLevelReadAloudBeta.R index 0469768..64f4590 100644 --- a/code/analysisWordLevelReadAloudBeta.R +++ b/code/analysisWordLevelReadAloudBeta.R @@ -122,6 +122,7 @@ library(gridExtra) library(grid) library(cowplot) library(colorspace) +library(effects) # library(colorblindr) #set up date for output file naming @@ -632,63 +633,22 @@ summary(misprod_with_rel_hes_model_4.6) # "" # Word frequency analysis with words absent from corpus dropped # Does a word's frequency predict hesitation on that word? errorDatAttestedFreqs <- filter(errorDat, log10frequency > 0) -wordfreq_model_1 <- lmerTest::lmer(hesitation ~ log10frequency + (1|id) + (1|passage), +wordfreq_model_1 <- lmerTest::lmer(hesitation ~ log10frequency + (1|id) + (1|passage) + (1|word), data=errorDatAttestedFreqs, REML=TRUE) summary(wordfreq_model_1) -wordfreq_model_2 <- lmerTest::lmer(misprod ~ log10frequency + (1|id) + (1|passage), +wordfreq_model_2 <- lmerTest::lmer(misprod ~ log10frequency + (1|id) + (1|passage) + (1|word), data=errorDatAttestedFreqs, REML=TRUE) summary(wordfreq_model_2) -# control for word, that must matter right? -wordfreq_model_1.5 <- lmerTest::lmer(hesitation ~ log10frequency + (1|id) + (1|passage) + (1|word), - data=errorDatAttestedFreqs, REML=TRUE) -summary(wordfreq_model_1.5) -wordfreq_model_2.5 <- lmerTest::lmer(misprod ~ log10frequency + (1|id) + (1|passage) + (1|word), - data=errorDatAttestedFreqs, REML=TRUE) -summary(wordfreq_model_2.5) - -# control for word w/o passage -wordfreq_model_1.6 <- lmerTest::lmer(hesitation ~ log10frequency + (1|id) + (1|word), - data=errorDatAttestedFreqs, REML=TRUE) -summary(wordfreq_model_1.6) -wordfreq_model_2.6 <- lmerTest::lmer(misprod ~ log10frequency + (1|id) + (1|word), - data=errorDatAttestedFreqs, REML=TRUE) -summary(wordfreq_model_2.6) - - - # Do social anxiety and frequency interact to predict hesitation rate or misproduction rate? -wordfreq_model_3 <- lmerTest::lmer(hesitation ~ log10frequency * scaaredSoc_gmc + (1|id) + (1|passage), - data=errorDatAttestedFreqs, REML=TRUE) -summary(wordfreq_model_3) # yes! - -wordfreq_model_4 <- lmerTest::lmer(misprod ~ log10frequency * scaaredSoc_gmc + (1|id) + (1|passage), - data=errorDatAttestedFreqs, REML=TRUE) -summary(wordfreq_model_4) # no, not at all - well, p = 0.15 - - -# control for word, that must matter right? -wordfreq_model_3.5 <- lmerTest::lmer(hesitation ~ log10frequency * scaaredSoc_gmc + (1|id) + (1|passage) + (1|word), +wordfreq_model_3 <- lmerTest::lmer(hesitation ~ log10frequency * scaaredSoc_gmc + (1|id) + (1|passage) + (1|word), data=errorDatAttestedFreqs, REML=TRUE) -summary(wordfreq_model_3.5) # still yes, very slightly higher p +summary(wordfreq_model_3) # todo -wordfreq_model_4.5 <- lmerTest::lmer(misprod ~ log10frequency * scaaredSoc_gmc + (1|id) + (1|passage) + (1|word), +wordfreq_model_4 <- lmerTest::lmer(misprod ~ log10frequency * scaaredSoc_gmc + (1|id) + (1|passage) + (1|word), data=errorDatAttestedFreqs, REML=TRUE) -summary(wordfreq_model_4.5) # still no, slightly lower p = 0.114 - - -# hesitation ~ wf x SA -interact_plot(model = wordfreq_model_3.5, - pred = log10frequency, modx = scaaredSoc_gmc, interval = TRUE) - - - -interact_plot(model = wordfreq_model_4.5, - pred = log10frequency, modx = scaaredSoc_gmc, interval = TRUE) - - - +summary(wordfreq_model_4) # todo @@ -742,29 +702,52 @@ summary(wordfreq_model_with_absents_2) # Do social anxiety and frequency interact to predict hesitation rate or misproduction rate? - -# control for word, that must matter right? -wordfreq_model_with_absents_3.5 <- lmerTest::lmer(hesitation ~ log10frequency_with_absents * scaaredSoc_gmc + (1|id) + (1|passage) + (1|word), +wordfreq_model_with_absents_3 <- lmerTest::lmer(hesitation ~ log10frequency_with_absents * scaaredSoc_gmc + (1|id) + (1|passage) + (1|word), data=errorDat, REML=TRUE) -summary(wordfreq_model_with_absents_3.5) # still yes, very slightly higher p +summary(wordfreq_model_with_absents_3) -wordfreq_model_with_absents_4.5 <- lmerTest::lmer(misprod ~ log10frequency_with_absents * scaaredSoc_gmc + (1|id) + (1|passage) + (1|word), +wordfreq_model_with_absents_4 <- lmerTest::lmer(misprod ~ log10frequency_with_absents * scaaredSoc_gmc + (1|id) + (1|passage) + (1|word), data=errorDat, REML=TRUE) -summary(wordfreq_model_with_absents_4.5) # still no, slightly lower p = 0.114 +summary(wordfreq_model_with_absents_4) + +# effects +# eff <- effect("log10frequency_with_absents", wordfreq_model_with_absents_1) +# plot(eff, se = TRUE, rug = FALSE, xlab = "log10frequency_with_absents", ylab = "hesitation", col.points = "red", col.lines = "blue", lty = 1) +# eff_noabsents <- effect("log10frequency", wordfreq_model_1) +# plot(eff_noabsents, se = TRUE, rug = FALSE, xlab = "log10frequency", ylab = "hesitation", col.points = "red", col.lines = "blue", lty = 1) + +plot_lmer <- function(model, predictor, outcome) { + # NB `outcome` will not catch your mistake; it's just a label + eff <- effect(predictor, model) + plot(eff, se = TRUE, rug = FALSE, xlab = predictor, ylab = outcome, + col.points = "red", col.lines = "blue", lty = 1) +} + +# as in +plot_lmer(wordfreq_model_1, "log10frequency", "hesitation") +plot_lmer(wordfreq_model_with_absents_1, "log10frequency_with_absents", "hesitation") +plot_lmer(wordfreq_model_with_absents_2, "log10frequency_with_absents", "misprod") +plot_lmer(wordfreq_model_2, "log10frequency", "misprod") # hesitation ~ wf x SA -interact_plot(model = wordfreq_model_with_absents_3.5, +interact_plot(model = wordfreq_model_3, + pred = log10frequency, modx = scaaredSoc_gmc, interval = TRUE) + +interact_plot(model = wordfreq_model_with_absents_3, pred = log10frequency_with_absents, modx = scaaredSoc_gmc, interval = TRUE) +# misprod ~ wf x SA +interact_plot(model = wordfreq_model_4, + pred = log10frequency, modx = scaaredSoc_gmc, interval = TRUE) -interact_plot(model = wordfreq_model_with_absents_4.5, +interact_plot(model = wordfreq_model_with_absents_4, pred = log10frequency_with_absents, modx = scaaredSoc_gmc, interval = TRUE) -summary(errorDat$log10frequency_with_absents) +# summary(errorDat$log10frequency_with_absents)