From f627949b76836c044859652e3655418a6506ccad Mon Sep 17 00:00:00 2001
From: l-acs <lucas.sahar@mail.mcgill.ca>
Date: Thu, 1 Jun 2023 17:43:13 -0400
Subject: [PATCH 01/33] Preliminary analysis prep: collapse across passages

---
 code/prepReadAloudBeta.R | 46 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)
 create mode 100644 code/prepReadAloudBeta.R

diff --git a/code/prepReadAloudBeta.R b/code/prepReadAloudBeta.R
new file mode 100644
index 0000000..eacff00
--- /dev/null
+++ b/code/prepReadAloudBeta.R
@@ -0,0 +1,46 @@
+# Reading in CSVs of preprocessed error data and participant information,
+# writing a new CSV including the preprocessed data as well as comprehension,
+# demographic, reading, and passage information
+#
+# Luc Sahar and Jessica M. Alexander -- NDCLab, Florida International University
+# last updated 5/31/23
+
+# NB passages "sun" and "broccoli" as coded contain errors. Namely, broccoli had
+# "iodized _table_ counteracts" instead of the intended "table salt", and sun
+# showed participants "_empower_ individuals" whereas it was coded as "enable"
+
+library(readxl) # read_xlsx
+library(stringr) # str_extract
+library(dplyr) # most things
+library(purrr) # map, map_df; generally good to have
+library(lubridate) # now
+library(readr) # write_csv
+
+ext_default = 'csv'
+tz_default = "America/New_York"
+date_format_default = "%Y%m%d_%I%M%P"
+
+
+build_output_filename <- function(label, ext = ext_default, timezone = tz_default, date_format = date_format_default) {
+  # `label` may include the destination directory, if different from the working directory when the script is run
+  current_datetime <- now(timezone) %>% format(date_format)
+  paste(label, '_', current_datetime, '.', ext, sep = "")
+}
+
+collapse_by_participant <- function(filename_in, filename_out) {
+  by_participant <- read_csv(filename_in) %>%
+    unique %>% # dedup
+    group_by(id) %>% summarize(across(misprod:total_uncorrected_errors, sum)) # summarize by participant, across all passages
+    # TODO change the columns selected, once more have been added to the output of the preproc script
+
+  write_csv(by_participant, filename_out)
+  return(filename_out)
+}
+
+
+# base = "~/Documents/ndclab/analysis-sandbox/github-structure-mirror/readAloud-valence-dataset/derivatives/preprocessed"
+base = "/home/data/NDClab/datasets/readAloud-valence-dataset/derivatives/preprocessed"
+preprocessed_summary_filename = "TODO"
+collapsed_filename = build_output_filename(label = paste(base, "disfluencies_subject", sep='/'))
+
+collapse_by_participant(preprocessed_summary_file, collapsed_filename)
\ No newline at end of file

From 6f9897c2052091003dc0c4839ded02206fe45ffb Mon Sep 17 00:00:00 2001
From: JMA <77893711+jessb0t@users.noreply.github.com>
Date: Wed, 14 Jun 2023 11:22:52 -0500
Subject: [PATCH 02/33] Initial prep and analysis scripts

---
 code/analysisReadAloudBeta.R | 159 ++++++++++++++++++++++
 code/prepReadAloudBeta.R     | 248 +++++++++++++++++++++++++++++++----
 2 files changed, 379 insertions(+), 28 deletions(-)
 create mode 100644 code/analysisReadAloudBeta.R

diff --git a/code/analysisReadAloudBeta.R b/code/analysisReadAloudBeta.R
new file mode 100644
index 0000000..2f662c3
--- /dev/null
+++ b/code/analysisReadAloudBeta.R
@@ -0,0 +1,159 @@
+# readAloud-valence-beta Reading Task Analyses
+# Authors: Luc Sahar, Jessica M. Alexander
+# Last Updated: 2023-06-14
+
+# INPUTS
+# data/df: behavioral data, for each participant on each passage, with relevant participant information and trial-level stimulus information
+
+# OUTPUTS
+# TBD
+
+# NOTES TO DO
+# drop 150086 as only completed 12 of 20 passages and low accuracy
+
+### SECTION 1: SETTING UP
+library(dplyr)
+library(lme4)
+library(lmerTest)
+library(interactions)
+
+#visualization tools
+library(ggplot2)
+library(gridExtra)
+library(grid)
+library(cowplot)
+library(colorspace)
+library(colorblindr)
+
+#set up date for output file naming
+today <- Sys.Date()
+today <- format(today, "%Y%m%d")
+
+#set up directories for input/output data
+data <- '/Users/jalexand/github/readAloud-valence-beta/derivatives/readAloudBetaData_20230614.csv'
+out_path <- '/Users/jalexand/github/readAloud-valence-beta/derivatives/'
+
+#read in data
+df <- read.csv(data)
+
+#organize participant demographic variables
+df$sex <- as.factor(df$sex)
+df$pronouns <- as.factor(df$pronouns)
+df$ethnic <- as.factor(df$ethnic)
+df$socclass <- as.factor(df$socclass)
+
+#extract demo stats
+summary(df$age)
+sd(df$age)
+summary(df$sex)/18
+summary(df$sex)/18 / (nrow(df)/18)
+summary(df$pronouns)/18
+summary(df$pronouns)/18 / (nrow(df)/18)
+summary(df$ethnic)/18
+summary(df$ethnic)/18 / (nrow(df)/18)
+summary(df$socclass)/18
+summary(df$socclass)/18 / (nrow(df)/18)
+
+#remove participants whose challenge question accuracy was below 50% (chance = 25%)
+dfTrim <- df
+dfTrim <- dfTrim %>%
+  group_by(id) %>%
+  mutate(challengeAvgSub = mean(challengeACC)) %>%
+  ungroup
+
+dfTrim <- subset(dfTrim, challengeAvgSub>0.5)
+length(unique(df$id)) - length(unique(dfTrim$id)) #number of participants removed
+
+#calculate average accuracy
+mean(dfTrim$challengeAvgSub)
+sd(dfTrim$challengeAvgSub)
+
+
+### SECTION 2: INITIAL DATA TRIMMING
+passage_no_before_trimming <- nrow(dfTrim)
+
+#insert passage-level trimming here
+
+passage_no_after_trim1 <- nrow(dfTrim)
+passage_no_before_trimming - passage_no_after_trim1 #number of passages trimmed
+(passage_no_before_trimming - passage_no_after_trim1) / passage_no_before_trimming #percentage of passages trimmed
+
+
+### SECTION 3: TRANSITION DATA TO LONG FORMAT
+errorDat <- data.frame(matrix(ncol=17, nrow=0))
+colnames(errorDat) <- c("passage", "id",
+                        "sex", "pronouns", "age", "ethnic", "socclass",
+                        "bfne", "phq8", "scaaredTotal", "scaaredGA", "scaaredSoc", "sps",
+                        "lenSyll", "lenWord", "avgSyllPerWord",
+                        "errors")
+for(j in 1:nrow(dfTrim)){
+  passage <- dfTrim$passage[j]
+  id <- dfTrim$id[j]
+  sex <- as.character(dfTrim$sex[j])
+  pronouns <- as.character(dfTrim$pronouns[j])
+  age <- dfTrim$age[j]
+  ethnic <- as.character(dfTrim$ethnic[j])
+  socclass <- as.character(dfTrim$socclass[j])
+  bfne <- dfTrim$bfne[j]
+  phq8 <- dfTrim$phq8[j]
+  scaaredTotal <- dfTrim$scaaredTotal[j]
+  scaaredGA <- dfTrim$scaaredGA[j]
+  scaaredSoc <- dfTrim$scaaredSoc[j]
+  sps <- dfTrim$sps[j]
+  lenSyll <- dfTrim$lenSyll[j]
+  lenWord <- dfTrim$lenWord[j]
+  avgSyllPerWord <- dfTrim$avgSyllPerWord[j]
+  errors <- dfTrim$errors[j]
+  errorDat[nrow(errorDat) + 1,] <-c(passage, id, sex, pronouns, age, ethnic, socclass, bfne, phq8, scaaredTotal, scaaredGA, scaaredSoc, sps, lenSyll, lenWord, avgSyllPerWord, errors)
+}
+
+#organize data types
+errorDat$sex <- as.factor(errorDat$sex)
+errorDat$pronouns <- as.factor(errorDat$pronouns)
+errorDat$age <- as.numeric(errorDat$age)
+errorDat$ethnic <- as.factor(errorDat$ethnic)
+errorDat$socclass <- as.factor(errorDat$socclass)
+errorDat$bfne <- as.numeric(errorDat$bfne)
+errorDat$phq8 <- as.numeric(errorDat$phq8)
+errorDat$scaaredTotal <- as.numeric(errorDat$scaaredTotal)
+errorDat$scaaredGA <- as.numeric(errorDat$scaaredGA)
+errorDat$scaaredSoc <- as.numeric(errorDat$scaaredSoc)
+errorDat$sps <- as.numeric(errorDat$sps)
+errorDat$lenSyll <- as.numeric(errorDat$lenSyll)
+errorDat$lenWord <- as.numeric(errorDat$lenWord)
+errorDat$avgSyllPerWord <- as.numeric(errorDat$avgSyllPerWord)
+errorDat$errors <- as.numeric(errorDat$errors)
+
+#modify contrasts for categorical predictors
+contrasts(errorDat$sex) <- contr.sum(2) #male: -1, female: +1
+
+#center continuous predictors
+errorDat$age_gmc <- errorDat$age - mean(errorDat$age)
+errorDat$bfne_gmc <- errorDat$bfne - mean(errorDat$bfne)
+errorDat$phq8_gmc <- errorDat$phq8 - mean(errorDat$phq8)
+errorDat$scaaredTotal_gmc <- errorDat$scaaredTotal - mean(errorDat$scaaredTotal)
+errorDat$scaaredGA_gmc <- errorDat$scaaredGA - mean(errorDat$scaaredGA)
+errorDat$scaaredSoc_gmc <- errorDat$scaaredSoc - mean(errorDat$scaaredSoc)
+errorDat$sps_gmc <- errorDat$sps - mean(errorDat$sps)
+errorDat$lenSyll_gmc <- errorDat$lenSyll - mean(errorDat$lenSyll)
+errorDat$lenWord_gmc <- errorDat$lenWord - mean(errorDat$lenWord)
+errorDat$avgSyllPerWord_gmc <- errorDat$avgSyllPerWord - mean(errorDat$avgSyllPerWord)
+errorDat$errors_gmc <- errorDat$errors - mean(errorDat$errors)
+
+#extract demo stats
+errorDatStats <- subset(errorDat, !duplicated(errorDat$id))
+summary(errorDatStats$age)
+sd(errorDatStats$age)
+summary(errorDatStats$sex)
+summary(errorDatStats$sex) / length(unique(errorDatStats$id))
+summary(errorDatStats$pronouns)
+summary(errorDatStats$pronouns) / length(unique(errorDatStats$id))
+summary(errorDatStats$ethnic)
+summary(errorDatStats$ethnic) / length(unique(errorDatStats$id))
+summary(errorDatStats$socclass)
+summary(errorDatStats$socclass) / length(unique(errorDatStats$id))
+
+
+### SECTION 4: MODEL RESULTS
+modelErr <- lmerTest::lmer(errors ~ scaaredTotal_gmc * lenSyll_gmc + (1|id) + (1|passage), data=errorDat, REML=TRUE)
+summary(modelErr)
\ No newline at end of file
diff --git a/code/prepReadAloudBeta.R b/code/prepReadAloudBeta.R
index eacff00..d7a3c67 100644
--- a/code/prepReadAloudBeta.R
+++ b/code/prepReadAloudBeta.R
@@ -1,14 +1,27 @@
-# Reading in CSVs of preprocessed error data and participant information,
-# writing a new CSV including the preprocessed data as well as comprehension,
-# demographic, reading, and passage information
-#
-# Luc Sahar and Jessica M. Alexander -- NDCLab, Florida International University
-# last updated 5/31/23
+# readAloud-valence-beta Analysis Preparation
+# Authors: Luc Sahar and Jessica M. Alexander -- NDCLab, Florida International University
+# Last updated: 2023-06-14
 
-# NB passages "sun" and "broccoli" as coded contain errors. Namely, broccoli had
-# "iodized _table_ counteracts" instead of the intended "table salt", and sun
-# showed participants "_empower_ individuals" whereas it was coded as "enable"
+# INPUTS
+# data/df: behavioral (error-related) data, for each participant on each passage
+# accDat: comprehension question accuracy (0/1) for each participant for each passage
+# readDat: stimuli characteristics (by passage half)
+# redcap: participant data, incl. demographics and responses + scored factors for questionnaires:
+  # bfne (brief fear of negative evaluation): bfne_b_scrdTotal (fear of negative evaluation total)
+  # phq8 (patient health questionnaire): phq8_scrdTotal (depression scale total)
+  # scaared, total (screen for adult anxiety disorders): scaared_b_scrdTotal (total anxiety)
+  # scaared, social (screen for adult anxiety disorders): scaared_b_scrdSoc (social phobias)
+  # scaared, general (screen for adult anxiety disorders): scaared_b_scrdGA (general anxiety)
+  # sps (social phobia scale): sias6sps6_b_scrdSPS
 
+# OUTPUTS
+# dfTrim: for each passage, for each participant, details on:
+  # participant behavior: reading errors made, comprehension question accuracy
+  # passage characteristics: length (syllable and word), average syllables per word
+  # participant data: demographics, language history, mood and mood disorder scores
+
+
+### SECTION 1: SETTING UP
 library(readxl) # read_xlsx
 library(stringr) # str_extract
 library(dplyr) # most things
@@ -16,31 +29,210 @@ library(purrr) # map, map_df; generally good to have
 library(lubridate) # now
 library(readr) # write_csv
 
-ext_default = 'csv'
-tz_default = "America/New_York"
-date_format_default = "%Y%m%d_%I%M%P"
+#set up defaults for output file naming
+# ext_default = 'csv'
+# tz_default = "America/New_York"
+# date_format_default = "%Y%m%d_%I%M%P"
+# 
+# build_output_filename <- function(label, ext = ext_default, timezone = tz_default, date_format = date_format_default) {
+#   # `label` may include the destination directory, if different from the working directory when the script is run
+#   current_datetime <- now(timezone) %>% format(date_format)
+#   paste(label, '_', current_datetime, '.', ext, sep = "")
+# }
+today <- Sys.Date()
+today <- format(today, "%Y%m%d")
+
+#set up directories for input/output
+main_dataset <- '/Users/jalexand/github/readAloud-valence-dataset/'
+main_analyses <- '/Users/jalexand/github/readAloud-valence-beta/'
+out_path <- '/Users/jalexand/github/readAloud-valence-beta/derivatives/'
+
+#load input files
+data <- paste(main_dataset, 'derivatives/preprocessed/disfluencies_subject-x-passage_20230613_0450pm.csv', sep="", collapse=NULL)
+accDat_path <- paste(main_dataset,'derivatives/preprocessed/readAloud_passage-level_summary_20220812.csv', sep="", collapse=NULL)
+readDat_path <- paste(main_dataset, 'derivatives/analysisStimuli_readDat_20230614.csv', sep="", collapse=NULL)
+redcap_path <- paste(main_dataset,'derivatives/preprocessed/202201v0readAloudval_SCRD_2022-06-20_1019.csv', sep="", collapse=NULL)
+agedat_path <- paste(main_dataset,'derivatives/preprocessed/202201v0readAloudval_SCRD_2022-06-20_1019_ageonly.csv', sep="", collapse=NULL)
+
+df <- read.csv(data)
+redcap <- read.csv(redcap_path, na.strings='NA') #participant questionnaire responses
+agedat <- read.csv(agedat_path, na.strings='NA') #participant age information
+readDat <- read.csv(readDat_path, na.strings='N') #passage-level characteristics from analysisStimuli.R
+accDat <- read.csv(accDat_path, na.strings='NA', check.names=FALSE) #passage level accuracy for each subject
+accDat$passage <- c("dams", "flying", "bats", "broccoli", "realty", "bees", "dogshow", "dolphins", "icefishing",
+                    "cars", "vegas", "sun", "caramel", "congo", "antarctica", "depression", "skunkowl", "grizzly",
+                    "mantis", "dentist")        #rename passages with short-name
+
+#add missing passages for 150086 so that nrow is divisible by 20
+passages_read <- df$passage[which(df$id=="150086")]
+all_passages <- unique(df$passage)
+tempdf <- data.frame(matrix(nrow=0, ncol=ncol(df)))
+colnames(tempdf) <- colnames(df)
+for(passage in 1:length(all_passages)){
+  if(all_passages[passage] %in% passages_read){next}else{
+    tempdf[nrow(tempdf) + 1,] <- c("150086", all_passages[passage], rep(NA, 41))
+  }
+}
+df <- rbind(df, tempdf)
+
+### SECTION 2: BUILD DEMOGRAPHIC DATA DF
+demoDat <- redcap[,c(1,5)]
+#biological sex: replace numerical values with text description
+for(a in 1:nrow(redcap)){
+  if(is.na(redcap$demo_b_sex_s1_r1_e1[a])){demoDat$sex[a] <- 'undisclosed'}
+  else if(redcap$demo_b_sex_s1_r1_e1[a]==1){demoDat$sex[a] <- 'male'}
+  else if(redcap$demo_b_sex_s1_r1_e1[a]==2){demoDat$sex[a] <- 'female'}
+  else if(redcap$demo_b_sex_s1_r1_e1[a]==3){demoDat$sex[a] <- 'intersex'}
+  else if(redcap$demo_b_sex_s1_r1_e1[a]==4){demoDat$sex[a] <- 'other'}
+  else if(redcap$demo_b_sex_s1_r1_e1[a]==5){demoDat$sex[a] <- 'unknown'}
+  else{demoDat$sex[a] <- 'undisclosed'}
+}
+
+#preferred pronouns: replace numerical values with text description
+for(b in 1:nrow(redcap)){
+  if(is.na(redcap$demo_b_pronouns_s1_r1_e1[b])){demoDat$pron[b] <- 'undisclosed'}
+  else if(redcap$demo_b_pronouns_s1_r1_e1[b]==1){demoDat$pron[b] <- 'she/her'}
+  else if(redcap$demo_b_pronouns_s1_r1_e1[b]==2){demoDat$pron[b] <- 'he/him'}
+  else if(redcap$demo_b_pronouns_s1_r1_e1[b]==3){demoDat$pron[b] <- 'they/them'}
+  else if(redcap$demo_b_pronouns_s1_r1_e1[b]==5){demoDat$pron[b] <- 'other'}
+  else{demoDat$pron[b] <- 'undisclosed'}
+}
+
+#ethnicity affiliation: map to text description
+for(c in 1:nrow(redcap)){
+  if(redcap$demo_b_ethnic_s1_r1_e1___1[c]==1){demoDat$ethnic[c] <- 'AI'} #american indian/alaska native
+  else if(redcap$demo_b_ethnic_s1_r1_e1___2[c]==1){demoDat$ethnic[c] <- 'A'} #asian
+  else if(redcap$demo_b_ethnic_s1_r1_e1___3[c]==1){demoDat$ethnic[c] <- 'AA'} #african american
+  else if(redcap$demo_b_ethnic_s1_r1_e1___4[c]==1){demoDat$ethnic[c] <- 'LX'} #hispanic/latinx
+  else if(redcap$demo_b_ethnic_s1_r1_e1___5[c]==1){demoDat$ethnic[c] <- 'ME'} #middle eastern
+  else if(redcap$demo_b_ethnic_s1_r1_e1___6[c]==1){demoDat$ethnic[c] <- 'PI'} #pacific islander
+  else if(redcap$demo_b_ethnic_s1_r1_e1___7[c]==1){demoDat$ethnic[c] <- 'W'} #white
+  else if(redcap$demo_b_ethnic_s1_r1_e1___8[c]==1){demoDat$ethnic[c] <- 'O'} #other
+  else{demoDat$ethnic[c] <- 'UND'} #undisclosed
+}
+
+#social class affiliation: replace numerical values with text description
+for(d in 1:nrow(redcap)){
+  if(is.na(redcap$demo_b_socclass_s1_r1_e1[d])){demoDat$socclass[d] <- 'undisclosed'}
+  else if(redcap$demo_b_socclass_s1_r1_e1[d]==1){demoDat$socclass[d] <- 'poor'}
+  else if(redcap$demo_b_socclass_s1_r1_e1[d]==2){demoDat$socclass[d] <- 'working'}
+  else if(redcap$demo_b_socclass_s1_r1_e1[d]==3){demoDat$socclass[d] <- 'middle'}
+  else if(redcap$demo_b_socclass_s1_r1_e1[d]==4){demoDat$socclass[d] <- 'affluent'}
+  else{demoDat$socclass[d] <- 'undisclosed'}
+}
+
+#communication disorders diagnoses: sum across childhood, adolescence, and adulthood
+for(e in 1:nrow(redcap)){
+  demoDat$commdis[e] <- sum(redcap$demo_b_comdiskid_s1_r1_e1[e],
+                            redcap$demo_b_comdisteen_s1_r1_e1[e],
+                            redcap$demo_b_comdisad_s1_r1_e[e])
+}
 
+#language history: transfer directly
+for(f in 1:nrow(redcap)){
+  demoDat$eng[f] <- redcap$demo_b_eng_s1_r1_e1[match(demoDat$record_id[f], redcap$record_id)] #participant monolingualism
+  demoDat$langhis[f] <- redcap$demo_b_langhis_s1_r1_e1[match(demoDat$record_id[f], redcap$record_id)] #participant language history
+  demoDat$ageen[f] <- redcap$demo_b_ageen_s1_r1_e1[match(demoDat$record_id[f], redcap$record_id)] #participant age of English acquisition
+  demoDat$profen[f] <- redcap$demo_b_profen_s1_r1_e1[match(demoDat$record_id[f], redcap$record_id)] #participant English proficiency
+}
+
+#mood and mood disorders: transfer directly
+for(g in 1:nrow(redcap)){
+  demoDat$bfne[g] <- redcap$bfne_b_scrdTotal[match(demoDat$record_id[g], redcap$record_id)] #bfne total score
+  demoDat$phq8[g] <- redcap$phq8_scrdTotal[match(demoDat$record_id[g], redcap$record_id)] #phq8 depression scale
+  demoDat$scaaredTotal[g] <- redcap$scaared_b_scrdTotal[match(demoDat$record_id[g], redcap$record_id)] #scaared total anxiety
+  demoDat$scaaredGA[g] <- redcap$scaared_b_scrdGA[match(demoDat$record_id[g], redcap$record_id)] #scaared general anxiety
+  demoDat$scaaredSoc[g] <- redcap$scaared_b_scrdSoc[match(demoDat$record_id[g], redcap$record_id)] #scaared social phobias
+  demoDat$sps[g] <- redcap$sias6sps6_b_scrdSPS[match(demoDat$record_id[g], redcap$record_id)] #sps social phobia scale
+}
 
-build_output_filename <- function(label, ext = ext_default, timezone = tz_default, date_format = date_format_default) {
-  # `label` may include the destination directory, if different from the working directory when the script is run
-  current_datetime <- now(timezone) %>% format(date_format)
-  paste(label, '_', current_datetime, '.', ext, sep = "")
+#age: pull from separate file
+for(h in 1:nrow(demoDat)){
+  demoDat$age[h] <- agedat$info_age_s1_r1_e1[match(demoDat$record_id[h], agedat$record_id)]
 }
 
-collapse_by_participant <- function(filename_in, filename_out) {
-  by_participant <- read_csv(filename_in) %>%
-    unique %>% # dedup
-    group_by(id) %>% summarize(across(misprod:total_uncorrected_errors, sum)) # summarize by participant, across all passages
-    # TODO change the columns selected, once more have been added to the output of the preproc script
 
-  write_csv(by_participant, filename_out)
-  return(filename_out)
+### SECTION 3: BUILD TRIAL-LEVEL DF (ADD DEMODAT, READDAT, and ACCDAT to DF)
+for(i in 1:nrow(df)){
+  subject <- df$id[i] #extract subject number for matching
+  passage <- df$passage[i] #extract passage name for matching
+  errors <- df$errors[i] #extract total errors
+  
+  #extract passage characteristics from readDat
+  df$lenSyll[i] <- sum(readDat$lengthSyll[which(readDat$passage==passage)]) #length of passage in syllables
+  df$lenWord[i] <- sum(readDat$lengthWord[which(readDat$passage==passage)]) #length of passage in words
+  df$avgSyllPerWord[i] <- df$lenSyll[i]/df$lenWord[i]
+  
+  #extract participant accuracy from accDat
+  df$challengeACC[i] <- accDat[match(passage, accDat$passage), as.character(subject)] #passage-specific challenge question accuracy for subject
+  
+  #extract participant demographics from demoDat
+  df$sex[i] <- demoDat$sex[match(df$id[i], demoDat$record_id)]   #participant biological sex
+  df$pronouns[i] <- demoDat$pron[match(df$id[i], demoDat$record_id)]   #participant preferred pronouns
+  df$age[i] <- demoDat$age[match(df$id[i], demoDat$record_id)] #participant age
+  df$ethnic[i] <- demoDat$ethnic[match(df$id[i], demoDat$record_id)] #participant ethnic group affiliation
+  df$socclass[i] <- demoDat$socclass[match(df$id[i], demoDat$record_id)]   #participant social class identification
+  df$eng[i] <- demoDat$eng[match(df$id[i], demoDat$record_id)] #participant multilingualism (0=EN only, 1=EN+another)
+  df$langhis[i] <- demoDat$langhis[match(df$id[i], demoDat$record_id)] #participant language learning history (1=EN first, 2=other first, 3=EN+other same, 4=something else)
+  df$ageen[i] <- demoDat$ageen[match(df$id[i], demoDat$record_id)] #participant age of English acquisition (if not L1)
+  df$profen[i] <- demoDat$profen[match(df$id[i], demoDat$record_id)] #participant English proficiency (1=Native, 2=Advanced, 3=Intermediate, 4=Elementary, 5=Not proficient)
+  df$commdis[i] <- demoDat$commdis[match(df$id[i], demoDat$record_id)] #participant communication disorder history (0=none, 1+=diagnoses to review)
+  df$bfne[i] <- demoDat$bfne[match(df$id[i], demoDat$record_id)] #participant fear of negative evaluation
+  df$phq8[i] <- demoDat$phq8[match(df$id[i], demoDat$record_id)] #participant depression
+  df$scaaredTotal[i] <- demoDat$scaaredTotal[match(df$id[i], demoDat$record_id)] #participant overall anxiety
+  df$scaaredGA[i] <- demoDat$scaaredGA[match(df$id[i], demoDat$record_id)] #participant general anxiety
+  df$scaaredSoc[i] <- demoDat$scaaredSoc[match(df$id[i], demoDat$record_id)] #participant social phobias (scaared)
+  df$sps[i] <- demoDat$sps[match(df$id[i], demoDat$record_id)] #participant social phobias (sias6sps6)
 }
 
+#organize participant demographic variables
+df$sex <- as.factor(df$sex)
+df$pronouns <- as.factor(df$pronouns)
+df$ethnic <- as.factor(df$ethnic)
+df$socclass <- as.factor(df$socclass)
+
+
+### SECTION 3: CROSS-CHECK ALL PARTICIPANTS MET INCLUSION CRITERIA
+#note: given the time required to annotated errors, only participants who met inclusion criteria were annotated
+#sum(df$eng==1 & df$langhis %in% c(2,4) & df$ageen>6) #confirm all subjects monolingual English OR natively bilingual OR learned English before age 6
+#sum(df$commdis>0) #confirm no subject diagnosed with any communication disorder
+#sum(df$profen>3, na.rm=TRUE)/20 #one remaining subject (150060) rates own English proficiency as not "elementary" or "not proficient", but reads fluidly and achieved 80% accuracy on challenge questions, so not excluded
+
+#extract age and sex stats
+summary(df$age) #age range and mean
+sd(df$age) #age standard deviation
+summary(df$sex)/20 #number of participants by sex
+summary(df$sex)/20 / (nrow(df)/20) #percentage of participants by sex
+summary(df$pronouns)/20 #number of participants by preferred pronoun
+summary(df$pronouns)/20 / (nrow(df)/20) #percentage of participants by preferred pronoun
+summary(df$ethnic)/20 #number of participants by ethnic affiliation
+summary(df$ethnic)/20 / (nrow(df)/20) #percentage of participants by ethnic affiliation
+summary(df$socclass)/20 #number of participants by social class affiliation
+summary(df$socclass)/20 / (nrow(df)/20) #percentage of participants by social class affiliation
+
+
+### SECTION 4: TRIM PASSAGES DUE TO EXPERIMENTER ERROR
+dfTrim <- subset(df, !(df$passage=='broccoli')) #remove broccoli passage due to typo in the last sentence as presented on-screen to participants
+dfTrim <- subset(dfTrim, !(dfTrim$passage=='sun')) #remove sun passage due to error in coding Excel
+
 
-# base = "~/Documents/ndclab/analysis-sandbox/github-structure-mirror/readAloud-valence-dataset/derivatives/preprocessed"
-base = "/home/data/NDClab/datasets/readAloud-valence-dataset/derivatives/preprocessed"
-preprocessed_summary_filename = "TODO"
-collapsed_filename = build_output_filename(label = paste(base, "disfluencies_subject", sep='/'))
+### SECTION 5: OUTPUT DATAFRAME
+write.csv(dfTrim, paste(out_path, "readAloudBetaData_", today, ".csv", sep="", collapse=NULL))
 
-collapse_by_participant(preprocessed_summary_file, collapsed_filename)
\ No newline at end of file
+# collapse_by_participant <- function(filename_in, filename_out) {
+#   by_participant <- read_csv(filename_in) %>%
+#     unique %>% # dedup
+#     group_by(id) %>% summarize(across(misprod:total_uncorrected_errors, sum)) # summarize by participant, across all passages
+#     # TODO change the columns selected, once more have been added to the output of the preproc script
+# 
+#   write_csv(by_participant, filename_out)
+#   return(filename_out)
+# }
+# 
+# 
+# # base = "~/Documents/ndclab/analysis-sandbox/github-structure-mirror/readAloud-valence-dataset/derivatives/preprocessed"
+# base = "/home/data/NDClab/datasets/readAloud-valence-dataset/derivatives/preprocessed"
+# preprocessed_summary_filename = "TODO"
+# collapsed_filename = build_output_filename(label = paste(base, "disfluencies_subject", sep='/'))
+# 
+# collapse_by_participant(preprocessed_summary_file, collapsed_filename)
\ No newline at end of file

From 9d0b8c2f9bb7824b5d93d94e51a10c22f931bcd2 Mon Sep 17 00:00:00 2001
From: JMA <77893711+jessb0t@users.noreply.github.com>
Date: Fri, 30 Jun 2023 15:01:35 -0500
Subject: [PATCH 03/33] Update scripts with infrastructure to match -alpha

---
 code/analysisReadAloudBeta.R | 108 ++++++++++++++++++-----------------
 code/prepReadAloudBeta.R     |  20 +++++--
 2 files changed, 73 insertions(+), 55 deletions(-)

diff --git a/code/analysisReadAloudBeta.R b/code/analysisReadAloudBeta.R
index 2f662c3..6fd2724 100644
--- a/code/analysisReadAloudBeta.R
+++ b/code/analysisReadAloudBeta.R
@@ -1,6 +1,6 @@
 # readAloud-valence-beta Reading Task Analyses
 # Authors: Luc Sahar, Jessica M. Alexander
-# Last Updated: 2023-06-14
+# Last Updated: 2023-06-30
 
 # INPUTS
 # data/df: behavioral data, for each participant on each passage, with relevant participant information and trial-level stimulus information
@@ -30,7 +30,7 @@ today <- Sys.Date()
 today <- format(today, "%Y%m%d")
 
 #set up directories for input/output data
-data <- '/Users/jalexand/github/readAloud-valence-beta/derivatives/readAloudBetaData_20230614.csv'
+data <- '/Users/jalexand/github/readAloud-valence-beta/derivatives/readAloudBetaData_20230630.csv'
 out_path <- '/Users/jalexand/github/readAloud-valence-beta/derivatives/'
 
 #read in data
@@ -54,6 +54,9 @@ summary(df$ethnic)/18 / (nrow(df)/18)
 summary(df$socclass)/18
 summary(df$socclass)/18 / (nrow(df)/18)
 
+#remove participants who were not engaged in the task
+#TBD, ex. 150222
+
 #remove participants whose challenge question accuracy was below 50% (chance = 25%)
 dfTrim <- df
 dfTrim <- dfTrim %>%
@@ -69,60 +72,19 @@ mean(dfTrim$challengeAvgSub)
 sd(dfTrim$challengeAvgSub)
 
 
-### SECTION 2: INITIAL DATA TRIMMING
+### SECTION 2: PASSAGE-LEVEL TRIMMING
 passage_no_before_trimming <- nrow(dfTrim)
 
-#insert passage-level trimming here
+#remove passages with high omissions (participant did not complete reading)
+##vegas 150013
 
 passage_no_after_trim1 <- nrow(dfTrim)
 passage_no_before_trimming - passage_no_after_trim1 #number of passages trimmed
 (passage_no_before_trimming - passage_no_after_trim1) / passage_no_before_trimming #percentage of passages trimmed
 
 
-### SECTION 3: TRANSITION DATA TO LONG FORMAT
-errorDat <- data.frame(matrix(ncol=17, nrow=0))
-colnames(errorDat) <- c("passage", "id",
-                        "sex", "pronouns", "age", "ethnic", "socclass",
-                        "bfne", "phq8", "scaaredTotal", "scaaredGA", "scaaredSoc", "sps",
-                        "lenSyll", "lenWord", "avgSyllPerWord",
-                        "errors")
-for(j in 1:nrow(dfTrim)){
-  passage <- dfTrim$passage[j]
-  id <- dfTrim$id[j]
-  sex <- as.character(dfTrim$sex[j])
-  pronouns <- as.character(dfTrim$pronouns[j])
-  age <- dfTrim$age[j]
-  ethnic <- as.character(dfTrim$ethnic[j])
-  socclass <- as.character(dfTrim$socclass[j])
-  bfne <- dfTrim$bfne[j]
-  phq8 <- dfTrim$phq8[j]
-  scaaredTotal <- dfTrim$scaaredTotal[j]
-  scaaredGA <- dfTrim$scaaredGA[j]
-  scaaredSoc <- dfTrim$scaaredSoc[j]
-  sps <- dfTrim$sps[j]
-  lenSyll <- dfTrim$lenSyll[j]
-  lenWord <- dfTrim$lenWord[j]
-  avgSyllPerWord <- dfTrim$avgSyllPerWord[j]
-  errors <- dfTrim$errors[j]
-  errorDat[nrow(errorDat) + 1,] <-c(passage, id, sex, pronouns, age, ethnic, socclass, bfne, phq8, scaaredTotal, scaaredGA, scaaredSoc, sps, lenSyll, lenWord, avgSyllPerWord, errors)
-}
-
-#organize data types
-errorDat$sex <- as.factor(errorDat$sex)
-errorDat$pronouns <- as.factor(errorDat$pronouns)
-errorDat$age <- as.numeric(errorDat$age)
-errorDat$ethnic <- as.factor(errorDat$ethnic)
-errorDat$socclass <- as.factor(errorDat$socclass)
-errorDat$bfne <- as.numeric(errorDat$bfne)
-errorDat$phq8 <- as.numeric(errorDat$phq8)
-errorDat$scaaredTotal <- as.numeric(errorDat$scaaredTotal)
-errorDat$scaaredGA <- as.numeric(errorDat$scaaredGA)
-errorDat$scaaredSoc <- as.numeric(errorDat$scaaredSoc)
-errorDat$sps <- as.numeric(errorDat$sps)
-errorDat$lenSyll <- as.numeric(errorDat$lenSyll)
-errorDat$lenWord <- as.numeric(errorDat$lenWord)
-errorDat$avgSyllPerWord <- as.numeric(errorDat$avgSyllPerWord)
-errorDat$errors <- as.numeric(errorDat$errors)
+### SECTION 3: ORGANIZE DATA FOR MODELING
+errorDat <- dfTrim
 
 #modify contrasts for categorical predictors
 contrasts(errorDat$sex) <- contr.sum(2) #male: -1, female: +1
@@ -138,7 +100,6 @@ errorDat$sps_gmc <- errorDat$sps - mean(errorDat$sps)
 errorDat$lenSyll_gmc <- errorDat$lenSyll - mean(errorDat$lenSyll)
 errorDat$lenWord_gmc <- errorDat$lenWord - mean(errorDat$lenWord)
 errorDat$avgSyllPerWord_gmc <- errorDat$avgSyllPerWord - mean(errorDat$avgSyllPerWord)
-errorDat$errors_gmc <- errorDat$errors - mean(errorDat$errors)
 
 #extract demo stats
 errorDatStats <- subset(errorDat, !duplicated(errorDat$id))
@@ -155,5 +116,50 @@ summary(errorDatStats$socclass) / length(unique(errorDatStats$id))
 
 
 ### SECTION 4: MODEL RESULTS
-modelErr <- lmerTest::lmer(errors ~ scaaredTotal_gmc * lenSyll_gmc + (1|id) + (1|passage), data=errorDat, REML=TRUE)
-summary(modelErr)
\ No newline at end of file
+#misprod_rate x bfne
+model1 <- lmerTest::lmer(misprod_rate ~ bfne_gmc + (1|id) + (1|passage), data=errorDat, REML=TRUE)
+summary(model1)
+
+#misprod_rate x scaaredSoc
+model2 <- lmerTest::lmer(misprod_rate ~ scaaredSoc_gmc + (1|id) + (1|passage), data=errorDat, REML=TRUE)
+summary(model2)
+
+#misprod_rate x sps
+model3 <- lmerTest::lmer(misprod_rate ~ sps_gmc + (1|id) + (1|passage), data=errorDat, REML=TRUE)
+summary(model3)
+
+#hesitation_rate x bfne
+model4 <- lmerTest::lmer(hesitation_rate ~ bfne_gmc + (1|id) + (1|passage), data=errorDat, REML=TRUE)
+summary(model4)
+
+#hesitation_rate x scaaredSoc
+model5 <- lmerTest::lmer(hesitation_rate ~ scaaredSoc_gmc + (1|id) + (1|passage), data=errorDat, REML=TRUE)
+summary(model5)
+
+#hesitation_rate x sps
+model6 <- lmerTest::lmer(hesitation_rate ~ sps_gmc + (1|id) + (1|passage), data=errorDat, REML=TRUE)
+summary(model6)
+
+#words_with_misprod_rate x bfne
+model7 <- lmerTest::lmer(words_with_misprod_rate ~ bfne_gmc + (1|id) + (1|passage), data=errorDat, REML=TRUE)
+summary(model7)
+
+#words_with_misprod_rate x scaaredSoc
+model8 <- lmerTest::lmer(words_with_misprod_rate ~ scaaredSoc_gmc + (1|id) + (1|passage), data=errorDat, REML=TRUE)
+summary(model8)
+
+#words_with_misprod_rate x sps
+model9 <- lmerTest::lmer(words_with_misprod_rate ~ sps_gmc + (1|id) + (1|passage), data=errorDat, REML=TRUE)
+summary(model9)
+
+#words_with_hes_rate x bfne
+model10 <- lmerTest::lmer(words_with_hes_rate ~ bfne_gmc + (1|id) + (1|passage), data=errorDat, REML=TRUE)
+summary(model10)
+
+#words_with_hes_rate x scaaredSoc
+model11 <- lmerTest::lmer(words_with_hes_rate ~ scaaredSoc_gmc + (1|id) + (1|passage), data=errorDat, REML=TRUE)
+summary(model11)
+
+#words_with_hes_rate x sps
+model12 <- lmerTest::lmer(words_with_hes_rate ~ sps_gmc + (1|id) + (1|passage), data=errorDat, REML=TRUE)
+summary(model12)
\ No newline at end of file
diff --git a/code/prepReadAloudBeta.R b/code/prepReadAloudBeta.R
index d7a3c67..e062d58 100644
--- a/code/prepReadAloudBeta.R
+++ b/code/prepReadAloudBeta.R
@@ -1,6 +1,6 @@
 # readAloud-valence-beta Analysis Preparation
 # Authors: Luc Sahar and Jessica M. Alexander -- NDCLab, Florida International University
-# Last updated: 2023-06-14
+# Last updated: 2023-06-30
 
 # INPUTS
 # data/df: behavioral (error-related) data, for each participant on each passage
@@ -48,7 +48,7 @@ main_analyses <- '/Users/jalexand/github/readAloud-valence-beta/'
 out_path <- '/Users/jalexand/github/readAloud-valence-beta/derivatives/'
 
 #load input files
-data <- paste(main_dataset, 'derivatives/preprocessed/disfluencies_subject-x-passage_20230613_0450pm.csv', sep="", collapse=NULL)
+data <- paste(main_dataset, 'derivatives/preprocessed/disfluencies_subject-x-passage_20230616_1229pm.csv', sep="", collapse=NULL)
 accDat_path <- paste(main_dataset,'derivatives/preprocessed/readAloud_passage-level_summary_20220812.csv', sep="", collapse=NULL)
 readDat_path <- paste(main_dataset, 'derivatives/analysisStimuli_readDat_20230614.csv', sep="", collapse=NULL)
 redcap_path <- paste(main_dataset,'derivatives/preprocessed/202201v0readAloudval_SCRD_2022-06-20_1019.csv', sep="", collapse=NULL)
@@ -63,6 +63,9 @@ accDat$passage <- c("dams", "flying", "bats", "broccoli", "realty", "bees", "dog
                     "cars", "vegas", "sun", "caramel", "congo", "antarctica", "depression", "skunkowl", "grizzly",
                     "mantis", "dentist")        #rename passages with short-name
 
+#organize data types
+df[,3:30] <- sapply(df[,3:30],as.numeric)
+
 #add missing passages for 150086 so that nrow is divisible by 20
 passages_read <- df$passage[which(df$id=="150086")]
 all_passages <- unique(df$passage)
@@ -70,7 +73,7 @@ tempdf <- data.frame(matrix(nrow=0, ncol=ncol(df)))
 colnames(tempdf) <- colnames(df)
 for(passage in 1:length(all_passages)){
   if(all_passages[passage] %in% passages_read){next}else{
-    tempdf[nrow(tempdf) + 1,] <- c("150086", all_passages[passage], rep(NA, 41))
+    tempdf[nrow(tempdf) + 1,] <- c("150086", all_passages[passage], rep(NA, 28))
   }
 }
 df <- rbind(df, tempdf)
@@ -156,7 +159,16 @@ for(h in 1:nrow(demoDat)){
 for(i in 1:nrow(df)){
   subject <- df$id[i] #extract subject number for matching
   passage <- df$passage[i] #extract passage name for matching
-  errors <- df$errors[i] #extract total errors
+  
+  #production errors of interest
+  # misprod = raw misproduction errors
+  # hesitation = raw hesitations
+  # words_with_misprod = distinct words with misproduction errors
+  # words_with_hes = distinct words with pre-word or word-internal hesitation
+  # misprod_rate = rate of raw misproduction errors
+  # hesitation_rate = rate of raw hesitations
+  # words_with_misprod_rate = rate of word-level misproduction errors
+  # words_with_hes_rate = rate of word-level hesitations
   
   #extract passage characteristics from readDat
   df$lenSyll[i] <- sum(readDat$lengthSyll[which(readDat$passage==passage)]) #length of passage in syllables

From a5c853ac2f7a48fde4db6ff2929fc222614d08d7 Mon Sep 17 00:00:00 2001
From: l-acs <lucas.sahar@mail.mcgill.ca>
Date: Thu, 10 Aug 2023 20:24:17 -0400
Subject: [PATCH 04/33] MWE of looking for outliers

---
 code/analysisReadAloudBeta.R        | 43 ++++++++++++++++----
 code/prepReadAloudBeta.R            | 42 +++++++++++++-------
 input/passages-to-omit_20230810.csv | 61 +++++++++++++++++++++++++++++
 3 files changed, 124 insertions(+), 22 deletions(-)
 create mode 100644 input/passages-to-omit_20230810.csv

diff --git a/code/analysisReadAloudBeta.R b/code/analysisReadAloudBeta.R
index 6fd2724..51b49ab 100644
--- a/code/analysisReadAloudBeta.R
+++ b/code/analysisReadAloudBeta.R
@@ -1,6 +1,6 @@
 # readAloud-valence-beta Reading Task Analyses
 # Authors: Luc Sahar, Jessica M. Alexander
-# Last Updated: 2023-06-30
+# Last Updated: 2023-08-10
 
 # INPUTS
 # data/df: behavioral data, for each participant on each passage, with relevant participant information and trial-level stimulus information
@@ -25,16 +25,29 @@ library(cowplot)
 library(colorspace)
 library(colorblindr)
 
+# ```
+# Warning in install.packages :
+# package ‘colorblindr’ is not available for this version of R
+#
+# A version of this package for your version of R might be available elsewhere,
+# see the ideas at
+# https://cran.r-project.org/doc/manuals/r-patched/R-admin.html#Installing-packages
+# ```
+
 #set up date for output file naming
 today <- Sys.Date()
 today <- format(today, "%Y%m%d")
 
 #set up directories for input/output data
-data <- '/Users/jalexand/github/readAloud-valence-beta/derivatives/readAloudBetaData_20230630.csv'
-out_path <- '/Users/jalexand/github/readAloud-valence-beta/derivatives/'
+# data <- '/Users/jalexand/github/readAloud-valence-beta/derivatives/readAloudBetaData_20230630.csv'
+data <- '/home/luc/Documents/ndclab/analysis-sandbox/rwe-analysis/derivatives/readAloudBetaData_20230810.csv'
+to_omit <- '/home/luc/Documents/ndclab/analysis-sandbox/rwe-analysis/input/passages-to-omit_20230810.csv'
+# out_path <- '/Users/jalexand/github/readAloud-valence-beta/derivatives/'
+out_path <- '/home/luc/Documents/ndclab/analysis-sandbox/rwe-analysis/derivatives/'
 
 #read in data
-df <- read.csv(data)
+df <- read.csv(data, row.names = NULL)
+passage_omissions_df <- read.csv(to_omit, row.names = NULL)
 
 #organize participant demographic variables
 df$sex <- as.factor(df$sex)
@@ -43,6 +56,9 @@ df$ethnic <- as.factor(df$ethnic)
 df$socclass <- as.factor(df$socclass)
 
 #extract demo stats
+
+# all these values are just in case they're useful - not needed per se for later
+# steps of the logic in this script
 summary(df$age)
 sd(df$age)
 summary(df$sex)/18
@@ -55,10 +71,17 @@ summary(df$socclass)/18
 summary(df$socclass)/18 / (nrow(df)/18)
 
 #remove participants who were not engaged in the task
-#TBD, ex. 150222
+dfTrim <- df
+# wait, are my removals supposed to be in dfTrim or df? dfTrim, right?
+
+# removal based on coder notes of audible distractions, others in the room, etc.:
+
+# 150015
+# 150208
+
+dfTrim <- subset(dfTrim, !(id %in% c(150015, 150208)))
 
 #remove participants whose challenge question accuracy was below 50% (chance = 25%)
-dfTrim <- df
 dfTrim <- dfTrim %>%
   group_by(id) %>%
   mutate(challengeAvgSub = mean(challengeACC)) %>%
@@ -77,6 +100,12 @@ passage_no_before_trimming <- nrow(dfTrim)
 
 #remove passages with high omissions (participant did not complete reading)
 ##vegas 150013
+# TODO
+# my attempt/idea, idk yet if it works
+dfTrim <- anti_join(dfTrim,
+                    passage_omissions_df,
+                    by = join_by(id == participant, passage == passage))
+
 
 passage_no_after_trim1 <- nrow(dfTrim)
 passage_no_before_trimming - passage_no_after_trim1 #number of passages trimmed
@@ -162,4 +191,4 @@ summary(model11)
 
 #words_with_hes_rate x sps
 model12 <- lmerTest::lmer(words_with_hes_rate ~ sps_gmc + (1|id) + (1|passage), data=errorDat, REML=TRUE)
-summary(model12)
\ No newline at end of file
+summary(model12)
diff --git a/code/prepReadAloudBeta.R b/code/prepReadAloudBeta.R
index e062d58..35b9808 100644
--- a/code/prepReadAloudBeta.R
+++ b/code/prepReadAloudBeta.R
@@ -33,7 +33,7 @@ library(readr) # write_csv
 # ext_default = 'csv'
 # tz_default = "America/New_York"
 # date_format_default = "%Y%m%d_%I%M%P"
-# 
+#
 # build_output_filename <- function(label, ext = ext_default, timezone = tz_default, date_format = date_format_default) {
 #   # `label` may include the destination directory, if different from the working directory when the script is run
 #   current_datetime <- now(timezone) %>% format(date_format)
@@ -43,17 +43,26 @@ today <- Sys.Date()
 today <- format(today, "%Y%m%d")
 
 #set up directories for input/output
-main_dataset <- '/Users/jalexand/github/readAloud-valence-dataset/'
-main_analyses <- '/Users/jalexand/github/readAloud-valence-beta/'
-out_path <- '/Users/jalexand/github/readAloud-valence-beta/derivatives/'
+# main_dataset <- '/Users/jalexand/github/readAloud-valence-dataset/'
+# main_analyses <- '/Users/jalexand/github/readAloud-valence-beta/'
+# out_path <- '/Users/jalexand/github/readAloud-valence-beta/derivatives/'
+main_dataset <- '/home/luc/Documents/ndclab/analysis-sandbox/rwe-dataset/'
+main_analyses <- '/home/luc/Documents/ndclab/analysis-sandbox/rwe-analysis/'
+out_path <- '/home/luc/Documents/ndclab/analysis-sandbox/rwe-analysis/derivatives/'
+
 
 #load input files
-data <- paste(main_dataset, 'derivatives/preprocessed/disfluencies_subject-x-passage_20230616_1229pm.csv', sep="", collapse=NULL)
+# data <- paste(main_dataset, 'derivatives/preprocessed/disfluencies_subject-x-passage_20230616_1229pm.csv', sep="", collapse=NULL)
+data <- "/home/luc/Documents/ndclab/analysis-sandbox/output-csvs/disfluencies_subject-x-passage_20230616_1229pm.csv"
 accDat_path <- paste(main_dataset,'derivatives/preprocessed/readAloud_passage-level_summary_20220812.csv', sep="", collapse=NULL)
 readDat_path <- paste(main_dataset, 'derivatives/analysisStimuli_readDat_20230614.csv', sep="", collapse=NULL)
 redcap_path <- paste(main_dataset,'derivatives/preprocessed/202201v0readAloudval_SCRD_2022-06-20_1019.csv', sep="", collapse=NULL)
 agedat_path <- paste(main_dataset,'derivatives/preprocessed/202201v0readAloudval_SCRD_2022-06-20_1019_ageonly.csv', sep="", collapse=NULL)
 
+# c(data, accDat_path, readDat_path, redcap_path, agedat_path) %>% fs::as_fs_path() %>% fs::is_file()
+# ✅: all TRUE
+
+
 df <- read.csv(data)
 redcap <- read.csv(redcap_path, na.strings='NA') #participant questionnaire responses
 agedat <- read.csv(agedat_path, na.strings='NA') #participant age information
@@ -159,7 +168,7 @@ for(h in 1:nrow(demoDat)){
 for(i in 1:nrow(df)){
   subject <- df$id[i] #extract subject number for matching
   passage <- df$passage[i] #extract passage name for matching
-  
+
   #production errors of interest
   # misprod = raw misproduction errors
   # hesitation = raw hesitations
@@ -169,15 +178,15 @@ for(i in 1:nrow(df)){
   # hesitation_rate = rate of raw hesitations
   # words_with_misprod_rate = rate of word-level misproduction errors
   # words_with_hes_rate = rate of word-level hesitations
-  
+
   #extract passage characteristics from readDat
   df$lenSyll[i] <- sum(readDat$lengthSyll[which(readDat$passage==passage)]) #length of passage in syllables
   df$lenWord[i] <- sum(readDat$lengthWord[which(readDat$passage==passage)]) #length of passage in words
   df$avgSyllPerWord[i] <- df$lenSyll[i]/df$lenWord[i]
-  
+
   #extract participant accuracy from accDat
   df$challengeACC[i] <- accDat[match(passage, accDat$passage), as.character(subject)] #passage-specific challenge question accuracy for subject
-  
+
   #extract participant demographics from demoDat
   df$sex[i] <- demoDat$sex[match(df$id[i], demoDat$record_id)]   #participant biological sex
   df$pronouns[i] <- demoDat$pron[match(df$id[i], demoDat$record_id)]   #participant preferred pronouns
@@ -211,6 +220,9 @@ df$socclass <- as.factor(df$socclass)
 #sum(df$profen>3, na.rm=TRUE)/20 #one remaining subject (150060) rates own English proficiency as not "elementary" or "not proficient", but reads fluidly and achieved 80% accuracy on challenge questions, so not excluded
 
 #extract age and sex stats
+
+# all these values are just in case they're useful - not needed per se for later
+# steps of the logic in this script
 summary(df$age) #age range and mean
 sd(df$age) #age standard deviation
 summary(df$sex)/20 #number of participants by sex
@@ -229,22 +241,22 @@ dfTrim <- subset(dfTrim, !(dfTrim$passage=='sun')) #remove sun passage due to er
 
 
 ### SECTION 5: OUTPUT DATAFRAME
-write.csv(dfTrim, paste(out_path, "readAloudBetaData_", today, ".csv", sep="", collapse=NULL))
+write.csv(dfTrim, paste(out_path, "readAloudBetaData_", today, ".csv", sep="", collapse=NULL), row.names = FALSE)
 
 # collapse_by_participant <- function(filename_in, filename_out) {
 #   by_participant <- read_csv(filename_in) %>%
 #     unique %>% # dedup
 #     group_by(id) %>% summarize(across(misprod:total_uncorrected_errors, sum)) # summarize by participant, across all passages
 #     # TODO change the columns selected, once more have been added to the output of the preproc script
-# 
+#
 #   write_csv(by_participant, filename_out)
 #   return(filename_out)
 # }
-# 
-# 
+#
+#
 # # base = "~/Documents/ndclab/analysis-sandbox/github-structure-mirror/readAloud-valence-dataset/derivatives/preprocessed"
 # base = "/home/data/NDClab/datasets/readAloud-valence-dataset/derivatives/preprocessed"
 # preprocessed_summary_filename = "TODO"
 # collapsed_filename = build_output_filename(label = paste(base, "disfluencies_subject", sep='/'))
-# 
-# collapse_by_participant(preprocessed_summary_file, collapsed_filename)
\ No newline at end of file
+#
+# collapse_by_participant(preprocessed_summary_file, collapsed_filename)
diff --git a/input/passages-to-omit_20230810.csv b/input/passages-to-omit_20230810.csv
new file mode 100644
index 0000000..07827be
--- /dev/null
+++ b/input/passages-to-omit_20230810.csv
@@ -0,0 +1,61 @@
+participant,passage
+150005,vegas
+150013,vegas
+150022,depression
+150026,dams
+150030,grizzly
+150034,realty
+150034,skunkowl
+150036,caramel
+150036,dams
+150056,sun
+150079,caramel
+150079,depression
+150079,realty
+150079,mantis
+150083,bees
+150083,cars
+150083,skunkowl
+150083,broccoli
+150086,dams
+150086,antarctica
+150086,dentist
+150086,dogshow
+150086,dolphins
+150086,flying
+150086,grizzly
+150086,mantis
+150088,grizzly
+150088,dams
+150091,caramel
+150146,caramel
+150146,mantis
+150146,dogshow
+150196,dams
+150196,cars
+150196,dentist
+150196,broccoli
+150196,depression
+150208,dams
+150216,bees
+150222,dogshow
+150222,sun
+150222,mantis
+150222,bats
+150242,bats
+150242,dentist
+150245,icefishing
+150245,dentist
+150245,congo
+150245,cars
+150245,skunkowl
+150245,broccoli
+150245,antarctica
+150245,grizzly
+150245,dogshow
+150245,sun
+150252,bees
+150252,cars
+150252,congo
+150274,flying
+150299,vegas

From 95c112f3ee628de2407a8bf3a8b54f4f0b20eb0c Mon Sep 17 00:00:00 2001
From: l-acs <lucas.sahar@mail.mcgill.ca>
Date: Sat, 12 Aug 2023 10:23:37 -0400
Subject: [PATCH 05/33] Add some models for follow-up analyses

---
 code/analysisReadAloudBeta.R | 252 ++++++++++++++++++++++++++++++++---
 code/prepReadAloudBeta.R     |   1 +
 2 files changed, 236 insertions(+), 17 deletions(-)

diff --git a/code/analysisReadAloudBeta.R b/code/analysisReadAloudBeta.R
index 51b49ab..905adbb 100644
--- a/code/analysisReadAloudBeta.R
+++ b/code/analysisReadAloudBeta.R
@@ -1,6 +1,6 @@
 # readAloud-valence-beta Reading Task Analyses
 # Authors: Luc Sahar, Jessica M. Alexander
-# Last Updated: 2023-08-10
+# Last Updated: 2023-08-11
 
 # INPUTS
 # data/df: behavioral data, for each participant on each passage, with relevant participant information and trial-level stimulus information
@@ -78,8 +78,11 @@ dfTrim <- df
 
 # 150015
 # 150208
+# 150245 had many passages that were entirely or near-entirely inaudible; the
+# rest were dropped too under the assumption that the audible ones too
+# would be too faint to identify errors in
 
-dfTrim <- subset(dfTrim, !(id %in% c(150015, 150208)))
+dfTrim <- subset(dfTrim, !(id %in% c(150015, 150208, 150245)))
 
 #remove participants whose challenge question accuracy was below 50% (chance = 25%)
 dfTrim <- dfTrim %>%
@@ -99,9 +102,7 @@ sd(dfTrim$challengeAvgSub)
 passage_no_before_trimming <- nrow(dfTrim)
 
 #remove passages with high omissions (participant did not complete reading)
-##vegas 150013
-# TODO
-# my attempt/idea, idk yet if it works
+# e.g. vegas 150013
 dfTrim <- anti_join(dfTrim,
                     passage_omissions_df,
                     by = join_by(id == participant, passage == passage))
@@ -130,6 +131,11 @@ errorDat$lenSyll_gmc <- errorDat$lenSyll - mean(errorDat$lenSyll)
 errorDat$lenWord_gmc <- errorDat$lenWord - mean(errorDat$lenWord)
 errorDat$avgSyllPerWord_gmc <- errorDat$avgSyllPerWord - mean(errorDat$avgSyllPerWord)
 
+# LS additions 8/11/23
+errorDat$corrections <- errorDat$corrections - mean(errorDat$corrections)
+errorDat$errors <- errorDat$errors - mean(errorDat$errors)
+
+
 #extract demo stats
 errorDatStats <- subset(errorDat, !duplicated(errorDat$id))
 summary(errorDatStats$age)
@@ -146,49 +152,261 @@ summary(errorDatStats$socclass) / length(unique(errorDatStats$id))
 
 ### SECTION 4: MODEL RESULTS
 #misprod_rate x bfne
-model1 <- lmerTest::lmer(misprod_rate ~ bfne_gmc + (1|id) + (1|passage), data=errorDat, REML=TRUE)
+model1 <- lmerTest::lmer(misprod_rate ~ bfne_gmc + (1|id) + (1|passage),
+                         data=errorDat, REML=TRUE)
 summary(model1)
 
 #misprod_rate x scaaredSoc
-model2 <- lmerTest::lmer(misprod_rate ~ scaaredSoc_gmc + (1|id) + (1|passage), data=errorDat, REML=TRUE)
+model2 <- lmerTest::lmer(misprod_rate ~ scaaredSoc_gmc + (1|id) + (1|passage),
+                         data=errorDat, REML=TRUE)
 summary(model2)
 
 #misprod_rate x sps
-model3 <- lmerTest::lmer(misprod_rate ~ sps_gmc + (1|id) + (1|passage), data=errorDat, REML=TRUE)
+model3 <- lmerTest::lmer(misprod_rate ~ sps_gmc + (1|id) + (1|passage),
+                         data=errorDat, REML=TRUE)
 summary(model3)
 
 #hesitation_rate x bfne
-model4 <- lmerTest::lmer(hesitation_rate ~ bfne_gmc + (1|id) + (1|passage), data=errorDat, REML=TRUE)
+model4 <- lmerTest::lmer(hesitation_rate ~ bfne_gmc + (1|id) + (1|passage),
+                         data=errorDat, REML=TRUE)
 summary(model4)
 
 #hesitation_rate x scaaredSoc
-model5 <- lmerTest::lmer(hesitation_rate ~ scaaredSoc_gmc + (1|id) + (1|passage), data=errorDat, REML=TRUE)
+model5 <- lmerTest::lmer(hesitation_rate ~ scaaredSoc_gmc + (1|id) + (1|passage),
+                         data=errorDat, REML=TRUE)
 summary(model5)
 
 #hesitation_rate x sps
-model6 <- lmerTest::lmer(hesitation_rate ~ sps_gmc + (1|id) + (1|passage), data=errorDat, REML=TRUE)
+model6 <- lmerTest::lmer(hesitation_rate ~ sps_gmc + (1|id) + (1|passage),
+                         data=errorDat, REML=TRUE)
 summary(model6)
 
 #words_with_misprod_rate x bfne
-model7 <- lmerTest::lmer(words_with_misprod_rate ~ bfne_gmc + (1|id) + (1|passage), data=errorDat, REML=TRUE)
+model7 <- lmerTest::lmer(words_with_misprod_rate ~ bfne_gmc + (1|id) + (1|passage),
+                         data=errorDat, REML=TRUE)
 summary(model7)
 
 #words_with_misprod_rate x scaaredSoc
-model8 <- lmerTest::lmer(words_with_misprod_rate ~ scaaredSoc_gmc + (1|id) + (1|passage), data=errorDat, REML=TRUE)
+model8 <- lmerTest::lmer(words_with_misprod_rate ~ scaaredSoc_gmc + (1|id) + (1|passage),
+                         data=errorDat, REML=TRUE)
 summary(model8)
 
 #words_with_misprod_rate x sps
-model9 <- lmerTest::lmer(words_with_misprod_rate ~ sps_gmc + (1|id) + (1|passage), data=errorDat, REML=TRUE)
+model9 <- lmerTest::lmer(words_with_misprod_rate ~ sps_gmc + (1|id) + (1|passage),
+                         data=errorDat, REML=TRUE)
 summary(model9)
 
 #words_with_hes_rate x bfne
-model10 <- lmerTest::lmer(words_with_hes_rate ~ bfne_gmc + (1|id) + (1|passage), data=errorDat, REML=TRUE)
+model10 <- lmerTest::lmer(words_with_hes_rate ~ bfne_gmc + (1|id) + (1|passage),
+                          data=errorDat, REML=TRUE)
 summary(model10)
 
 #words_with_hes_rate x scaaredSoc
-model11 <- lmerTest::lmer(words_with_hes_rate ~ scaaredSoc_gmc + (1|id) + (1|passage), data=errorDat, REML=TRUE)
+model11 <- lmerTest::lmer(words_with_hes_rate ~ scaaredSoc_gmc + (1|id) + (1|passage),
+                          data=errorDat, REML=TRUE)
 summary(model11)
 
 #words_with_hes_rate x sps
-model12 <- lmerTest::lmer(words_with_hes_rate ~ sps_gmc + (1|id) + (1|passage), data=errorDat, REML=TRUE)
+model12 <- lmerTest::lmer(words_with_hes_rate ~ sps_gmc + (1|id) + (1|passage),
+                          data=errorDat, REML=TRUE)
 summary(model12)
+
+
+#### supplemental analyses
+# see notes
+
+
+
+
+
+
+# glmer(accuracy ~ scaaredSoc_gmc + (1|id) + (1|passage), data=errorDat, family="binomial")
+# "f_" : follow-up
+
+# Accuracy/comprehension as explained by social anxiety: scaaredSoc
+f_model1 <- glmer(challengeACC ~ scaaredSoc_gmc + (1|id) + (1|passage),
+                  data=errorDat, family = "binomial")
+summary(f_model1)
+
+# Accuracy/comprehension as explained by social anxiety: bfne
+f_model2 <- glmer(challengeACC ~ bfne_gmc + (1|id) + (1|passage),
+                  data=errorDat, family = "binomial")
+summary(f_model2)
+
+
+# Accuracy/comprehension as explained by social anxiety: sps
+f_model3 <- glmer(challengeACC ~ sps_gmc + (1|id) + (1|passage),
+                  data=errorDat, family = "binomial")
+summary(f_model3)
+
+
+# Accuracy/comprehension as explained by disfluencies: hesitations per syllable
+f_model4 <- glmer(challengeACC ~ hesitation_rate + (1|id) + (1|passage),
+                  data=errorDat, family = "binomial")
+summary(f_model4)
+
+# Accuracy/comprehension as explained by disfluencies: hesitations per word
+f_model5 <- glmer(challengeACC ~ words_with_hes_rate + (1|id) + (1|passage),
+                  data=errorDat, family = "binomial")
+summary(f_model5)
+
+
+# Accuracy/comprehension as explained by errors: misproductions per syllable
+f_model6 <- glmer(challengeACC ~ misprod_rate + (1|id) + (1|passage),
+                  data=errorDat, family = "binomial")
+summary(f_model6)
+
+# Accuracy/comprehension as explained by errors: misproductions per word
+f_model7 <- glmer(challengeACC ~ words_with_misprod_rate + (1|id) + (1|passage),
+                  data=errorDat, family = "binomial")
+summary(f_model7)
+
+
+
+
+# Accuracy/comprehension as explained by disfluencies *and* SA: hesitations per syllable with scaared
+f_model8 <- glmer(challengeACC ~ hesitation_rate * scaaredSoc_gmc + (1|id) + (1|passage),
+                  data=errorDat, family = "binomial")
+summary(f_model8)
+
+# Accuracy/comprehension as explained by disfluencies: hesitations per word with scaared
+f_model9 <- glmer(challengeACC ~ words_with_hes_rate * scaaredSoc_gmc + (1|id) + (1|passage),
+                  data=errorDat, family = "binomial")
+summary(f_model9)
+
+
+# Accuracy/comprehension as explained by errors: misproductions per syllable with scaared
+f_model10 <- glmer(challengeACC ~ misprod_rate * scaaredSoc_gmc + (1|id) + (1|passage),
+                  data=errorDat, family = "binomial")
+summary(f_model10)
+
+# Accuracy/comprehension as explained by errors: misproductions per word with scaared
+f_model11 <- glmer(challengeACC ~ words_with_misprod_rate * scaaredSoc_gmc + (1|id) + (1|passage),
+                  data=errorDat, family = "binomial")
+summary(f_model11)
+
+
+
+# Accuracy/comprehension as explained by disfluencies *and* SA: hesitations per syllable with bfne
+f_model12 <- glmer(challengeACC ~ hesitation_rate * bfne_gmc + (1|id) + (1|passage),
+                  data=errorDat, family = "binomial")
+summary(f_model12)
+
+# Accuracy/comprehension as explained by disfluencies: hesitations per word with bfne
+f_model13 <- glmer(challengeACC ~ words_with_hes_rate * bfne_gmc + (1|id) + (1|passage),
+                  data=errorDat, family = "binomial")
+summary(f_model13)
+
+
+# Accuracy/comprehension as explained by errors: misproductions per syllable with bfne
+f_model14 <- glmer(challengeACC ~ misprod_rate * bfne_gmc + (1|id) + (1|passage),
+                   data=errorDat, family = "binomial")
+summary(f_model14)
+
+# Accuracy/comprehension as explained by errors: misproductions per word with bfne
+f_model15 <- glmer(challengeACC ~ words_with_misprod_rate * bfne_gmc + (1|id) + (1|passage),
+                   data=errorDat, family = "binomial")
+summary(f_model15)
+
+
+
+# Accuracy/comprehension as explained by disfluencies *and* SA: hesitations per syllable with sps
+f_model16 <- glmer(challengeACC ~ hesitation_rate * sps_gmc + (1|id) + (1|passage),
+                   data=errorDat, family = "binomial")
+summary(f_model16)
+
+# Accuracy/comprehension as explained by disfluencies: hesitations per word with sps
+f_model17 <- glmer(challengeACC ~ words_with_hes_rate * sps_gmc + (1|id) + (1|passage),
+                   data=errorDat, family = "binomial")
+summary(f_model17)
+
+
+# Accuracy/comprehension as explained by errors: misproductions per syllable with sps
+f_model18 <- glmer(challengeACC ~ misprod_rate * sps_gmc + (1|id) + (1|passage),
+                   data=errorDat, family = "binomial")
+summary(f_model18)
+
+# Accuracy/comprehension as explained by errors: misproductions per word with sps
+f_model19 <- glmer(challengeACC ~ words_with_misprod_rate * sps_gmc + (1|id) + (1|passage),
+                   data=errorDat, family = "binomial")
+summary(f_model19)
+
+
+
+# Now, misproduction-hesitation interactions
+
+# Errors as explained by disfluency: rate of misproduced syllables from rate of hesitated syllables
+f_model20 <- lmerTest::lmer(misprod_rate ~ hesitation_rate + (1|id) + (1|passage),
+                            data=errorDat, REML=TRUE)
+summary(f_model20)
+
+# Errors as explained by disfluency: rate of misproduced words from rate of hesitated words
+f_model21 <- lmerTest::lmer(words_with_misprod_rate ~ words_with_hes_rate + (1|id) + (1|passage),
+                            data=errorDat, REML=TRUE)
+summary(f_model21)
+
+
+# Errors as explained by disfluency: rate of misproduced words from rate of hesitated syllables
+f_model22 <- lmerTest::lmer(words_with_misprod_rate ~ hesitation_rate + (1|id) + (1|passage),
+                            data=errorDat, REML=TRUE)
+summary(f_model22)
+
+
+
+# Now, misproduction-hesitation interactions with social anxiety
+
+# Errors as explained by disfluency and SA: rate of misproduced syllables from rate of hesitated syllables and scaared
+f_model23 <- lmerTest::lmer(misprod_rate ~ hesitation_rate * scaaredSoc_gmc + (1|id) + (1|passage),
+                            data=errorDat, REML=TRUE)
+summary(f_model23)
+
+# Errors as explained by disfluency and SA: rate of misproduced words from rate of hesitated words and scaared
+f_model24 <- lmerTest::lmer(words_with_misprod_rate ~ words_with_hes_rate * scaaredSoc_gmc + (1|id) + (1|passage),
+                            data=errorDat, REML=TRUE)
+summary(f_model24)
+
+
+# Errors as explained by disfluency and SA: rate of misproduced words from rate of hesitated syllables and scaared
+f_model25 <- lmerTest::lmer(words_with_misprod_rate ~ hesitation_rate * scaaredSoc_gmc + (1|id) + (1|passage),
+                            data=errorDat, REML=TRUE)
+summary(f_model25)
+
+
+# Errors as explained by disfluency and SA: rate of misproduced syllables from rate of hesitated syllables and bfne
+f_model26 <- lmerTest::lmer(misprod_rate ~ hesitation_rate * bfne_gmc + (1|id) + (1|passage),
+                            data=errorDat, REML=TRUE)
+summary(f_model26)
+
+# Errors as explained by disfluency and SA: rate of misproduced words from rate of hesitated words and bfne
+f_model27 <- lmerTest::lmer(words_with_misprod_rate ~ words_with_hes_rate * bfne_gmc + (1|id) + (1|passage),
+                            data=errorDat, REML=TRUE)
+summary(f_model27)
+
+
+# Errors as explained by disfluency and SA: rate of misproduced words from rate of hesitated syllables and bfne
+f_model28 <- lmerTest::lmer(words_with_misprod_rate ~ hesitation_rate * bfne_gmc + (1|id) + (1|passage),
+                            data=errorDat, REML=TRUE)
+summary(f_model28)
+
+
+# Errors as explained by disfluency and SA: rate of misproduced syllables from rate of hesitated syllables and sps
+f_model29 <- lmerTest::lmer(misprod_rate ~ hesitation_rate * sps_gmc + (1|id) + (1|passage),
+                            data=errorDat, REML=TRUE)
+summary(f_model29)
+
+# Errors as explained by disfluency and SA: rate of misproduced words from rate of hesitated words and sps
+f_model30 <- lmerTest::lmer(words_with_misprod_rate ~ words_with_hes_rate * sps_gmc + (1|id) + (1|passage),
+                            data=errorDat, REML=TRUE)
+summary(f_model30)
+
+
+# Errors as explained by disfluency and SA: rate of misproduced words from rate of hesitated syllables and sps
+f_model31 <- lmerTest::lmer(words_with_misprod_rate ~ hesitation_rate * sps_gmc + (1|id) + (1|passage),
+                            data=errorDat, REML=TRUE)
+summary(f_model31)
+
+
+
+
+
+
diff --git a/code/prepReadAloudBeta.R b/code/prepReadAloudBeta.R
index 35b9808..407fcd5 100644
--- a/code/prepReadAloudBeta.R
+++ b/code/prepReadAloudBeta.R
@@ -223,6 +223,7 @@ df$socclass <- as.factor(df$socclass)
 
 # all these values are just in case they're useful - not needed per se for later
 # steps of the logic in this script
+
 summary(df$age) #age range and mean
 sd(df$age) #age standard deviation
 summary(df$sex)/20 #number of participants by sex

From a6656cb4dfd8c4006f130db2e08c72c214a2e922 Mon Sep 17 00:00:00 2001
From: l-acs <lucas.sahar@mail.mcgill.ca>
Date: Sat, 12 Aug 2023 23:45:19 -0400
Subject: [PATCH 06/33] Comment some things and add additional follow-up models

---
 code/analysisReadAloudBeta.R | 126 +++++++++++++++++++++++++++++------
 1 file changed, 106 insertions(+), 20 deletions(-)

diff --git a/code/analysisReadAloudBeta.R b/code/analysisReadAloudBeta.R
index 905adbb..dd159c0 100644
--- a/code/analysisReadAloudBeta.R
+++ b/code/analysisReadAloudBeta.R
@@ -1,6 +1,6 @@
 # readAloud-valence-beta Reading Task Analyses
 # Authors: Luc Sahar, Jessica M. Alexander
-# Last Updated: 2023-08-11
+# Last Updated: 2023-08-12
 
 # INPUTS
 # data/df: behavioral data, for each participant on each passage, with relevant participant information and trial-level stimulus information
@@ -46,8 +46,8 @@ to_omit <- '/home/luc/Documents/ndclab/analysis-sandbox/rwe-analysis/input/passa
 out_path <- '/home/luc/Documents/ndclab/analysis-sandbox/rwe-analysis/derivatives/'
 
 #read in data
-df <- read.csv(data, row.names = NULL)
-passage_omissions_df <- read.csv(to_omit, row.names = NULL)
+df <- read.csv(data, row.names = NULL) # output of prep script
+passage_omissions_df <- read.csv(to_omit, row.names = NULL) # hand-crafted list of participant x passage entries to exclude, based on coder comments
 
 #organize participant demographic variables
 df$sex <- as.factor(df$sex)
@@ -72,16 +72,13 @@ summary(df$socclass)/18 / (nrow(df)/18)
 
 #remove participants who were not engaged in the task
 dfTrim <- df
-# wait, are my removals supposed to be in dfTrim or df? dfTrim, right?
 
 # removal based on coder notes of audible distractions, others in the room, etc.:
-
 # 150015
 # 150208
 # 150245 had many passages that were entirely or near-entirely inaudible; the
 # rest were dropped too under the assumption that the audible ones too
 # would be too faint to identify errors in
-
 dfTrim <- subset(dfTrim, !(id %in% c(150015, 150208, 150245)))
 
 #remove participants whose challenge question accuracy was below 50% (chance = 25%)
@@ -91,7 +88,7 @@ dfTrim <- dfTrim %>%
   ungroup
 
 dfTrim <- subset(dfTrim, challengeAvgSub>0.5)
-length(unique(df$id)) - length(unique(dfTrim$id)) #number of participants removed
+length(unique(df$id)) - length(unique(dfTrim$id)) #number of participants removed due to distraction or low accuracy
 
 #calculate average accuracy
 mean(dfTrim$challengeAvgSub)
@@ -101,7 +98,7 @@ sd(dfTrim$challengeAvgSub)
 ### SECTION 2: PASSAGE-LEVEL TRIMMING
 passage_no_before_trimming <- nrow(dfTrim)
 
-#remove passages with high omissions (participant did not complete reading)
+#remove passages with high omissions (participant did not complete reading) or other problems (someone else is in the room, etc.)
 # e.g. vegas 150013
 dfTrim <- anti_join(dfTrim,
                     passage_omissions_df,
@@ -132,8 +129,10 @@ errorDat$lenWord_gmc <- errorDat$lenWord - mean(errorDat$lenWord)
 errorDat$avgSyllPerWord_gmc <- errorDat$avgSyllPerWord - mean(errorDat$avgSyllPerWord)
 
 # LS additions 8/11/23
-errorDat$corrections <- errorDat$corrections - mean(errorDat$corrections)
-errorDat$errors <- errorDat$errors - mean(errorDat$errors)
+# errorDat$errors <- errorDat$errors - mean(errorDat$errors)
+# errorDat$correction <- errorDat$corrections - mean(errorDat$corrections)
+errorDat$error_rate <- errorDat$errors / errorDat$lenSyll
+errorDat$correction_rate <- errorDat$corrections / errorDat$lenSyll
 
 
 #extract demo stats
@@ -292,18 +291,18 @@ f_model12 <- glmer(challengeACC ~ hesitation_rate * bfne_gmc + (1|id) + (1|passa
                   data=errorDat, family = "binomial")
 summary(f_model12)
 
-# Accuracy/comprehension as explained by disfluencies: hesitations per word with bfne
+# Accuracy/comprehension as explained by disfluencies *and* SA: hesitations per word with bfne
 f_model13 <- glmer(challengeACC ~ words_with_hes_rate * bfne_gmc + (1|id) + (1|passage),
                   data=errorDat, family = "binomial")
 summary(f_model13)
 
 
-# Accuracy/comprehension as explained by errors: misproductions per syllable with bfne
+# Accuracy/comprehension as explained by errors *and* SA: misproductions per syllable with bfne
 f_model14 <- glmer(challengeACC ~ misprod_rate * bfne_gmc + (1|id) + (1|passage),
                    data=errorDat, family = "binomial")
 summary(f_model14)
 
-# Accuracy/comprehension as explained by errors: misproductions per word with bfne
+# Accuracy/comprehension as explained by errors *and* SA: misproductions per word with bfne
 f_model15 <- glmer(challengeACC ~ words_with_misprod_rate * bfne_gmc + (1|id) + (1|passage),
                    data=errorDat, family = "binomial")
 summary(f_model15)
@@ -315,41 +314,41 @@ f_model16 <- glmer(challengeACC ~ hesitation_rate * sps_gmc + (1|id) + (1|passag
                    data=errorDat, family = "binomial")
 summary(f_model16)
 
-# Accuracy/comprehension as explained by disfluencies: hesitations per word with sps
+# Accuracy/comprehension as explained by disfluencies *and* SA: hesitations per word with sps
 f_model17 <- glmer(challengeACC ~ words_with_hes_rate * sps_gmc + (1|id) + (1|passage),
                    data=errorDat, family = "binomial")
 summary(f_model17)
 
 
-# Accuracy/comprehension as explained by errors: misproductions per syllable with sps
+# Accuracy/comprehension as explained by errors *and* SA: misproductions per syllable with sps
 f_model18 <- glmer(challengeACC ~ misprod_rate * sps_gmc + (1|id) + (1|passage),
                    data=errorDat, family = "binomial")
 summary(f_model18)
 
-# Accuracy/comprehension as explained by errors: misproductions per word with sps
+# Accuracy/comprehension as explained by errors *and* SA: misproductions per word with sps
 f_model19 <- glmer(challengeACC ~ words_with_misprod_rate * sps_gmc + (1|id) + (1|passage),
                    data=errorDat, family = "binomial")
 summary(f_model19)
 
 
 
-# Now, misproduction-hesitation interactions
+# Now, misproduction-hesitation relationships
 
 # Errors as explained by disfluency: rate of misproduced syllables from rate of hesitated syllables
 f_model20 <- lmerTest::lmer(misprod_rate ~ hesitation_rate + (1|id) + (1|passage),
                             data=errorDat, REML=TRUE)
-summary(f_model20)
+summary(f_model20) # ***
 
 # Errors as explained by disfluency: rate of misproduced words from rate of hesitated words
 f_model21 <- lmerTest::lmer(words_with_misprod_rate ~ words_with_hes_rate + (1|id) + (1|passage),
                             data=errorDat, REML=TRUE)
-summary(f_model21)
+summary(f_model21) # ***
 
 
 # Errors as explained by disfluency: rate of misproduced words from rate of hesitated syllables
 f_model22 <- lmerTest::lmer(words_with_misprod_rate ~ hesitation_rate + (1|id) + (1|passage),
                             data=errorDat, REML=TRUE)
-summary(f_model22)
+summary(f_model22) # ***
 
 
 
@@ -406,7 +405,94 @@ f_model31 <- lmerTest::lmer(words_with_misprod_rate ~ hesitation_rate * sps_gmc
 summary(f_model31)
 
 
+# LS ideas:
+# error rate ~ SA
+# correction rate ~ error rate * SA
+
+# comprehension ~ correction rate
+# comprehension ~ SA * correction rate
+# comprehension ~ error rate
+# comprehension ~ SA * error rate
+# comprehension ~ error rate * correction rate
+# comprehension ~ error rate * correction rate * SA
+
+# comprehension ~ hes rate * error rate
+# comprehension ~ hes rate * correction rate
+
+# comprehension ~ hes rate * error rate * SA
+# comprehension ~ hes rate * correction rate * SA
+# comprehension ~ hes rate * error rate * correction rate * SA
+
+# error rate      ~ hes rate * SA
+# correction rate ~ hes rate * SA, control for error rate
+
+# corr_ : corrections rate
+# err_  : errors rate
+
+
+# error rate as explained by social anxiety
+fls_model_err_scaared <- lmer(error_rate ~ scaaredSoc_gmc + (1|id) + (1|passage),
+                                   data=errorDat, REML=TRUE)
+summary(fls_model_err_scaared)
+
+# correction rate as explained by social anxiety
+fls_model_corr_scaared <- lmer(correction_rate ~ scaaredSoc_gmc + (1|id) + (1|passage),
+                              data=errorDat, REML=TRUE)
+summary(fls_model_corr_scaared)
+
+# correction rate as explained by social anxiety and errors
+fls_model_corr_scaared_err <- lmer(correction_rate ~ scaaredSoc_gmc * error_rate + (1|id) + (1|passage),
+                                   data=errorDat, REML=TRUE)
+summary(fls_model_corr_scaared_err)
+
+# comprehension ~ correction rate
+fls_model_comp_corr <- glmer(challengeACC ~ correction_rate + (1|id) + (1|passage),
+                             data=errorDat, family = "binomial")
+summary(fls_model_comp_corr)
+
+# comprehension ~ SA * correction rate
+fls_model_comp_scaared_corr <- glmer(challengeACC ~ scaaredSoc_gmc * correction_rate + (1|id) + (1|passage),
+                                     data=errorDat, family = "binomial")
+summary(fls_model_comp_scaared_corr) # *
+
+# comprehension ~ error rate
+fls_model_comp_err <- glmer(challengeACC ~ error_rate + (1|id) + (1|passage),
+                             data=errorDat, family = "binomial")
+summary(fls_model_comp_err)
+
+# comprehension ~ SA * error rate
+fls_model_comp_scaared_err <- glmer(challengeACC ~ scaaredSoc_gmc * error_rate + (1|id) + (1|passage),
+                                    data=errorDat, family = "binomial")
+summary(fls_model_comp_scaared_err)
+
+# comprehension ~ error rate * correction rate
+fls_model_comp_err_corr <- glmer(challengeACC ~ error_rate * correction_rate + (1|id) + (1|passage),
+                                 data=errorDat, family = "binomial")
+summary(fls_model_comp_err_corr)
+
+# comprehension ~ hes rate * error rate
+fls_model_comp_hes_err <- glmer(challengeACC ~ hesitation_rate * error_rate + (1|id) + (1|passage),
+                                data=errorDat, family = "binomial")
+summary(fls_model_comp_hes_err)
+
+# comprehension ~ hes rate * correction rate
+fls_model_comp_hes_corr <- glmer(challengeACC ~ hesitation_rate * correction_rate + (1|id) + (1|passage),
+                                 data=errorDat, family = "binomial")
+summary(fls_model_comp_hes_corr) # ***
+
+
+# Check the main one of interest, comprehension as explained by SA and
+# corrections (see fls_model_comp_scaared_corr above), with all three tests
+f_model_comp_scaared_corr <- glmer(challengeACC ~ scaaredSoc_gmc * correction_rate + (1|id) + (1|passage),
+                                   data=errorDat, family = "binomial")
+summary(f_model_comp_scaared_corr) # *
 
+f_model_comp_bfne_corr <- glmer(challengeACC ~ bfne_gmc * correction_rate + (1|id) + (1|passage),
+                                data=errorDat, family = "binomial")
+summary(f_model_comp_bfne_corr) # N.S.
 
+f_model_comp_sps_corr <- glmer(challengeACC ~ sps_gmc * correction_rate + (1|id) + (1|passage),
+                               data=errorDat, family = "binomial")
+summary(f_model_comp_sps_corr) # N.S., but p for interaction is at 0.0698
 
 

From 7d5efa67db6c78fce5e1279084beb41721ae5b5b Mon Sep 17 00:00:00 2001
From: l-acs <lucas.sahar@mail.mcgill.ca>
Date: Wed, 16 Aug 2023 19:37:28 -0400
Subject: [PATCH 07/33] WIP: Reading speed analysis

---
 code/analysisReadAloudBeta.R | 109 +++++++++++++++++++++++++++++++++--
 code/prepReadAloudBeta.R     |  26 ++++++---
 2 files changed, 124 insertions(+), 11 deletions(-)

diff --git a/code/analysisReadAloudBeta.R b/code/analysisReadAloudBeta.R
index dd159c0..7b71881 100644
--- a/code/analysisReadAloudBeta.R
+++ b/code/analysisReadAloudBeta.R
@@ -1,6 +1,6 @@
 # readAloud-valence-beta Reading Task Analyses
 # Authors: Luc Sahar, Jessica M. Alexander
-# Last Updated: 2023-08-12
+# Last Updated: 2023-08-16
 
 # INPUTS
 # data/df: behavioral data, for each participant on each passage, with relevant participant information and trial-level stimulus information
@@ -40,7 +40,9 @@ today <- format(today, "%Y%m%d")
 
 #set up directories for input/output data
 # data <- '/Users/jalexand/github/readAloud-valence-beta/derivatives/readAloudBetaData_20230630.csv'
-data <- '/home/luc/Documents/ndclab/analysis-sandbox/rwe-analysis/derivatives/readAloudBetaData_20230810.csv'
+# data <- '/home/luc/Documents/ndclab/analysis-sandbox/rwe-analysis/derivatives/readAloudBetaData_20230810.csv'
+# data <- '/home/luc/Documents/ndclab/analysis-sandbox/rwe-analysis/derivatives/readAloudBetaData_20230815.csv'
+data <- '/home/luc/Documents/ndclab/analysis-sandbox/rwe-analysis/derivatives/readAloudBetaData_20230816.csv'
 to_omit <- '/home/luc/Documents/ndclab/analysis-sandbox/rwe-analysis/input/passages-to-omit_20230810.csv'
 # out_path <- '/Users/jalexand/github/readAloud-valence-beta/derivatives/'
 out_path <- '/home/luc/Documents/ndclab/analysis-sandbox/rwe-analysis/derivatives/'
@@ -55,6 +57,9 @@ df$pronouns <- as.factor(df$pronouns)
 df$ethnic <- as.factor(df$ethnic)
 df$socclass <- as.factor(df$socclass)
 
+
+# FIXME: stats on df and dfTrim for reading speed are NA
+
 #extract demo stats
 
 # all these values are just in case they're useful - not needed per se for later
@@ -77,8 +82,8 @@ dfTrim <- df
 # 150015
 # 150208
 # 150245 had many passages that were entirely or near-entirely inaudible; the
-# rest were dropped too under the assumption that the audible ones too
-# would be too faint to identify errors in
+# rest were dropped too under the assumption that the audible ones too would be
+# too faint to identify errors in
 dfTrim <- subset(dfTrim, !(id %in% c(150015, 150208, 150245)))
 
 #remove participants whose challenge question accuracy was below 50% (chance = 25%)
@@ -94,6 +99,12 @@ length(unique(df$id)) - length(unique(dfTrim$id)) #number of participants remove
 mean(dfTrim$challengeAvgSub)
 sd(dfTrim$challengeAvgSub)
 
+# calculate average speed
+mean(dfTrim$timePerSyllable, na.rm=TRUE)
+sd(dfTrim$timePerSyllable, na.rm=TRUE)
+mean(dfTrim$timePerWord, na.rm=TRUE)
+sd(dfTrim$timePerWord, na.rm=TRUE)
+
 
 ### SECTION 2: PASSAGE-LEVEL TRIMMING
 passage_no_before_trimming <- nrow(dfTrim)
@@ -110,6 +121,34 @@ passage_no_before_trimming - passage_no_after_trim1 #number of passages trimmed
 (passage_no_before_trimming - passage_no_after_trim1) / passage_no_before_trimming #percentage of passages trimmed
 
 
+# band-aid fix: remove passages without reading speed data so that we can run
+# our analyses on them nonetheless
+# todo
+
+# these are the only four passages without reading time data...
+# and incidentally? well, see their comments here...
+c(150013, "vegas")      # N.B.: 161 omitted syllables of 318 total in passage
+c(150022, "depression") # N.B.: 160 omitted syllables of 362 total in passage
+c(150083, "caramel")    # N.B.: only one of four passages to have >= 5% of syllables omitted
+c(150083, "cars")       # N.B.: only one of four passages to have >= 5% of syllables omitted
+
+
+dfTrim <- filter(dfTrim, !is.na(timePerSyllable))
+# itself, but without ones for which we have no reading data
+
+
+
+"TODO"
+
+passage_no_after_trim2 <- nrow(dfTrim)
+passage_no_after_trim1 - passage_no_after_trim2 #number of passages trimmed
+(passage_no_after_trim1 - passage_no_after_trim2) / passage_no_after_trim1 #percentage of passages trimmed of last bunch
+(passage_no_after_trim1 - passage_no_after_trim2) / passage_no_before_trimming #percentage of passages trimmed of whole
+
+
+
+
+
 ### SECTION 3: ORGANIZE DATA FOR MODELING
 errorDat <- dfTrim
 
@@ -134,6 +173,11 @@ errorDat$avgSyllPerWord_gmc <- errorDat$avgSyllPerWord - mean(errorDat$avgSyllPe
 errorDat$error_rate <- errorDat$errors / errorDat$lenSyll
 errorDat$correction_rate <- errorDat$corrections / errorDat$lenSyll
 
+errorDat$timePerSyllable_gmc <- errorDat$timePerSyllable - mean(errorDat$timePerSyllable)
+errorDat$timePerWord_gmc <- errorDat$timePerWord - mean(errorDat$timePerWord)
+
+# todo center avg reading speed, probably
+
 
 #extract demo stats
 errorDatStats <- subset(errorDat, !duplicated(errorDat$id))
@@ -148,6 +192,12 @@ summary(errorDatStats$ethnic) / length(unique(errorDatStats$id))
 summary(errorDatStats$socclass)
 summary(errorDatStats$socclass) / length(unique(errorDatStats$id))
 
+# Reading speed stats (ls additions 8/16/23)
+summary(errorDatStats$timePerSyllable)
+summary(errorDatStats$timePerWord)
+
+summary(errorDatStats$timePerSyllable_gmc)
+summary(errorDatStats$timePerWord_gmc)
 
 ### SECTION 4: MODEL RESULTS
 #misprod_rate x bfne
@@ -405,6 +455,57 @@ f_model31 <- lmerTest::lmer(words_with_misprod_rate ~ hesitation_rate * sps_gmc
 summary(f_model31)
 
 
+
+# Now: see if reading speed plays into it
+
+# Does scaaredSoc predict reading speed?
+# syllable level
+rs_model_1 <- lmerTest::lmer(timePerSyllable_gmc ~ scaaredSoc_gmc + (1|id) + (1|passage),
+                             data=errorDat, REML=TRUE)
+summary(rs_model_1)
+
+# word level
+rs_model_2 <- lmerTest::lmer(timePerWord_gmc ~ scaaredSoc_gmc + (1|id) + (1|passage),
+                             data=errorDat, REML=TRUE)
+summary(rs_model_2)
+
+
+rs_model_1_bfne <- lmerTest::lmer(timePerSyllable_gmc ~ bfne_gmc + (1|id) + (1|passage),
+                             data=errorDat, REML=TRUE)
+summary(rs_model_1_bfne)
+
+# word level
+rs_model_2_bfne <- lmerTest::lmer(timePerWord_gmc ~ bfne_gmc + (1|id) + (1|passage),
+                             data=errorDat, REML=TRUE)
+summary(rs_model_2_bfne)
+
+rs_model_1_sps <- lmerTest::lmer(timePerSyllable_gmc ~ sps_gmc + (1|id) + (1|passage),
+                                  data=errorDat, REML=TRUE)
+summary(rs_model_1_sps)
+
+# word level
+rs_model_2_bfne <- lmerTest::lmer(timePerWord_gmc ~ sps_gmc + (1|id) + (1|passage),
+                                  data=errorDat, REML=TRUE)
+summary(rs_model_2_sps)
+
+
+
+# And now ->> check out work
+# Does our hesitation ~ scaaredSoc finding hold with reading speed controlled for?
+# syllable level
+rs_model_3 <- lmerTest::lmer(hesitation_rate ~ scaaredSoc_gmc + timePerSyllable_gmc + (1|id) + (1|passage),
+                             data=errorDat, REML=TRUE)
+summary(rs_model_3)
+
+# word level
+rs_model_4 <- lmerTest::lmer(words_with_hes_rate ~ scaaredSoc_gmc + timePerWord_gmc + (1|id) + (1|passage),
+                             data=errorDat, REML=TRUE)
+summary(rs_model_4)
+
+
+# todo: check models 1 and 2 when hesitation rate is held still
+
+
 # LS ideas:
 # error rate ~ SA
 # correction rate ~ error rate * SA
diff --git a/code/prepReadAloudBeta.R b/code/prepReadAloudBeta.R
index 407fcd5..878e824 100644
--- a/code/prepReadAloudBeta.R
+++ b/code/prepReadAloudBeta.R
@@ -1,6 +1,6 @@
 # readAloud-valence-beta Analysis Preparation
 # Authors: Luc Sahar and Jessica M. Alexander -- NDCLab, Florida International University
-# Last updated: 2023-06-30
+# Last updated: 2023-08-15
 
 # INPUTS
 # data/df: behavioral (error-related) data, for each participant on each passage
@@ -58,12 +58,13 @@ accDat_path <- paste(main_dataset,'derivatives/preprocessed/readAloud_passage-le
 readDat_path <- paste(main_dataset, 'derivatives/analysisStimuli_readDat_20230614.csv', sep="", collapse=NULL)
 redcap_path <- paste(main_dataset,'derivatives/preprocessed/202201v0readAloudval_SCRD_2022-06-20_1019.csv', sep="", collapse=NULL)
 agedat_path <- paste(main_dataset,'derivatives/preprocessed/202201v0readAloudval_SCRD_2022-06-20_1019_ageonly.csv', sep="", collapse=NULL)
+speedDat_path <- paste(main_dataset, "derivatives/preprocessed/valence-timing/timingpitch_subject-by-passage_2022-09-09.csv", sep="", collapse=NULL)
 
-# c(data, accDat_path, readDat_path, redcap_path, agedat_path) %>% fs::as_fs_path() %>% fs::is_file()
+# c(data, accDat_path, readDat_path, redcap_path, agedat_path, speedDat_path) %>% fs::as_fs_path() %>% fs::is_file()
 # ✅: all TRUE
 
 
-df <- read.csv(data)
+df <- read.csv(data, na.strings='NA')
 redcap <- read.csv(redcap_path, na.strings='NA') #participant questionnaire responses
 agedat <- read.csv(agedat_path, na.strings='NA') #participant age information
 readDat <- read.csv(readDat_path, na.strings='N') #passage-level characteristics from analysisStimuli.R
@@ -71,6 +72,7 @@ accDat <- read.csv(accDat_path, na.strings='NA', check.names=FALSE) #passage lev
 accDat$passage <- c("dams", "flying", "bats", "broccoli", "realty", "bees", "dogshow", "dolphins", "icefishing",
                     "cars", "vegas", "sun", "caramel", "congo", "antarctica", "depression", "skunkowl", "grizzly",
                     "mantis", "dentist")        #rename passages with short-name
+speedDat <- read.csv(speedDat_path, na.strings='NA')
 
 #organize data types
 df[,3:30] <- sapply(df[,3:30],as.numeric)
@@ -164,7 +166,13 @@ for(h in 1:nrow(demoDat)){
 }
 
 
-### SECTION 3: BUILD TRIAL-LEVEL DF (ADD DEMODAT, READDAT, and ACCDAT to DF)
+### SECTION 3: SET UP DERIVED FIELDS FOR SPEED ANALYSES
+speedDat$readingTime <- speedDat$readEnd - speedDat$readStart
+speedDat$id <- as.character(speedDat$id) # so we can join and it doesn't complain about type comparison
+df <- left_join(df, speedDat, by = c("id", "passage")) # now reading timestamps and duration are looped into df
+
+
+### SECTION 4: BUILD TRIAL-LEVEL DF (ADD DEMODAT, READDAT, and ACCDAT to DF)
 for(i in 1:nrow(df)){
   subject <- df$id[i] #extract subject number for matching
   passage <- df$passage[i] #extract passage name for matching
@@ -212,8 +220,12 @@ df$pronouns <- as.factor(df$pronouns)
 df$ethnic <- as.factor(df$ethnic)
 df$socclass <- as.factor(df$socclass)
 
+# compute speed
+df$timePerSyllable <- df$readingTime / df$lenSyll
+df$timePerWord     <- df$readingTime / df$lenWord
+
 
-### SECTION 3: CROSS-CHECK ALL PARTICIPANTS MET INCLUSION CRITERIA
+### SECTION 5: CROSS-CHECK ALL PARTICIPANTS MET INCLUSION CRITERIA
 #note: given the time required to annotated errors, only participants who met inclusion criteria were annotated
 #sum(df$eng==1 & df$langhis %in% c(2,4) & df$ageen>6) #confirm all subjects monolingual English OR natively bilingual OR learned English before age 6
 #sum(df$commdis>0) #confirm no subject diagnosed with any communication disorder
@@ -236,12 +248,12 @@ summary(df$socclass)/20 #number of participants by social class affiliation
 summary(df$socclass)/20 / (nrow(df)/20) #percentage of participants by social class affiliation
 
 
-### SECTION 4: TRIM PASSAGES DUE TO EXPERIMENTER ERROR
+### SECTION 6: TRIM PASSAGES DUE TO EXPERIMENTER ERROR
 dfTrim <- subset(df, !(df$passage=='broccoli')) #remove broccoli passage due to typo in the last sentence as presented on-screen to participants
 dfTrim <- subset(dfTrim, !(dfTrim$passage=='sun')) #remove sun passage due to error in coding Excel
 
 
-### SECTION 5: OUTPUT DATAFRAME
+### SECTION 7: OUTPUT DATAFRAME
 write.csv(dfTrim, paste(out_path, "readAloudBetaData_", today, ".csv", sep="", collapse=NULL), row.names = FALSE)
 
 # collapse_by_participant <- function(filename_in, filename_out) {

From 9d77a5a109e96b6f32973f1d6d6a2d8d00757a54 Mon Sep 17 00:00:00 2001
From: l-acs <lucas.sahar@mail.mcgill.ca>
Date: Thu, 17 Aug 2023 09:17:08 -0400
Subject: [PATCH 08/33] WIP: Control for age

---
 code/analysisReadAloudBeta.R | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/code/analysisReadAloudBeta.R b/code/analysisReadAloudBeta.R
index 7b71881..39d9b20 100644
--- a/code/analysisReadAloudBeta.R
+++ b/code/analysisReadAloudBeta.R
@@ -490,6 +490,18 @@ summary(rs_model_2_sps)
 
 
 
+# What happens when we control for age?
+#hesitation_rate x scaaredSoc
+age_model1 <- lmerTest::lmer(hesitation_rate ~ scaaredSoc_gmc + age_gmc + (1|id) + (1|passage),
+                         data=errorDat, REML=TRUE)
+summary(age_model1)
+
+#words_with_hes_rate x scaaredSoc
+age_model2 <- lmerTest::lmer(words_with_hes_rate ~ scaaredSoc_gmc + age_gmc + (1|id) + (1|passage),
+                          data=errorDat, REML=TRUE)
+summary(age_model2)
+
+
 # And now ->> check out work
 # Does our hesitation ~ scaaredSoc finding hold with reading speed controlled for?
 # syllable level
@@ -506,6 +518,9 @@ summary(rs_model_4)
 # todo: check models 1 and 2 when hesitation rate is held still
 
 
+
+
+
 # LS ideas:
 # error rate ~ SA
 # correction rate ~ error rate * SA

From 9f953416ceafa0c7f181313231f7514b95bdf0d6 Mon Sep 17 00:00:00 2001
From: l-acs <lucas.sahar@mail.mcgill.ca>
Date: Thu, 17 Aug 2023 09:37:07 -0400
Subject: [PATCH 09/33] Check if reading speed mean-centering changes things
 (it doesn't)

---
 code/analysisReadAloudBeta.R | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/code/analysisReadAloudBeta.R b/code/analysisReadAloudBeta.R
index 39d9b20..2e93828 100644
--- a/code/analysisReadAloudBeta.R
+++ b/code/analysisReadAloudBeta.R
@@ -135,6 +135,8 @@ c(150083, "cars")       # N.B.: only one of four passages to have >= 5% of sylla
 
 dfTrim <- filter(dfTrim, !is.na(timePerSyllable))
 # itself, but without ones for which we have no reading data
+# this ends up only dropping 0083, caramel - the other three already end up
+# getting dropped based on other criteria
 
 
 
@@ -490,6 +492,22 @@ summary(rs_model_2_sps)
 
 
 
+# Does scaaredSoc predict reading speed?
+# syllable level
+rs_model_3 <- lmerTest::lmer(timePerSyllable ~ scaaredSoc_gmc + (1|id) + (1|passage),
+                             data=errorDat, REML=TRUE)
+summary(rs_model_3)
+
+# word level
+rs_model_4 <- lmerTest::lmer(timePerWord ~ scaaredSoc_gmc + (1|id) + (1|passage),
+                             data=errorDat, REML=TRUE)
+summary(rs_model_4)
+
+
+
+
+
+
 # What happens when we control for age?
 #hesitation_rate x scaaredSoc
 age_model1 <- lmerTest::lmer(hesitation_rate ~ scaaredSoc_gmc + age_gmc + (1|id) + (1|passage),

From ab8787b9087c94f83b5c7a1ed4e440c22bae2d5d Mon Sep 17 00:00:00 2001
From: l-acs <lucas.sahar@mail.mcgill.ca>
Date: Tue, 22 Aug 2023 21:46:15 -0400
Subject: [PATCH 10/33] WIP: Misproduction-hesitation sequences

---
 code/analysisReadAloudBeta.R | 163 +++++++++++++----------------------
 code/prepReadAloudBeta.R     |   9 +-
 2 files changed, 65 insertions(+), 107 deletions(-)

diff --git a/code/analysisReadAloudBeta.R b/code/analysisReadAloudBeta.R
index 2e93828..44a3deb 100644
--- a/code/analysisReadAloudBeta.R
+++ b/code/analysisReadAloudBeta.R
@@ -1,6 +1,6 @@
 # readAloud-valence-beta Reading Task Analyses
 # Authors: Luc Sahar, Jessica M. Alexander
-# Last Updated: 2023-08-16
+# Last Updated: 2023-08-24
 
 # INPUTS
 # data/df: behavioral data, for each participant on each passage, with relevant participant information and trial-level stimulus information
@@ -42,7 +42,8 @@ today <- format(today, "%Y%m%d")
 # data <- '/Users/jalexand/github/readAloud-valence-beta/derivatives/readAloudBetaData_20230630.csv'
 # data <- '/home/luc/Documents/ndclab/analysis-sandbox/rwe-analysis/derivatives/readAloudBetaData_20230810.csv'
 # data <- '/home/luc/Documents/ndclab/analysis-sandbox/rwe-analysis/derivatives/readAloudBetaData_20230815.csv'
-data <- '/home/luc/Documents/ndclab/analysis-sandbox/rwe-analysis/derivatives/readAloudBetaData_20230816.csv'
+# data <- '/home/luc/Documents/ndclab/analysis-sandbox/rwe-analysis/derivatives/readAloudBetaData_20230816.csv'
+data <- '/home/luc/Documents/ndclab/analysis-sandbox/rwe-analysis/derivatives/readAloudBetaData_20230821.csv'
 to_omit <- '/home/luc/Documents/ndclab/analysis-sandbox/rwe-analysis/input/passages-to-omit_20230810.csv'
 # out_path <- '/Users/jalexand/github/readAloud-valence-beta/derivatives/'
 out_path <- '/home/luc/Documents/ndclab/analysis-sandbox/rwe-analysis/derivatives/'
@@ -57,9 +58,6 @@ df$pronouns <- as.factor(df$pronouns)
 df$ethnic <- as.factor(df$ethnic)
 df$socclass <- as.factor(df$socclass)
 
-
-# FIXME: stats on df and dfTrim for reading speed are NA
-
 #extract demo stats
 
 # all these values are just in case they're useful - not needed per se for later
@@ -123,7 +121,6 @@ passage_no_before_trimming - passage_no_after_trim1 #number of passages trimmed
 
 # band-aid fix: remove passages without reading speed data so that we can run
 # our analyses on them nonetheless
-# todo
 
 # these are the only four passages without reading time data...
 # and incidentally? well, see their comments here...
@@ -138,10 +135,6 @@ dfTrim <- filter(dfTrim, !is.na(timePerSyllable))
 # this ends up only dropping 0083, caramel - the other three already end up
 # getting dropped based on other criteria
 
-
-
-"TODO"
-
 passage_no_after_trim2 <- nrow(dfTrim)
 passage_no_after_trim1 - passage_no_after_trim2 #number of passages trimmed
 (passage_no_after_trim1 - passage_no_after_trim2) / passage_no_after_trim1 #percentage of passages trimmed of last bunch
@@ -178,7 +171,7 @@ errorDat$correction_rate <- errorDat$corrections / errorDat$lenSyll
 errorDat$timePerSyllable_gmc <- errorDat$timePerSyllable - mean(errorDat$timePerSyllable)
 errorDat$timePerWord_gmc <- errorDat$timePerWord - mean(errorDat$timePerWord)
 
-# todo center avg reading speed, probably
+
 
 
 #extract demo stats
@@ -201,6 +194,45 @@ summary(errorDatStats$timePerWord)
 summary(errorDatStats$timePerSyllable_gmc)
 summary(errorDatStats$timePerWord_gmc)
 
+
+### SECTION 3.5: preparing for misprod-hes sequential analyses
+
+# ignore the misprod-hes columns for now
+errorDatMisprodHes <- select(errorDat, !contains("_syllables"))
+
+# First: look at a given misproduction and check for nearby hesitations
+justMisprodWithHesBefore <- cbind(errorDatMisprodHes,
+                                  hes_position = 0, # "before",
+                                  misprod_tally = errorDat$misprod_with_hes_in_previous_syllables)
+
+justMisprodWithHesAfter <- cbind(errorDatMisprodHes,
+                                 hes_position = 1, # "after",
+                                 misprod_tally = errorDat$misprod_with_hes_in_next_syllables)
+
+
+# stack the ones before and the ones after as rows of a single df (my attempt at long form)
+errorDatLongMisprodWithRelHes <- rbind(justMisprodWithHesBefore, justMisprodWithHesAfter)
+
+# track the binary relative position as a factor
+errorDatLongMisprodWithRelHes$hes_position <- as.factor(errorDatLongMisprodWithRelHes$hes_position)
+
+# Then: look at a given hesitation and check for nearby misproductions
+justHesWithMisprodBefore <- cbind(errorDatMisprodHes,
+                                  misprod_position = 0, # "before",
+                                  hes_tally = errorDat$hes_with_misprod_in_previous_syllables)
+
+justHesWithMisprodAfter <- cbind(errorDatMisprodHes,
+                                 misprod_position = 1, # "after",
+                                 hes_tally = errorDat$hes_with_misprod_in_next_syllables)
+
+# stack the ones before and the ones after as rows of a single df (my attempt at long form)
+errorDatLongHesWithRelMisprod <- rbind(justHesWithMisprodBefore, justHesWithMisprodAfter)
+
+# track the binary relative position as a factor
+errorDatLongHesWithRelMisprod$misprod_position <- as.factor(errorDatLongHesWithRelMisprod$misprod_position)
+
+
+
 ### SECTION 4: MODEL RESULTS
 #misprod_rate x bfne
 model1 <- lmerTest::lmer(misprod_rate ~ bfne_gmc + (1|id) + (1|passage),
@@ -506,8 +538,6 @@ summary(rs_model_4)
 
 
 
-
-
 # What happens when we control for age?
 #hesitation_rate x scaaredSoc
 age_model1 <- lmerTest::lmer(hesitation_rate ~ scaaredSoc_gmc + age_gmc + (1|id) + (1|passage),
@@ -520,7 +550,7 @@ age_model2 <- lmerTest::lmer(words_with_hes_rate ~ scaaredSoc_gmc + age_gmc + (1
 summary(age_model2)
 
 
-# And now ->> check out work
+# And now ->> check our work
 # Does our hesitation ~ scaaredSoc finding hold with reading speed controlled for?
 # syllable level
 rs_model_3 <- lmerTest::lmer(hesitation_rate ~ scaaredSoc_gmc + timePerSyllable_gmc + (1|id) + (1|passage),
@@ -533,100 +563,25 @@ rs_model_4 <- lmerTest::lmer(words_with_hes_rate ~ scaaredSoc_gmc + timePerWord_
 summary(rs_model_4)
 
 
-# todo: check models 1 and 2 when hesitation rate is held still
-
-
-
-
-
-# LS ideas:
-# error rate ~ SA
-# correction rate ~ error rate * SA
-
-# comprehension ~ correction rate
-# comprehension ~ SA * correction rate
-# comprehension ~ error rate
-# comprehension ~ SA * error rate
-# comprehension ~ error rate * correction rate
-# comprehension ~ error rate * correction rate * SA
-
-# comprehension ~ hes rate * error rate
-# comprehension ~ hes rate * correction rate
-
-# comprehension ~ hes rate * error rate * SA
-# comprehension ~ hes rate * correction rate * SA
-# comprehension ~ hes rate * error rate * correction rate * SA
-
-# error rate      ~ hes rate * SA
-# correction rate ~ hes rate * SA, control for error rate
-
-# corr_ : corrections rate
-# err_  : errors rate
-
-
-# error rate as explained by social anxiety
-fls_model_err_scaared <- lmer(error_rate ~ scaaredSoc_gmc + (1|id) + (1|passage),
-                                   data=errorDat, REML=TRUE)
-summary(fls_model_err_scaared)
-
-# correction rate as explained by social anxiety
-fls_model_corr_scaared <- lmer(correction_rate ~ scaaredSoc_gmc + (1|id) + (1|passage),
-                              data=errorDat, REML=TRUE)
-summary(fls_model_corr_scaared)
-
-# correction rate as explained by social anxiety and errors
-fls_model_corr_scaared_err <- lmer(correction_rate ~ scaaredSoc_gmc * error_rate + (1|id) + (1|passage),
-                                   data=errorDat, REML=TRUE)
-summary(fls_model_corr_scaared_err)
-
-# comprehension ~ correction rate
-fls_model_comp_corr <- glmer(challengeACC ~ correction_rate + (1|id) + (1|passage),
-                             data=errorDat, family = "binomial")
-summary(fls_model_comp_corr)
-
-# comprehension ~ SA * correction rate
-fls_model_comp_scaared_corr <- glmer(challengeACC ~ scaaredSoc_gmc * correction_rate + (1|id) + (1|passage),
-                                     data=errorDat, family = "binomial")
-summary(fls_model_comp_scaared_corr) # *
-
-# comprehension ~ error rate
-fls_model_comp_err <- glmer(challengeACC ~ error_rate + (1|id) + (1|passage),
-                             data=errorDat, family = "binomial")
-summary(fls_model_comp_err)
-
-# comprehension ~ SA * error rate
-fls_model_comp_scaared_err <- glmer(challengeACC ~ scaaredSoc_gmc * error_rate + (1|id) + (1|passage),
-                                    data=errorDat, family = "binomial")
-summary(fls_model_comp_scaared_err)
-
-# comprehension ~ error rate * correction rate
-fls_model_comp_err_corr <- glmer(challengeACC ~ error_rate * correction_rate + (1|id) + (1|passage),
-                                 data=errorDat, family = "binomial")
-summary(fls_model_comp_err_corr)
-
-# comprehension ~ hes rate * error rate
-fls_model_comp_hes_err <- glmer(challengeACC ~ hesitation_rate * error_rate + (1|id) + (1|passage),
-                                data=errorDat, family = "binomial")
-summary(fls_model_comp_hes_err)
+# misprod-hes ordering
 
-# comprehension ~ hes rate * correction rate
-fls_model_comp_hes_corr <- glmer(challengeACC ~ hesitation_rate * correction_rate + (1|id) + (1|passage),
-                                 data=errorDat, family = "binomial")
-summary(fls_model_comp_hes_corr) # ***
+# Is the number of hesitations adjacent to misproductions in a particular
+# reading predicted by the
 
+# Does the position of misproductions relative to hesitations
 
-# Check the main one of interest, comprehension as explained by SA and
-# corrections (see fls_model_comp_scaared_corr above), with all three tests
-f_model_comp_scaared_corr <- glmer(challengeACC ~ scaaredSoc_gmc * correction_rate + (1|id) + (1|passage),
-                                   data=errorDat, family = "binomial")
-summary(f_model_comp_scaared_corr) # *
 
-f_model_comp_bfne_corr <- glmer(challengeACC ~ bfne_gmc * correction_rate + (1|id) + (1|passage),
-                                data=errorDat, family = "binomial")
-summary(f_model_comp_bfne_corr) # N.S.
+# we have a number of occurrences of a misproduction in a particular position
+# relative to a passage's hesitations. does knowing the position (before/after)
+# predict the number of these sequences we have?
 
-f_model_comp_sps_corr <- glmer(challengeACC ~ sps_gmc * correction_rate + (1|id) + (1|passage),
-                               data=errorDat, family = "binomial")
-summary(f_model_comp_sps_corr) # N.S., but p for interaction is at 0.0698
+# does misproduction location relative to a hesitation predict how many
+# instances we get in a particular reading?
 
+hes_with_rel_misprod_model_1 <- lmerTest::lmer(hes_tally ~ misprod_position + (1|id) + (1|passage),
+                                               data=errorDatLongHesWithRelMisprod, REML=TRUE)
+summary(hes_with_rel_misprod_model_1)
 
+misprod_with_rel_hes_model_1 <- lmerTest::lmer(misprod_tally ~ hes_position + (1|id) + (1|passage),
+                                               data=errorDatLongMisprodWithRelHes, REML=TRUE)
+summary(misprod_with_rel_hes_model_1)
diff --git a/code/prepReadAloudBeta.R b/code/prepReadAloudBeta.R
index 878e824..04f5476 100644
--- a/code/prepReadAloudBeta.R
+++ b/code/prepReadAloudBeta.R
@@ -53,7 +53,8 @@ out_path <- '/home/luc/Documents/ndclab/analysis-sandbox/rwe-analysis/derivative
 
 #load input files
 # data <- paste(main_dataset, 'derivatives/preprocessed/disfluencies_subject-x-passage_20230616_1229pm.csv', sep="", collapse=NULL)
-data <- "/home/luc/Documents/ndclab/analysis-sandbox/output-csvs/disfluencies_subject-x-passage_20230616_1229pm.csv"
+# data <- "/home/luc/Documents/ndclab/analysis-sandbox/output-csvs/disfluencies_subject-x-passage_20230616_1229pm.csv"
+data <- "/home/luc/Documents/ndclab/analysis-sandbox/output-csvs/disfluencies_subject-x-passage_20230818_1042pm.csv"
 accDat_path <- paste(main_dataset,'derivatives/preprocessed/readAloud_passage-level_summary_20220812.csv', sep="", collapse=NULL)
 readDat_path <- paste(main_dataset, 'derivatives/analysisStimuli_readDat_20230614.csv', sep="", collapse=NULL)
 redcap_path <- paste(main_dataset,'derivatives/preprocessed/202201v0readAloudval_SCRD_2022-06-20_1019.csv', sep="", collapse=NULL)
@@ -84,7 +85,7 @@ tempdf <- data.frame(matrix(nrow=0, ncol=ncol(df)))
 colnames(tempdf) <- colnames(df)
 for(passage in 1:length(all_passages)){
   if(all_passages[passage] %in% passages_read){next}else{
-    tempdf[nrow(tempdf) + 1,] <- c("150086", all_passages[passage], rep(NA, 28))
+    tempdf[nrow(tempdf) + 1,] <- c("150086", all_passages[passage], rep(NA, 30))
   }
 }
 df <- rbind(df, tempdf)
@@ -254,7 +255,9 @@ dfTrim <- subset(dfTrim, !(dfTrim$passage=='sun')) #remove sun passage due to er
 
 
 ### SECTION 7: OUTPUT DATAFRAME
-write.csv(dfTrim, paste(out_path, "readAloudBetaData_", today, ".csv", sep="", collapse=NULL), row.names = FALSE)
+out_filename <- paste(out_path, "readAloudBetaData_", today, ".csv", sep="", collapse=NULL)
+write.csv(dfTrim, out_filename, row.names = FALSE)
+out_filename
 
 # collapse_by_participant <- function(filename_in, filename_out) {
 #   by_participant <- read_csv(filename_in) %>%

From a66e53669c453dc7a5035fb1a4929e86c5102ec1 Mon Sep 17 00:00:00 2001
From: l-acs <lucas.sahar@mail.mcgill.ca>
Date: Fri, 25 Aug 2023 11:12:17 -0400
Subject: [PATCH 11/33] Add prep script for word freq analysis, after
 `dataset`'s analysisStimuli.R

---
 code/prepWordFreqReadAloudBeta.R | 138 +++++++++++++++++++++++++++++++
 1 file changed, 138 insertions(+)
 create mode 100644 code/prepWordFreqReadAloudBeta.R

diff --git a/code/prepWordFreqReadAloudBeta.R b/code/prepWordFreqReadAloudBeta.R
new file mode 100644
index 0000000..e1fd57a
--- /dev/null
+++ b/code/prepWordFreqReadAloudBeta.R
@@ -0,0 +1,138 @@
+# readAloud-valence-beta Analysis Preparation for word frequency
+# Authors: Jessica M. Alexander and Luc Sahar -- NDCLab, Florida International University
+# Last updated: 2023-08-25
+
+# Input:
+# - SUBTLEXus corpus
+# - Scaffolds xlsx, used only as the simplest way to enumerate the passages
+# - Stimulus characteristics for our passages
+
+# Output:
+# - a CSV, with one row per passage, showing the average word frequency (in
+#   SUBTLEXus) for the words in that passage
+
+### SECTION 1: SETTING UP
+library(readxl) #load excel files
+library(textstem) #lemmatize_words function
+
+#set up date for output file naming
+today <- Sys.Date()
+today <- format(today, "%Y%m%d")
+
+#set up directories for input/output
+# main_dataset <- '/Users/jalexand/github/readAloud-valence-dataset/'
+main_dataset <- '~/Documents/ndclab/analysis-sandbox/rwe-dataset/'
+main_analysis <- '~/Documents/ndclab/analysis-sandbox/rwe-analysis/'
+# out_path_readDat <- '/Users/jalexand/github/readAloud-valence-dataset/derivatives/'
+out_path_readDat <- paste(main_analysis, 'derivatives/', sep = '')
+
+#load input files
+scaffolds <- paste(main_dataset, 'code/scaffolds.xlsx', sep="", collapse=NULL)
+readAloudStimChar <- paste(main_dataset, 'materials/readAloud-ldt/stimuli/readAloud/readAloud-stimuli_characteristics.xlsx', sep="", collapse=NULL)
+SUBList <- paste(main_dataset, 'materials/readAloud-ldt/stimuli/resources/SUBTLEXus74286wordstextversion.txt', sep="", collapse=NULL) #downloaded from https://www.ugent.be/pp/experimentele-psychologie/en/research/documents/subtlexus on 06/13/2022
+
+# maybe include passDat? it has something about word data, and seems to have flesch too?
+subtlexus <- read.table(SUBList, header=TRUE)
+subtlexus$Word <- tolower(subtlexus$Word) #make all entries in SUBTLEXUS lower-case
+passages <-excel_sheets(scaffolds) # antarctica ... vegas
+
+# Jess' suggestions
+# L72-124 from analysisStimuli.R
+
+### SECTION 2: SET UP PASSAGE LIST AND PREPARE LEMMAS FOR ACCESSING FREQUENCY INFO
+#create manual mapping of words to SUBTLEXUS corpus when lemma doesn't automatically match
+manualLemma <- data.frame(matrix(ncol=2, nrow=0)) #initialize a table to hold words without a frequency match
+colnames(manualLemma) <- c("stimWord",
+                           "lemma")
+for(i in 1:length(passages)){
+  passage <- passages[i]
+  passageDat <- read_xlsx(readAloudStimChar, sheet=passage, skip=1, na="#") #read in passage data
+  passWords <- passageDat[,1:2] #pull word list
+  for(a in 1:length(passWords$stimWord)){        #correct apostrophes (curly to straight)
+    string <- passWords$stimWord[a]
+    passWords$stimWord[a] <- gsub("’", "'",string)
+  }
+  passWords$stimWord <- tolower(passWords$stimWord) #shift word list to lowercase to match SUBTLEXUS
+  passWords$lemma <- lemmatize_words(passWords$stimWord) #lemmatize word list
+  passWords$freq <- rep(0, nrow(passWords)) #initialize new column to hold word frequency data
+  for(f in 1:nrow(passWords)){        #add log word frequency from SUBTLEXUS corpus for each word lemma in the passage
+    passWords$freq[f] <- subtlexus$Lg10WF[match(passWords$lemma[f], subtlexus$Word)]
+  }
+  noFreqTable <- subset(passWords, is.na(passWords$freq))[,2:3] #extract words that did not get a frequency match
+  manualLemma <- rbind(manualLemma, noFreqTable) #bind to running table of words without a frequency match
+}
+manualLemma <- manualLemma[!duplicated(manualLemma$stimWord),] #remove duplicate rows
+manualLemma <- manualLemma[order(manualLemma$stimWord),] #alphabetize
+manualLemma$newLemma <- c(rep(0,nrow(manualLemma))) #initialize column to hold manual lemmas extracted by researcher
+for(lemma in 1:nrow(manualLemma)){       #manually adjust possessives by dropping "'s" (except for "it's")
+  string <- manualLemma$lemma[lemma]
+  if(substr(string, nchar(string)-1, nchar(string))=="'s" & string!="it's"){
+    manualLemma$newLemma[lemma] <- substr(string, 0, nchar(string)-2)
+  }
+}
+#manually adjust plurals
+manualLemma[match("brittles", manualLemma$stimWord),3] <- "brittle"
+#manually adjust adjectives
+manualLemma[match("club-like", manualLemma$stimWord),3] <- "club"
+manualLemma[match("in-flight", manualLemma$stimWord),3] <- "flight"
+manualLemma[match("mid-", manualLemma$stimWord),3] <- "middle"
+#no manual adjustment of the following categories:
+#compound words with no obvious "primary" lemma: ccc, don't, it's, long-term
+#proper nouns: delano, nissan
+#words simply not in the SUBTLEXUS database: ecotourism, hydropower, jetsetter, megabat, microbats, photoreceptor, plumicorn, powertrain, spearer, trinocular
+#ordinal numbers: nineteeth, second, twentieth
+manualLemma <- subset(manualLemma, manualLemma$newLemma!=0) #drop words without a manual mapping
+manualLemma$freq <- rep(0, nrow(manualLemma))       #add log word frequency from SUBTLEXUS corpus
+for(f in 1:nrow(manualLemma)){
+  manualLemma$freq[f] <- subtlexus$Lg10WF[match(manualLemma$newLemma[f], subtlexus$Word)]
+}
+
+# Jess' suggestions
+# L170-189 from analysisStimuli.R
+
+### SECTION 3: BUILD AND OUTPUT READDAT MATRIX
+readDat <- data.frame(matrix(ncol=2, nrow=0))
+colnames(readDat) <- c("passage", "avgFreq")
+
+#calculate characteristics per passage half
+for(j in 1:length(passages)){
+  passage <- passages[j]
+  passageDat <- read_xlsx(readAloudStimChar, sheet=passage, skip=1, na="#") #read in passage word list
+
+  #extract passage word list
+  passWords <- passageDat[,1:2] #pull word list
+  for(a in 1:length(passWords$stimWord)){        #correct apostrophes (curly to straight)
+    string <- passWords$stimWord[a]
+    passWords$stimWord[a] <- gsub("’", "'",string)
+  }
+  passWords$stimWord <- tolower(passWords$stimWord) #shift word list to lowercase to match SUBTLEXUS
+  passWords$lemma <- lemmatize_words(passWords$stimWord) #lemmatize word list
+
+  #add frequency data
+  passWords$freq <- rep(0, nrow(passWords)) #initialize column to hold frequency data
+  for(f in 1:nrow(passWords)){
+    if(!is.na(subtlexus$Lg10WF[match(passWords$stimWord[f], subtlexus$Word)])){           #automatic matching of full word to SUBTLEXUS corpus
+      passWords$freq[f] <- subtlexus$Lg10WF[match(passWords$stimWord[f], subtlexus$Word)]
+    } else if(!is.na(subtlexus$Lg10WF[match(passWords$lemma[f], subtlexus$Word)])){       #automatic matching of lemma to SUBTLEXUS corpus
+      passWords$freq[f] <- subtlexus$Lg10WF[match(passWords$lemma[f], subtlexus$Word)]
+    } else if (passWords$lemma[f] %in% manualLemma$lemma){                                #check manual table if no auto-match
+      passWords$freq[f] <- manualLemma$freq[match(passWords$lemma[f], manualLemma$lemma)]
+    } else {passWords$freq[f] <- median(subtlexus$Lg10WF)}                                #impute median value for non-matches
+  }
+
+
+  #calculate average frequency
+  freqAvg <- mean(passWords$freq) #calculate mean frequency for whole passage
+
+  #create vectors for each passage and add to readDat
+  vectorWholePassage <- c(passage, freqAvg) # instead of vectorPre and vectorPost, as in analysisStimuli.R
+  readDat[nrow(readDat) + 1,] <- c(vectorWholePassage)
+}
+
+#organize data types
+readDat$avgFreq <- as.numeric(readDat$avgFreq)
+
+#output readDat
+out_path_readDat_csv <- paste(out_path_readDat, 'prepWordFreq_readDat', today, '.csv', sep="", collapse=NULL)
+write.csv(readDat, out_path_readDat_csv)
+out_path_readDat_csv # print out to terminal so it's easy to open!

From e2af6ce1653154b22f81bd3bff9580aa5ff96b0d Mon Sep 17 00:00:00 2001
From: l-acs <lucas.sahar@mail.mcgill.ca>
Date: Mon, 28 Aug 2023 09:31:53 -0400
Subject: [PATCH 12/33] WIP: word freq prep/analysis

---
 code/analysisReadAloudBeta.R | 24 ++++++++++++++++++++++--
 code/prepReadAloudBeta.R     |  8 ++++++--
 2 files changed, 28 insertions(+), 4 deletions(-)

diff --git a/code/analysisReadAloudBeta.R b/code/analysisReadAloudBeta.R
index 44a3deb..4ed68fd 100644
--- a/code/analysisReadAloudBeta.R
+++ b/code/analysisReadAloudBeta.R
@@ -1,6 +1,6 @@
 # readAloud-valence-beta Reading Task Analyses
 # Authors: Luc Sahar, Jessica M. Alexander
-# Last Updated: 2023-08-24
+# Last Updated: 2023-08-25
 
 # INPUTS
 # data/df: behavioral data, for each participant on each passage, with relevant participant information and trial-level stimulus information
@@ -43,7 +43,7 @@ today <- format(today, "%Y%m%d")
 # data <- '/home/luc/Documents/ndclab/analysis-sandbox/rwe-analysis/derivatives/readAloudBetaData_20230810.csv'
 # data <- '/home/luc/Documents/ndclab/analysis-sandbox/rwe-analysis/derivatives/readAloudBetaData_20230815.csv'
 # data <- '/home/luc/Documents/ndclab/analysis-sandbox/rwe-analysis/derivatives/readAloudBetaData_20230816.csv'
-data <- '/home/luc/Documents/ndclab/analysis-sandbox/rwe-analysis/derivatives/readAloudBetaData_20230821.csv'
+data <- '/home/luc/Documents/ndclab/analysis-sandbox/rwe-analysis/derivatives/readAloudBetaData_20230825.csv'
 to_omit <- '/home/luc/Documents/ndclab/analysis-sandbox/rwe-analysis/input/passages-to-omit_20230810.csv'
 # out_path <- '/Users/jalexand/github/readAloud-valence-beta/derivatives/'
 out_path <- '/home/luc/Documents/ndclab/analysis-sandbox/rwe-analysis/derivatives/'
@@ -585,3 +585,23 @@ summary(hes_with_rel_misprod_model_1)
 misprod_with_rel_hes_model_1 <- lmerTest::lmer(misprod_tally ~ hes_position + (1|id) + (1|passage),
                                                data=errorDatLongMisprodWithRelHes, REML=TRUE)
 summary(misprod_with_rel_hes_model_1)
+
+
+
+# Word frequency analysis
+# Does a passage's average word frequency predict participants' hesitation rate or misproduction rate?
+wordfreq_model_1 <- lmerTest::lmer(hesitation_rate ~ avgWordFreq + (1|id) + (1|passage),
+                                   data=errorDat, REML=TRUE)
+summary(wordfreq_model_1)
+wordfreq_model_2 <- lmerTest::lmer(misprod_rate ~ avgWordFreq + (1|id) + (1|passage),
+                                   data=errorDat, REML=TRUE)
+summary(wordfreq_model_2)
+
+# Do social anxiety and frequency interact to predict hesitation rate or misproduction rate?
+wordfreq_model_3 <- lmerTest::lmer(hesitation_rate ~ avgWordFreq * scaaredSoc_gmc + (1|id) + (1|passage),
+                                   data=errorDat, REML=TRUE)
+summary(wordfreq_model_3)
+
+wordfreq_model_4 <- lmerTest::lmer(misprod_rate ~ avgWordFreq * scaaredSoc_gmc + (1|id) + (1|passage),
+                                   data=errorDat, REML=TRUE)
+summary(wordfreq_model_4)
diff --git a/code/prepReadAloudBeta.R b/code/prepReadAloudBeta.R
index 04f5476..afb78d8 100644
--- a/code/prepReadAloudBeta.R
+++ b/code/prepReadAloudBeta.R
@@ -60,7 +60,7 @@ readDat_path <- paste(main_dataset, 'derivatives/analysisStimuli_readDat_2023061
 redcap_path <- paste(main_dataset,'derivatives/preprocessed/202201v0readAloudval_SCRD_2022-06-20_1019.csv', sep="", collapse=NULL)
 agedat_path <- paste(main_dataset,'derivatives/preprocessed/202201v0readAloudval_SCRD_2022-06-20_1019_ageonly.csv', sep="", collapse=NULL)
 speedDat_path <- paste(main_dataset, "derivatives/preprocessed/valence-timing/timingpitch_subject-by-passage_2022-09-09.csv", sep="", collapse=NULL)
-
+freqDat_path <- paste(main_analyses, "derivatives/prepWordFreq_readDat20230825.csv", sep="")
 # c(data, accDat_path, readDat_path, redcap_path, agedat_path, speedDat_path) %>% fs::as_fs_path() %>% fs::is_file()
 # ✅: all TRUE
 
@@ -74,6 +74,7 @@ accDat$passage <- c("dams", "flying", "bats", "broccoli", "realty", "bees", "dog
                     "cars", "vegas", "sun", "caramel", "congo", "antarctica", "depression", "skunkowl", "grizzly",
                     "mantis", "dentist")        #rename passages with short-name
 speedDat <- read.csv(speedDat_path, na.strings='NA')
+freqDat <- read.csv(freqDat_path, na.strings = 'NA')
 
 #organize data types
 df[,3:30] <- sapply(df[,3:30],as.numeric)
@@ -173,7 +174,7 @@ speedDat$id <- as.character(speedDat$id) # so we can join and it doesn't complai
 df <- left_join(df, speedDat, by = c("id", "passage")) # now reading timestamps and duration are looped into df
 
 
-### SECTION 4: BUILD TRIAL-LEVEL DF (ADD DEMODAT, READDAT, and ACCDAT to DF)
+### SECTION 4: BUILD TRIAL-LEVEL DF (ADD DEMODAT, READDAT, ACCDAT, AND FREQDAT to DF)
 for(i in 1:nrow(df)){
   subject <- df$id[i] #extract subject number for matching
   passage <- df$passage[i] #extract passage name for matching
@@ -193,6 +194,9 @@ for(i in 1:nrow(df)){
   df$lenWord[i] <- sum(readDat$lengthWord[which(readDat$passage==passage)]) #length of passage in words
   df$avgSyllPerWord[i] <- df$lenSyll[i]/df$lenWord[i]
 
+  #extract passage characteristics from freqDat
+  df$avgWordFreq[i] <- freqDat$avgFreq[which(freqDat$passage==passage)]
+
   #extract participant accuracy from accDat
   df$challengeACC[i] <- accDat[match(passage, accDat$passage), as.character(subject)] #passage-specific challenge question accuracy for subject
 

From 1b21dac7138121eb7d2421a7b5c2c508918745b8 Mon Sep 17 00:00:00 2001
From: l-acs <lucas.sahar@mail.mcgill.ca>
Date: Tue, 30 Jan 2024 10:42:52 -0500
Subject: [PATCH 13/33] Add data dict and a few more possible models

---
 code/analysisReadAloudBeta.R | 112 +++++++++++++++++++++++++++++++++--
 1 file changed, 106 insertions(+), 6 deletions(-)

diff --git a/code/analysisReadAloudBeta.R b/code/analysisReadAloudBeta.R
index 4ed68fd..2509334 100644
--- a/code/analysisReadAloudBeta.R
+++ b/code/analysisReadAloudBeta.R
@@ -11,6 +11,103 @@
 # NOTES TO DO
 # drop 150086 as only completed 12 of 20 passages and low accuracy
 
+# Data dict
+
+# errorDatMisprodHes:
+#
+#   our errorDat dataframe, just without the misprod-sequencing columns (which
+#   we'll add in piecemeal by different names later)
+
+# First, look at a given misproduction and check for nearby hesitations
+#
+# hes_position:
+#
+#   for long-form dataframes counting misproductions, this indicates whether the
+#   relevant count is the number of hesitations before (0) or after (1) those
+#   misproductions being counted in that row
+#
+#
+# misprod_tally:
+#
+#   conversely, in long-form dataframes counting misproductions, this column
+#   actually tracks how many misproductions there are in that reading
+#   (participant x passage) that have a hesitation in the relevant relative
+#   position
+
+
+# justMisprodWithHesBefore:
+#
+#   this is the dataframe with every (participant x passage) reading, counting
+#   the number of misproductions with a nearby preceding hesitation
+#
+#   i.e., for each reading, it counts the number of times (misprod_tally) that a
+#   hesitation comes before a misproduction -- so for every entry, hes_position = 0
+#
+#
+# justMisprodWithHesAfter
+#
+#   similarly, this is the dataframe with every (participant x passage) reading,
+#   counting the number of misproductions with a nearby following hesitation
+#
+#   i.e., for each reading, it counts the number of times (misprod_tally) that a
+#   hesitation comes after a misproduction -- so for every entry, hes_position = 1
+#
+#
+# errorDatLongMisprodWithRelHes:
+#
+#   this is the long-form dataframe, with two rows per reading (participant x
+#   passage): one for each position for a relative hesitation. i.e. this stacks
+#   the two dataframes that respectively have (1) every passage, with a count of
+#   misproductions for hes_position = 0, and (2) every passage, with a count of
+#   misproductions for hes_position = 1
+
+
+
+# Then, look at a given hesitation and check for nearby misproductions
+
+# misprod_position:
+#
+#   for long-form dataframes counting hesitations, this indicates whether the
+#   relevant count is the number of misproductions before (0) or after (1) those
+#   hesitations being counted in that row
+#
+#
+# hes_tally:
+#
+#   conversely, in long-form dataframes counting hesitations, this column
+#   actually tracks how many hesitations there are in that reading (participant
+#   x passage) that have a misproduction in the relevant relative position
+
+
+# justHesWithMisprodBefore:
+#
+#   this is the dataframe with every (participant x passage) reading, counting
+#   the number of hesitations with a nearby preceding misproduction
+#
+#   i.e., for each reading, it counts the number of times (hes_tally) that a
+#   misproduction comes before a hesitation -- so for every entry,
+#   misprod_position = 0
+#
+#
+# justHesWithMisprodAfter
+#
+#   similarly, this is the dataframe with every (participant x passage) reading,
+#   counting the number of hesitations with a nearby following misproduction
+#
+#   i.e., for each reading, it counts the number of times (hes_tally) that a
+#   misproduction comes after a hesitation -- so for every entry,
+#   misprod_position = 1
+#
+#
+# errorDatLongHesWithRelMisprod:
+#
+#   this is the long-form dataframe, with two rows per reading (participant x
+#   passage): one for each position for a relative misproduction, i.e. this
+#   stacks the two dataframes that respectively have (1) every passage, with a
+#   count of hesitations for misprod_position = 0, and (2) every passage, with a
+#   count of hesitations for misprod_position = 1
+
+
 ### SECTION 1: SETTING UP
 library(dplyr)
 library(lme4)
@@ -119,9 +216,6 @@ passage_no_before_trimming - passage_no_after_trim1 #number of passages trimmed
 (passage_no_before_trimming - passage_no_after_trim1) / passage_no_before_trimming #percentage of passages trimmed
 
 
-# band-aid fix: remove passages without reading speed data so that we can run
-# our analyses on them nonetheless
-
 # these are the only four passages without reading time data...
 # and incidentally? well, see their comments here...
 c(150013, "vegas")      # N.B.: 161 omitted syllables of 318 total in passage
@@ -300,9 +394,6 @@ summary(model12)
 
 
 
-
-
-
 # glmer(accuracy ~ scaaredSoc_gmc + (1|id) + (1|passage), data=errorDat, family="binomial")
 # "f_" : follow-up
 
@@ -586,6 +677,15 @@ misprod_with_rel_hes_model_1 <- lmerTest::lmer(misprod_tally ~ hes_position + (1
                                                data=errorDatLongMisprodWithRelHes, REML=TRUE)
 summary(misprod_with_rel_hes_model_1)
 
+## does it interact with SA?
+hes_with_rel_misprod_model_3 <- lmerTest::lmer(hes_tally ~ misprod_position * scaaredSoc_gmc + (1|id) + (1|passage),
+                                               data=errorDatLongHesWithRelMisprod, REML=TRUE)
+# summary(hes_with_rel_misprod_model_3)
+
+misprod_with_rel_hes_model_4 <- lmerTest::lmer(misprod_tally ~ hes_position * scaaredSoc_gmc + (1|id) + (1|passage),
+                                               data=errorDatLongMisprodWithRelHes, REML=TRUE)
+# summary(misprod_with_rel_hes_model_4)
+
 
 
 # Word frequency analysis

From b6fc56e9ea727c339087fc0ffcbb3fe908e39524 Mon Sep 17 00:00:00 2001
From: l-acs <lucas.sahar@mail.mcgill.ca>
Date: Tue, 30 Jan 2024 16:23:43 -0500
Subject: [PATCH 14/33] Keep code/prepReadAloudBeta.R

---
 ...prepReadAloudBeta.R-move-to-code__prepWordLevelReadAloudBeta.R | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename code/prepReadAloudBeta.R => code__prepReadAloudBeta.R-move-to-code__prepWordLevelReadAloudBeta.R (100%)

diff --git a/code/prepReadAloudBeta.R b/code__prepReadAloudBeta.R-move-to-code__prepWordLevelReadAloudBeta.R
similarity index 100%
rename from code/prepReadAloudBeta.R
rename to code__prepReadAloudBeta.R-move-to-code__prepWordLevelReadAloudBeta.R

From fd71d50d0f3942ab14ddab925158e19fbf136f4b Mon Sep 17 00:00:00 2001
From: l-acs <lucas.sahar@mail.mcgill.ca>
Date: Tue, 30 Jan 2024 16:23:43 -0500
Subject: [PATCH 15/33] Copy code/prepReadAloudBeta.R into
 code/prepWordLevelReadAloudBeta.R

---
 code/{prepReadAloudBeta.R => prepWordLevelReadAloudBeta.R} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename code/{prepReadAloudBeta.R => prepWordLevelReadAloudBeta.R} (100%)

diff --git a/code/prepReadAloudBeta.R b/code/prepWordLevelReadAloudBeta.R
similarity index 100%
rename from code/prepReadAloudBeta.R
rename to code/prepWordLevelReadAloudBeta.R

From 80ab7a810e0856ddcd9401880f62f5b69f43defb Mon Sep 17 00:00:00 2001
From: l-acs <lucas.sahar@mail.mcgill.ca>
Date: Tue, 30 Jan 2024 16:23:43 -0500
Subject: [PATCH 16/33] Revert prepReadAloudBeta.R, keeping history

---
 .../prepReadAloudBeta.R                                           | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename code__prepReadAloudBeta.R-move-to-code__prepWordLevelReadAloudBeta.R => code/prepReadAloudBeta.R (100%)

diff --git a/code__prepReadAloudBeta.R-move-to-code__prepWordLevelReadAloudBeta.R b/code/prepReadAloudBeta.R
similarity index 100%
rename from code__prepReadAloudBeta.R-move-to-code__prepWordLevelReadAloudBeta.R
rename to code/prepReadAloudBeta.R

From 6e629e8f4032b10e2e9b8d906f7bedc143e568f8 Mon Sep 17 00:00:00 2001
From: l-acs <lucas.sahar@mail.mcgill.ca>
Date: Tue, 30 Jan 2024 12:11:00 -0500
Subject: [PATCH 17/33] Add initial prep script for word level analysis, a copy
 of the passage-level script

---
 code/prepWordLevelReadAloudBeta.R | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/code/prepWordLevelReadAloudBeta.R b/code/prepWordLevelReadAloudBeta.R
index afb78d8..3ce7172 100644
--- a/code/prepWordLevelReadAloudBeta.R
+++ b/code/prepWordLevelReadAloudBeta.R
@@ -1,11 +1,13 @@
 # readAloud-valence-beta Analysis Preparation
 # Authors: Luc Sahar and Jessica M. Alexander -- NDCLab, Florida International University
-# Last updated: 2023-08-15
+# Last updated: 2024-01-30
 
 # INPUTS
-# data/df: behavioral (error-related) data, for each participant on each passage
+# data/df: behavioral (error-related) data, for each participant for each word
 # accDat: comprehension question accuracy (0/1) for each participant for each passage
-# readDat: stimuli characteristics (by passage half)
+#         as of 1/30/24 this is alphabetical by *passage* now (more easily
+#         automated, less stateful, less hard coding)
+# readDat: stimuli characteristics (by passage half) - 2024-01-30 - now NA?
 # redcap: participant data, incl. demographics and responses + scored factors for questionnaires:
   # bfne (brief fear of negative evaluation): bfne_b_scrdTotal (fear of negative evaluation total)
   # phq8 (patient health questionnaire): phq8_scrdTotal (depression scale total)
@@ -39,29 +41,28 @@ library(readr) # write_csv
 #   current_datetime <- now(timezone) %>% format(date_format)
 #   paste(label, '_', current_datetime, '.', ext, sep = "")
 # }
-today <- Sys.Date()
-today <- format(today, "%Y%m%d")
+today <- Sys.Date() %>% format("%Y%m%d")
 
 #set up directories for input/output
 # main_dataset <- '/Users/jalexand/github/readAloud-valence-dataset/'
 # main_analyses <- '/Users/jalexand/github/readAloud-valence-beta/'
 # out_path <- '/Users/jalexand/github/readAloud-valence-beta/derivatives/'
-main_dataset <- '/home/luc/Documents/ndclab/analysis-sandbox/rwe-dataset/'
-main_analyses <- '/home/luc/Documents/ndclab/analysis-sandbox/rwe-analysis/'
-out_path <- '/home/luc/Documents/ndclab/analysis-sandbox/rwe-analysis/derivatives/'
+main_dataset <- '~/Documents/ndclab/rwe-analysis-sandbox/rwe-dataset/'
+main_analyses <- '~/Documents/ndclab/rwe-analysis-sandbox/rwe-analysis/'
+out_path <- '~/Documents/ndclab/rwe-analysis-sandbox/rwe-analysis/derivatives/'
 
 
 #load input files
 # data <- paste(main_dataset, 'derivatives/preprocessed/disfluencies_subject-x-passage_20230616_1229pm.csv', sep="", collapse=NULL)
-# data <- "/home/luc/Documents/ndclab/analysis-sandbox/output-csvs/disfluencies_subject-x-passage_20230616_1229pm.csv"
-data <- "/home/luc/Documents/ndclab/analysis-sandbox/output-csvs/disfluencies_subject-x-passage_20230818_1042pm.csv"
+data <- "~/Documents/ndclab/rwe-analysis-sandbox/rwe-dataset/derivatives/disfluencies_subject-x-passage-x-word_20240126_0253pm.csv"
 accDat_path <- paste(main_dataset,'derivatives/preprocessed/readAloud_passage-level_summary_20220812.csv', sep="", collapse=NULL)
 readDat_path <- paste(main_dataset, 'derivatives/analysisStimuli_readDat_20230614.csv', sep="", collapse=NULL)
 redcap_path <- paste(main_dataset,'derivatives/preprocessed/202201v0readAloudval_SCRD_2022-06-20_1019.csv', sep="", collapse=NULL)
 agedat_path <- paste(main_dataset,'derivatives/preprocessed/202201v0readAloudval_SCRD_2022-06-20_1019_ageonly.csv', sep="", collapse=NULL)
 speedDat_path <- paste(main_dataset, "derivatives/preprocessed/valence-timing/timingpitch_subject-by-passage_2022-09-09.csv", sep="", collapse=NULL)
 freqDat_path <- paste(main_analyses, "derivatives/prepWordFreq_readDat20230825.csv", sep="")
-# c(data, accDat_path, readDat_path, redcap_path, agedat_path, speedDat_path) %>% fs::as_fs_path() %>% fs::is_file()
+scaffolds_path <- paste(main_dataset, 'code/scaffolds.xlsx', sep="", collapse=NULL)
+# c(data, accDat_path, readDat_path, redcap_path, agedat_path, speedDat_path, scaffolds_path) %>% fs::as_fs_path() %>% fs::is_file()
 # ✅: all TRUE
 
 
@@ -73,6 +74,7 @@ accDat <- read.csv(accDat_path, na.strings='NA', check.names=FALSE) #passage lev
 accDat$passage <- c("dams", "flying", "bats", "broccoli", "realty", "bees", "dogshow", "dolphins", "icefishing",
                     "cars", "vegas", "sun", "caramel", "congo", "antarctica", "depression", "skunkowl", "grizzly",
                     "mantis", "dentist")        #rename passages with short-name
+                                                # can't we use scaffolds for this?
 speedDat <- read.csv(speedDat_path, na.strings='NA')
 freqDat <- read.csv(freqDat_path, na.strings = 'NA')
 

From c0b1ae48f25f503aa1d5da1c433d9be20d99f23a Mon Sep 17 00:00:00 2001
From: l-acs <lucas.sahar@mail.mcgill.ca>
Date: Tue, 30 Jan 2024 13:33:04 -0500
Subject: [PATCH 18/33] Limit block comments to RStudio's prescribed 80
 character width

---
 code/prepWordLevelReadAloudBeta.R | 40 +++++++++++++++++++------------
 1 file changed, 25 insertions(+), 15 deletions(-)

diff --git a/code/prepWordLevelReadAloudBeta.R b/code/prepWordLevelReadAloudBeta.R
index 3ce7172..1d9d2bc 100644
--- a/code/prepWordLevelReadAloudBeta.R
+++ b/code/prepWordLevelReadAloudBeta.R
@@ -1,26 +1,35 @@
 # readAloud-valence-beta Analysis Preparation
-# Authors: Luc Sahar and Jessica M. Alexander -- NDCLab, Florida International University
+# Authors: Luc Sahar and Jessica M. Alexander
+# NDCLab, Florida International University
 # Last updated: 2024-01-30
 
 # INPUTS
 # data/df: behavioral (error-related) data, for each participant for each word
-# accDat: comprehension question accuracy (0/1) for each participant for each passage
-#         as of 1/30/24 this is alphabetical by *passage* now (more easily
-#         automated, less stateful, less hard coding)
+# accDat:  comprehension question accuracy (0/1) for each participant for each
+#          passage
+#            as of 1/30/24 this is alphabetical by *passage* now (more easily
+#            automated, less stateful, less hard coding)
 # readDat: stimuli characteristics (by passage half) - 2024-01-30 - now NA?
-# redcap: participant data, incl. demographics and responses + scored factors for questionnaires:
-  # bfne (brief fear of negative evaluation): bfne_b_scrdTotal (fear of negative evaluation total)
-  # phq8 (patient health questionnaire): phq8_scrdTotal (depression scale total)
-  # scaared, total (screen for adult anxiety disorders): scaared_b_scrdTotal (total anxiety)
-  # scaared, social (screen for adult anxiety disorders): scaared_b_scrdSoc (social phobias)
-  # scaared, general (screen for adult anxiety disorders): scaared_b_scrdGA (general anxiety)
-  # sps (social phobia scale): sias6sps6_b_scrdSPS
+# redcap:  participant data, incl. demographics and responses + scored factors
+#          for questionnaires:
+#   bfne (brief fear of negative evaluation): bfne_b_scrdTotal (fear of negative
+#         evaluation total)
+#   phq8 (patient health questionnaire): phq8_scrdTotal (depression scale total)
+#   scaared, total (screen for adult anxiety disorders): scaared_b_scrdTotal
+#        (total anxiety)
+#   scaared, social (screen for adult anxiety disorders): scaared_b_scrdSoc
+#        (social phobias)
+#   scaared, general (screen for adult anxiety disorders): scaared_b_scrdGA
+#        (general anxiety)
+#   sps (social phobia scale): sias6sps6_b_scrdSPS
 
 # OUTPUTS
 # dfTrim: for each passage, for each participant, details on:
-  # participant behavior: reading errors made, comprehension question accuracy
-  # passage characteristics: length (syllable and word), average syllables per word
-  # participant data: demographics, language history, mood and mood disorder scores
+#   participant behavior: reading errors made, comprehension question accuracy
+#   passage characteristics: length (syllable and word), average syllables per
+#                            word
+#   participant data: demographics, language history, mood and mood disorder
+#                     scores
 
 
 ### SECTION 1: SETTING UP
@@ -233,7 +242,8 @@ df$timePerWord     <- df$readingTime / df$lenWord
 
 
 ### SECTION 5: CROSS-CHECK ALL PARTICIPANTS MET INCLUSION CRITERIA
-#note: given the time required to annotated errors, only participants who met inclusion criteria were annotated
+# note: given the time required to annotated errors, only participants who met
+#       inclusion criteria were annotated
 #sum(df$eng==1 & df$langhis %in% c(2,4) & df$ageen>6) #confirm all subjects monolingual English OR natively bilingual OR learned English before age 6
 #sum(df$commdis>0) #confirm no subject diagnosed with any communication disorder
 #sum(df$profen>3, na.rm=TRUE)/20 #one remaining subject (150060) rates own English proficiency as not "elementary" or "not proficient", but reads fluidly and achieved 80% accuracy on challenge questions, so not excluded

From 8165183beed30500775abadfe59231e6ea935967 Mon Sep 17 00:00:00 2001
From: l-acs <lucas.sahar@mail.mcgill.ca>
Date: Tue, 30 Jan 2024 13:45:38 -0500
Subject: [PATCH 19/33] Declutter

---
 code/prepWordLevelReadAloudBeta.R | 16 ++--------------
 1 file changed, 2 insertions(+), 14 deletions(-)

diff --git a/code/prepWordLevelReadAloudBeta.R b/code/prepWordLevelReadAloudBeta.R
index 1d9d2bc..b0ca2ee 100644
--- a/code/prepWordLevelReadAloudBeta.R
+++ b/code/prepWordLevelReadAloudBeta.R
@@ -41,15 +41,6 @@ library(lubridate) # now
 library(readr) # write_csv
 
 #set up defaults for output file naming
-# ext_default = 'csv'
-# tz_default = "America/New_York"
-# date_format_default = "%Y%m%d_%I%M%P"
-#
-# build_output_filename <- function(label, ext = ext_default, timezone = tz_default, date_format = date_format_default) {
-#   # `label` may include the destination directory, if different from the working directory when the script is run
-#   current_datetime <- now(timezone) %>% format(date_format)
-#   paste(label, '_', current_datetime, '.', ext, sep = "")
-# }
 today <- Sys.Date() %>% format("%Y%m%d")
 
 #set up directories for input/output
@@ -80,10 +71,7 @@ redcap <- read.csv(redcap_path, na.strings='NA') #participant questionnaire resp
 agedat <- read.csv(agedat_path, na.strings='NA') #participant age information
 readDat <- read.csv(readDat_path, na.strings='N') #passage-level characteristics from analysisStimuli.R
 accDat <- read.csv(accDat_path, na.strings='NA', check.names=FALSE) #passage level accuracy for each subject
-accDat$passage <- c("dams", "flying", "bats", "broccoli", "realty", "bees", "dogshow", "dolphins", "icefishing",
-                    "cars", "vegas", "sun", "caramel", "congo", "antarctica", "depression", "skunkowl", "grizzly",
-                    "mantis", "dentist")        #rename passages with short-name
-                                                # can't we use scaffolds for this?
+accDat$passage <- excel_sheets(scaffolds_path)  #rename passages with short-name
 speedDat <- read.csv(speedDat_path, na.strings='NA')
 freqDat <- read.csv(freqDat_path, na.strings = 'NA')
 
@@ -289,6 +277,6 @@ out_filename
 # # base = "~/Documents/ndclab/analysis-sandbox/github-structure-mirror/readAloud-valence-dataset/derivatives/preprocessed"
 # base = "/home/data/NDClab/datasets/readAloud-valence-dataset/derivatives/preprocessed"
 # preprocessed_summary_filename = "TODO"
-# collapsed_filename = build_output_filename(label = paste(base, "disfluencies_subject", sep='/'))
+# collapsed_filename = "TODO"
 #
 # collapse_by_participant(preprocessed_summary_file, collapsed_filename)

From c1b3048f28bd21dc7aca5aab73a7a794aefa77c9 Mon Sep 17 00:00:00 2001
From: l-acs <lucas.sahar@mail.mcgill.ca>
Date: Wed, 31 Jan 2024 12:54:04 -0500
Subject: [PATCH 20/33] Simplify data wrangling

---
 code/prepWordLevelReadAloudBeta.R | 134 ++++++++++++++----------------
 1 file changed, 62 insertions(+), 72 deletions(-)

diff --git a/code/prepWordLevelReadAloudBeta.R b/code/prepWordLevelReadAloudBeta.R
index b0ca2ee..c249858 100644
--- a/code/prepWordLevelReadAloudBeta.R
+++ b/code/prepWordLevelReadAloudBeta.R
@@ -65,111 +65,99 @@ scaffolds_path <- paste(main_dataset, 'code/scaffolds.xlsx', sep="", collapse=NU
 # c(data, accDat_path, readDat_path, redcap_path, agedat_path, speedDat_path, scaffolds_path) %>% fs::as_fs_path() %>% fs::is_file()
 # ✅: all TRUE
 
-
+all_passages <- excel_sheets(scaffolds_path)
 df <- read.csv(data, na.strings='NA')
 redcap <- read.csv(redcap_path, na.strings='NA') #participant questionnaire responses
 agedat <- read.csv(agedat_path, na.strings='NA') #participant age information
 readDat <- read.csv(readDat_path, na.strings='N') #passage-level characteristics from analysisStimuli.R
 accDat <- read.csv(accDat_path, na.strings='NA', check.names=FALSE) #passage level accuracy for each subject
-accDat$passage <- excel_sheets(scaffolds_path)  #rename passages with short-name
+accDat$passage <- all_passages  #rename passages with short-name
 speedDat <- read.csv(speedDat_path, na.strings='NA')
 freqDat <- read.csv(freqDat_path, na.strings = 'NA')
 
 #organize data types
-df[,3:30] <- sapply(df[,3:30],as.numeric)
+# df[,3:30] <- sapply(df[,3:30],as.numeric)
+# not applicable for now, we'll see
 
 #add missing passages for 150086 so that nrow is divisible by 20
-passages_read <- df$passage[which(df$id=="150086")]
-all_passages <- unique(df$passage)
-tempdf <- data.frame(matrix(nrow=0, ncol=ncol(df)))
-colnames(tempdf) <- colnames(df)
-for(passage in 1:length(all_passages)){
-  if(all_passages[passage] %in% passages_read){next}else{
-    tempdf[nrow(tempdf) + 1,] <- c("150086", all_passages[passage], rep(NA, 30))
-  }
-}
-df <- rbind(df, tempdf)
+# passages_read <- df$passage[which(df$id=="150086")]
+# tempdf <- data.frame(matrix(nrow=0, ncol=ncol(df)))
+# colnames(tempdf) <- colnames(df)
+# for(passage in 1:length(all_passages)){
+#   if(all_passages[passage] %in% passages_read){next}else{
+#     tempdf[nrow(tempdf) + 1,] <- c("150086", all_passages[passage], rep(NA, 30))
+#   }
+# }
+# df <- rbind(df, tempdf)
 
 ### SECTION 2: BUILD DEMOGRAPHIC DATA DF
 demoDat <- redcap[,c(1,5)]
+# demoDat_imperative <- redcap[,c(1,5)]; demoDat_dplyr <- demoDat_imperative
+# as we refactor, our test case is: all.equal(demoDat_imperative, demoDat_dplyr)
+# this has been confirmed to work for with every new column added
+
 #biological sex: replace numerical values with text description
-for(a in 1:nrow(redcap)){
-  if(is.na(redcap$demo_b_sex_s1_r1_e1[a])){demoDat$sex[a] <- 'undisclosed'}
-  else if(redcap$demo_b_sex_s1_r1_e1[a]==1){demoDat$sex[a] <- 'male'}
-  else if(redcap$demo_b_sex_s1_r1_e1[a]==2){demoDat$sex[a] <- 'female'}
-  else if(redcap$demo_b_sex_s1_r1_e1[a]==3){demoDat$sex[a] <- 'intersex'}
-  else if(redcap$demo_b_sex_s1_r1_e1[a]==4){demoDat$sex[a] <- 'other'}
-  else if(redcap$demo_b_sex_s1_r1_e1[a]==5){demoDat$sex[a] <- 'unknown'}
-  else{demoDat$sex[a] <- 'undisclosed'}
-}
+demoDat$sex <- case_match(redcap$demo_b_sex_s1_r1_e1,
+                          1 ~ 'male', 2 ~ 'female', 3 ~ 'intersex',
+                          4 ~ 'other', 5 ~ 'unknown', .default = 'undisclosed')
 
 #preferred pronouns: replace numerical values with text description
-for(b in 1:nrow(redcap)){
-  if(is.na(redcap$demo_b_pronouns_s1_r1_e1[b])){demoDat$pron[b] <- 'undisclosed'}
-  else if(redcap$demo_b_pronouns_s1_r1_e1[b]==1){demoDat$pron[b] <- 'she/her'}
-  else if(redcap$demo_b_pronouns_s1_r1_e1[b]==2){demoDat$pron[b] <- 'he/him'}
-  else if(redcap$demo_b_pronouns_s1_r1_e1[b]==3){demoDat$pron[b] <- 'they/them'}
-  else if(redcap$demo_b_pronouns_s1_r1_e1[b]==5){demoDat$pron[b] <- 'other'}
-  else{demoDat$pron[b] <- 'undisclosed'}
-}
+# for(b in 1:nrow(redcap)){ ... }
+# 8 lines, 466 chars; boilerplatey
+
+# try rewriting as case_when
+demoDat$pron <- case_match (redcap$demo_b_pronouns_s1_r1_e1,
+                            1 ~ "she/her", 2 ~ "he/him", 3 ~ "they/them",
+                            5 ~ "other", .default = "undisclosed")
+# `.default` catches both NA and everything else
+
 
 #ethnicity affiliation: map to text description
-for(c in 1:nrow(redcap)){
-  if(redcap$demo_b_ethnic_s1_r1_e1___1[c]==1){demoDat$ethnic[c] <- 'AI'} #american indian/alaska native
-  else if(redcap$demo_b_ethnic_s1_r1_e1___2[c]==1){demoDat$ethnic[c] <- 'A'} #asian
-  else if(redcap$demo_b_ethnic_s1_r1_e1___3[c]==1){demoDat$ethnic[c] <- 'AA'} #african american
-  else if(redcap$demo_b_ethnic_s1_r1_e1___4[c]==1){demoDat$ethnic[c] <- 'LX'} #hispanic/latinx
-  else if(redcap$demo_b_ethnic_s1_r1_e1___5[c]==1){demoDat$ethnic[c] <- 'ME'} #middle eastern
-  else if(redcap$demo_b_ethnic_s1_r1_e1___6[c]==1){demoDat$ethnic[c] <- 'PI'} #pacific islander
-  else if(redcap$demo_b_ethnic_s1_r1_e1___7[c]==1){demoDat$ethnic[c] <- 'W'} #white
-  else if(redcap$demo_b_ethnic_s1_r1_e1___8[c]==1){demoDat$ethnic[c] <- 'O'} #other
-  else{demoDat$ethnic[c] <- 'UND'} #undisclosed
-}
+# wait, I don't think this lets people be multiple races
+demoDat$ethnic <- case_when(
+  redcap$demo_b_ethnic_s1_r1_e1___1 == 1 ~ 'AI', #american indian/alaska native
+  redcap$demo_b_ethnic_s1_r1_e1___2 == 1 ~ 'A', #asian
+  redcap$demo_b_ethnic_s1_r1_e1___3 == 1 ~ 'AA', #african american
+  redcap$demo_b_ethnic_s1_r1_e1___4 == 1 ~ 'LX', #hispanic/latinx
+  redcap$demo_b_ethnic_s1_r1_e1___5 == 1 ~ 'ME', #middle eastern
+  redcap$demo_b_ethnic_s1_r1_e1___6 == 1 ~ 'PI', #pacific islander
+  redcap$demo_b_ethnic_s1_r1_e1___7 == 1 ~ 'W', #white
+  redcap$demo_b_ethnic_s1_r1_e1___8 == 1 ~ 'O', #other
+  .default = 'UND' #undisclosed
+)
 
 #social class affiliation: replace numerical values with text description
-for(d in 1:nrow(redcap)){
-  if(is.na(redcap$demo_b_socclass_s1_r1_e1[d])){demoDat$socclass[d] <- 'undisclosed'}
-  else if(redcap$demo_b_socclass_s1_r1_e1[d]==1){demoDat$socclass[d] <- 'poor'}
-  else if(redcap$demo_b_socclass_s1_r1_e1[d]==2){demoDat$socclass[d] <- 'working'}
-  else if(redcap$demo_b_socclass_s1_r1_e1[d]==3){demoDat$socclass[d] <- 'middle'}
-  else if(redcap$demo_b_socclass_s1_r1_e1[d]==4){demoDat$socclass[d] <- 'affluent'}
-  else{demoDat$socclass[d] <- 'undisclosed'}
-}
+demoDat$socclass <- case_match(redcap$demo_b_socclass_s1_r1_e1,
+                                     1 ~ "poor", 2 ~ "working", 3 ~ "middle",
+                                     4 ~ "affluent", .default = "undisclosed")
 
 #communication disorders diagnoses: sum across childhood, adolescence, and adulthood
-for(e in 1:nrow(redcap)){
-  demoDat$commdis[e] <- sum(redcap$demo_b_comdiskid_s1_r1_e1[e],
-                            redcap$demo_b_comdisteen_s1_r1_e1[e],
-                            redcap$demo_b_comdisad_s1_r1_e[e])
-}
+# nb. there was a typo in the old version: no adult diagnoses were being checked
+# because the column name did not exist and `sum` with the df$col syntax did not
+# catch that
+demoDat$commdis <- select(redcap, matches("demo_b_comdis.*e1")) %>% rowSums
 
 #language history: transfer directly
-for(f in 1:nrow(redcap)){
-  demoDat$eng[f] <- redcap$demo_b_eng_s1_r1_e1[match(demoDat$record_id[f], redcap$record_id)] #participant monolingualism
-  demoDat$langhis[f] <- redcap$demo_b_langhis_s1_r1_e1[match(demoDat$record_id[f], redcap$record_id)] #participant language history
-  demoDat$ageen[f] <- redcap$demo_b_ageen_s1_r1_e1[match(demoDat$record_id[f], redcap$record_id)] #participant age of English acquisition
-  demoDat$profen[f] <- redcap$demo_b_profen_s1_r1_e1[match(demoDat$record_id[f], redcap$record_id)] #participant English proficiency
-}
+demoDat$eng <- redcap$demo_b_eng_s1_r1_e1 #participant monolingualism
+demoDat$langhis <- redcap$demo_b_langhis_s1_r1_e1 #participant language history
+demoDat$ageen <- redcap$demo_b_ageen_s1_r1_e1 #participant age of English acquisition
+demoDat$profen <- redcap$demo_b_profen_s1_r1_e1 #participant English proficiency
 
 #mood and mood disorders: transfer directly
-for(g in 1:nrow(redcap)){
-  demoDat$bfne[g] <- redcap$bfne_b_scrdTotal[match(demoDat$record_id[g], redcap$record_id)] #bfne total score
-  demoDat$phq8[g] <- redcap$phq8_scrdTotal[match(demoDat$record_id[g], redcap$record_id)] #phq8 depression scale
-  demoDat$scaaredTotal[g] <- redcap$scaared_b_scrdTotal[match(demoDat$record_id[g], redcap$record_id)] #scaared total anxiety
-  demoDat$scaaredGA[g] <- redcap$scaared_b_scrdGA[match(demoDat$record_id[g], redcap$record_id)] #scaared general anxiety
-  demoDat$scaaredSoc[g] <- redcap$scaared_b_scrdSoc[match(demoDat$record_id[g], redcap$record_id)] #scaared social phobias
-  demoDat$sps[g] <- redcap$sias6sps6_b_scrdSPS[match(demoDat$record_id[g], redcap$record_id)] #sps social phobia scale
-}
+demoDat$bfne <- redcap$bfne_b_scrdTotal
+demoDat$phq8 <- redcap$phq8_scrdTotal #phq8 depression scale
+demoDat$scaaredTotal <- redcap$scaared_b_scrdTotal #scaared total anxiety
+demoDat$scaaredGA <- redcap$scaared_b_scrdGA #scaared general anxiety
+demoDat$scaaredSoc <- redcap$scaared_b_scrdSoc #scaared social phobias
+demoDat$sps <- redcap$sias6sps6_b_scrdSPS #sps social phobia scale
 
 #age: pull from separate file
-for(h in 1:nrow(demoDat)){
-  demoDat$age[h] <- agedat$info_age_s1_r1_e1[match(demoDat$record_id[h], agedat$record_id)]
-}
+demoDat <- left_join(demoDat, # can't just assign: matching matters given new df
+                     select(agedat, record_id, age = info_age_s1_r1_e1))
 
 
 ### SECTION 3: SET UP DERIVED FIELDS FOR SPEED ANALYSES
 speedDat$readingTime <- speedDat$readEnd - speedDat$readStart
-speedDat$id <- as.character(speedDat$id) # so we can join and it doesn't complain about type comparison
 df <- left_join(df, speedDat, by = c("id", "passage")) # now reading timestamps and duration are looped into df
 
 
@@ -217,6 +205,8 @@ for(i in 1:nrow(df)){
   df$scaaredSoc[i] <- demoDat$scaaredSoc[match(df$id[i], demoDat$record_id)] #participant social phobias (scaared)
   df$sps[i] <- demoDat$sps[match(df$id[i], demoDat$record_id)] #participant social phobias (sias6sps6)
 }
+# first, test it- then we can delete the old versions
+# expect it to fail tho
 
 #organize participant demographic variables
 df$sex <- as.factor(df$sex)

From 6a5360bb7601fc63dc3caf2b3fee33de72550b1b Mon Sep 17 00:00:00 2001
From: l-acs <lucas.sahar@mail.mcgill.ca>
Date: Wed, 31 Jan 2024 15:59:13 -0500
Subject: [PATCH 21/33] Revise for word level: inclusion checks, summaries

---
 code/prepWordLevelReadAloudBeta.R | 36 +++++++++++++++----------------
 1 file changed, 17 insertions(+), 19 deletions(-)

diff --git a/code/prepWordLevelReadAloudBeta.R b/code/prepWordLevelReadAloudBeta.R
index c249858..1e30014 100644
--- a/code/prepWordLevelReadAloudBeta.R
+++ b/code/prepWordLevelReadAloudBeta.R
@@ -113,8 +113,8 @@ demoDat$pron <- case_match (redcap$demo_b_pronouns_s1_r1_e1,
 
 
 #ethnicity affiliation: map to text description
-# wait, I don't think this lets people be multiple races
-demoDat$ethnic <- case_when(
+demoDat$ethnic <- case_when( # first, check for self-identifying 2+ ethnicities
+  redcap %>% select(matches("demo_b_ethnic_*")) %>% rowSums >= 2 ~ 'M', # multi
   redcap$demo_b_ethnic_s1_r1_e1___1 == 1 ~ 'AI', #american indian/alaska native
   redcap$demo_b_ethnic_s1_r1_e1___2 == 1 ~ 'A', #asian
   redcap$demo_b_ethnic_s1_r1_e1___3 == 1 ~ 'AA', #african american
@@ -162,6 +162,8 @@ df <- left_join(df, speedDat, by = c("id", "passage")) # now reading timestamps
 
 
 ### SECTION 4: BUILD TRIAL-LEVEL DF (ADD DEMODAT, READDAT, ACCDAT, AND FREQDAT to DF)
+# this takes over an hour and a half to run - todo: reimplement for efficiency
+# (worthwhile given it's 200,000 rows to loop over)
 for(i in 1:nrow(df)){
   subject <- df$id[i] #extract subject number for matching
   passage <- df$passage[i] #extract passage name for matching
@@ -205,8 +207,7 @@ for(i in 1:nrow(df)){
   df$scaaredSoc[i] <- demoDat$scaaredSoc[match(df$id[i], demoDat$record_id)] #participant social phobias (scaared)
   df$sps[i] <- demoDat$sps[match(df$id[i], demoDat$record_id)] #participant social phobias (sias6sps6)
 }
-# first, test it- then we can delete the old versions
-# expect it to fail tho
+# succeeded given changes above it; to be revised
 
 #organize participant demographic variables
 df$sex <- as.factor(df$sex)
@@ -215,6 +216,7 @@ df$ethnic <- as.factor(df$ethnic)
 df$socclass <- as.factor(df$socclass)
 
 # compute speed
+# nb these are passage level data but are tracked by word
 df$timePerSyllable <- df$readingTime / df$lenSyll
 df$timePerWord     <- df$readingTime / df$lenWord
 
@@ -222,25 +224,21 @@ df$timePerWord     <- df$readingTime / df$lenWord
 ### SECTION 5: CROSS-CHECK ALL PARTICIPANTS MET INCLUSION CRITERIA
 # note: given the time required to annotated errors, only participants who met
 #       inclusion criteria were annotated
-#sum(df$eng==1 & df$langhis %in% c(2,4) & df$ageen>6) #confirm all subjects monolingual English OR natively bilingual OR learned English before age 6
-#sum(df$commdis>0) #confirm no subject diagnosed with any communication disorder
-#sum(df$profen>3, na.rm=TRUE)/20 #one remaining subject (150060) rates own English proficiency as not "elementary" or "not proficient", but reads fluidly and achieved 80% accuracy on challenge questions, so not excluded
+sum(df$eng==1 & df$langhis %in% c(2,4) & df$ageen>6) #confirm all subjects monolingual English OR natively bilingual OR learned English before age 6
+sum(df$commdis>0) #confirm no subject diagnosed with any communication disorder
+filter(df, profen > 3) %>% select(id) %>% unique %>% nrow #one remaining subject (150060) rates own English proficiency as not "elementary" or "not proficient", but reads fluidly and achieved 80% accuracy on challenge questions, so not excluded
 
 #extract age and sex stats
-
 # all these values are just in case they're useful - not needed per se for later
 # steps of the logic in this script
+summary_unique <- function(df, key, column, f = summary) {
+  unique(select(df, column, key))[[column]] %>% f
+}
+
+c("age", "sex", "pronouns", "ethnic", "socclass") %>%
+  map(\(col) summary_unique(df, "id", col)) # `map` not `for`: return, not print
 
-summary(df$age) #age range and mean
-sd(df$age) #age standard deviation
-summary(df$sex)/20 #number of participants by sex
-summary(df$sex)/20 / (nrow(df)/20) #percentage of participants by sex
-summary(df$pronouns)/20 #number of participants by preferred pronoun
-summary(df$pronouns)/20 / (nrow(df)/20) #percentage of participants by preferred pronoun
-summary(df$ethnic)/20 #number of participants by ethnic affiliation
-summary(df$ethnic)/20 / (nrow(df)/20) #percentage of participants by ethnic affiliation
-summary(df$socclass)/20 #number of participants by social class affiliation
-summary(df$socclass)/20 / (nrow(df)/20) #percentage of participants by social class affiliation
+summary_unique(df, "id", "age", f = sd) # also do stdev for age
 
 
 ### SECTION 6: TRIM PASSAGES DUE TO EXPERIMENTER ERROR
@@ -249,7 +247,7 @@ dfTrim <- subset(dfTrim, !(dfTrim$passage=='sun')) #remove sun passage due to er
 
 
 ### SECTION 7: OUTPUT DATAFRAME
-out_filename <- paste(out_path, "readAloudBetaData_", today, ".csv", sep="", collapse=NULL)
+out_filename <- paste(out_path, "readAloudBetaData-wordLevel_", today, ".csv", sep="", collapse=NULL)
 write.csv(dfTrim, out_filename, row.names = FALSE)
 out_filename
 

From 4e4d66069bb627a82ed38220d92ae4743bd27aa3 Mon Sep 17 00:00:00 2001
From: l-acs <lucas.sahar@mail.mcgill.ca>
Date: Wed, 31 Jan 2024 21:00:55 -0500
Subject: [PATCH 22/33] Also summarize demographic stats by percent

---
 code/prepWordLevelReadAloudBeta.R | 28 ++++++++--------------------
 1 file changed, 8 insertions(+), 20 deletions(-)

diff --git a/code/prepWordLevelReadAloudBeta.R b/code/prepWordLevelReadAloudBeta.R
index 1e30014..aca099c 100644
--- a/code/prepWordLevelReadAloudBeta.R
+++ b/code/prepWordLevelReadAloudBeta.R
@@ -1,7 +1,7 @@
 # readAloud-valence-beta Analysis Preparation
 # Authors: Luc Sahar and Jessica M. Alexander
 # NDCLab, Florida International University
-# Last updated: 2024-01-30
+# Last updated: 2024-01-31
 
 # INPUTS
 # data/df: behavioral (error-related) data, for each participant for each word
@@ -235,9 +235,15 @@ summary_unique <- function(df, key, column, f = summary) {
   unique(select(df, column, key))[[column]] %>% f
 }
 
-c("age", "sex", "pronouns", "ethnic", "socclass") %>%
+demo_cols <- c("age", "sex", "pronouns", "ethnic", "socclass")
+sample_size <- select(df, "id") %>% unique %>% nrow
+
+demo_cols %>% # as totals, for each of our demographic columns
   map(\(col) summary_unique(df, "id", col)) # `map` not `for`: return, not print
 
+demo_cols %>% # as a percent
+  map(\(col) summary_unique(df, "id", col) / sample_size)
+
 summary_unique(df, "id", "age", f = sd) # also do stdev for age
 
 
@@ -250,21 +256,3 @@ dfTrim <- subset(dfTrim, !(dfTrim$passage=='sun')) #remove sun passage due to er
 out_filename <- paste(out_path, "readAloudBetaData-wordLevel_", today, ".csv", sep="", collapse=NULL)
 write.csv(dfTrim, out_filename, row.names = FALSE)
 out_filename
-
-# collapse_by_participant <- function(filename_in, filename_out) {
-#   by_participant <- read_csv(filename_in) %>%
-#     unique %>% # dedup
-#     group_by(id) %>% summarize(across(misprod:total_uncorrected_errors, sum)) # summarize by participant, across all passages
-#     # TODO change the columns selected, once more have been added to the output of the preproc script
-#
-#   write_csv(by_participant, filename_out)
-#   return(filename_out)
-# }
-#
-#
-# # base = "~/Documents/ndclab/analysis-sandbox/github-structure-mirror/readAloud-valence-dataset/derivatives/preprocessed"
-# base = "/home/data/NDClab/datasets/readAloud-valence-dataset/derivatives/preprocessed"
-# preprocessed_summary_filename = "TODO"
-# collapsed_filename = "TODO"
-#
-# collapse_by_participant(preprocessed_summary_file, collapsed_filename)

From 086753ada89cbca0187f18d99146a6fe61911500 Mon Sep 17 00:00:00 2001
From: l-acs <lucas.sahar@mail.mcgill.ca>
Date: Wed, 31 Jan 2024 21:09:20 -0500
Subject: [PATCH 23/33] Clean up unused material & out-of-date notes

---
 code/prepWordLevelReadAloudBeta.R | 37 +++++--------------------------
 1 file changed, 6 insertions(+), 31 deletions(-)

diff --git a/code/prepWordLevelReadAloudBeta.R b/code/prepWordLevelReadAloudBeta.R
index aca099c..1129de2 100644
--- a/code/prepWordLevelReadAloudBeta.R
+++ b/code/prepWordLevelReadAloudBeta.R
@@ -76,25 +76,12 @@ speedDat <- read.csv(speedDat_path, na.strings='NA')
 freqDat <- read.csv(freqDat_path, na.strings = 'NA')
 
 #organize data types
-# df[,3:30] <- sapply(df[,3:30],as.numeric)
-# not applicable for now, we'll see
-
-#add missing passages for 150086 so that nrow is divisible by 20
-# passages_read <- df$passage[which(df$id=="150086")]
-# tempdf <- data.frame(matrix(nrow=0, ncol=ncol(df)))
-# colnames(tempdf) <- colnames(df)
-# for(passage in 1:length(all_passages)){
-#   if(all_passages[passage] %in% passages_read){next}else{
-#     tempdf[nrow(tempdf) + 1,] <- c("150086", all_passages[passage], rep(NA, 30))
-#   }
-# }
-# df <- rbind(df, tempdf)
+
 
 ### SECTION 2: BUILD DEMOGRAPHIC DATA DF
 demoDat <- redcap[,c(1,5)]
-# demoDat_imperative <- redcap[,c(1,5)]; demoDat_dplyr <- demoDat_imperative
-# as we refactor, our test case is: all.equal(demoDat_imperative, demoDat_dplyr)
-# this has been confirmed to work for with every new column added
+# while refactoring, the test case: all.equal(demoDat_imperative, demoDat_dplyr)
+# has been confirmed to work with every new column added
 
 #biological sex: replace numerical values with text description
 demoDat$sex <- case_match(redcap$demo_b_sex_s1_r1_e1,
@@ -102,10 +89,6 @@ demoDat$sex <- case_match(redcap$demo_b_sex_s1_r1_e1,
                           4 ~ 'other', 5 ~ 'unknown', .default = 'undisclosed')
 
 #preferred pronouns: replace numerical values with text description
-# for(b in 1:nrow(redcap)){ ... }
-# 8 lines, 466 chars; boilerplatey
-
-# try rewriting as case_when
 demoDat$pron <- case_match (redcap$demo_b_pronouns_s1_r1_e1,
                             1 ~ "she/her", 2 ~ "he/him", 3 ~ "they/them",
                             5 ~ "other", .default = "undisclosed")
@@ -132,9 +115,6 @@ demoDat$socclass <- case_match(redcap$demo_b_socclass_s1_r1_e1,
                                      4 ~ "affluent", .default = "undisclosed")
 
 #communication disorders diagnoses: sum across childhood, adolescence, and adulthood
-# nb. there was a typo in the old version: no adult diagnoses were being checked
-# because the column name did not exist and `sum` with the df$col syntax did not
-# catch that
 demoDat$commdis <- select(redcap, matches("demo_b_comdis.*e1")) %>% rowSums
 
 #language history: transfer directly
@@ -168,15 +148,10 @@ for(i in 1:nrow(df)){
   subject <- df$id[i] #extract subject number for matching
   passage <- df$passage[i] #extract passage name for matching
 
-  #production errors of interest
+  # production errors of interest are already included
   # misprod = raw misproduction errors
   # hesitation = raw hesitations
-  # words_with_misprod = distinct words with misproduction errors
-  # words_with_hes = distinct words with pre-word or word-internal hesitation
-  # misprod_rate = rate of raw misproduction errors
-  # hesitation_rate = rate of raw hesitations
-  # words_with_misprod_rate = rate of word-level misproduction errors
-  # words_with_hes_rate = rate of word-level hesitations
+  # misprod-hesitation linear sequences and vice versa
 
   #extract passage characteristics from readDat
   df$lenSyll[i] <- sum(readDat$lengthSyll[which(readDat$passage==passage)]) #length of passage in syllables
@@ -216,7 +191,7 @@ df$ethnic <- as.factor(df$ethnic)
 df$socclass <- as.factor(df$socclass)
 
 # compute speed
-# nb these are passage level data but are tracked by word
+# nb these are passage-level data but are tracked in every row, i.e. word level
 df$timePerSyllable <- df$readingTime / df$lenSyll
 df$timePerWord     <- df$readingTime / df$lenWord
 

From e16cd3e93ee019dbc18be72ea0a8f85a1f8c2b91 Mon Sep 17 00:00:00 2001
From: l-acs <lucas.sahar@mail.mcgill.ca>
Date: Wed, 31 Jan 2024 21:21:10 -0500
Subject: [PATCH 24/33] Keep code/analysisReadAloudBeta.R

---
 ...ReadAloudBeta.R-move-to-code__analysisWordLevelReadAloudBeta.R | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename code/analysisReadAloudBeta.R => code__analysisReadAloudBeta.R-move-to-code__analysisWordLevelReadAloudBeta.R (100%)

diff --git a/code/analysisReadAloudBeta.R b/code__analysisReadAloudBeta.R-move-to-code__analysisWordLevelReadAloudBeta.R
similarity index 100%
rename from code/analysisReadAloudBeta.R
rename to code__analysisReadAloudBeta.R-move-to-code__analysisWordLevelReadAloudBeta.R

From 0e41693addbbce5c420a70c85649de45a270b267 Mon Sep 17 00:00:00 2001
From: l-acs <lucas.sahar@mail.mcgill.ca>
Date: Wed, 31 Jan 2024 21:21:10 -0500
Subject: [PATCH 25/33] Copy code/analysisReadAloudBeta.R into
 code/analysisWordLevelReadAloudBeta.R

---
 .../{analysisReadAloudBeta.R => analysisWordLevelReadAloudBeta.R} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename code/{analysisReadAloudBeta.R => analysisWordLevelReadAloudBeta.R} (100%)

diff --git a/code/analysisReadAloudBeta.R b/code/analysisWordLevelReadAloudBeta.R
similarity index 100%
rename from code/analysisReadAloudBeta.R
rename to code/analysisWordLevelReadAloudBeta.R

From f27304c4f3fa5346808d77151546640f83be8bb2 Mon Sep 17 00:00:00 2001
From: l-acs <lucas.sahar@mail.mcgill.ca>
Date: Wed, 31 Jan 2024 21:21:10 -0500
Subject: [PATCH 26/33] Revert analysisReadAloudBeta.R, keeping history

---
 .../analysisReadAloudBeta.R                                       | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename code__analysisReadAloudBeta.R-move-to-code__analysisWordLevelReadAloudBeta.R => code/analysisReadAloudBeta.R (100%)

diff --git a/code__analysisReadAloudBeta.R-move-to-code__analysisWordLevelReadAloudBeta.R b/code/analysisReadAloudBeta.R
similarity index 100%
rename from code__analysisReadAloudBeta.R-move-to-code__analysisWordLevelReadAloudBeta.R
rename to code/analysisReadAloudBeta.R

From 34b5e472519e189b01ce6296b2b915e735d0e4da Mon Sep 17 00:00:00 2001
From: l-acs <lucas.sahar@mail.mcgill.ca>
Date: Thu, 1 Feb 2024 02:15:31 -0500
Subject: [PATCH 27/33] Begin revising models and pre-analysis for word level

---
 code/analysisWordLevelReadAloudBeta.R | 150 ++++++++++++++------------
 1 file changed, 80 insertions(+), 70 deletions(-)

diff --git a/code/analysisWordLevelReadAloudBeta.R b/code/analysisWordLevelReadAloudBeta.R
index 2509334..246a049 100644
--- a/code/analysisWordLevelReadAloudBeta.R
+++ b/code/analysisWordLevelReadAloudBeta.R
@@ -1,6 +1,6 @@
 # readAloud-valence-beta Reading Task Analyses
 # Authors: Luc Sahar, Jessica M. Alexander
-# Last Updated: 2023-08-25
+# Last Updated: 2024-02-01
 
 # INPUTS
 # data/df: behavioral data, for each participant on each passage, with relevant participant information and trial-level stimulus information
@@ -140,10 +140,11 @@ today <- format(today, "%Y%m%d")
 # data <- '/home/luc/Documents/ndclab/analysis-sandbox/rwe-analysis/derivatives/readAloudBetaData_20230810.csv'
 # data <- '/home/luc/Documents/ndclab/analysis-sandbox/rwe-analysis/derivatives/readAloudBetaData_20230815.csv'
 # data <- '/home/luc/Documents/ndclab/analysis-sandbox/rwe-analysis/derivatives/readAloudBetaData_20230816.csv'
-data <- '/home/luc/Documents/ndclab/analysis-sandbox/rwe-analysis/derivatives/readAloudBetaData_20230825.csv'
-to_omit <- '/home/luc/Documents/ndclab/analysis-sandbox/rwe-analysis/input/passages-to-omit_20230810.csv'
+# data <- '/home/luc/Documents/ndclab/analysis-sandbox/rwe-analysis/derivatives/readAloudBetaData_20230825.csv'
+data <- '~/Documents/ndclab/rwe-analysis-sandbox/rwe-analysis/derivatives/readAloudBetaData-wordLevel_20240130.csv'
+to_omit <- '/home/luc/Documents/ndclab/rwe-analysis-sandbox/rwe-analysis/input/passages-to-omit_20230810.csv'
 # out_path <- '/Users/jalexand/github/readAloud-valence-beta/derivatives/'
-out_path <- '/home/luc/Documents/ndclab/analysis-sandbox/rwe-analysis/derivatives/'
+out_path <- '/home/luc/Documents/ndclab/rwe-analysis-sandbox/rwe-analysis/derivatives/'
 
 #read in data
 df <- read.csv(data, row.names = NULL) # output of prep script
@@ -159,16 +160,23 @@ df$socclass <- as.factor(df$socclass)
 
 # all these values are just in case they're useful - not needed per se for later
 # steps of the logic in this script
-summary(df$age)
-sd(df$age)
-summary(df$sex)/18
-summary(df$sex)/18 / (nrow(df)/18)
-summary(df$pronouns)/18
-summary(df$pronouns)/18 / (nrow(df)/18)
-summary(df$ethnic)/18
-summary(df$ethnic)/18 / (nrow(df)/18)
-summary(df$socclass)/18
-summary(df$socclass)/18 / (nrow(df)/18)
+# from prep script:
+demo_cols <- c("age", "sex", "pronouns", "ethnic", "socclass")
+sample_size <- select(df, "id") %>% unique %>% nrow
+
+
+summary_unique <- function(df, key, column, f = summary) {
+  unique(select(df, column, key))[[column]] %>% f
+}
+
+demo_cols %>% # as totals, for each of our demographic columns
+  map(\(col) summary_unique(df, "id", col)) # `map` not `for`: return, not print
+
+demo_cols %>% # as a percent
+  map(\(col) summary_unique(df, "id", col) / sample_size)
+
+summary_unique(df, "id", "age", f = sd) # also do stdev for age
+
 
 #remove participants who were not engaged in the task
 dfTrim <- df
@@ -191,18 +199,17 @@ dfTrim <- subset(dfTrim, challengeAvgSub>0.5)
 length(unique(df$id)) - length(unique(dfTrim$id)) #number of participants removed due to distraction or low accuracy
 
 #calculate average accuracy
-mean(dfTrim$challengeAvgSub)
-sd(dfTrim$challengeAvgSub)
-
-# calculate average speed
-mean(dfTrim$timePerSyllable, na.rm=TRUE)
-sd(dfTrim$timePerSyllable, na.rm=TRUE)
-mean(dfTrim$timePerWord, na.rm=TRUE)
-sd(dfTrim$timePerWord, na.rm=TRUE)
+summary_unique(dfTrim, "id", "challengeAvgSub")
+summary_unique(dfTrim, "id", "challengeAvgSub", f = sd)
 
+# calculate average speed per *person* per *passage*
+unique(select(dfTrim, timePerSyllable, id, passage))$timePerSyllable %>% mean(na.rm = TRUE)
+unique(select(dfTrim, timePerSyllable, id, passage))$timePerSyllable %>% sd(na.rm = TRUE)
+unique(select(dfTrim, timePerWord, id, passage))$timePerWord %>% mean(na.rm = TRUE)
+unique(select(dfTrim, timePerWord, id, passage))$timePerWord %>% sd(na.rm = TRUE)
 
 ### SECTION 2: PASSAGE-LEVEL TRIMMING
-passage_no_before_trimming <- nrow(dfTrim)
+passage_no_before_trimming <- summary_unique(dfTrim, "id", "passage", f = length)
 
 #remove passages with high omissions (participant did not complete reading) or other problems (someone else is in the room, etc.)
 # e.g. vegas 150013
@@ -211,7 +218,7 @@ dfTrim <- anti_join(dfTrim,
                     by = join_by(id == participant, passage == passage))
 
 
-passage_no_after_trim1 <- nrow(dfTrim)
+passage_no_after_trim1 <- summary_unique(dfTrim, "id", "passage", f = length)
 passage_no_before_trimming - passage_no_after_trim1 #number of passages trimmed
 (passage_no_before_trimming - passage_no_after_trim1) / passage_no_before_trimming #percentage of passages trimmed
 
@@ -229,7 +236,7 @@ dfTrim <- filter(dfTrim, !is.na(timePerSyllable))
 # this ends up only dropping 0083, caramel - the other three already end up
 # getting dropped based on other criteria
 
-passage_no_after_trim2 <- nrow(dfTrim)
+passage_no_after_trim2 <- summary_unique(dfTrim, "id", "passage", f = length)
 passage_no_after_trim1 - passage_no_after_trim2 #number of passages trimmed
 (passage_no_after_trim1 - passage_no_after_trim2) / passage_no_after_trim1 #percentage of passages trimmed of last bunch
 (passage_no_after_trim1 - passage_no_after_trim2) / passage_no_before_trimming #percentage of passages trimmed of whole
@@ -259,8 +266,8 @@ errorDat$avgSyllPerWord_gmc <- errorDat$avgSyllPerWord - mean(errorDat$avgSyllPe
 # LS additions 8/11/23
 # errorDat$errors <- errorDat$errors - mean(errorDat$errors)
 # errorDat$correction <- errorDat$corrections - mean(errorDat$corrections)
-errorDat$error_rate <- errorDat$errors / errorDat$lenSyll
-errorDat$correction_rate <- errorDat$corrections / errorDat$lenSyll
+# errorDat$error_rate <- errorDat$errors / errorDat$lenSyll
+# errorDat$correction_rate <- errorDat$corrections / errorDat$lenSyll
 
 errorDat$timePerSyllable_gmc <- errorDat$timePerSyllable - mean(errorDat$timePerSyllable)
 errorDat$timePerWord_gmc <- errorDat$timePerWord - mean(errorDat$timePerWord)
@@ -269,24 +276,27 @@ errorDat$timePerWord_gmc <- errorDat$timePerWord - mean(errorDat$timePerWord)
 
 
 #extract demo stats
-errorDatStats <- subset(errorDat, !duplicated(errorDat$id))
-summary(errorDatStats$age)
-sd(errorDatStats$age)
-summary(errorDatStats$sex)
-summary(errorDatStats$sex) / length(unique(errorDatStats$id))
-summary(errorDatStats$pronouns)
-summary(errorDatStats$pronouns) / length(unique(errorDatStats$id))
-summary(errorDatStats$ethnic)
-summary(errorDatStats$ethnic) / length(unique(errorDatStats$id))
-summary(errorDatStats$socclass)
-summary(errorDatStats$socclass) / length(unique(errorDatStats$id))
-
-# Reading speed stats (ls additions 8/16/23)
-summary(errorDatStats$timePerSyllable)
-summary(errorDatStats$timePerWord)
-
-summary(errorDatStats$timePerSyllable_gmc)
-summary(errorDatStats$timePerWord_gmc)
+# fixme: these are at passage level, we need to `(summary_)unique`
+# actually, nvm: we already have all these from prep
+# errorDatStats <- subset(errorDat, !duplicated(errorDat$id))
+# errorDatStats <-
+# summary(errorDatStats$age)
+# sd(errorDatStats$age)
+# summary(errorDatStats$sex)
+# summary(errorDatStats$sex) / length(unique(errorDatStats$id))
+# summary(errorDatStats$pronouns)
+# summary(errorDatStats$pronouns) / length(unique(errorDatStats$id))
+# summary(errorDatStats$ethnic)
+# summary(errorDatStats$ethnic) / length(unique(errorDatStats$id))
+# summary(errorDatStats$socclass)
+# summary(errorDatStats$socclass) / length(unique(errorDatStats$id))
+#
+# # Reading speed stats (ls additions 8/16/23)
+# summary(errorDatStats$timePerSyllable)
+# summary(errorDatStats$timePerWord)
+#
+# summary(errorDatStats$timePerSyllable_gmc)
+# summary(errorDatStats$timePerWord_gmc)
 
 
 ### SECTION 3.5: preparing for misprod-hes sequential analyses
@@ -329,24 +339,24 @@ errorDatLongHesWithRelMisprod$misprod_position <- as.factor(errorDatLongHesWithR
 
 ### SECTION 4: MODEL RESULTS
 #misprod_rate x bfne
-model1 <- lmerTest::lmer(misprod_rate ~ bfne_gmc + (1|id) + (1|passage),
-                         data=errorDat, REML=TRUE)
-summary(model1)
+# model1 <- lmerTest::lmer(misprod ~ bfne_gmc + (1|id) + (1|passage),
+#                          data=errorDat, REML=TRUE)
+# summary(model1)
 
 #misprod_rate x scaaredSoc
-model2 <- lmerTest::lmer(misprod_rate ~ scaaredSoc_gmc + (1|id) + (1|passage),
+model2 <- lmerTest::lmer(misprod ~ scaaredSoc_gmc + (1|id) + (1|passage),
                          data=errorDat, REML=TRUE)
 summary(model2)
 
 #misprod_rate x sps
-model3 <- lmerTest::lmer(misprod_rate ~ sps_gmc + (1|id) + (1|passage),
-                         data=errorDat, REML=TRUE)
-summary(model3)
+# model3 <- lmerTest::lmer(misprod_rate ~ sps_gmc + (1|id) + (1|passage),
+#                          data=errorDat, REML=TRUE)
+# summary(model3)
 
 #hesitation_rate x bfne
-model4 <- lmerTest::lmer(hesitation_rate ~ bfne_gmc + (1|id) + (1|passage),
-                         data=errorDat, REML=TRUE)
-summary(model4)
+# model4 <- lmerTest::lmer(hesitation_rate ~ bfne_gmc + (1|id) + (1|passage),
+#                          data=errorDat, REML=TRUE)
+# summary(model4)
 
 #hesitation_rate x scaaredSoc
 model5 <- lmerTest::lmer(hesitation_rate ~ scaaredSoc_gmc + (1|id) + (1|passage),
@@ -354,14 +364,14 @@ model5 <- lmerTest::lmer(hesitation_rate ~ scaaredSoc_gmc + (1|id) + (1|passage)
 summary(model5)
 
 #hesitation_rate x sps
-model6 <- lmerTest::lmer(hesitation_rate ~ sps_gmc + (1|id) + (1|passage),
-                         data=errorDat, REML=TRUE)
-summary(model6)
+# model6 <- lmerTest::lmer(hesitation_rate ~ sps_gmc + (1|id) + (1|passage),
+#                          data=errorDat, REML=TRUE)
+# summary(model6)
 
 #words_with_misprod_rate x bfne
-model7 <- lmerTest::lmer(words_with_misprod_rate ~ bfne_gmc + (1|id) + (1|passage),
-                         data=errorDat, REML=TRUE)
-summary(model7)
+# model7 <- lmerTest::lmer(words_with_misprod_rate ~ bfne_gmc + (1|id) + (1|passage),
+#                          data=errorDat, REML=TRUE)
+# summary(model7)
 
 #words_with_misprod_rate x scaaredSoc
 model8 <- lmerTest::lmer(words_with_misprod_rate ~ scaaredSoc_gmc + (1|id) + (1|passage),
@@ -369,14 +379,14 @@ model8 <- lmerTest::lmer(words_with_misprod_rate ~ scaaredSoc_gmc + (1|id) + (1|
 summary(model8)
 
 #words_with_misprod_rate x sps
-model9 <- lmerTest::lmer(words_with_misprod_rate ~ sps_gmc + (1|id) + (1|passage),
-                         data=errorDat, REML=TRUE)
-summary(model9)
+# model9 <- lmerTest::lmer(words_with_misprod_rate ~ sps_gmc + (1|id) + (1|passage),
+#                          data=errorDat, REML=TRUE)
+# summary(model9)
 
 #words_with_hes_rate x bfne
-model10 <- lmerTest::lmer(words_with_hes_rate ~ bfne_gmc + (1|id) + (1|passage),
-                          data=errorDat, REML=TRUE)
-summary(model10)
+# model10 <- lmerTest::lmer(words_with_hes_rate ~ bfne_gmc + (1|id) + (1|passage),
+#                           data=errorDat, REML=TRUE)
+# summary(model10)
 
 #words_with_hes_rate x scaaredSoc
 model11 <- lmerTest::lmer(words_with_hes_rate ~ scaaredSoc_gmc + (1|id) + (1|passage),
@@ -384,9 +394,9 @@ model11 <- lmerTest::lmer(words_with_hes_rate ~ scaaredSoc_gmc + (1|id) + (1|pas
 summary(model11)
 
 #words_with_hes_rate x sps
-model12 <- lmerTest::lmer(words_with_hes_rate ~ sps_gmc + (1|id) + (1|passage),
-                          data=errorDat, REML=TRUE)
-summary(model12)
+# model12 <- lmerTest::lmer(words_with_hes_rate ~ sps_gmc + (1|id) + (1|passage),
+#                           data=errorDat, REML=TRUE)
+# summary(model12)
 
 
 #### supplemental analyses

From f4b4662e19ee5f10066d0ba116205ebf1c5183e0 Mon Sep 17 00:00:00 2001
From: l-acs <lucas.sahar@mail.mcgill.ca>
Date: Thu, 1 Feb 2024 02:42:34 -0500
Subject: [PATCH 28/33] Set up models for word frequency at the word level

---
 code/analysisWordLevelReadAloudBeta.R | 340 +++++++++++---------------
 1 file changed, 144 insertions(+), 196 deletions(-)

diff --git a/code/analysisWordLevelReadAloudBeta.R b/code/analysisWordLevelReadAloudBeta.R
index 246a049..8438b18 100644
--- a/code/analysisWordLevelReadAloudBeta.R
+++ b/code/analysisWordLevelReadAloudBeta.R
@@ -262,139 +262,127 @@ errorDat$sps_gmc <- errorDat$sps - mean(errorDat$sps)
 errorDat$lenSyll_gmc <- errorDat$lenSyll - mean(errorDat$lenSyll)
 errorDat$lenWord_gmc <- errorDat$lenWord - mean(errorDat$lenWord)
 errorDat$avgSyllPerWord_gmc <- errorDat$avgSyllPerWord - mean(errorDat$avgSyllPerWord)
-
-# LS additions 8/11/23
-# errorDat$errors <- errorDat$errors - mean(errorDat$errors)
-# errorDat$correction <- errorDat$corrections - mean(errorDat$corrections)
-# errorDat$error_rate <- errorDat$errors / errorDat$lenSyll
-# errorDat$correction_rate <- errorDat$corrections / errorDat$lenSyll
-
 errorDat$timePerSyllable_gmc <- errorDat$timePerSyllable - mean(errorDat$timePerSyllable)
 errorDat$timePerWord_gmc <- errorDat$timePerWord - mean(errorDat$timePerWord)
 
 
-
-
-#extract demo stats
-# fixme: these are at passage level, we need to `(summary_)unique`
-# actually, nvm: we already have all these from prep
-# errorDatStats <- subset(errorDat, !duplicated(errorDat$id))
-# errorDatStats <-
-# summary(errorDatStats$age)
-# sd(errorDatStats$age)
-# summary(errorDatStats$sex)
-# summary(errorDatStats$sex) / length(unique(errorDatStats$id))
-# summary(errorDatStats$pronouns)
-# summary(errorDatStats$pronouns) / length(unique(errorDatStats$id))
-# summary(errorDatStats$ethnic)
-# summary(errorDatStats$ethnic) / length(unique(errorDatStats$id))
-# summary(errorDatStats$socclass)
-# summary(errorDatStats$socclass) / length(unique(errorDatStats$id))
-#
-# # Reading speed stats (ls additions 8/16/23)
-# summary(errorDatStats$timePerSyllable)
-# summary(errorDatStats$timePerWord)
-#
-# summary(errorDatStats$timePerSyllable_gmc)
-# summary(errorDatStats$timePerWord_gmc)
-
-
 ### SECTION 3.5: preparing for misprod-hes sequential analyses
 
 # ignore the misprod-hes columns for now
-errorDatMisprodHes <- select(errorDat, !contains("_syllables"))
-
-# First: look at a given misproduction and check for nearby hesitations
-justMisprodWithHesBefore <- cbind(errorDatMisprodHes,
-                                  hes_position = 0, # "before",
-                                  misprod_tally = errorDat$misprod_with_hes_in_previous_syllables)
-
-justMisprodWithHesAfter <- cbind(errorDatMisprodHes,
-                                 hes_position = 1, # "after",
-                                 misprod_tally = errorDat$misprod_with_hes_in_next_syllables)
-
-
-# stack the ones before and the ones after as rows of a single df (my attempt at long form)
-errorDatLongMisprodWithRelHes <- rbind(justMisprodWithHesBefore, justMisprodWithHesAfter)
-
-# track the binary relative position as a factor
-errorDatLongMisprodWithRelHes$hes_position <- as.factor(errorDatLongMisprodWithRelHes$hes_position)
-
-# Then: look at a given hesitation and check for nearby misproductions
-justHesWithMisprodBefore <- cbind(errorDatMisprodHes,
-                                  misprod_position = 0, # "before",
-                                  hes_tally = errorDat$hes_with_misprod_in_previous_syllables)
+# errorDatMisprodHes <- select(errorDat, !contains("_syllables"))
+#
+# # First: look at a given misproduction and check for nearby hesitations
+# justMisprodWithHesBefore <- cbind(errorDatMisprodHes,
+#                                   hes_position = 0, # "before",
+#                                   misprod_tally = errorDat$misprod_with_hes_in_previous_syllables)
+#
+# justMisprodWithHesAfter <- cbind(errorDatMisprodHes,
+#                                  hes_position = 1, # "after",
+#                                  misprod_tally = errorDat$misprod_with_hes_in_next_syllables)
 
-justHesWithMisprodAfter <- cbind(errorDatMisprodHes,
-                                 misprod_position = 1, # "after",
-                                 hes_tally = errorDat$hes_with_misprod_in_next_syllables)
 
 # stack the ones before and the ones after as rows of a single df (my attempt at long form)
-errorDatLongHesWithRelMisprod <- rbind(justHesWithMisprodBefore, justHesWithMisprodAfter)
-
-# track the binary relative position as a factor
-errorDatLongHesWithRelMisprod$misprod_position <- as.factor(errorDatLongHesWithRelMisprod$misprod_position)
+# errorDatLongMisprodWithRelHes <- rbind(justMisprodWithHesBefore, justMisprodWithHesAfter)
+#
+# # track the binary relative position as a factor
+# errorDatLongMisprodWithRelHes$hes_position <- as.factor(errorDatLongMisprodWithRelHes$hes_position)
+#
+# # Then: look at a given hesitation and check for nearby misproductions
+# justHesWithMisprodBefore <- cbind(errorDatMisprodHes,
+#                                   misprod_position = 0, # "before",
+#                                   hes_tally = errorDat$hes_with_misprod_in_previous_syllables)
+#
+# justHesWithMisprodAfter <- cbind(errorDatMisprodHes,
+#                                  misprod_position = 1, # "after",
+#                                  hes_tally = errorDat$hes_with_misprod_in_next_syllables)
+#
+# # stack the ones before and the ones after as rows of a single df (my attempt at long form)
+# errorDatLongHesWithRelMisprod <- rbind(justHesWithMisprodBefore, justHesWithMisprodAfter)
+#
+# # track the binary relative position as a factor
+# errorDatLongHesWithRelMisprod$misprod_position <- as.factor(errorDatLongHesWithRelMisprod$misprod_position)
 
 
 
 ### SECTION 4: MODEL RESULTS
-#misprod_rate x bfne
+#misprod x bfne
 # model1 <- lmerTest::lmer(misprod ~ bfne_gmc + (1|id) + (1|passage),
 #                          data=errorDat, REML=TRUE)
 # summary(model1)
 
-#misprod_rate x scaaredSoc
+#misprod x scaaredSoc
 model2 <- lmerTest::lmer(misprod ~ scaaredSoc_gmc + (1|id) + (1|passage),
                          data=errorDat, REML=TRUE)
 summary(model2)
 
-#misprod_rate x sps
-# model3 <- lmerTest::lmer(misprod_rate ~ sps_gmc + (1|id) + (1|passage),
+#misprod x scaaredSoc control for word
+model2.5 <- lmerTest::lmer(misprod ~ scaaredSoc_gmc + (1|id) + (1|passage) + (1|word),
+                         data=errorDat, REML=TRUE)
+summary(model2.5)
+
+
+
+#misprod x sps
+# model3 <- lmerTest::lmer(misprod ~ sps_gmc + (1|id) + (1|passage),
 #                          data=errorDat, REML=TRUE)
 # summary(model3)
 
-#hesitation_rate x bfne
-# model4 <- lmerTest::lmer(hesitation_rate ~ bfne_gmc + (1|id) + (1|passage),
+#hesitation x bfne
+# model4 <- lmerTest::lmer(hesitation ~ bfne_gmc + (1|id) + (1|passage),
 #                          data=errorDat, REML=TRUE)
 # summary(model4)
 
-#hesitation_rate x scaaredSoc
-model5 <- lmerTest::lmer(hesitation_rate ~ scaaredSoc_gmc + (1|id) + (1|passage),
+#hesitation x scaaredSoc
+model5 <- lmerTest::lmer(hesitation ~ scaaredSoc_gmc + (1|id) + (1|passage),
                          data=errorDat, REML=TRUE)
 summary(model5)
 
-#hesitation_rate x sps
-# model6 <- lmerTest::lmer(hesitation_rate ~ sps_gmc + (1|id) + (1|passage),
+# hesitation x scaaredSoc, control for word
+model5.5 <- lmerTest::lmer(hesitation ~ scaaredSoc_gmc + (1|id) + (1|passage) + (1|word),
+                         data=errorDat, REML=TRUE)
+summary(model5.5)
+
+
+# hesitation x scaaredSoc, only control for word (and id; not passage)
+model5.6 <- lmerTest::lmer(hesitation ~ scaaredSoc_gmc + (1|id) + (1|word),
+                           data=errorDat, REML=TRUE)
+summary(model5.6)
+
+# results are similar
+
+
+#hesitation x sps
+# model6 <- lmerTest::lmer(hesitation ~ sps_gmc + (1|id) + (1|passage),
 #                          data=errorDat, REML=TRUE)
 # summary(model6)
 
-#words_with_misprod_rate x bfne
-# model7 <- lmerTest::lmer(words_with_misprod_rate ~ bfne_gmc + (1|id) + (1|passage),
+#words_with_misprod x bfne
+# model7 <- lmerTest::lmer(words_with_misprod ~ bfne_gmc + (1|id) + (1|passage),
 #                          data=errorDat, REML=TRUE)
 # summary(model7)
 
-#words_with_misprod_rate x scaaredSoc
-model8 <- lmerTest::lmer(words_with_misprod_rate ~ scaaredSoc_gmc + (1|id) + (1|passage),
-                         data=errorDat, REML=TRUE)
-summary(model8)
+#words_with_misprod x scaaredSoc
+# model8 <- lmerTest::lmer(words_with_misprod ~ scaaredSoc_gmc + (1|id) + (1|passage),
+#                          data=errorDat, REML=TRUE)
+# summary(model8)
 
-#words_with_misprod_rate x sps
-# model9 <- lmerTest::lmer(words_with_misprod_rate ~ sps_gmc + (1|id) + (1|passage),
+#words_with_misprod x sps
+# model9 <- lmerTest::lmer(words_with_misprod ~ sps_gmc + (1|id) + (1|passage),
 #                          data=errorDat, REML=TRUE)
 # summary(model9)
 
-#words_with_hes_rate x bfne
-# model10 <- lmerTest::lmer(words_with_hes_rate ~ bfne_gmc + (1|id) + (1|passage),
+#words_with_hes x bfne
+# model10 <- lmerTest::lmer(words_with_hes ~ bfne_gmc + (1|id) + (1|passage),
 #                           data=errorDat, REML=TRUE)
 # summary(model10)
 
-#words_with_hes_rate x scaaredSoc
-model11 <- lmerTest::lmer(words_with_hes_rate ~ scaaredSoc_gmc + (1|id) + (1|passage),
-                          data=errorDat, REML=TRUE)
-summary(model11)
+#words_with_hes x scaaredSoc
+# model11 <- lmerTest::lmer(words_with_hes ~ scaaredSoc_gmc + (1|id) + (1|passage),
+#                           data=errorDat, REML=TRUE)
+# summary(model11)
 
-#words_with_hes_rate x sps
-# model12 <- lmerTest::lmer(words_with_hes_rate ~ sps_gmc + (1|id) + (1|passage),
+#words_with_hes x sps
+# model12 <- lmerTest::lmer(words_with_hes ~ sps_gmc + (1|id) + (1|passage),
 #                           data=errorDat, REML=TRUE)
 # summary(model12)
 
@@ -425,23 +413,23 @@ summary(f_model3)
 
 
 # Accuracy/comprehension as explained by disfluencies: hesitations per syllable
-f_model4 <- glmer(challengeACC ~ hesitation_rate + (1|id) + (1|passage),
-                  data=errorDat, family = "binomial")
-summary(f_model4)
+# f_model4 <- glmer(challengeACC ~ hesitation + (1|id) + (1|passage),
+#                   data=errorDat, family = "binomial")
+# summary(f_model4)
 
 # Accuracy/comprehension as explained by disfluencies: hesitations per word
-f_model5 <- glmer(challengeACC ~ words_with_hes_rate + (1|id) + (1|passage),
+f_model5 <- glmer(challengeACC ~ hesitation + (1|id) + (1|passage),
                   data=errorDat, family = "binomial")
 summary(f_model5)
 
 
 # Accuracy/comprehension as explained by errors: misproductions per syllable
-f_model6 <- glmer(challengeACC ~ misprod_rate + (1|id) + (1|passage),
-                  data=errorDat, family = "binomial")
-summary(f_model6)
+# f_model6 <- glmer(challengeACC ~ misprod + (1|id) + (1|passage),
+#                   data=errorDat, family = "binomial")
+# summary(f_model6)
 
 # Accuracy/comprehension as explained by errors: misproductions per word
-f_model7 <- glmer(challengeACC ~ words_with_misprod_rate + (1|id) + (1|passage),
+f_model7 <- glmer(challengeACC ~ misprod + (1|id) + (1|passage),
                   data=errorDat, family = "binomial")
 summary(f_model7)
 
@@ -449,69 +437,69 @@ summary(f_model7)
 
 
 # Accuracy/comprehension as explained by disfluencies *and* SA: hesitations per syllable with scaared
-f_model8 <- glmer(challengeACC ~ hesitation_rate * scaaredSoc_gmc + (1|id) + (1|passage),
+f_model8 <- glmer(challengeACC ~ hesitation * scaaredSoc_gmc + (1|id) + (1|passage),
                   data=errorDat, family = "binomial")
 summary(f_model8)
 
 # Accuracy/comprehension as explained by disfluencies: hesitations per word with scaared
-f_model9 <- glmer(challengeACC ~ words_with_hes_rate * scaaredSoc_gmc + (1|id) + (1|passage),
+f_model9 <- glmer(challengeACC ~ words_with_hes * scaaredSoc_gmc + (1|id) + (1|passage),
                   data=errorDat, family = "binomial")
 summary(f_model9)
 
 
 # Accuracy/comprehension as explained by errors: misproductions per syllable with scaared
-f_model10 <- glmer(challengeACC ~ misprod_rate * scaaredSoc_gmc + (1|id) + (1|passage),
+f_model10 <- glmer(challengeACC ~ misprod * scaaredSoc_gmc + (1|id) + (1|passage),
                   data=errorDat, family = "binomial")
 summary(f_model10)
 
 # Accuracy/comprehension as explained by errors: misproductions per word with scaared
-f_model11 <- glmer(challengeACC ~ words_with_misprod_rate * scaaredSoc_gmc + (1|id) + (1|passage),
+f_model11 <- glmer(challengeACC ~ words_with_misprod * scaaredSoc_gmc + (1|id) + (1|passage),
                   data=errorDat, family = "binomial")
 summary(f_model11)
 
 
 
 # Accuracy/comprehension as explained by disfluencies *and* SA: hesitations per syllable with bfne
-f_model12 <- glmer(challengeACC ~ hesitation_rate * bfne_gmc + (1|id) + (1|passage),
+f_model12 <- glmer(challengeACC ~ hesitation * bfne_gmc + (1|id) + (1|passage),
                   data=errorDat, family = "binomial")
 summary(f_model12)
 
 # Accuracy/comprehension as explained by disfluencies *and* SA: hesitations per word with bfne
-f_model13 <- glmer(challengeACC ~ words_with_hes_rate * bfne_gmc + (1|id) + (1|passage),
+f_model13 <- glmer(challengeACC ~ words_with_hes * bfne_gmc + (1|id) + (1|passage),
                   data=errorDat, family = "binomial")
 summary(f_model13)
 
 
 # Accuracy/comprehension as explained by errors *and* SA: misproductions per syllable with bfne
-f_model14 <- glmer(challengeACC ~ misprod_rate * bfne_gmc + (1|id) + (1|passage),
+f_model14 <- glmer(challengeACC ~ misprod * bfne_gmc + (1|id) + (1|passage),
                    data=errorDat, family = "binomial")
 summary(f_model14)
 
 # Accuracy/comprehension as explained by errors *and* SA: misproductions per word with bfne
-f_model15 <- glmer(challengeACC ~ words_with_misprod_rate * bfne_gmc + (1|id) + (1|passage),
+f_model15 <- glmer(challengeACC ~ words_with_misprod * bfne_gmc + (1|id) + (1|passage),
                    data=errorDat, family = "binomial")
 summary(f_model15)
 
 
 
 # Accuracy/comprehension as explained by disfluencies *and* SA: hesitations per syllable with sps
-f_model16 <- glmer(challengeACC ~ hesitation_rate * sps_gmc + (1|id) + (1|passage),
+f_model16 <- glmer(challengeACC ~ hesitation * sps_gmc + (1|id) + (1|passage),
                    data=errorDat, family = "binomial")
 summary(f_model16)
 
 # Accuracy/comprehension as explained by disfluencies *and* SA: hesitations per word with sps
-f_model17 <- glmer(challengeACC ~ words_with_hes_rate * sps_gmc + (1|id) + (1|passage),
+f_model17 <- glmer(challengeACC ~ words_with_hes * sps_gmc + (1|id) + (1|passage),
                    data=errorDat, family = "binomial")
 summary(f_model17)
 
 
 # Accuracy/comprehension as explained by errors *and* SA: misproductions per syllable with sps
-f_model18 <- glmer(challengeACC ~ misprod_rate * sps_gmc + (1|id) + (1|passage),
+f_model18 <- glmer(challengeACC ~ misprod * sps_gmc + (1|id) + (1|passage),
                    data=errorDat, family = "binomial")
 summary(f_model18)
 
 # Accuracy/comprehension as explained by errors *and* SA: misproductions per word with sps
-f_model19 <- glmer(challengeACC ~ words_with_misprod_rate * sps_gmc + (1|id) + (1|passage),
+f_model19 <- glmer(challengeACC ~ words_with_misprod * sps_gmc + (1|id) + (1|passage),
                    data=errorDat, family = "binomial")
 summary(f_model19)
 
@@ -520,18 +508,18 @@ summary(f_model19)
 # Now, misproduction-hesitation relationships
 
 # Errors as explained by disfluency: rate of misproduced syllables from rate of hesitated syllables
-f_model20 <- lmerTest::lmer(misprod_rate ~ hesitation_rate + (1|id) + (1|passage),
+f_model20 <- lmerTest::lmer(misprod ~ hesitation + (1|id) + (1|passage),
                             data=errorDat, REML=TRUE)
 summary(f_model20) # ***
 
 # Errors as explained by disfluency: rate of misproduced words from rate of hesitated words
-f_model21 <- lmerTest::lmer(words_with_misprod_rate ~ words_with_hes_rate + (1|id) + (1|passage),
+f_model21 <- lmerTest::lmer(words_with_misprod ~ words_with_hes + (1|id) + (1|passage),
                             data=errorDat, REML=TRUE)
 summary(f_model21) # ***
 
 
 # Errors as explained by disfluency: rate of misproduced words from rate of hesitated syllables
-f_model22 <- lmerTest::lmer(words_with_misprod_rate ~ hesitation_rate + (1|id) + (1|passage),
+f_model22 <- lmerTest::lmer(words_with_misprod ~ hesitation + (1|id) + (1|passage),
                             data=errorDat, REML=TRUE)
 summary(f_model22) # ***
 
@@ -540,130 +528,69 @@ summary(f_model22) # ***
 # Now, misproduction-hesitation interactions with social anxiety
 
 # Errors as explained by disfluency and SA: rate of misproduced syllables from rate of hesitated syllables and scaared
-f_model23 <- lmerTest::lmer(misprod_rate ~ hesitation_rate * scaaredSoc_gmc + (1|id) + (1|passage),
+f_model23 <- lmerTest::lmer(misprod ~ hesitation * scaaredSoc_gmc + (1|id) + (1|passage),
                             data=errorDat, REML=TRUE)
 summary(f_model23)
 
 # Errors as explained by disfluency and SA: rate of misproduced words from rate of hesitated words and scaared
-f_model24 <- lmerTest::lmer(words_with_misprod_rate ~ words_with_hes_rate * scaaredSoc_gmc + (1|id) + (1|passage),
+f_model24 <- lmerTest::lmer(words_with_misprod ~ words_with_hes * scaaredSoc_gmc + (1|id) + (1|passage),
                             data=errorDat, REML=TRUE)
 summary(f_model24)
 
 
 # Errors as explained by disfluency and SA: rate of misproduced words from rate of hesitated syllables and scaared
-f_model25 <- lmerTest::lmer(words_with_misprod_rate ~ hesitation_rate * scaaredSoc_gmc + (1|id) + (1|passage),
+f_model25 <- lmerTest::lmer(words_with_misprod ~ hesitation * scaaredSoc_gmc + (1|id) + (1|passage),
                             data=errorDat, REML=TRUE)
 summary(f_model25)
 
 
 # Errors as explained by disfluency and SA: rate of misproduced syllables from rate of hesitated syllables and bfne
-f_model26 <- lmerTest::lmer(misprod_rate ~ hesitation_rate * bfne_gmc + (1|id) + (1|passage),
+f_model26 <- lmerTest::lmer(misprod ~ hesitation * bfne_gmc + (1|id) + (1|passage),
                             data=errorDat, REML=TRUE)
 summary(f_model26)
 
 # Errors as explained by disfluency and SA: rate of misproduced words from rate of hesitated words and bfne
-f_model27 <- lmerTest::lmer(words_with_misprod_rate ~ words_with_hes_rate * bfne_gmc + (1|id) + (1|passage),
+f_model27 <- lmerTest::lmer(words_with_misprod ~ words_with_hes * bfne_gmc + (1|id) + (1|passage),
                             data=errorDat, REML=TRUE)
 summary(f_model27)
 
 
 # Errors as explained by disfluency and SA: rate of misproduced words from rate of hesitated syllables and bfne
-f_model28 <- lmerTest::lmer(words_with_misprod_rate ~ hesitation_rate * bfne_gmc + (1|id) + (1|passage),
+f_model28 <- lmerTest::lmer(words_with_misprod ~ hesitation * bfne_gmc + (1|id) + (1|passage),
                             data=errorDat, REML=TRUE)
 summary(f_model28)
 
 
 # Errors as explained by disfluency and SA: rate of misproduced syllables from rate of hesitated syllables and sps
-f_model29 <- lmerTest::lmer(misprod_rate ~ hesitation_rate * sps_gmc + (1|id) + (1|passage),
+f_model29 <- lmerTest::lmer(misprod ~ hesitation * sps_gmc + (1|id) + (1|passage),
                             data=errorDat, REML=TRUE)
 summary(f_model29)
 
 # Errors as explained by disfluency and SA: rate of misproduced words from rate of hesitated words and sps
-f_model30 <- lmerTest::lmer(words_with_misprod_rate ~ words_with_hes_rate * sps_gmc + (1|id) + (1|passage),
+f_model30 <- lmerTest::lmer(words_with_misprod ~ words_with_hes * sps_gmc + (1|id) + (1|passage),
                             data=errorDat, REML=TRUE)
 summary(f_model30)
 
 
 # Errors as explained by disfluency and SA: rate of misproduced words from rate of hesitated syllables and sps
-f_model31 <- lmerTest::lmer(words_with_misprod_rate ~ hesitation_rate * sps_gmc + (1|id) + (1|passage),
+f_model31 <- lmerTest::lmer(words_with_misprod ~ hesitation * sps_gmc + (1|id) + (1|passage),
                             data=errorDat, REML=TRUE)
 summary(f_model31)
 
 
 
-# Now: see if reading speed plays into it
-
-# Does scaaredSoc predict reading speed?
-# syllable level
-rs_model_1 <- lmerTest::lmer(timePerSyllable_gmc ~ scaaredSoc_gmc + (1|id) + (1|passage),
-                             data=errorDat, REML=TRUE)
-summary(rs_model_1)
-
-# word level
-rs_model_2 <- lmerTest::lmer(timePerWord_gmc ~ scaaredSoc_gmc + (1|id) + (1|passage),
-                             data=errorDat, REML=TRUE)
-summary(rs_model_2)
-
-
-rs_model_1_bfne <- lmerTest::lmer(timePerSyllable_gmc ~ bfne_gmc + (1|id) + (1|passage),
-                             data=errorDat, REML=TRUE)
-summary(rs_model_1_bfne)
-
-# word level
-rs_model_2_bfne <- lmerTest::lmer(timePerWord_gmc ~ bfne_gmc + (1|id) + (1|passage),
-                             data=errorDat, REML=TRUE)
-summary(rs_model_2_bfne)
-
-rs_model_1_sps <- lmerTest::lmer(timePerSyllable_gmc ~ sps_gmc + (1|id) + (1|passage),
-                                  data=errorDat, REML=TRUE)
-summary(rs_model_1_sps)
-
-# word level
-rs_model_2_bfne <- lmerTest::lmer(timePerWord_gmc ~ sps_gmc + (1|id) + (1|passage),
-                                  data=errorDat, REML=TRUE)
-summary(rs_model_2_sps)
-
-
-
-# Does scaaredSoc predict reading speed?
-# syllable level
-rs_model_3 <- lmerTest::lmer(timePerSyllable ~ scaaredSoc_gmc + (1|id) + (1|passage),
-                             data=errorDat, REML=TRUE)
-summary(rs_model_3)
-
-# word level
-rs_model_4 <- lmerTest::lmer(timePerWord ~ scaaredSoc_gmc + (1|id) + (1|passage),
-                             data=errorDat, REML=TRUE)
-summary(rs_model_4)
-
-
-
-
 # What happens when we control for age?
-#hesitation_rate x scaaredSoc
-age_model1 <- lmerTest::lmer(hesitation_rate ~ scaaredSoc_gmc + age_gmc + (1|id) + (1|passage),
+#hesitation x scaaredSoc
+age_model1 <- lmerTest::lmer(hesitation ~ scaaredSoc_gmc + age_gmc + (1|id) + (1|passage),
                          data=errorDat, REML=TRUE)
 summary(age_model1)
 
-#words_with_hes_rate x scaaredSoc
-age_model2 <- lmerTest::lmer(words_with_hes_rate ~ scaaredSoc_gmc + age_gmc + (1|id) + (1|passage),
+#words_with_hes x scaaredSoc
+age_model2 <- lmerTest::lmer(words_with_hes ~ scaaredSoc_gmc + age_gmc + (1|id) + (1|passage),
                           data=errorDat, REML=TRUE)
 summary(age_model2)
 
 
-# And now ->> check our work
-# Does our hesitation ~ scaaredSoc finding hold with reading speed controlled for?
-# syllable level
-rs_model_3 <- lmerTest::lmer(hesitation_rate ~ scaaredSoc_gmc + timePerSyllable_gmc + (1|id) + (1|passage),
-                             data=errorDat, REML=TRUE)
-summary(rs_model_3)
-
-# word level
-rs_model_4 <- lmerTest::lmer(words_with_hes_rate ~ scaaredSoc_gmc + timePerWord_gmc + (1|id) + (1|passage),
-                             data=errorDat, REML=TRUE)
-summary(rs_model_4)
-
-
 # misprod-hes ordering
 
 # Is the number of hesitations adjacent to misproductions in a particular
@@ -699,19 +626,40 @@ misprod_with_rel_hes_model_4 <- lmerTest::lmer(misprod_tally ~ hes_position * sc
 
 
 # Word frequency analysis
-# Does a passage's average word frequency predict participants' hesitation rate or misproduction rate?
-wordfreq_model_1 <- lmerTest::lmer(hesitation_rate ~ avgWordFreq + (1|id) + (1|passage),
-                                   data=errorDat, REML=TRUE)
+# Does a word's frequency predict hesitation on that word?
+errorDatAttestedFreqs <- filter(errorDat, log10frequency > 0)
+wordfreq_model_1 <- lmerTest::lmer(hesitation ~ log10frequency + (1|id) + (1|passage),
+                                   data=errorDatAttestedFreqs, REML=TRUE)
 summary(wordfreq_model_1)
-wordfreq_model_2 <- lmerTest::lmer(misprod_rate ~ avgWordFreq + (1|id) + (1|passage),
-                                   data=errorDat, REML=TRUE)
+wordfreq_model_2 <- lmerTest::lmer(misprod ~ log10frequency + (1|id) + (1|passage),
+                                   data=errorDatAttestedFreqs, REML=TRUE)
 summary(wordfreq_model_2)
 
+
+# control for word, that must matter right?
+wordfreq_model_1.5 <- lmerTest::lmer(hesitation ~ log10frequency + (1|id) + (1|passage) + (1|word),
+                                   data=errorDatAttestedFreqs, REML=TRUE)
+summary(wordfreq_model_1.5)
+wordfreq_model_2.5 <- lmerTest::lmer(misprod ~ log10frequency + (1|id) + (1|passage) + (1|word),
+                                   data=errorDatAttestedFreqs, REML=TRUE)
+summary(wordfreq_model_2.5)
+
+
 # Do social anxiety and frequency interact to predict hesitation rate or misproduction rate?
-wordfreq_model_3 <- lmerTest::lmer(hesitation_rate ~ avgWordFreq * scaaredSoc_gmc + (1|id) + (1|passage),
-                                   data=errorDat, REML=TRUE)
-summary(wordfreq_model_3)
+wordfreq_model_3 <- lmerTest::lmer(hesitation ~ log10frequency * scaaredSoc_gmc + (1|id) + (1|passage),
+                                   data=errorDatAttestedFreqs, REML=TRUE)
+summary(wordfreq_model_3) # yes!
+
+wordfreq_model_4 <- lmerTest::lmer(misprod ~ log10frequency * scaaredSoc_gmc + (1|id) + (1|passage),
+                                   data=errorDatAttestedFreqs, REML=TRUE)
+summary(wordfreq_model_4) # no, not at all - well, p = 0.15
+
+
+# control for word, that must matter right?
+wordfreq_model_3.5 <- lmerTest::lmer(hesitation ~ log10frequency * scaaredSoc_gmc + (1|id) + (1|passage) + (1|word),
+                                   data=errorDatAttestedFreqs, REML=TRUE)
+summary(wordfreq_model_3.5) # still yes, very slightly higher p
 
-wordfreq_model_4 <- lmerTest::lmer(misprod_rate ~ avgWordFreq * scaaredSoc_gmc + (1|id) + (1|passage),
-                                   data=errorDat, REML=TRUE)
-summary(wordfreq_model_4)
+wordfreq_model_4.5 <- lmerTest::lmer(misprod ~ log10frequency * scaaredSoc_gmc + (1|id) + (1|passage) + (1|word),
+                                   data=errorDatAttestedFreqs, REML=TRUE)
+summary(wordfreq_model_4.5) # still no, slightly lower p = 0.114

From e4dfc14dd463c8b6294f1a51579edfbcd601cf1b Mon Sep 17 00:00:00 2001
From: l-acs <lucas.sahar@mail.mcgill.ca>
Date: Thu, 1 Feb 2024 03:29:52 -0500
Subject: [PATCH 29/33] Set up models for misproduction-hesitation ordering at
 the word level. Other tweaks.

---
 code/analysisWordLevelReadAloudBeta.R | 188 +++++++++++++++-----------
 1 file changed, 112 insertions(+), 76 deletions(-)

diff --git a/code/analysisWordLevelReadAloudBeta.R b/code/analysisWordLevelReadAloudBeta.R
index 8438b18..c87d87a 100644
--- a/code/analysisWordLevelReadAloudBeta.R
+++ b/code/analysisWordLevelReadAloudBeta.R
@@ -269,38 +269,38 @@ errorDat$timePerWord_gmc <- errorDat$timePerWord - mean(errorDat$timePerWord)
 ### SECTION 3.5: preparing for misprod-hes sequential analyses
 
 # ignore the misprod-hes columns for now
-# errorDatMisprodHes <- select(errorDat, !contains("_syllables"))
-#
-# # First: look at a given misproduction and check for nearby hesitations
-# justMisprodWithHesBefore <- cbind(errorDatMisprodHes,
-#                                   hes_position = 0, # "before",
-#                                   misprod_tally = errorDat$misprod_with_hes_in_previous_syllables)
-#
-# justMisprodWithHesAfter <- cbind(errorDatMisprodHes,
-#                                  hes_position = 1, # "after",
-#                                  misprod_tally = errorDat$misprod_with_hes_in_next_syllables)
+errorDatMisprodHes <- select(errorDat, !contains("any_"))
+
+# First: look at a given misproduction and check for nearby hesitations
+justMisprodWithHesBefore <- cbind(errorDatMisprodHes,
+                                  hes_position = 0, # "before",
+                                  misprod_in_adjacent_window = errorDat$misprod_with_any_prior_hesitation)
+
+justMisprodWithHesAfter <- cbind(errorDatMisprodHes,
+                                 hes_position = 1, # "after",
+                                 misprod_in_adjacent_window = errorDat$misprod_with_any_upcoming_hesitation)
 
 
 # stack the ones before and the ones after as rows of a single df (my attempt at long form)
-# errorDatLongMisprodWithRelHes <- rbind(justMisprodWithHesBefore, justMisprodWithHesAfter)
-#
-# # track the binary relative position as a factor
-# errorDatLongMisprodWithRelHes$hes_position <- as.factor(errorDatLongMisprodWithRelHes$hes_position)
-#
-# # Then: look at a given hesitation and check for nearby misproductions
-# justHesWithMisprodBefore <- cbind(errorDatMisprodHes,
-#                                   misprod_position = 0, # "before",
-#                                   hes_tally = errorDat$hes_with_misprod_in_previous_syllables)
-#
-# justHesWithMisprodAfter <- cbind(errorDatMisprodHes,
-#                                  misprod_position = 1, # "after",
-#                                  hes_tally = errorDat$hes_with_misprod_in_next_syllables)
-#
-# # stack the ones before and the ones after as rows of a single df (my attempt at long form)
-# errorDatLongHesWithRelMisprod <- rbind(justHesWithMisprodBefore, justHesWithMisprodAfter)
-#
-# # track the binary relative position as a factor
-# errorDatLongHesWithRelMisprod$misprod_position <- as.factor(errorDatLongHesWithRelMisprod$misprod_position)
+errorDatLongMisprodWithRelHes <- rbind(justMisprodWithHesBefore, justMisprodWithHesAfter)
+
+# track the binary relative position as a factor
+errorDatLongMisprodWithRelHes$hes_position <- as.factor(errorDatLongMisprodWithRelHes$hes_position)
+
+# Then: look at a given hesitation and check for nearby misproductions
+justHesWithMisprodBefore <- cbind(errorDatMisprodHes,
+                                  misprod_position = 0, # "before",
+                                  hes_in_adjacent_window = errorDat$hesitation_with_any_prior_misprod)
+
+justHesWithMisprodAfter <- cbind(errorDatMisprodHes,
+                                 misprod_position = 1, # "after",
+                                 hes_in_adjacent_window = errorDat$hesitation_with_any_upcoming_misprod)
+
+# stack the ones before and the ones after as rows of a single df (my attempt at long form)
+errorDatLongHesWithRelMisprod <- rbind(justHesWithMisprodBefore, justHesWithMisprodAfter)
+
+# track the binary relative position as a factor
+errorDatLongHesWithRelMisprod$misprod_position <- as.factor(errorDatLongHesWithRelMisprod$misprod_position)
 
 
 
@@ -437,71 +437,71 @@ summary(f_model7)
 
 
 # Accuracy/comprehension as explained by disfluencies *and* SA: hesitations per syllable with scaared
-f_model8 <- glmer(challengeACC ~ hesitation * scaaredSoc_gmc + (1|id) + (1|passage),
-                  data=errorDat, family = "binomial")
-summary(f_model8)
+# f_model8 <- glmer(challengeACC ~ hesitation * scaaredSoc_gmc + (1|id) + (1|passage),
+#                   data=errorDat, family = "binomial")
+# summary(f_model8)
 
 # Accuracy/comprehension as explained by disfluencies: hesitations per word with scaared
-f_model9 <- glmer(challengeACC ~ words_with_hes * scaaredSoc_gmc + (1|id) + (1|passage),
+f_model9 <- glmer(challengeACC ~ hesitation * scaaredSoc_gmc + (1|id) + (1|passage),
                   data=errorDat, family = "binomial")
 summary(f_model9)
 
 
 # Accuracy/comprehension as explained by errors: misproductions per syllable with scaared
-f_model10 <- glmer(challengeACC ~ misprod * scaaredSoc_gmc + (1|id) + (1|passage),
-                  data=errorDat, family = "binomial")
-summary(f_model10)
+# f_model10 <- glmer(challengeACC ~ misprod * scaaredSoc_gmc + (1|id) + (1|passage),
+#                   data=errorDat, family = "binomial")
+# summary(f_model10)
 
 # Accuracy/comprehension as explained by errors: misproductions per word with scaared
-f_model11 <- glmer(challengeACC ~ words_with_misprod * scaaredSoc_gmc + (1|id) + (1|passage),
+f_model11 <- glmer(challengeACC ~ misprod * scaaredSoc_gmc + (1|id) + (1|passage),
                   data=errorDat, family = "binomial")
 summary(f_model11)
 
 
 
 # Accuracy/comprehension as explained by disfluencies *and* SA: hesitations per syllable with bfne
-f_model12 <- glmer(challengeACC ~ hesitation * bfne_gmc + (1|id) + (1|passage),
-                  data=errorDat, family = "binomial")
-summary(f_model12)
+# f_model12 <- glmer(challengeACC ~ hesitation * bfne_gmc + (1|id) + (1|passage),
+#                   data=errorDat, family = "binomial")
+# summary(f_model12)
 
 # Accuracy/comprehension as explained by disfluencies *and* SA: hesitations per word with bfne
-f_model13 <- glmer(challengeACC ~ words_with_hes * bfne_gmc + (1|id) + (1|passage),
-                  data=errorDat, family = "binomial")
-summary(f_model13)
+# f_model13 <- glmer(challengeACC ~ words_with_hes * bfne_gmc + (1|id) + (1|passage),
+#                   data=errorDat, family = "binomial")
+# summary(f_model13)
 
 
 # Accuracy/comprehension as explained by errors *and* SA: misproductions per syllable with bfne
-f_model14 <- glmer(challengeACC ~ misprod * bfne_gmc + (1|id) + (1|passage),
-                   data=errorDat, family = "binomial")
-summary(f_model14)
+# f_model14 <- glmer(challengeACC ~ misprod * bfne_gmc + (1|id) + (1|passage),
+#                    data=errorDat, family = "binomial")
+# summary(f_model14)
 
 # Accuracy/comprehension as explained by errors *and* SA: misproductions per word with bfne
-f_model15 <- glmer(challengeACC ~ words_with_misprod * bfne_gmc + (1|id) + (1|passage),
-                   data=errorDat, family = "binomial")
-summary(f_model15)
+# f_model15 <- glmer(challengeACC ~ words_with_misprod * bfne_gmc + (1|id) + (1|passage),
+#                    data=errorDat, family = "binomial")
+# summary(f_model15)
 
 
 
 # Accuracy/comprehension as explained by disfluencies *and* SA: hesitations per syllable with sps
-f_model16 <- glmer(challengeACC ~ hesitation * sps_gmc + (1|id) + (1|passage),
-                   data=errorDat, family = "binomial")
-summary(f_model16)
+# f_model16 <- glmer(challengeACC ~ hesitation * sps_gmc + (1|id) + (1|passage),
+#                    data=errorDat, family = "binomial")
+# summary(f_model16)
 
 # Accuracy/comprehension as explained by disfluencies *and* SA: hesitations per word with sps
-f_model17 <- glmer(challengeACC ~ words_with_hes * sps_gmc + (1|id) + (1|passage),
-                   data=errorDat, family = "binomial")
-summary(f_model17)
+# f_model17 <- glmer(challengeACC ~ words_with_hes * sps_gmc + (1|id) + (1|passage),
+#                    data=errorDat, family = "binomial")
+# summary(f_model17)
 
 
 # Accuracy/comprehension as explained by errors *and* SA: misproductions per syllable with sps
-f_model18 <- glmer(challengeACC ~ misprod * sps_gmc + (1|id) + (1|passage),
-                   data=errorDat, family = "binomial")
-summary(f_model18)
+# f_model18 <- glmer(challengeACC ~ misprod * sps_gmc + (1|id) + (1|passage),
+#                    data=errorDat, family = "binomial")
+# summary(f_model18)
 
 # Accuracy/comprehension as explained by errors *and* SA: misproductions per word with sps
-f_model19 <- glmer(challengeACC ~ words_with_misprod * sps_gmc + (1|id) + (1|passage),
-                   data=errorDat, family = "binomial")
-summary(f_model19)
+# f_model19 <- glmer(challengeACC ~ words_with_misprod * sps_gmc + (1|id) + (1|passage),
+#                    data=errorDat, family = "binomial")
+# summary(f_model19)
 
 
 
@@ -522,7 +522,7 @@ summary(f_model21) # ***
 f_model22 <- lmerTest::lmer(words_with_misprod ~ hesitation + (1|id) + (1|passage),
                             data=errorDat, REML=TRUE)
 summary(f_model22) # ***
-
+# NB my * comments here (this section of models at least) are out of date
 
 
 # Now, misproduction-hesitation interactions with social anxiety
@@ -593,12 +593,6 @@ summary(age_model2)
 
 # misprod-hes ordering
 
-# Is the number of hesitations adjacent to misproductions in a particular
-# reading predicted by the
-
-# Does the position of misproductions relative to hesitations
-
-
 # we have a number of occurrences of a misproduction in a particular position
 # relative to a passage's hesitations. does knowing the position (before/after)
 # predict the number of these sequences we have?
@@ -606,22 +600,58 @@ summary(age_model2)
 # does misproduction location relative to a hesitation predict how many
 # instances we get in a particular reading?
 
-hes_with_rel_misprod_model_1 <- lmerTest::lmer(hes_tally ~ misprod_position + (1|id) + (1|passage),
+hes_with_rel_misprod_model_1 <- lmerTest::lmer(hes_in_adjacent_window ~ misprod_position + (1|id) + (1|passage),
+                                               data=errorDatLongHesWithRelMisprod, REML=TRUE)
+summary(hes_with_rel_misprod_model_1) # n.s., 0.271
+
+misprod_with_rel_hes_model_1 <- lmerTest::lmer(misprod_in_adjacent_window ~ hes_position + (1|id) + (1|passage),
+                                               data=errorDatLongMisprodWithRelHes, REML=TRUE)
+summary(misprod_with_rel_hes_model_1) # n.s., 0.108
+
+## does it interact with SA?
+hes_with_rel_misprod_model_3 <- lmerTest::lmer(hes_in_adjacent_window ~ misprod_position * scaaredSoc_gmc + (1|id) + (1|passage),
+                                               data=errorDatLongHesWithRelMisprod, REML=TRUE)
+summary(hes_with_rel_misprod_model_3) # n.s.
+
+misprod_with_rel_hes_model_4 <- lmerTest::lmer(misprod_in_adjacent_window ~ hes_position * scaaredSoc_gmc + (1|id) + (1|passage),
+                                               data=errorDatLongMisprodWithRelHes, REML=TRUE)
+summary(misprod_with_rel_hes_model_4) # n.s.
+
+# what if we control for word?
+hes_with_rel_misprod_model_1.5 <- lmerTest::lmer(hes_in_adjacent_window ~ misprod_position + (1|id) + (1|passage) + (1|word),
                                                data=errorDatLongHesWithRelMisprod, REML=TRUE)
-summary(hes_with_rel_misprod_model_1)
+summary(hes_with_rel_misprod_model_1.5) # n.s., sameish
 
-misprod_with_rel_hes_model_1 <- lmerTest::lmer(misprod_tally ~ hes_position + (1|id) + (1|passage),
+misprod_with_rel_hes_model_1.5 <- lmerTest::lmer(misprod_in_adjacent_window ~ hes_position + (1|id) + (1|passage) + (1|word),
                                                data=errorDatLongMisprodWithRelHes, REML=TRUE)
-summary(misprod_with_rel_hes_model_1)
+summary(misprod_with_rel_hes_model_1.5) # ., 0.0974
 
 ## does it interact with SA?
-hes_with_rel_misprod_model_3 <- lmerTest::lmer(hes_tally ~ misprod_position * scaaredSoc_gmc + (1|id) + (1|passage),
+hes_with_rel_misprod_model_3.5 <- lmerTest::lmer(hes_in_adjacent_window ~ misprod_position * scaaredSoc_gmc + (1|id) + (1|passage) + (1|word),
                                                data=errorDatLongHesWithRelMisprod, REML=TRUE)
-# summary(hes_with_rel_misprod_model_3)
+summary(hes_with_rel_misprod_model_3.5) # n.s.
 
-misprod_with_rel_hes_model_4 <- lmerTest::lmer(misprod_tally ~ hes_position * scaaredSoc_gmc + (1|id) + (1|passage),
+misprod_with_rel_hes_model_4.5 <- lmerTest::lmer(misprod_in_adjacent_window ~ hes_position * scaaredSoc_gmc + (1|id) + (1|passage) + (1|word),
                                                data=errorDatLongMisprodWithRelHes, REML=TRUE)
-# summary(misprod_with_rel_hes_model_4)
+summary(misprod_with_rel_hes_model_4.5) # n.s.
+
+# and if we ignore passage?
+hes_with_rel_misprod_model_1.6 <- lmerTest::lmer(hes_in_adjacent_window ~ misprod_position + (1|id) + (1|word),
+                                                 data=errorDatLongHesWithRelMisprod, REML=TRUE)
+summary(hes_with_rel_misprod_model_1.6) # n.s., sameish
+
+misprod_with_rel_hes_model_1.6 <- lmerTest::lmer(misprod_in_adjacent_window ~ hes_position + (1|id) + (1|word),
+                                                 data=errorDatLongMisprodWithRelHes, REML=TRUE)
+summary(misprod_with_rel_hes_model_1.6) # made no difference, as you might expect
+
+## does it interact with SA?
+hes_with_rel_misprod_model_3.6 <- lmerTest::lmer(hes_in_adjacent_window ~ misprod_position * scaaredSoc_gmc + (1|id) + (1|word),
+                                                 data=errorDatLongHesWithRelMisprod, REML=TRUE)
+summary(hes_with_rel_misprod_model_3.6) # ""
+
+misprod_with_rel_hes_model_4.6 <- lmerTest::lmer(misprod_in_adjacent_window ~ hes_position * scaaredSoc_gmc + (1|id) + (1|word),
+                                                 data=errorDatLongMisprodWithRelHes, REML=TRUE)
+summary(misprod_with_rel_hes_model_4.6) # ""
 
 
 
@@ -663,3 +693,9 @@ summary(wordfreq_model_3.5) # still yes, very slightly higher p
 wordfreq_model_4.5 <- lmerTest::lmer(misprod ~ log10frequency * scaaredSoc_gmc + (1|id) + (1|passage) + (1|word),
                                    data=errorDatAttestedFreqs, REML=TRUE)
 summary(wordfreq_model_4.5) # still no, slightly lower p = 0.114
+
+
+# hesitation ~ wf x SA
+interact_plot(model = wordfreq_model_3,
+              pred = log10frequency, modx = scaaredSoc_gmc, interval = TRUE)
+

From ac0378d18f51b87fc9d32578d524060ee9bf9bc7 Mon Sep 17 00:00:00 2001
From: l-acs <lucas.sahar@mail.mcgill.ca>
Date: Thu, 1 Feb 2024 20:40:43 -0500
Subject: [PATCH 30/33] Clean up: sanitize for clarity before revised analyses

---
 code/analysisWordLevelReadAloudBeta.R | 78 +++++++++++++++------------
 1 file changed, 45 insertions(+), 33 deletions(-)

diff --git a/code/analysisWordLevelReadAloudBeta.R b/code/analysisWordLevelReadAloudBeta.R
index c87d87a..03dd2e4 100644
--- a/code/analysisWordLevelReadAloudBeta.R
+++ b/code/analysisWordLevelReadAloudBeta.R
@@ -110,6 +110,7 @@
 
 ### SECTION 1: SETTING UP
 library(dplyr)
+library(purrr)
 library(lme4)
 library(lmerTest)
 library(interactions)
@@ -120,31 +121,17 @@ library(gridExtra)
 library(grid)
 library(cowplot)
 library(colorspace)
-library(colorblindr)
-
-# ```
-# Warning in install.packages :
-# package ‘colorblindr’ is not available for this version of R
-#
-# A version of this package for your version of R might be available elsewhere,
-# see the ideas at
-# https://cran.r-project.org/doc/manuals/r-patched/R-admin.html#Installing-packages
-# ```
+# library(colorblindr)
 
 #set up date for output file naming
 today <- Sys.Date()
 today <- format(today, "%Y%m%d")
 
 #set up directories for input/output data
-# data <- '/Users/jalexand/github/readAloud-valence-beta/derivatives/readAloudBetaData_20230630.csv'
-# data <- '/home/luc/Documents/ndclab/analysis-sandbox/rwe-analysis/derivatives/readAloudBetaData_20230810.csv'
-# data <- '/home/luc/Documents/ndclab/analysis-sandbox/rwe-analysis/derivatives/readAloudBetaData_20230815.csv'
-# data <- '/home/luc/Documents/ndclab/analysis-sandbox/rwe-analysis/derivatives/readAloudBetaData_20230816.csv'
-# data <- '/home/luc/Documents/ndclab/analysis-sandbox/rwe-analysis/derivatives/readAloudBetaData_20230825.csv'
 data <- '~/Documents/ndclab/rwe-analysis-sandbox/rwe-analysis/derivatives/readAloudBetaData-wordLevel_20240130.csv'
-to_omit <- '/home/luc/Documents/ndclab/rwe-analysis-sandbox/rwe-analysis/input/passages-to-omit_20230810.csv'
+to_omit <- '~/Documents/ndclab/rwe-analysis-sandbox/rwe-analysis/input/passages-to-omit_20230810.csv'
 # out_path <- '/Users/jalexand/github/readAloud-valence-beta/derivatives/'
-out_path <- '/home/luc/Documents/ndclab/rwe-analysis-sandbox/rwe-analysis/derivatives/'
+out_path <- '~/Documents/ndclab/rwe-analysis-sandbox/rwe-analysis/derivatives/'
 
 #read in data
 df <- read.csv(data, row.names = NULL) # output of prep script
@@ -202,11 +189,6 @@ length(unique(df$id)) - length(unique(dfTrim$id)) #number of participants remove
 summary_unique(dfTrim, "id", "challengeAvgSub")
 summary_unique(dfTrim, "id", "challengeAvgSub", f = sd)
 
-# calculate average speed per *person* per *passage*
-unique(select(dfTrim, timePerSyllable, id, passage))$timePerSyllable %>% mean(na.rm = TRUE)
-unique(select(dfTrim, timePerSyllable, id, passage))$timePerSyllable %>% sd(na.rm = TRUE)
-unique(select(dfTrim, timePerWord, id, passage))$timePerWord %>% mean(na.rm = TRUE)
-unique(select(dfTrim, timePerWord, id, passage))$timePerWord %>% sd(na.rm = TRUE)
 
 ### SECTION 2: PASSAGE-LEVEL TRIMMING
 passage_no_before_trimming <- summary_unique(dfTrim, "id", "passage", f = length)
@@ -231,11 +213,6 @@ c(150083, "caramel")    # N.B.: only one of four passages to have >= 5% of sylla
 c(150083, "cars")       # N.B.: only one of four passages to have >= 5% of syllables omitted
 
 
-dfTrim <- filter(dfTrim, !is.na(timePerSyllable))
-# itself, but without ones for which we have no reading data
-# this ends up only dropping 0083, caramel - the other three already end up
-# getting dropped based on other criteria
-
 passage_no_after_trim2 <- summary_unique(dfTrim, "id", "passage", f = length)
 passage_no_after_trim1 - passage_no_after_trim2 #number of passages trimmed
 (passage_no_after_trim1 - passage_no_after_trim2) / passage_no_after_trim1 #percentage of passages trimmed of last bunch
@@ -259,11 +236,7 @@ errorDat$scaaredTotal_gmc <- errorDat$scaaredTotal - mean(errorDat$scaaredTotal)
 errorDat$scaaredGA_gmc <- errorDat$scaaredGA - mean(errorDat$scaaredGA)
 errorDat$scaaredSoc_gmc <- errorDat$scaaredSoc - mean(errorDat$scaaredSoc)
 errorDat$sps_gmc <- errorDat$sps - mean(errorDat$sps)
-errorDat$lenSyll_gmc <- errorDat$lenSyll - mean(errorDat$lenSyll)
-errorDat$lenWord_gmc <- errorDat$lenWord - mean(errorDat$lenWord)
-errorDat$avgSyllPerWord_gmc <- errorDat$avgSyllPerWord - mean(errorDat$avgSyllPerWord)
-errorDat$timePerSyllable_gmc <- errorDat$timePerSyllable - mean(errorDat$timePerSyllable)
-errorDat$timePerWord_gmc <- errorDat$timePerWord - mean(errorDat$timePerWord)
+
 
 
 ### SECTION 3.5: preparing for misprod-hes sequential analyses
@@ -674,6 +647,15 @@ wordfreq_model_2.5 <- lmerTest::lmer(misprod ~ log10frequency + (1|id) + (1|pass
                                    data=errorDatAttestedFreqs, REML=TRUE)
 summary(wordfreq_model_2.5)
 
+# control for word w/o passage
+wordfreq_model_1.6 <- lmerTest::lmer(hesitation ~ log10frequency + (1|id) + (1|word),
+                                     data=errorDatAttestedFreqs, REML=TRUE)
+summary(wordfreq_model_1.6)
+wordfreq_model_2.6 <- lmerTest::lmer(misprod ~ log10frequency + (1|id) + (1|word),
+                                     data=errorDatAttestedFreqs, REML=TRUE)
+summary(wordfreq_model_2.6)
+
+
 
 # Do social anxiety and frequency interact to predict hesitation rate or misproduction rate?
 wordfreq_model_3 <- lmerTest::lmer(hesitation ~ log10frequency * scaaredSoc_gmc + (1|id) + (1|passage),
@@ -696,6 +678,36 @@ summary(wordfreq_model_4.5) # still no, slightly lower p = 0.114
 
 
 # hesitation ~ wf x SA
-interact_plot(model = wordfreq_model_3,
+interact_plot(model = wordfreq_model_3.5,
+              pred = log10frequency, modx = scaaredSoc_gmc, interval = TRUE)
+
+
+
+interact_plot(model = wordfreq_model_4.5,
               pred = log10frequency, modx = scaaredSoc_gmc, interval = TRUE)
 
+
+
+
+
+
+
+
+summary(errorDatAttestedFreqs$log10frequency)
+
+
+case_when(errorDatAttestedFreqs$log10frequency <= 2.711 ~ 1, errorDatAttestedFreqs$log10frequency <= 4.058 ~ 2, errorDatAttestedFreqs$log10frequency <= 5.609 ~ 3, .default = 4 )
+
+
+errorDatAttestedFreqs$freqCat <- case_when(errorDatAttestedFreqs$log10frequency <= 2.711 ~ "low", .default = "high")
+
+# errorDatAttestedFreqs$freqCat <- as.factor(errorDatAttestedFreqs$freqCat)
+
+
+wordfreq_model_category_1 <- lmerTest::lmer(misprod ~ freqCat * scaaredSoc_gmc + (1|id) + (1|passage),
+                                   data=errorDatAttestedFreqs, REML=TRUE)
+summary(wordfreq_model_category_1)
+
+interact_plot(model = wordfreq_model_category_1,
+              pred = freqCat, modx = scaaredSoc_gmc, points = TRUE, interval = TRUE)
+

From 60950b59ebce022dfd32cfa8697f359146013459 Mon Sep 17 00:00:00 2001
From: l-acs <lucas.sahar@mail.mcgill.ca>
Date: Thu, 1 Feb 2024 21:02:21 -0500
Subject: [PATCH 31/33] Update data dict to reflect word-level level of
 analysis

---
 code/analysisWordLevelReadAloudBeta.R | 87 ++++++++++++++-------------
 1 file changed, 44 insertions(+), 43 deletions(-)

diff --git a/code/analysisWordLevelReadAloudBeta.R b/code/analysisWordLevelReadAloudBeta.R
index 03dd2e4..c115260 100644
--- a/code/analysisWordLevelReadAloudBeta.R
+++ b/code/analysisWordLevelReadAloudBeta.R
@@ -22,44 +22,45 @@
 #
 # hes_position:
 #
-#   for long-form dataframes counting misproductions, this indicates whether the
-#   relevant count is the number of hesitations before (0) or after (1) those
-#   misproductions being counted in that row
+#   for long-form dataframes looking for misproductions, this indicates whether
+#   the relevant binary value is tracking whether a hesitation comes before (0)
+#   or after (1) the misproduction being counted in that row
 #
 #
-# misprod_tally:
+# misprod_in_adjacent_window:
 #
-#   conversely, in long-form dataframes counting misproductions, this column
-#   actually tracks how many misproductions there are in that reading
-#   (participant x passage) that have a hesitation in the relevant relative
-#   position
+#   conversely, in long-form dataframes looking at misproductions, this column
+#   actually tracks whether or not there is a misproduction in the relevant
+#   relative position (to a hesitation), within the specified window
 
 
 # justMisprodWithHesBefore:
 #
-#   this is the dataframe with every (participant x passage) reading, counting
-#   the number of misproductions with a nearby preceding hesitation
+#   this is the dataframe with every (participant x passage x word) read,
+#   checking for each word whether there is a misproduction with a nearby
+#   preceding hesitation
 #
-#   i.e., for each reading, it counts the number of times (misprod_tally) that a
-#   hesitation comes before a misproduction -- so for every entry, hes_position = 0
+#   i.e., for each reading, it only looks at rows where a hesitation comes
+#   before a misproduction -- so for every entry, hes_position = 0
 #
 #
 # justMisprodWithHesAfter
 #
-#   similarly, this is the dataframe with every (participant x passage) reading,
-#   counting the number of misproductions with a nearby following hesitation
+#   similarly, this is the dataframe with every (participant x passage x word)
+#   read, checking each time whether there is a misproduction with a nearby
+#   following hesitation
 #
-#   i.e., for each reading, it counts the number of times (misprod_tally) that a
-#   hesitation comes after a misproduction -- so for every entry, hes_position = 1
+#   i.e., for each reading, it only looks at rows where a hesitation comes after
+#   a misproduction -- so for every entry, hes_position = 1
 #
 #
 # errorDatLongMisprodWithRelHes:
 #
-#   this is the long-form dataframe, with two rows per reading (participant x
-#   passage): one for each position for a relative hesitation. i.e. this stacks
-#   the two dataframes that respectively have (1) every passage, with a count of
-#   misproductions for hes_position = 0, and (2) every passage, with a count of
-#   misproductions for hes_position = 1
+#   this is the long-form dataframe, with two rows per word read (participant x
+#   passage x word): one for each position for a relative hesitation, i.e. this
+#   stacks the two dataframes that respectively have (1) every word, where
+#   misprod_in_adjacent_window corresponds to hes_position = 0, and (2) every
+#   word, where misprod_in_adjacent_window corresponds to hes_position = 1
 
 
 
@@ -67,45 +68,45 @@
 
 # misprod_position:
 #
-#   for long-form dataframes counting hesitations, this indicates whether the
-#   relevant count is the number of misproductions before (0) or after (1) those
-#   hesitations being counted in that row
+#   for long-form dataframes looking for hesitations, this indicates whether the
+#   relevant binary value is tracking whether a misproduction comes before (0)
+#   or after (1) the hesitation being counted in that row
 #
 #
-# hes_tally:
+# hes_in_adjacent_window:
 #
-#   conversely, in long-form dataframes counting hesitations, this column
-#   actually tracks how many hesitations there are in that reading (participant
-#   x passage) that have a misproduction in the relevant relative position
+#   conversely, in long-form dataframes looking at hesitations, this column
+#   actually tracks whether or not there is a hesitation in the relevant
+#   relative position (to a misproduction), within the specified window
 
 
 # justHesWithMisprodBefore:
 #
-#   this is the dataframe with every (participant x passage) reading, counting
-#   the number of hesitations with a nearby preceding misproduction
+#   this is the dataframe with every (participant x passage x word) read,
+#   checking for each word whether there is a hesitation with a nearby
+#   preceding misproduction
 #
-#   i.e., for each reading, it counts the number of times (hes_tally) that a
-#   misproduction comes before a hesitation -- so for every entry,
-#   misprod_position = 0
+#   i.e., for each reading, it only looks at rows where a misproduction comes
+#   before a hesitation -- so for every entry, misprod_position = 0
 #
 #
 # justHesWithMisprodAfter
 #
-#   similarly, this is the dataframe with every (participant x passage) reading,
-#   counting the number of hesitations with a nearby following misproduction
+#   similarly, this is the dataframe with every (participant x passage x word)
+#   read, checking for each word whether there is a hesitation with a nearby
+#   following misproduction
 #
-#   i.e., for each reading, it counts the number of times (hes_tally) that a
-#   misproduction comes after a hesitation -- so for every entry,
-#   misprod_position = 1
+#   i.e., for each reading, it only looks at rows where a misproduction comes
+#   after a hesitation -- so for every entry, misprod_position = 1
 #
 #
 # errorDatLongHesWithRelMisprod:
 #
-#   this is the long-form dataframe, with two rows per reading (participant x
-#   passage): one for each position for a relative misproduction, i.e. this
-#   stacks the two dataframes that respectively have (1) every passage, with a
-#   count of hesitations for misprod_position = 0, and (2) every passage, with a
-#   count of hesitations for misprod_position = 1
+#   this is the long-form dataframe, with two rows per word read (participant x
+#   passage x word): one for each position for a relative misproduction, i.e.
+#   this stacks the two dataframes that respectively have (1) every word, where
+#   hes_in_adjacent_window corresponds to misprod_position = 0, and (2) every
+#   word, where hes_in_adjacent_window corresponds to misprod_position = 1
 
 
 ### SECTION 1: SETTING UP

From 7492f3c0b1b9691cb6c6374a557bb98a8ce094ac Mon Sep 17 00:00:00 2001
From: l-acs <lucas.sahar@mail.mcgill.ca>
Date: Thu, 1 Feb 2024 21:32:25 -0500
Subject: [PATCH 32/33] Add models for word frequency, interpolating absent
 words

---
 code/analysisWordLevelReadAloudBeta.R | 74 +++++++++++++++++++++++----
 1 file changed, 65 insertions(+), 9 deletions(-)

diff --git a/code/analysisWordLevelReadAloudBeta.R b/code/analysisWordLevelReadAloudBeta.R
index c115260..0469768 100644
--- a/code/analysisWordLevelReadAloudBeta.R
+++ b/code/analysisWordLevelReadAloudBeta.R
@@ -629,7 +629,7 @@ summary(misprod_with_rel_hes_model_4.6) # ""
 
 
 
-# Word frequency analysis
+# Word frequency analysis with words absent from corpus dropped
 # Does a word's frequency predict hesitation on that word?
 errorDatAttestedFreqs <- filter(errorDat, log10frequency > 0)
 wordfreq_model_1 <- lmerTest::lmer(hesitation ~ log10frequency + (1|id) + (1|passage),
@@ -697,18 +697,74 @@ interact_plot(model = wordfreq_model_4.5,
 summary(errorDatAttestedFreqs$log10frequency)
 
 
-case_when(errorDatAttestedFreqs$log10frequency <= 2.711 ~ 1, errorDatAttestedFreqs$log10frequency <= 4.058 ~ 2, errorDatAttestedFreqs$log10frequency <= 5.609 ~ 3, .default = 4 )
+# case_when(errorDatAttestedFreqs$log10frequency <= 2.711 ~ 1, errorDatAttestedFreqs$log10frequency <= 4.058 ~ 2, errorDatAttestedFreqs$log10frequency <= 5.609 ~ 3, .default = 4 )
+#
+#
+# errorDatAttestedFreqs$freqCat <- case_when(errorDatAttestedFreqs$log10frequency <= 2.711 ~ "low", .default = "high")
+#
+# # errorDatAttestedFreqs$freqCat <- as.factor(errorDatAttestedFreqs$freqCat)
+#
+#
+# wordfreq_model_category_1 <- lmerTest::lmer(misprod ~ freqCat * scaaredSoc_gmc + (1|id) + (1|passage),
+#                                    data=errorDatAttestedFreqs, REML=TRUE)
+# summary(wordfreq_model_category_1)
+#
+# interact_plot(model = wordfreq_model_category_1,
+#               pred = freqCat, modx = scaaredSoc_gmc, points = TRUE, interval = TRUE)
 
 
-errorDatAttestedFreqs$freqCat <- case_when(errorDatAttestedFreqs$log10frequency <= 2.711 ~ "low", .default = "high")
 
-# errorDatAttestedFreqs$freqCat <- as.factor(errorDatAttestedFreqs$freqCat)
+
+# Word frequency analysis with words absent from corpus set to corpus minimum
+subtlexus_minimum = 0.301 # or: # subtlexus %>% select(Lg10WF) %>% min
+errorDat$log10frequency_with_absents <- case_match(
+  errorDat$log10frequency,
+  0 ~ subtlexus_minimum,
+  .default = errorDat$log10frequency)
+
+compare_freq <- data.frame(cbind(old = errorDat$log10frequency,
+                                 new = errorDat$log10frequency_with_absents))
+
+filter(compare_freq, old != new) %>% # confirm it worked as expected
+  filter(old != 0 | new != subtlexus_minimum) %>%
+  nrow == 0 # TRUE
+
+
+# Does a word's frequency predict hesitation on that word?
+wordfreq_model_with_absents_1 <- lmerTest::lmer(hesitation ~ log10frequency_with_absents + (1|id) + (1|passage) + (1|word),
+                                   data=errorDat, REML=TRUE)
+summary(wordfreq_model_with_absents_1)
+
+# "" misprod on that word?
+wordfreq_model_with_absents_2 <- lmerTest::lmer(misprod ~ log10frequency_with_absents + (1|id) + (1|passage) + (1|word),
+                                   data=errorDat, REML=TRUE)
+summary(wordfreq_model_with_absents_2)
+
+
+# Do social anxiety and frequency interact to predict hesitation rate or misproduction rate?
+
+# control for word, that must matter right?
+wordfreq_model_with_absents_3.5 <- lmerTest::lmer(hesitation ~ log10frequency_with_absents * scaaredSoc_gmc + (1|id) + (1|passage) + (1|word),
+                                     data=errorDat, REML=TRUE)
+summary(wordfreq_model_with_absents_3.5) # still yes, very slightly higher p
+
+wordfreq_model_with_absents_4.5 <- lmerTest::lmer(misprod ~ log10frequency_with_absents * scaaredSoc_gmc + (1|id) + (1|passage) + (1|word),
+                                     data=errorDat, REML=TRUE)
+summary(wordfreq_model_with_absents_4.5) # still no, slightly lower p = 0.114
+
+
+# hesitation ~ wf x SA
+interact_plot(model = wordfreq_model_with_absents_3.5,
+              pred = log10frequency_with_absents, modx = scaaredSoc_gmc, interval = TRUE)
+
+
+
+interact_plot(model = wordfreq_model_with_absents_4.5,
+              pred = log10frequency_with_absents, modx = scaaredSoc_gmc, interval = TRUE)
+
+
 
 
-wordfreq_model_category_1 <- lmerTest::lmer(misprod ~ freqCat * scaaredSoc_gmc + (1|id) + (1|passage),
-                                   data=errorDatAttestedFreqs, REML=TRUE)
-summary(wordfreq_model_category_1)
 
-interact_plot(model = wordfreq_model_category_1,
-              pred = freqCat, modx = scaaredSoc_gmc, points = TRUE, interval = TRUE)
+summary(errorDat$log10frequency_with_absents)
 

From a335041e398cc7633c8da11328413e6a4a045224 Mon Sep 17 00:00:00 2001
From: l-acs <lucas.sahar@mail.mcgill.ca>
Date: Thu, 1 Feb 2024 22:17:15 -0500
Subject: [PATCH 33/33] Prune models and set up plots

---
 code/analysisWordLevelReadAloudBeta.R | 95 +++++++++++----------------
 1 file changed, 39 insertions(+), 56 deletions(-)

diff --git a/code/analysisWordLevelReadAloudBeta.R b/code/analysisWordLevelReadAloudBeta.R
index 0469768..64f4590 100644
--- a/code/analysisWordLevelReadAloudBeta.R
+++ b/code/analysisWordLevelReadAloudBeta.R
@@ -122,6 +122,7 @@ library(gridExtra)
 library(grid)
 library(cowplot)
 library(colorspace)
+library(effects)
 # library(colorblindr)
 
 #set up date for output file naming
@@ -632,63 +633,22 @@ summary(misprod_with_rel_hes_model_4.6) # ""
 # Word frequency analysis with words absent from corpus dropped
 # Does a word's frequency predict hesitation on that word?
 errorDatAttestedFreqs <- filter(errorDat, log10frequency > 0)
-wordfreq_model_1 <- lmerTest::lmer(hesitation ~ log10frequency + (1|id) + (1|passage),
+wordfreq_model_1 <- lmerTest::lmer(hesitation ~ log10frequency + (1|id) + (1|passage) + (1|word),
                                    data=errorDatAttestedFreqs, REML=TRUE)
 summary(wordfreq_model_1)
-wordfreq_model_2 <- lmerTest::lmer(misprod ~ log10frequency + (1|id) + (1|passage),
+wordfreq_model_2 <- lmerTest::lmer(misprod ~ log10frequency + (1|id) + (1|passage) + (1|word),
                                    data=errorDatAttestedFreqs, REML=TRUE)
 summary(wordfreq_model_2)
 
 
-# control for word, that must matter right?
-wordfreq_model_1.5 <- lmerTest::lmer(hesitation ~ log10frequency + (1|id) + (1|passage) + (1|word),
-                                   data=errorDatAttestedFreqs, REML=TRUE)
-summary(wordfreq_model_1.5)
-wordfreq_model_2.5 <- lmerTest::lmer(misprod ~ log10frequency + (1|id) + (1|passage) + (1|word),
-                                   data=errorDatAttestedFreqs, REML=TRUE)
-summary(wordfreq_model_2.5)
-
-# control for word w/o passage
-wordfreq_model_1.6 <- lmerTest::lmer(hesitation ~ log10frequency + (1|id) + (1|word),
-                                     data=errorDatAttestedFreqs, REML=TRUE)
-summary(wordfreq_model_1.6)
-wordfreq_model_2.6 <- lmerTest::lmer(misprod ~ log10frequency + (1|id) + (1|word),
-                                     data=errorDatAttestedFreqs, REML=TRUE)
-summary(wordfreq_model_2.6)
-
-
-
 # Do social anxiety and frequency interact to predict hesitation rate or misproduction rate?
-wordfreq_model_3 <- lmerTest::lmer(hesitation ~ log10frequency * scaaredSoc_gmc + (1|id) + (1|passage),
-                                   data=errorDatAttestedFreqs, REML=TRUE)
-summary(wordfreq_model_3) # yes!
-
-wordfreq_model_4 <- lmerTest::lmer(misprod ~ log10frequency * scaaredSoc_gmc + (1|id) + (1|passage),
-                                   data=errorDatAttestedFreqs, REML=TRUE)
-summary(wordfreq_model_4) # no, not at all - well, p = 0.15
-
-
-# control for word, that must matter right?
-wordfreq_model_3.5 <- lmerTest::lmer(hesitation ~ log10frequency * scaaredSoc_gmc + (1|id) + (1|passage) + (1|word),
+wordfreq_model_3 <- lmerTest::lmer(hesitation ~ log10frequency * scaaredSoc_gmc + (1|id) + (1|passage) + (1|word),
                                    data=errorDatAttestedFreqs, REML=TRUE)
-summary(wordfreq_model_3.5) # still yes, very slightly higher p
+summary(wordfreq_model_3) # todo
 
-wordfreq_model_4.5 <- lmerTest::lmer(misprod ~ log10frequency * scaaredSoc_gmc + (1|id) + (1|passage) + (1|word),
+wordfreq_model_4 <- lmerTest::lmer(misprod ~ log10frequency * scaaredSoc_gmc + (1|id) + (1|passage) + (1|word),
                                    data=errorDatAttestedFreqs, REML=TRUE)
-summary(wordfreq_model_4.5) # still no, slightly lower p = 0.114
-
-
-# hesitation ~ wf x SA
-interact_plot(model = wordfreq_model_3.5,
-              pred = log10frequency, modx = scaaredSoc_gmc, interval = TRUE)
-
-
-
-interact_plot(model = wordfreq_model_4.5,
-              pred = log10frequency, modx = scaaredSoc_gmc, interval = TRUE)
-
-
-
+summary(wordfreq_model_4) # todo
 
 
 
@@ -742,29 +702,52 @@ summary(wordfreq_model_with_absents_2)
 
 
 # Do social anxiety and frequency interact to predict hesitation rate or misproduction rate?
-
-# control for word, that must matter right?
-wordfreq_model_with_absents_3.5 <- lmerTest::lmer(hesitation ~ log10frequency_with_absents * scaaredSoc_gmc + (1|id) + (1|passage) + (1|word),
+wordfreq_model_with_absents_3 <- lmerTest::lmer(hesitation ~ log10frequency_with_absents * scaaredSoc_gmc + (1|id) + (1|passage) + (1|word),
                                      data=errorDat, REML=TRUE)
-summary(wordfreq_model_with_absents_3.5) # still yes, very slightly higher p
+summary(wordfreq_model_with_absents_3)
 
-wordfreq_model_with_absents_4.5 <- lmerTest::lmer(misprod ~ log10frequency_with_absents * scaaredSoc_gmc + (1|id) + (1|passage) + (1|word),
+wordfreq_model_with_absents_4 <- lmerTest::lmer(misprod ~ log10frequency_with_absents * scaaredSoc_gmc + (1|id) + (1|passage) + (1|word),
                                      data=errorDat, REML=TRUE)
-summary(wordfreq_model_with_absents_4.5) # still no, slightly lower p = 0.114
+summary(wordfreq_model_with_absents_4)
+
+# effects
+# eff <- effect("log10frequency_with_absents", wordfreq_model_with_absents_1)
+# plot(eff, se = TRUE, rug = FALSE, xlab = "log10frequency_with_absents", ylab = "hesitation", col.points = "red", col.lines = "blue", lty = 1)
+# eff_noabsents <- effect("log10frequency", wordfreq_model_1)
+# plot(eff_noabsents, se = TRUE, rug = FALSE, xlab = "log10frequency", ylab = "hesitation", col.points = "red", col.lines = "blue", lty = 1)
+
+plot_lmer <- function(model, predictor, outcome) {
+  # NB `outcome` will not catch your mistake; it's just a label
+  eff <- effect(predictor, model)
+  plot(eff, se = TRUE, rug = FALSE, xlab = predictor, ylab = outcome,
+       col.points = "red", col.lines = "blue", lty = 1)
+}
+
+# as in
+plot_lmer(wordfreq_model_1, "log10frequency", "hesitation")
+plot_lmer(wordfreq_model_with_absents_1, "log10frequency_with_absents", "hesitation")
+plot_lmer(wordfreq_model_with_absents_2, "log10frequency_with_absents", "misprod")
+plot_lmer(wordfreq_model_2, "log10frequency", "misprod")
 
 
 # hesitation ~ wf x SA
-interact_plot(model = wordfreq_model_with_absents_3.5,
+interact_plot(model = wordfreq_model_3,
+              pred = log10frequency, modx = scaaredSoc_gmc, interval = TRUE)
+
+interact_plot(model = wordfreq_model_with_absents_3,
               pred = log10frequency_with_absents, modx = scaaredSoc_gmc, interval = TRUE)
 
 
+# misprod ~ wf x SA
+interact_plot(model = wordfreq_model_4,
+              pred = log10frequency, modx = scaaredSoc_gmc, interval = TRUE)
 
-interact_plot(model = wordfreq_model_with_absents_4.5,
+interact_plot(model = wordfreq_model_with_absents_4,
               pred = log10frequency_with_absents, modx = scaaredSoc_gmc, interval = TRUE)
 
 
 
 
 
-summary(errorDat$log10frequency_with_absents)
+# summary(errorDat$log10frequency_with_absents)