From 918bc5320306106910c64ef93bc7f12fefe2dc6b Mon Sep 17 00:00:00 2001
From: vboyce <vero.boyce@gmail.com>
Date: Thu, 29 Aug 2024 16:20:19 -0700
Subject: [PATCH 01/14] ""working"" yoon simp imp

---
 data/yoon_simpimp_2015/import.R             | 148 +++++++++++++++++++-
 data/yoon_simpimp_2015/simpimp_preprocess.R |  82 +++++++++++
 data/yoon_simpimp_2015/useful.R             |  72 ++++++++++
 3 files changed, 295 insertions(+), 7 deletions(-)
 create mode 100644 data/yoon_simpimp_2015/simpimp_preprocess.R
 create mode 100644 data/yoon_simpimp_2015/useful.R

diff --git a/data/yoon_simpimp_2015/import.R b/data/yoon_simpimp_2015/import.R
index 67c9a35..e13b370 100644
--- a/data/yoon_simpimp_2015/import.R
+++ b/data/yoon_simpimp_2015/import.R
@@ -1,3 +1,4 @@
+library(tidyverse)
 library(here)
 
 source(here("helper_functions", "idless_draft.R"))
@@ -5,17 +6,150 @@ source(here("helper_functions", "common.R"))
 dataset_name <- "yoon_simpimp_2015"
 data_path <- init(dataset_name)
 
-# TODO: figure out what part of the data we actually need here
 
-wide.table <- tibble()
+generate_aois <- function(x, y, target_side, 
+                          l_x_max, l_x_min, l_y_max, l_y_min,
+                          r_x_max, r_x_min, r_y_max, r_y_min,
+                          monitor_size_x = 1e6,
+                          monitor_size_y = 1e6) {
+  ifelse(target_side == "left",
+         case_when( # left target
+           x <= l_x_max & x >= l_x_min & y <= l_y_max & y >= l_y_min ~ "target",
+           x <= r_x_max & x >= r_x_min & y <= r_y_max & y >= r_y_min ~ "distractor",
+           is.na(x) | is.na(y) | x > monitor_size_x | x < 0 | y > monitor_size_y | y < 0 ~ "missing",
+           .default = "other"
+         ),
+         case_when( # right target
+           x <= l_x_max & x >= l_x_min & y <= l_y_max & y >= l_y_min ~ "distractor",
+           x <= r_x_max & x >= r_x_min & y <= r_y_max & y >= r_y_min ~ "target",
+           is.na(x) | is.na(y) | x > monitor_size_x | x < 0 | y > monitor_size_y | y < 0 ~ "missing",
+           .default = "other"
+         ))
+}
+# 
 eyetracking_path <- here(data_path, "eyetracking")
-# TODO t.crit is seconds right now
-data_ex_1B <- read.csv(here(eyetracking_path, "simpimp_processed_3v1.csv"))
-data_ex_1A <- read.csv(here(eyetracking_path, "simpimp_processed_2v1.csv"))
+data_ex_1<- read_csv(here(eyetracking_path, "eyetrack_expt1.csv")) |> mutate(expt="0")
+data_ex_2 <- read_csv(here(eyetracking_path, "eyetrack_expt2.csv")) |> mutate(expt="scale")
 
-exclusion_data <- read.csv(here(eyetracking_path, "simpimp_et_log.csv"))
-order_data <- read.csv(here(eyetracking_path, "simpimp_et_order.csv"))
+exclusion_data <- read_csv(here(eyetracking_path, "simpimp_et_log.csv")) |> 
+  filter(age!="adult")
+order_data <- read_csv(here(eyetracking_path, "simpimp_et_order.csv"))
 
+draft_data <- data_ex_1 |> bind_rows(data_ex_2) |> left_join(order_data) |> 
+  mutate(time=ifelse(is.na(order), NA, time), #this is janky - VB
+         stimulus=ifelse(is.na(order), NA, stimulus)) |> 
+  group_by(subid) |> 
+       fill(order:targetOnset) |> 
+       fill(time:stimulus) |> 
+  ungroup() |> 
+  filter(!is.na(time)) |> 
+  mutate(time=time/1000,
+         t=t-time-1000*targetOnset,
+         point_of_disambiguation=time+1000*targetOnset) |> 
+  select(-lx, -ly, -rx, -ry) |> inner_join(exclusion_data) |> 
+  mutate(target_side=ifelse(targetPos=="R","right", "left"),
+         condition=case_when(
+           trial_type=="cs" ~ "control-single",
+           trial_type=="cd" ~ "control-double",
+           trial_type=="inf" ~ "inference"
+         ),
+         vanilla_trial = FALSE, # there are decisions to be made about whether
+         # the cs trials are vanilla, we're leaning not, but shrug
+         excluded = keep_drop=="drop", # note there should also be trial level exclusions, but we haven't tracked those down
+         exclusion_reason = ifelse(keep_drop=="drop", "participant level some reason", NA),
+         target_stimulus_label_original = str_c("target_",condition, "_",item),
+         distractor_stimulus_label_original=str_c("distractor_", condition,"_", item)
+         # will need to fix this
+  )
+
+
+
+wide.table <- draft_data |> 
+  mutate(age_units="years",
+         age=as.numeric(age),
+         full_phrase=NA,
+         native_language="eng",
+         full_phrase_language="eng",
+         session_num=0, 
+         sample_rate=NA,
+         tracker = NA,
+         coding_method = "eyetracking",
+         # note we will need to figure this out off of item at some point
+         target_stimulus_label_english = target_stimulus_label_original,
+         target_stimulus_novelty = "familiar",
+         target_stimulus_image_path = "stimulus_image_path",
+         target_image_description = "image",
+         target_image_description_source = "Peekbank discretion",
+         distractor_stimulus_label_english = distractor_stimulus_label_original,
+         distractor_stimulus_novelty = "familiar",
+         distractor_stimulus_image_path = "distrator_image_path",
+         distractor_image_description = "tbd",
+         distractor_image_description_source = "Peekbank discretion"
+  ) |> 
+  select(
+  subject_id = subid,
+  sex = sex,
+  native_language,
+  age = age,
+  age_units,
+  t,
+  #aoi = NA,
+  full_phrase,
+  full_phrase_language,
+  point_of_disambiguation,
+  target_side,
+  condition,
+  vanilla_trial,
+  excluded,
+  exclusion_reason,
+  session_num,
+  sample_rate,
+  tracker,
+  coding_method,
+  target_stimulus_label_original,
+  target_stimulus_label_english,
+  target_stimulus_novelty,
+  target_stimulus_image_path,
+  target_image_description,
+  target_image_description_source,
+  distractor_stimulus_label_original,
+  distractor_stimulus_label_english,
+  distractor_stimulus_novelty,
+  distractor_stimulus_image_path,
+  distractor_image_description,
+  distractor_image_description_source,
+  x,
+  y
+) %>%
+  # optional 
+  mutate(
+    # fill out all of these if you have xy data
+    l_x_max = 840,
+    l_x_min = 0,
+    l_y_max = 1000,
+    l_y_min = 250,
+    r_x_max = 1680,
+    r_x_min = 840,
+    r_y_max = 1000,
+    r_y_min = 250,
+    x = x,
+    y = y,
+    monitor_size_x = NA,
+    monitor_size_y = NA,
+    # if two subsequent trials can have the same stimuli combination,
+    # use this to indicate the trial order within an administration
+    trial_index = NA,
+    # lab specific name for trials
+    trial_name = NA,
+    # lab specific names for stimuli
+    target_stimulus_name = NA, 
+    distractor_stimulus_name = NA
+  ) |> mutate(aoi=generate_aois(x, y, target_side, 
+                                l_x_max, l_x_min, l_y_max, l_y_min,
+                                r_x_max, r_x_min, r_y_max, r_y_min))
+
+    
+  
 dataset_list <- digest.dataset(
   dataset_name = dataset_name,
   lab_dataset_id = NA,
diff --git a/data/yoon_simpimp_2015/simpimp_preprocess.R b/data/yoon_simpimp_2015/simpimp_preprocess.R
new file mode 100644
index 0000000..db1640e
--- /dev/null
+++ b/data/yoon_simpimp_2015/simpimp_preprocess.R
@@ -0,0 +1,82 @@
+# based on pre-processing of the original study, 
+# but updated to use tidyverse and be idiomatic
+# and to be selective for what downstream processing
+# is already easy for peekbank to do
+
+library(tidyverse)
+library(janitor)
+source("useful.R")
+
+select_msg <- function(df){
+  if (raw.data.path=="../raw_data/new_data/")
+  {df |>   select(time, message=l_raw_x_px)}
+  else {df |>  select(time, message=l_por_x_px)}
+  
+}
+preprocess.data <- function(file.name, x.max = 1680, y.max = 1050,
+                            samp.rate = 120,
+                            avg.eyes = TRUE) {
+  
+  
+  ## DATA CLEANING
+  # read in data and get rid of header rows
+  all.d <- read_tsv(str_c(raw.data.path, file.name), comment = "##") |> clean_names()
+  
+  
+  ## split data into messages and data
+  ## First get data:
+  dat <- all.d |>
+    filter(type == "SMP") |>
+    mutate(
+      lx = to.n(l_por_x_px),
+      rx = to.n(r_por_x_px),
+      ly = to.n(l_por_y_px),
+      ry = to.n(r_por_y_px)
+    ) |>
+    select(t = time, lx, ly, rx, ry)
+  
+  print(file.name)
+  
+ # head(dat) |> print()
+  # get messages (whatever those are)
+  msgs <- all.d |>
+    filter(str_detect(type,"MSG")) |> 
+    select_msg() |> 
+    mutate(stimulus = as.character(message) |> 
+             str_replace("# Message: ", "") |> 
+             str_replace(".jpg","")) |> 
+    select(-message)
+  
+  msgs |> select(stimulus) |> unique() |> head() |> print()
+
+  #print(head(msgs))
+  # attach the most recently started stimulus
+  d <- dat |> left_join(msgs, by = join_by(closest(t > time))) |> 
+    filter(!str_detect(stimulus,".avi")) |> 
+    filter(!stimulus=="blank") |> 
+    filter(!is.na(stimulus)) |> 
+    rowwise() |> 
+    mutate(x=mean(c(lx,rx), na.rm=T), #average eyes
+           y=mean(c(ly,ry), na.rm=T),
+           x=ifelse(0<x & x < x.max, x, NA),
+           y=ifelse(0<y & y < y.max, y, NA),
+           y=y.max-y, # move to cartesian origin
+           t=t/1000, #into milliseconds
+           subid=file.name)
+  
+  return(d)
+}
+
+raw.data.path = "../raw_data/new_data/"
+
+stuff <- list.files(raw.data.path) |>  # big issues
+  map(preprocess.data) |>  bind_rows() |> mutate(expt="expt2") |> write_csv("eyetrack_expt2.csv")
+
+raw.data.path = "../raw_data/old_data/"
+
+#foo <- preprocess.data("140217-02-L1.txt")
+stuff2 <- list.files(raw.data.path) |> 
+  map(preprocess.data) |>  bind_rows() |> mutate(expt="expt1") |> write_csv("eyetrack_expt1.csv")
+
+
+
diff --git a/data/yoon_simpimp_2015/useful.R b/data/yoon_simpimp_2015/useful.R
new file mode 100644
index 0000000..f90a74e
--- /dev/null
+++ b/data/yoon_simpimp_2015/useful.R
@@ -0,0 +1,72 @@
+################################################################################
+## USEFUL.R
+## a variety of useful libraries and commands
+## mcf 6/13 etc.
+################################################################################
+
+library(ggplot2)
+library(bootstrap)
+library(lme4)
+library(stringr)
+library(reshape2)
+library(plyr)
+
+## add some style elements for ggplot2
+theme_set(theme_bw())
+
+## standard error of the mean
+sem <- function (x) {
+  sd(x,na.rm=TRUE) / sqrt(length(x))
+}
+
+## standard error of the mean
+se <- function(x) {
+  y <- x[!is.na(x)] # remove the missing values, if any
+  sqrt(var(as.vector(y))/length(y))
+}
+
+## NA functions
+na.mean <- function(x) {mean(x,na.rm=T)}
+na.sum <- function(x) {sum(x,na.rm=T)}
+
+## convert to number
+to.n <- function(x) {
+  as.numeric(as.character(x))
+}
+
+## inverse logistic
+inv.logit <- function (x) {
+  exp(x) / (1 + exp(x)) 
+}
+
+## number of unique subs
+n.unique <- function (x) {
+  length(unique(x))
+}
+
+## for bootstrapping 95% confidence intervals
+theta <- function(x,xdata,na.rm=T) {mean(xdata[x],na.rm=na.rm)}
+ci.low <- function(x,na.rm=T) {
+  mean(x,na.rm=na.rm) - quantile(bootstrap(1:length(x),1000,theta,x,na.rm=na.rm)$thetastar,.025,na.rm=na.rm)}
+ci.high <- function(x,na.rm=T) {
+  quantile(bootstrap(1:length(x),1000,theta,x,na.rm=na.rm)$thetastar,.975,na.rm=na.rm) - mean(x,na.rm=na.rm)}
+
+## for basic plots, add linear models with correlations
+lm.txt <- function (p1,p2,x=7.5,yoff=.05,lt=2,c="black",data=data)
+{
+  l <- lm(p2 ~ p1)
+  regLine(l,lty=lt,col=c)
+  cl <- coef(l)
+  text(x,cl[1] + cl[2] * x + yoff,
+       paste("r = ",sprintf("%2.2f",sqrt(summary(l)$r.squared)),
+             getstars(anova(l)$"Pr(>F)"[1]),sep=""),
+       xpd="n")
+}
+
+## get stars for significance testing
+getstars <- function(x) {
+  if (x > .1) {return("")}
+  if (x < .001) {return("***")}
+  if (x < .01) {return("**")}
+  if (x < .05) {return("*")}
+}

From 280a73eb31c5bbb7e41b26b7304e0d2043721e8b Mon Sep 17 00:00:00 2001
From: vboyce <vero.boyce@gmail.com>
Date: Thu, 29 Aug 2024 16:34:40 -0700
Subject: [PATCH 02/14] Adrian types 5 characters

---
 helper_functions/idless_draft.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/helper_functions/idless_draft.R b/helper_functions/idless_draft.R
index d236875..25d6234 100644
--- a/helper_functions/idless_draft.R
+++ b/helper_functions/idless_draft.R
@@ -212,7 +212,7 @@ digest.dataset <- function(
       age = case_when(
         lab_age_units == "months" ~ lab_age,
         lab_age_units == "days" ~ lab_age/(365.25/12),
-        lab_age_units == "years" ~ 12*lab_age + ifelse(lab_age-floor(lab_age) == 0, 6, 0),
+        lab_age_units == "years" ~ 12*lab_age + ifelse(all(lab_age-floor(lab_age) == 0), 6, 0),
         .default = NA
       ),
       administration_aux_data = NA

From 641d8aa02cb5b71a354812f4dbf0a80fda6c9afb Mon Sep 17 00:00:00 2001
From: Alvin Tan <66404649+alvinwmtan@users.noreply.github.com>
Date: Thu, 29 Aug 2024 16:50:48 -0700
Subject: [PATCH 03/14] fixed aoi_region_set_id definition

---
 helper_functions/idless_draft.R | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/helper_functions/idless_draft.R b/helper_functions/idless_draft.R
index 25d6234..6dc1fe9 100644
--- a/helper_functions/idless_draft.R
+++ b/helper_functions/idless_draft.R
@@ -168,6 +168,10 @@ digest.dataset <- function(
     ungroup() %>% 
     group_by(administration_id, trial_type_id, trial_order) %>%
     mutate(trial_id = cur_group_id() - 1) %>%
+    ungroup() %>%
+    group_by(l_x_max, l_x_min, l_y_max, l_y_min,
+             r_x_max, r_x_min, r_y_max, r_y_min) %>%
+    mutate(aoi_region_set_id = cur_group_id() - 1) %>%
     ungroup()
   
   datasets <- tibble(
@@ -230,7 +234,8 @@ digest.dataset <- function(
       vanilla_trial,
       dataset_id,
       distractor_id,
-      target_id
+      target_id,
+      aoi_region_set_id
     ) %>% 
     mutate(
       target_side = tolower(target_side),
@@ -241,7 +246,6 @@ digest.dataset <- function(
         target_side == "r" ~ "right",
         .default="ERROR"),
       trial_type_aux_data = NA,
-      aoi_region_set_id = NA # set for now, set to 0 further down below if we actually have that table
       )
   
   
@@ -289,8 +293,8 @@ digest.dataset <- function(
         r_x_min,
         r_y_max,
         r_y_min,
-      ) %>% 
-      mutate(aoi_region_set_id = 0)
+        aoi_region_set_id
+      )
     
     xy_timepoints <- data %>%
       {if (rezero) peekds::rezero_times(.) else rename(., t_zeroed = t)} %>%
@@ -304,6 +308,8 @@ digest.dataset <- function(
         trial_id,
         administration_id
       )
+  } else {
+    trial_types$aoi_region_set_id <- NA
   }
   
   return(list(

From 80c691e77021ce42195f045312887a1a086118d0 Mon Sep 17 00:00:00 2001
From: Alvin Tan <66404649+alvinwmtan@users.noreply.github.com>
Date: Thu, 29 Aug 2024 16:51:00 -0700
Subject: [PATCH 04/14] added kremin_2021 import

---
 data/kremin_2021/import.R | 255 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 255 insertions(+)
 create mode 100644 data/kremin_2021/import.R

diff --git a/data/kremin_2021/import.R b/data/kremin_2021/import.R
new file mode 100644
index 0000000..547e05e
--- /dev/null
+++ b/data/kremin_2021/import.R
@@ -0,0 +1,255 @@
+library(tidyverse)
+library(janitor)
+library(here)
+library(glue)
+library(readxl)
+
+source(here("helper_functions", "idless_draft.R"))
+source(here("helper_functions", "common.R"))
+dataset_name <- "kremin_2021"
+read_path <- init(dataset_name)
+
+# Montreal ####
+## loading files ####
+mtl_path <- here(read_path, "Montreal")
+
+# et data
+load(here(mtl_path, "mtl_raw_gaze_anon.rda"))
+
+# demog data
+mtl_demog <- read_csv(here(mtl_path, "mtl_msl_anon.csv")) |> 
+  clean_names() |> 
+  mutate(testing_loc = "Montreal")
+# grab exposure values from sander-montant_2022
+load(here(mtl_path, "demo_comp.Rda"))
+demo_comp <- demo_comp |> 
+  select(study_id, eng_exp, fre_exp)
+
+## getting trial info ####
+# trial info
+trial_info <- read_csv(here(read_path, "target-distractor-pairs.csv"))
+trial_info_fr <- read_csv(here(read_path, "trial_info_fr.csv")) |> 
+  mutate(media_name = str_remove(media_name, "\\.wmv"))
+
+aois <- read_csv(here(mtl_path, "compmix_aois.csv")) |> 
+  separate_wider_delim(aoi_name, delim = "_",
+                       names = c("language", "object_type", "object", "trial_type", "location")) |> 
+  mutate(x_min = x1,
+         x_max = x2,
+         y_min = y2,
+         y_max = y3,
+         x_width = x_max - x_min,
+         y_height = y_max - y_min) |> 
+  filter(y_max %% 1 == 0,
+         x_max %% 1 == 0)
+
+# aoi cleaning
+aoi_one <- aois |> 
+  select(object, location, x_min, x_max, y_min, y_max, x_width, y_height) |> 
+  distinct() |> 
+  group_by(object, location) |> 
+  summarise(n_aois = n(),
+            across(starts_with("x"), min),
+            across(starts_with("y"), min)) |> 
+  filter(n_aois == 1)
+
+aoi_fixed <- aois |> 
+  select(object, location, x_min, x_max, y_min, y_max, x_width, y_height) |> 
+  distinct() |> 
+  # the following two lines are based on the original processing code 
+  # (https://osf.io/ug7t3/files/github/01_load.R)
+  filter(x_width %in% aoi_one$x_width,
+         y_height != 493) 
+
+## processing et data ####
+mtl_data_cleaned <- mtl_raw_data_anon |> 
+  rename(x = gaze_point_x_adc_spx, 
+         y = gaze_point_y_adc_spx) |> 
+  separate_wider_delim(recording_name, delim = "_",
+                       names = c("study_name", "study_id", "study_order"),
+                       cols_remove = FALSE) |> 
+  mutate(media_name = str_remove(media_name, "\\.wmv"),
+         media_name = case_when(
+           media_name == "Cow_FrSingle_L" & study_order == "F1" ~ "Cow_FrSingle_R",
+           .default = media_name
+         )) |> 
+  filter(str_detect(media_name, "(Single|Mixed)"),
+         is.na(studio_event)) |> 
+  separate_wider_delim(media_name, delim = "_",
+                       names = c("object", "trial_type", "location"),
+                       cols_remove = FALSE) |> 
+  select(-starts_with("aoi_"))
+
+## merging data ####
+mtl_wide.table <- mtl_data_cleaned |> 
+  left_join(trial_info, by = join_by(object == target)) |> 
+  left_join(trial_info_fr, by = join_by(media_name)) |> 
+  left_join(aoi_fixed, by = join_by(object, location)) |> # target
+  left_join(aoi_fixed |> mutate(location = ifelse(location == "L", "R", "L")), # distractor
+            by = join_by(distractor == object, location),
+            suffix = c("_t", "_d")) |> 
+  left_join(mtl_demog, by = join_by(recording_name, study_id)) |> 
+  mutate(
+    subject_id = glue("{study_name}_{study_id}"),
+    sex = gender,
+    native_language = "eng, fre",
+    age = years * 365.2425 + months * (365.2425/12) + days,
+    age_units = "days",
+    t = recording_timestamp,
+    # aoi,
+    # full_phrase
+    # full_phrase_language
+    point_of_disambiguation = 3000,
+    target_side = ifelse(location == "L", "left", "right"),
+    # condition
+    vanilla_trial = 0,
+    excluded = keeper == 0,
+    exclusion_reason = exclusion,
+    session_num = 1,
+    sample_rate = 60,
+    tracker = "Tobii T60-XL",
+    coding_method = "eyetracking",
+    target_stimulus_label_original = target_label,
+    target_stimulus_label_english = target_image,
+    target_stimulus_novelty = "familiar",
+    target_stimulus_image_path = NA,
+    target_image_description = target_image,
+    target_image_description_source = "experiment documentation",
+    distractor_stimulus_label_original = distractor_label,
+    distractor_stimulus_label_english = distractor_image,
+    distractor_stimulus_novelty = "familiar",
+    distractor_stimulus_image_path = NA,
+    distractor_image_description = distractor_image,
+    distractor_image_description_source = "experiment documentation",
+    l_x_max = ifelse(location == "L", x_max_t, x_max_d),
+    l_x_min = ifelse(location == "L", x_min_t, x_min_d),
+    l_y_max = ifelse(location == "L", y_max_t, y_max_d),
+    l_y_min = ifelse(location == "L", y_min_t, y_min_d),
+    r_x_max = ifelse(location == "R", x_max_t, x_max_d),
+    r_x_min = ifelse(location == "R", x_min_t, x_min_d),
+    r_y_max = ifelse(location == "R", y_max_t, y_max_d),
+    r_y_min = ifelse(location == "R", y_min_t, y_min_d),
+    # x
+    # y
+    monitor_size_x = 1920,
+    monitor_size_y = 1200,
+    aoi = ifelse(target_side == "left",
+                 case_when( # left target
+                   x <= l_x_max & x >= l_x_min & y <= l_y_max & y >= l_y_min ~ "target",
+                   x <= r_x_max & x >= r_x_min & y <= r_y_max & y >= r_y_min ~ "distractor",
+                   is.na(x) | is.na(y) | x > monitor_size_x | x < 0 | y > monitor_size_y | y < 0 ~ "missing",
+                   .default = "other"
+                 ),
+                 case_when( # right target
+                   x <= l_x_max & x >= l_x_min & y <= l_y_max & y >= l_y_min ~ "distractor",
+                   x <= r_x_max & x >= r_x_min & y <= r_y_max & y >= r_y_min ~ "target",
+                   is.na(x) | is.na(y) | x > monitor_size_x | x < 0 | y > monitor_size_y | y < 0 ~ "missing",
+                   .default = "other"
+                 ))
+  )
+
+# Princeton ####
+## loading files ####
+pct_path <- here(read_path, "Princeton")
+
+# et data
+pct_raw_data_anon <- read_csv(here(pct_path, "3aMixData_col-names.csv"),
+                              na = c(".", "-"))
+
+# demog data
+pct_comp <- read_excel(here(pct_path, "Princeton_LangComprehension.xlsx")) |> 
+  clean_names() |> 
+  rename(study_id = subj_id)
+pct_keepers <- read_csv(here(pct_path, "Princeton_keepers.csv"))
+pct_demog <- read_csv(here(pct_path, "pct_msl_anon.csv")) |> 
+  left_join(pct_comp, by = join_by(study_id)) |> 
+  left_join(pct_keepers, by = join_by(study_id)) |> 
+  mutate(testing_loc = "Princeton",
+         study_id = as.character(study_id))
+
+## getting trial info ####
+pct_trial_info <- read_csv(here(pct_path, "CompMix_trial-numbers_PCT.csv"))
+trial_info_sp <- read_csv(here(read_path, "trial_info_sp.csv"))
+
+## processing et data ####
+pct_data_cleaned <- pct_raw_data_anon |> 
+  filter(`Sub Num` != "Sub Num", # duplicate header rows
+         `Sub Num` != "") |>  
+  mutate(across(`-3100`:`4767`, as.numeric)) |> 
+  pivot_longer(cols = `-3100`:`4767`, 
+               names_to = "t_norm", 
+               values_to = "look") |> 
+  clean_names() |> 
+  rename(study_id = sub_num,
+         trial_type = condition,
+         target = target_image) |> 
+  mutate(trial_type = case_when(
+    trial_type == "Switch" ~ "Mixed",
+    .default = "Single"
+  )) |> 
+  separate_wider_delim(order, delim = "_",
+                       names = c("study_name", "study_order"),
+                       too_many = "drop",
+                       cols_remove = TRUE) |> 
+  left_join(pct_trial_info, by = join_by(study_order, target, target_side, trial_type)) |> 
+  select(-tr_num)
+
+## merging data ####
+pct_wide.table <- pct_data_cleaned |> 
+  left_join(pct_trial_info,
+            by = join_by(study_order, target_side, target, trial_type, trial_number)) |> 
+  left_join(trial_info_sp,
+            by = join_by(study_order, target_side, target, trial_type)) |> 
+  left_join(pct_demog, by = join_by(study_id)) |> 
+  mutate(
+    vocab_eng = as.numeric(vocab_eng),
+    subject_id = glue("{study_name}_{study_id}"),
+    sex = gender,
+    native_language = "eng, spa",
+    age = years * 365.2425 + months.y * (365.2425/12) + days,
+    age_units = "days",
+    t = as.numeric(t_norm),
+    aoi = case_when(
+      look == 1 ~ "target",
+      look == 0 ~ "distractor",
+      .default = "missing"
+    ),
+    # full_phrase
+    # full_phrase_language
+    point_of_disambiguation = 3100,
+    target_side = ifelse(target_side == "l", "left", "right"),
+    condition = tolower(trial_type),
+    vanilla_trial = 0,
+    excluded = keeper == "N",
+    exclusion_reason = reason,
+    session_num = 1,
+    sample_rate = NA,
+    tracker = NA,
+    coding_method = "manual gaze coding",
+    target_stimulus_label_original = target_label,
+    target_stimulus_label_english = tolower(target),
+    target_stimulus_novelty = "familiar",
+    target_stimulus_image_path = NA,
+    target_image_description = tolower(target),
+    target_image_description_source = "experiment documentation",
+    distractor_stimulus_label_original = distractor_label,
+    distractor_stimulus_label_english = distractor_image,
+    distractor_stimulus_novelty = "familiar",
+    distractor_stimulus_image_path = NA,
+    distractor_image_description = distractor_image,
+    distractor_image_description_source = "experiment documentation"
+  )
+
+# combine both datasets ####
+wide.table <- bind_rows(mtl_wide.table |> select(-keeper), 
+                        pct_wide.table |> select(-keeper))
+
+dataset_list <- digest.dataset(
+  dataset_name = dataset_name,
+  lab_dataset_id = NA,
+  cite = "Kremin, L. V., Jardak, A., Lew-Williams, C., & Byers-Heinlein, K. (2023). Bilingual children’s comprehension of code-switching at an uninformative adjective. Language Development Research 3(1), 249–276.",
+  shortcite = "Kremin et al. 2023",
+  wide.table = wide.table
+)
+
+write_and_validate_list(dataset_list, cdi_expected = FALSE, upload = TRUE)

From 5bf551d99ed8a42ddcafe42d7792e947a2e1f770 Mon Sep 17 00:00:00 2001
From: Stephan Meylan <meylan.stephan@gmail.com>
Date: Thu, 29 Aug 2024 17:01:36 -0700
Subject: [PATCH 05/14] Create test_import.yml

For smoke testing data import
---
 .github/workflows/test_import.yml | 52 +++++++++++++++++++++++++++++++
 1 file changed, 52 insertions(+)
 create mode 100644 .github/workflows/test_import.yml

diff --git a/.github/workflows/test_import.yml b/.github/workflows/test_import.yml
new file mode 100644
index 0000000..f3fd470
--- /dev/null
+++ b/.github/workflows/test_import.yml
@@ -0,0 +1,52 @@
+# For help debugging build failures open an issue on the RStudio community with the 'github-actions' tag.
+# https://community.rstudio.com/new-topic?category=Package%20development&tags=github-actions
+on:
+  push:
+    branches:
+      - main
+      - master
+  pull_request:
+    branches:
+      - main
+      - master
+
+name: test-import
+
+jobs:
+  test-import:
+    runs-on: macOS-latest
+
+    env:
+      GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
+
+    steps:
+      - uses: actions/checkout@v2
+
+      - uses: r-lib/actions/setup-r@v2
+        
+      - uses: r-lib/actions/setup-pandoc@v1
+
+      - name: Query dependencies
+        run: |
+          install.packages('remotes')
+          saveRDS(remotes::dev_package_deps(dependencies = TRUE), ".github/depends.Rds", version = 2)
+          writeLines(sprintf("R-%i.%i", getRversion()$major, getRversion()$minor), ".github/R-version")
+        shell: Rscript {0}
+
+      - name: Restore R package cache
+        uses: actions/cache@v2
+        with:
+          path: ${{ env.R_LIBS_USER }}
+          key: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1-${{ hashFiles('.github/depends.Rds') }}
+          restore-keys: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1-
+      
+      - name: Install dependencies
+        run: |
+          remotes::install_deps(dependencies = TRUE)
+          install.packages("pkgdown", type = "binary")
+        shell: Rscript {0}
+
+      - name: Run pipeline 
+        run: R helper_functions/pipeline.R
+
+      

From 9bc41adcb513298c282804be5074bc2d0b92de3e Mon Sep 17 00:00:00 2001
From: Alvin Tan <66404649+alvinwmtan@users.noreply.github.com>
Date: Thu, 29 Aug 2024 17:03:52 -0700
Subject: [PATCH 06/14] added kremin_2021 README

---
 data/kremin_2021/README.md | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)
 create mode 100644 data/kremin_2021/README.md

diff --git a/data/kremin_2021/README.md b/data/kremin_2021/README.md
new file mode 100644
index 0000000..778af0e
--- /dev/null
+++ b/data/kremin_2021/README.md
@@ -0,0 +1,31 @@
+# kremin_2021 dataset
+
+## Reference
+Kremin, L. V., Jardak, A., Lew-Williams, C., & Byers-Heinlein, K. (2023). Bilingual children’s comprehension of code-switching at an uninformative adjective. Language Development Research 3(1), 249–276.
+
+## Abstract
+Bilingual children regularly hear sentences that contain words from both languages, also known as code-switching. Investigating how bilinguals process code-switching is important for understanding bilingual language acquisition, because young bilinguals have been shown to experience processing costs and reduced comprehension when encountering code-switched nouns. Studies have yet to investigate if processing costs are present when children encounter code-switches at other parts of speech within a sentence. The current study examined how 30 young bilinguals (age range: 37 – 48 months) processed sentences with code-switches at an uninformative determiner-adjective pair before the target noun (e.g., “Can you find le bon [the good] duck?) compared to single-language sentences (e.g., “Can you find the good duck?”). Surprisingly, bilingual children accurately identified the target object in both sentence types, contrasting with previous findings that sentences containing codeswitching lead to processing difficulties. Indeed, children showed similar (and in some cases, better) comprehension of sentences with a code-switch at an uninformative adjective phrase, relative to single-language sentences. We conclude that functional information conveyed by a code-switch may contribute to bilingual children’s sentence processing.
+
+## Original study info
+Participants were 36-month-old bilinguals (Eng-Fre from Montreal, and Eng-Spa from Princeton).
+The key manipulation was code-mixing in prenominal adjectives before the target noun (e.g., "Can you see the good cow?" vs "Can you see le bon cow?"); note that the adjectives were uninformative.
+
+Data from Montreal was collected with a Tobii T60-XL eyetracker, and data from Princeton was collected using a video camera and manual gaze coding.
+
+Note that the data only include "single" and "mixed" conditions; there are also other "filler" trials that were not in the data (although the filler data for the Eng-Fra subset can be found in Sander-Montant et al. ([2022](osf.io/2m345/))).
+
+## Importing decisions
+Stimuli are available although these are in video files so will need to be extracted.
+Language exposure data are also available in Perez et al. ([2024](https://osf.io/mxksz/)) and could be imported (although they have not yet been).
+
+We included importing decisions from the [processing script](https://osf.io/ug7t3/files/github/01_load.R), including the rectification of AOIs and the removal of duplicated header rows.
+Age was calculated from years, months, and days using the formula: years * 365.2425 + months * (365.2425/12) + days.
+
+Monitor size for the Montreal data was set as 1920x1200 based on the Tobii export.
+
+We decided that the data reflect non-vanilla trials, although the "single" condition trials (e.g., "Can you see the good cow?") could be construed as "vanilla" in the sense that the carrier phrase is unlikely to bias the results.
+(This would also be true for the "filler" condition trials should we decide to include these data).
+The data may be worth further analysis despite their non-vanilla status.
+
+## Importing ambiguity
+None other than those reported above.

From b518663be7276d36e2f23d73f350e4c4e4e21022 Mon Sep 17 00:00:00 2001
From: Stephan Meylan <meylan.stephan@gmail.com>
Date: Thu, 29 Aug 2024 17:08:11 -0700
Subject: [PATCH 07/14] Simplified test_import.yml for testing

---
 .github/workflows/test_import.yml | 19 ++-----------------
 1 file changed, 2 insertions(+), 17 deletions(-)

diff --git a/.github/workflows/test_import.yml b/.github/workflows/test_import.yml
index f3fd470..4fcf702 100644
--- a/.github/workflows/test_import.yml
+++ b/.github/workflows/test_import.yml
@@ -26,25 +26,10 @@ jobs:
         
       - uses: r-lib/actions/setup-pandoc@v1
 
-      - name: Query dependencies
-        run: |
-          install.packages('remotes')
-          saveRDS(remotes::dev_package_deps(dependencies = TRUE), ".github/depends.Rds", version = 2)
-          writeLines(sprintf("R-%i.%i", getRversion()$major, getRversion()$minor), ".github/R-version")
-        shell: Rscript {0}
-
-      - name: Restore R package cache
-        uses: actions/cache@v2
-        with:
-          path: ${{ env.R_LIBS_USER }}
-          key: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1-${{ hashFiles('.github/depends.Rds') }}
-          restore-keys: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1-
-      
       - name: Install dependencies
         run: |
-          remotes::install_deps(dependencies = TRUE)
-          install.packages("pkgdown", type = "binary")
-        shell: Rscript {0}
+          install.packages('remotes')
+          remotes::install_github("langcog/peekbankr",  force=T)          
 
       - name: Run pipeline 
         run: R helper_functions/pipeline.R

From 25da777cab35c0efbdb3d9b1bb04834d417b4cc9 Mon Sep 17 00:00:00 2001
From: Alvin Tan <66404649+alvinwmtan@users.noreply.github.com>
Date: Thu, 29 Aug 2024 17:08:35 -0700
Subject: [PATCH 08/14] updated kremin_2021 README

---
 data/kremin_2021/README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/data/kremin_2021/README.md b/data/kremin_2021/README.md
index 778af0e..7a5504d 100644
--- a/data/kremin_2021/README.md
+++ b/data/kremin_2021/README.md
@@ -17,6 +17,7 @@ Note that the data only include "single" and "mixed" conditions; there are also
 ## Importing decisions
 Stimuli are available although these are in video files so will need to be extracted.
 Language exposure data are also available in Perez et al. ([2024](https://osf.io/mxksz/)) and could be imported (although they have not yet been).
+There are no CDI data, but there are vocabulary data from DVAP; these could be imported in the lang_measures field for subject_aux_data.
 
 We included importing decisions from the [processing script](https://osf.io/ug7t3/files/github/01_load.R), including the rectification of AOIs and the removal of duplicated header rows.
 Age was calculated from years, months, and days using the formula: years * 365.2425 + months * (365.2425/12) + days.

From 9bc6f6f314a92f7f3f72305377ca478c766f87ad Mon Sep 17 00:00:00 2001
From: Alvin Tan <66404649+alvinwmtan@users.noreply.github.com>
Date: Thu, 29 Aug 2024 17:11:02 -0700
Subject: [PATCH 09/14] updated kremin_2021 README

---
 data/kremin_2021/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/data/kremin_2021/README.md b/data/kremin_2021/README.md
index 7a5504d..faa1181 100644
--- a/data/kremin_2021/README.md
+++ b/data/kremin_2021/README.md
@@ -16,8 +16,8 @@ Note that the data only include "single" and "mixed" conditions; there are also
 
 ## Importing decisions
 Stimuli are available although these are in video files so will need to be extracted.
-Language exposure data are also available in Perez et al. ([2024](https://osf.io/mxksz/)) and could be imported (although they have not yet been).
-There are no CDI data, but there are vocabulary data from DVAP; these could be imported in the lang_measures field for subject_aux_data.
+CDI and Language exposure data for the Montreal subset are also available in Perez et al. ([2024](https://osf.io/mxksz/)) and could be imported (although they have not yet been).
+There are also vocabulary data from DVAP; these could be imported in the lang_measures field for subject_aux_data.
 
 We included importing decisions from the [processing script](https://osf.io/ug7t3/files/github/01_load.R), including the rectification of AOIs and the removal of duplicated header rows.
 Age was calculated from years, months, and days using the formula: years * 365.2425 + months * (365.2425/12) + days.

From 320e0b1376f7a2aecaafe043e1c4b69cde2d8dda Mon Sep 17 00:00:00 2001
From: Stephan Meylan <meylan.stephan@gmail.com>
Date: Thu, 29 Aug 2024 17:12:43 -0700
Subject: [PATCH 10/14] Update test_import.yml

---
 .github/workflows/test_import.yml | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/test_import.yml b/.github/workflows/test_import.yml
index 4fcf702..581e14c 100644
--- a/.github/workflows/test_import.yml
+++ b/.github/workflows/test_import.yml
@@ -30,8 +30,7 @@ jobs:
         run: |
           install.packages('remotes')
           remotes::install_github("langcog/peekbankr",  force=T)          
-
+        shell: Rscript {0}
+          
       - name: Run pipeline 
-        run: R helper_functions/pipeline.R
-
-      
+        run: Rscript helper_functions/pipeline.R

From 0d466c8ec5a7c6422927e92fdfab6550ad3af9d1 Mon Sep 17 00:00:00 2001
From: Stephan Meylan <meylan.stephan@gmail.com>
Date: Thu, 29 Aug 2024 17:15:58 -0700
Subject: [PATCH 11/14] Update test_import.yml

---
 .github/workflows/test_import.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test_import.yml b/.github/workflows/test_import.yml
index 581e14c..2c4532a 100644
--- a/.github/workflows/test_import.yml
+++ b/.github/workflows/test_import.yml
@@ -28,7 +28,7 @@ jobs:
 
       - name: Install dependencies
         run: |
-          install.packages('remotes')
+          install.packages('remotes','purrr','dplyr','here','tidyr')
           remotes::install_github("langcog/peekbankr",  force=T)          
         shell: Rscript {0}
           

From 102f904e835f74cdd92fa4e769fcfaa88c9bae91 Mon Sep 17 00:00:00 2001
From: Stephan Meylan <meylan.stephan@gmail.com>
Date: Thu, 29 Aug 2024 17:18:08 -0700
Subject: [PATCH 12/14] Update test_import.yml

---
 .github/workflows/test_import.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test_import.yml b/.github/workflows/test_import.yml
index 2c4532a..470beeb 100644
--- a/.github/workflows/test_import.yml
+++ b/.github/workflows/test_import.yml
@@ -28,7 +28,7 @@ jobs:
 
       - name: Install dependencies
         run: |
-          install.packages('remotes','purrr','dplyr','here','tidyr')
+          install.packages(c('remotes','purrr','dplyr','here','tidyr'))
           remotes::install_github("langcog/peekbankr",  force=T)          
         shell: Rscript {0}
           

From 2750c9fbef2198b19cd43fc9884f21728cf5d624 Mon Sep 17 00:00:00 2001
From: vboyce <vero.boyce@gmail.com>
Date: Thu, 29 Aug 2024 17:35:13 -0700
Subject: [PATCH 13/14] time points are mostly fixed, something is (very) wrong
 with aoi coordinates

---
 data/yoon_simpimp_2015/import.R             | 24 +++++++++++----------
 data/yoon_simpimp_2015/simpimp_preprocess.R | 19 ++++++++--------
 2 files changed, 23 insertions(+), 20 deletions(-)

diff --git a/data/yoon_simpimp_2015/import.R b/data/yoon_simpimp_2015/import.R
index e13b370..facfa41 100644
--- a/data/yoon_simpimp_2015/import.R
+++ b/data/yoon_simpimp_2015/import.R
@@ -28,24 +28,26 @@ generate_aois <- function(x, y, target_side,
 }
 # 
 eyetracking_path <- here(data_path, "eyetracking")
-data_ex_1<- read_csv(here(eyetracking_path, "eyetrack_expt1.csv")) |> mutate(expt="0")
-data_ex_2 <- read_csv(here(eyetracking_path, "eyetrack_expt2.csv")) |> mutate(expt="scale")
+data_ex_1<- read_csv(here(eyetracking_path, "eyetrack_expt1.csv")) |> 
+  mutate(expt="0")
+
+data_ex_2 <- read_csv(here(eyetracking_path, "eyetrack_expt2.csv")) |> 
+  mutate(expt="scale") 
 
 exclusion_data <- read_csv(here(eyetracking_path, "simpimp_et_log.csv")) |> 
   filter(age!="adult")
 order_data <- read_csv(here(eyetracking_path, "simpimp_et_order.csv"))
 
-draft_data <- data_ex_1 |> bind_rows(data_ex_2) |> left_join(order_data) |> 
-  mutate(time=ifelse(is.na(order), NA, time), #this is janky - VB
-         stimulus=ifelse(is.na(order), NA, stimulus)) |> 
-  group_by(subid) |> 
-       fill(order:targetOnset) |> 
-       fill(time:stimulus) |> 
+draft_data <- data_ex_1 |> bind_rows(data_ex_2) |> inner_join(order_data) |> 
+  #mutate(time=ifelse(is.na(order), NA, time), #this is janky - VB
+  #       stimulus=ifelse(is.na(order), NA, stimulus)) |> 
+  #group_by(subid) |> 
+  #     fill(order:targetOnset) |> 
+  #     fill(time:stimulus) |> 
   ungroup() |> 
   filter(!is.na(time)) |> 
-  mutate(time=time/1000,
-         t=t-time-1000*targetOnset,
-         point_of_disambiguation=time+1000*targetOnset) |> 
+  mutate(t=t-time,
+         point_of_disambiguation=1000*targetOnset) |> 
   select(-lx, -ly, -rx, -ry) |> inner_join(exclusion_data) |> 
   mutate(target_side=ifelse(targetPos=="R","right", "left"),
          condition=case_when(
diff --git a/data/yoon_simpimp_2015/simpimp_preprocess.R b/data/yoon_simpimp_2015/simpimp_preprocess.R
index db1640e..6f4014d 100644
--- a/data/yoon_simpimp_2015/simpimp_preprocess.R
+++ b/data/yoon_simpimp_2015/simpimp_preprocess.R
@@ -8,7 +8,7 @@ library(janitor)
 source("useful.R")
 
 select_msg <- function(df){
-  if (raw.data.path=="../raw_data/new_data/")
+  if (raw.data.path=="raw_data/eyetracking/new_data/")
   {df |>   select(time, message=l_raw_x_px)}
   else {df |>  select(time, message=l_por_x_px)}
   
@@ -31,9 +31,10 @@ preprocess.data <- function(file.name, x.max = 1680, y.max = 1050,
       lx = to.n(l_por_x_px),
       rx = to.n(r_por_x_px),
       ly = to.n(l_por_y_px),
-      ry = to.n(r_por_y_px)
+      ry = to.n(r_por_y_px),
+      t=time/1000 #convert ms
     ) |>
-    select(t = time, lx, ly, rx, ry)
+    select(t = t, lx, ly, rx, ry)
   
   print(file.name)
   
@@ -42,7 +43,8 @@ preprocess.data <- function(file.name, x.max = 1680, y.max = 1050,
   msgs <- all.d |>
     filter(str_detect(type,"MSG")) |> 
     select_msg() |> 
-    mutate(stimulus = as.character(message) |> 
+    mutate(time=time/1000, #to ms
+           stimulus = as.character(message) |> 
              str_replace("# Message: ", "") |> 
              str_replace(".jpg","")) |> 
     select(-message)
@@ -61,22 +63,21 @@ preprocess.data <- function(file.name, x.max = 1680, y.max = 1050,
            x=ifelse(0<x & x < x.max, x, NA),
            y=ifelse(0<y & y < y.max, y, NA),
            y=y.max-y, # move to cartesian origin
-           t=t/1000, #into milliseconds
            subid=file.name)
   
   return(d)
 }
 
-raw.data.path = "../raw_data/new_data/"
+raw.data.path = "raw_data/eyetracking/new_data/"
 
 stuff <- list.files(raw.data.path) |>  # big issues
-  map(preprocess.data) |>  bind_rows() |> mutate(expt="expt2") |> write_csv("eyetrack_expt2.csv")
+  map(preprocess.data) |>  bind_rows() |> mutate(expt="expt2") |> write_csv("raw_data/eyetracking/eyetrack_expt2.csv")
 
-raw.data.path = "../raw_data/old_data/"
+raw.data.path = "raw_data/eyetracking/old_data/"
 
 #foo <- preprocess.data("140217-02-L1.txt")
 stuff2 <- list.files(raw.data.path) |> 
-  map(preprocess.data) |>  bind_rows() |> mutate(expt="expt1") |> write_csv("eyetrack_expt1.csv")
+  map(preprocess.data) |>  bind_rows() |> mutate(expt="expt1") |> write_csv("raw_data/eyetracking/eyetrack_expt1.csv")
 
 
 

From 63817793349291e38050ff4ef6fff36e12350198 Mon Sep 17 00:00:00 2001
From: mzettersten <mzettersten@users.noreply.github.com>
Date: Thu, 29 Aug 2024 17:41:51 -0700
Subject: [PATCH 14/14] initial fixes to fernald marchman

---
 data/fernald_marchman_2012/ReadME.md | 11 +++++---
 data/fernald_marchman_2012/import.R  | 39 ++++++++++++++++------------
 data/fernald_marchman_2012/notes     |  6 -----
 3 files changed, 30 insertions(+), 26 deletions(-)
 delete mode 100644 data/fernald_marchman_2012/notes

diff --git a/data/fernald_marchman_2012/ReadME.md b/data/fernald_marchman_2012/ReadME.md
index ff5da82..90c657a 100644
--- a/data/fernald_marchman_2012/ReadME.md
+++ b/data/fernald_marchman_2012/ReadME.md
@@ -27,10 +27,15 @@ Note: for images, some images were shown in slightly different versions or mirro
 
 For the manju and tempo trials, some were exposure where the object was on a background and some were tests where they were not on the background.
 
-5. Importing ambiguity
+Some images were mirrored depending on left/right positioning - image labels L and R are from the participants' perspective.
 
+IMPORTANT: for related/unrelated prime noun/verb trials, the trials are represented in the raw data TWICE - once centered on the onset of the verb and once centered on the onset of the noun. We only keep the trial representation centered on the onset of the noun.
 
+5. Importing ambiguity
 
-ToDos:
-* check with Martin and/or Virginia about whether slightly different images (mirroring) matter
+Point of disambiguation is tricky for verb and adjective trials - should this be the first informative moment (e.g. when an informative verb was mentioned) or at the onset of the noun?
 
+In the raw data, point of disambiguation:
+- exposure novel trials: F0 is the onset of the verb
+- 24mos: adjective: word onset is the adjective
+- 30mos: hard adjective trials: onset of the color/ size
\ No newline at end of file
diff --git a/data/fernald_marchman_2012/import.R b/data/fernald_marchman_2012/import.R
index 772b3e0..50ae8a8 100644
--- a/data/fernald_marchman_2012/import.R
+++ b/data/fernald_marchman_2012/import.R
@@ -45,6 +45,12 @@ d_processed_24 <- d_raw_24 %>%
 d_raw_30 <- read_delim(fs::path(read_path, "TL230ABoriginalichartsn1-121toMF.txt"),
   delim = "\t"
 )
+# remove duplicated trials (recentered on verb instead of noun)
+d_raw_30 <- d_raw_30 |>
+  filter(
+    !(OriginalCondition %in% c("R-primeVerb","UR-primeVerb"))
+  )
+
 # d_raw_30 has two slightly different types of rows mixed together
 d_processed_30_part_1 <- d_raw_30 |>
   filter(is.na(Shifts)) |>
@@ -60,15 +66,17 @@ d_processed_30_part_1 <- d_raw_30 |>
 
 d_processed_30_part_2 <- d_raw_30 |>
   filter(!is.na(Shifts)) |>
-  # these *do* have looking data in non-looking cols
-  rename(
-    f01 = `Frames - word starts at frame 45 `,
-    f02 = `First Shift Gap`,
-    f03 = `RT`,
-    f04 = `CritOnSet`,
-    f05 = `CritOffSet`
-  ) |>
+  # # these *do* have looking data in non-looking cols
+  # rename(
+  #   f01 = `Frames - word starts at frame 45 `,
+  #   f02 = `First Shift Gap`,
+  #   f03 = `RT`,
+  #   f04 = `CritOnSet`,
+  #   f05 = `CritOffSet`
+  # ) |>
   preprocess_raw_data() %>%
+  #drop final x column
+  select(-x270) %>%
   relabel_time_cols(
     metadata_names = extract_col_types(.)[["metadata_names"]],
     pre_dis_names = extract_col_types(.)[["pre_dis_names"]],
@@ -143,9 +151,6 @@ d_tidy <- d_tidy %>%
     TRUE ~ right_image
   ))
 
-
-## TODO See Readme for some questions about stimulus table
-
 # create stimulus table
 stimulus_table_link <- d_tidy %>%
   distinct(target_image, target_label) |>
@@ -230,7 +235,7 @@ d_tidy <- d_tidy %>%
   )
 
 # create zero-indexed ids for trial_types
-d_trial_type_ids <- d_tidy %>% 
+d_trial_type_ids <- d_tidy %>%
   distinct(
     target_id, distractor_id, target_side,
     condition
@@ -252,7 +257,7 @@ d_tidy_semifinal <- d_tidy %>%
   left_join(d_administration_ids) %>%
   left_join(d_trial_type_ids) |>
   select(-condition2, -original_condition, -cond_orig)
-  
+
 
 # get zero-indexed trial ids for the trials table
 d_trial_ids <- d_tidy_semifinal %>%
@@ -262,13 +267,13 @@ d_trial_ids <- d_tidy_semifinal %>%
   ) %>%
   # the prescreen notes are not attached to all rows of a trial (sub_num x session x months x trial_type_id), so we fix this
   group_by(sub_num, session, months, trial_type_id) %>%
-  summarize(prescreen_notes = first(na.omit(prescreen_notes)), .groups = 'drop') %>% 
+  summarize(prescreen_notes = first(na.omit(prescreen_notes)), .groups = 'drop') %>%
   mutate(excluded = !is.na(prescreen_notes)) |>
   rename(exclusion_reason = prescreen_notes) |>
   group_by(sub_num, session, months) %>%
   mutate(trial_order = cumsum(trial_type_id != lag(trial_type_id, default = first(trial_type_id)))) %>%
-  ungroup() %>% 
-  mutate(trial_id = 0:(n()-1)) %>% 
+  ungroup() %>%
+  mutate(trial_id = 0:(n()-1)) %>%
   distinct()
 
 # join
@@ -464,5 +469,5 @@ write_and_validate(
   aoi_region_sets = NA,
   xy_timepoints = NA,
   aoi_timepoints,
-  upload = TRUE
+  upload = FALSE
 )
diff --git a/data/fernald_marchman_2012/notes b/data/fernald_marchman_2012/notes
deleted file mode 100644
index c4cc450..0000000
--- a/data/fernald_marchman_2012/notes
+++ /dev/null
@@ -1,6 +0,0 @@
-PO - 2800 ms
-sound on: carrier
-F0 target noun onset
-exposure novel trials: F0 is the onset of the verb
-30mos: hard adjective trials: onset of the color/ size
-24mos: adjective: word onset is the adjective