diff --git a/.github/workflows/test_import.yml b/.github/workflows/test_import.yml
new file mode 100644
index 0000000..470beeb
--- /dev/null
+++ b/.github/workflows/test_import.yml
@@ -0,0 +1,36 @@
+# For help debugging build failures open an issue on the RStudio community with the 'github-actions' tag.
+# https://community.rstudio.com/new-topic?category=Package%20development&tags=github-actions
+on:
+  push:
+    branches:
+      - main
+      - master
+  pull_request:
+    branches:
+      - main
+      - master
+
+name: test-import
+
+jobs:
+  test-import:
+    runs-on: macOS-latest
+
+    env:
+      GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
+
+    steps:
+      - uses: actions/checkout@v2
+
+      - uses: r-lib/actions/setup-r@v2
+        
+      - uses: r-lib/actions/setup-pandoc@v1
+
+      - name: Install dependencies
+        run: |
+          install.packages(c('remotes','purrr','dplyr','here','tidyr'))
+          remotes::install_github("langcog/peekbankr",  force=T)          
+        shell: Rscript {0}
+          
+      - name: Run pipeline 
+        run: Rscript helper_functions/pipeline.R
diff --git a/data/fernald_marchman_2012/ReadME.md b/data/fernald_marchman_2012/ReadME.md
index ff5da82..90c657a 100644
--- a/data/fernald_marchman_2012/ReadME.md
+++ b/data/fernald_marchman_2012/ReadME.md
@@ -27,10 +27,15 @@ Note: for images, some images were shown in slightly different versions or mirro
 
 For the manju and tempo trials, some were exposure where the object was on a background and some were tests where they were not on the background.
 
-5. Importing ambiguity
+Some images were mirrored depending on left/right positioning - image labels L and R are from the participants' perspective.
 
+IMPORTANT: for related/unrelated prime noun/verb trials, the trials are represented in the raw data TWICE - once centered on the onset of the verb and once centered on the onset of the noun. We only keep the trial representation centered on the onset of the noun.
 
+5. Importing ambiguity
 
-ToDos:
-* check with Martin and/or Virginia about whether slightly different images (mirroring) matter
+Point of disambiguation is tricky for verb and adjective trials - should this be the first informative moment (e.g. when an informative verb was mentioned) or at the onset of the noun?
 
+In the raw data, point of disambiguation:
+- exposure novel trials: F0 is the onset of the verb
+- 24mos: adjective: word onset is the adjective
+- 30mos: hard adjective trials: onset of the color/ size
\ No newline at end of file
diff --git a/data/fernald_marchman_2012/import.R b/data/fernald_marchman_2012/import.R
index 772b3e0..50ae8a8 100644
--- a/data/fernald_marchman_2012/import.R
+++ b/data/fernald_marchman_2012/import.R
@@ -45,6 +45,12 @@ d_processed_24 <- d_raw_24 %>%
 d_raw_30 <- read_delim(fs::path(read_path, "TL230ABoriginalichartsn1-121toMF.txt"),
   delim = "\t"
 )
+# remove duplicated trials (recentered on verb instead of noun)
+d_raw_30 <- d_raw_30 |>
+  filter(
+    !(OriginalCondition %in% c("R-primeVerb","UR-primeVerb"))
+  )
+
 # d_raw_30 has two slightly different types of rows mixed together
 d_processed_30_part_1 <- d_raw_30 |>
   filter(is.na(Shifts)) |>
@@ -60,15 +66,17 @@ d_processed_30_part_1 <- d_raw_30 |>
 
 d_processed_30_part_2 <- d_raw_30 |>
   filter(!is.na(Shifts)) |>
-  # these *do* have looking data in non-looking cols
-  rename(
-    f01 = `Frames - word starts at frame 45 `,
-    f02 = `First Shift Gap`,
-    f03 = `RT`,
-    f04 = `CritOnSet`,
-    f05 = `CritOffSet`
-  ) |>
+  # # these *do* have looking data in non-looking cols
+  # rename(
+  #   f01 = `Frames - word starts at frame 45 `,
+  #   f02 = `First Shift Gap`,
+  #   f03 = `RT`,
+  #   f04 = `CritOnSet`,
+  #   f05 = `CritOffSet`
+  # ) |>
   preprocess_raw_data() %>%
+  #drop final x column
+  select(-x270) %>%
   relabel_time_cols(
     metadata_names = extract_col_types(.)[["metadata_names"]],
     pre_dis_names = extract_col_types(.)[["pre_dis_names"]],
@@ -143,9 +151,6 @@ d_tidy <- d_tidy %>%
     TRUE ~ right_image
   ))
 
-
-## TODO See Readme for some questions about stimulus table
-
 # create stimulus table
 stimulus_table_link <- d_tidy %>%
   distinct(target_image, target_label) |>
@@ -230,7 +235,7 @@ d_tidy <- d_tidy %>%
   )
 
 # create zero-indexed ids for trial_types
-d_trial_type_ids <- d_tidy %>% 
+d_trial_type_ids <- d_tidy %>%
   distinct(
     target_id, distractor_id, target_side,
     condition
@@ -252,7 +257,7 @@ d_tidy_semifinal <- d_tidy %>%
   left_join(d_administration_ids) %>%
   left_join(d_trial_type_ids) |>
   select(-condition2, -original_condition, -cond_orig)
-  
+
 
 # get zero-indexed trial ids for the trials table
 d_trial_ids <- d_tidy_semifinal %>%
@@ -262,13 +267,13 @@ d_trial_ids <- d_tidy_semifinal %>%
   ) %>%
   # the prescreen notes are not attached to all rows of a trial (sub_num x session x months x trial_type_id), so we fix this
   group_by(sub_num, session, months, trial_type_id) %>%
-  summarize(prescreen_notes = first(na.omit(prescreen_notes)), .groups = 'drop') %>% 
+  summarize(prescreen_notes = first(na.omit(prescreen_notes)), .groups = 'drop') %>%
   mutate(excluded = !is.na(prescreen_notes)) |>
   rename(exclusion_reason = prescreen_notes) |>
   group_by(sub_num, session, months) %>%
   mutate(trial_order = cumsum(trial_type_id != lag(trial_type_id, default = first(trial_type_id)))) %>%
-  ungroup() %>% 
-  mutate(trial_id = 0:(n()-1)) %>% 
+  ungroup() %>%
+  mutate(trial_id = 0:(n()-1)) %>%
   distinct()
 
 # join
@@ -464,5 +469,5 @@ write_and_validate(
   aoi_region_sets = NA,
   xy_timepoints = NA,
   aoi_timepoints,
-  upload = TRUE
+  upload = FALSE
 )
diff --git a/data/fernald_marchman_2012/notes b/data/fernald_marchman_2012/notes
deleted file mode 100644
index c4cc450..0000000
--- a/data/fernald_marchman_2012/notes
+++ /dev/null
@@ -1,6 +0,0 @@
-PO - 2800 ms
-sound on: carrier
-F0 target noun onset
-exposure novel trials: F0 is the onset of the verb
-30mos: hard adjective trials: onset of the color/ size
-24mos: adjective: word onset is the adjective
diff --git a/data/kremin_2021/README.md b/data/kremin_2021/README.md
new file mode 100644
index 0000000..faa1181
--- /dev/null
+++ b/data/kremin_2021/README.md
@@ -0,0 +1,32 @@
+# kremin_2021 dataset
+
+## Reference
+Kremin, L. V., Jardak, A., Lew-Williams, C., & Byers-Heinlein, K. (2023). Bilingual children’s comprehension of code-switching at an uninformative adjective. Language Development Research 3(1), 249–276.
+
+## Abstract
+Bilingual children regularly hear sentences that contain words from both languages, also known as code-switching. Investigating how bilinguals process code-switching is important for understanding bilingual language acquisition, because young bilinguals have been shown to experience processing costs and reduced comprehension when encountering code-switched nouns. Studies have yet to investigate if processing costs are present when children encounter code-switches at other parts of speech within a sentence. The current study examined how 30 young bilinguals (age range: 37 – 48 months) processed sentences with code-switches at an uninformative determiner-adjective pair before the target noun (e.g., “Can you find le bon [the good] duck?) compared to single-language sentences (e.g., “Can you find the good duck?”). Surprisingly, bilingual children accurately identified the target object in both sentence types, contrasting with previous findings that sentences containing codeswitching lead to processing difficulties. Indeed, children showed similar (and in some cases, better) comprehension of sentences with a code-switch at an uninformative adjective phrase, relative to single-language sentences. We conclude that functional information conveyed by a code-switch may contribute to bilingual children’s sentence processing.
+
+## Original study info
+Participants were 36-month-old bilinguals (Eng-Fre from Montreal, and Eng-Spa from Princeton).
+The key manipulation was code-mixing in prenominal adjectives before the target noun (e.g., "Can you see the good cow?" vs "Can you see le bon cow?"); note that the adjectives were uninformative.
+
+Data from Montreal was collected with a Tobii T60-XL eyetracker, and data from Princeton was collected using a video camera and manual gaze coding.
+
+Note that the data only include "single" and "mixed" conditions; there are also other "filler" trials that were not in the data (although the filler data for the Eng-Fra subset can be found in Sander-Montant et al. ([2022](osf.io/2m345/))).
+
+## Importing decisions
+Stimuli are available although these are in video files so will need to be extracted.
+CDI and Language exposure data for the Montreal subset are also available in Perez et al. ([2024](https://osf.io/mxksz/)) and could be imported (although they have not yet been).
+There are also vocabulary data from DVAP; these could be imported in the lang_measures field for subject_aux_data.
+
+We included importing decisions from the [processing script](https://osf.io/ug7t3/files/github/01_load.R), including the rectification of AOIs and the removal of duplicated header rows.
+Age was calculated from years, months, and days using the formula: years * 365.2425 + months * (365.2425/12) + days.
+
+Monitor size for the Montreal data was set as 1920x1200 based on the Tobii export.
+
+We decided that the data reflect non-vanilla trials, although the "single" condition trials (e.g., "Can you see the good cow?") could be construed as "vanilla" in the sense that the carrier phrase is unlikely to bias the results.
+(This would also be true for the "filler" condition trials should we decide to include these data).
+The data may be worth further analysis despite their non-vanilla status.
+
+## Importing ambiguity
+None other than those reported above.
diff --git a/data/kremin_2021/import.R b/data/kremin_2021/import.R
new file mode 100644
index 0000000..547e05e
--- /dev/null
+++ b/data/kremin_2021/import.R
@@ -0,0 +1,255 @@
+library(tidyverse)
+library(janitor)
+library(here)
+library(glue)
+library(readxl)
+
+source(here("helper_functions", "idless_draft.R"))
+source(here("helper_functions", "common.R"))
+dataset_name <- "kremin_2021"
+read_path <- init(dataset_name)
+
+# Montreal ####
+## loading files ####
+mtl_path <- here(read_path, "Montreal")
+
+# et data
+load(here(mtl_path, "mtl_raw_gaze_anon.rda"))
+
+# demog data
+mtl_demog <- read_csv(here(mtl_path, "mtl_msl_anon.csv")) |> 
+  clean_names() |> 
+  mutate(testing_loc = "Montreal")
+# grab exposure values from sander-montant_2022
+load(here(mtl_path, "demo_comp.Rda"))
+demo_comp <- demo_comp |> 
+  select(study_id, eng_exp, fre_exp)
+
+## getting trial info ####
+# trial info
+trial_info <- read_csv(here(read_path, "target-distractor-pairs.csv"))
+trial_info_fr <- read_csv(here(read_path, "trial_info_fr.csv")) |> 
+  mutate(media_name = str_remove(media_name, "\\.wmv"))
+
+aois <- read_csv(here(mtl_path, "compmix_aois.csv")) |> 
+  separate_wider_delim(aoi_name, delim = "_",
+                       names = c("language", "object_type", "object", "trial_type", "location")) |> 
+  mutate(x_min = x1,
+         x_max = x2,
+         y_min = y2,
+         y_max = y3,
+         x_width = x_max - x_min,
+         y_height = y_max - y_min) |> 
+  filter(y_max %% 1 == 0,
+         x_max %% 1 == 0)
+
+# aoi cleaning
+aoi_one <- aois |> 
+  select(object, location, x_min, x_max, y_min, y_max, x_width, y_height) |> 
+  distinct() |> 
+  group_by(object, location) |> 
+  summarise(n_aois = n(),
+            across(starts_with("x"), min),
+            across(starts_with("y"), min)) |> 
+  filter(n_aois == 1)
+
+aoi_fixed <- aois |> 
+  select(object, location, x_min, x_max, y_min, y_max, x_width, y_height) |> 
+  distinct() |> 
+  # the following two lines are based on the original processing code 
+  # (https://osf.io/ug7t3/files/github/01_load.R)
+  filter(x_width %in% aoi_one$x_width,
+         y_height != 493) 
+
+## processing et data ####
+mtl_data_cleaned <- mtl_raw_data_anon |> 
+  rename(x = gaze_point_x_adc_spx, 
+         y = gaze_point_y_adc_spx) |> 
+  separate_wider_delim(recording_name, delim = "_",
+                       names = c("study_name", "study_id", "study_order"),
+                       cols_remove = FALSE) |> 
+  mutate(media_name = str_remove(media_name, "\\.wmv"),
+         media_name = case_when(
+           media_name == "Cow_FrSingle_L" & study_order == "F1" ~ "Cow_FrSingle_R",
+           .default = media_name
+         )) |> 
+  filter(str_detect(media_name, "(Single|Mixed)"),
+         is.na(studio_event)) |> 
+  separate_wider_delim(media_name, delim = "_",
+                       names = c("object", "trial_type", "location"),
+                       cols_remove = FALSE) |> 
+  select(-starts_with("aoi_"))
+
+## merging data ####
+mtl_wide.table <- mtl_data_cleaned |> 
+  left_join(trial_info, by = join_by(object == target)) |> 
+  left_join(trial_info_fr, by = join_by(media_name)) |> 
+  left_join(aoi_fixed, by = join_by(object, location)) |> # target
+  left_join(aoi_fixed |> mutate(location = ifelse(location == "L", "R", "L")), # distractor
+            by = join_by(distractor == object, location),
+            suffix = c("_t", "_d")) |> 
+  left_join(mtl_demog, by = join_by(recording_name, study_id)) |> 
+  mutate(
+    subject_id = glue("{study_name}_{study_id}"),
+    sex = gender,
+    native_language = "eng, fre",
+    age = years * 365.2425 + months * (365.2425/12) + days,
+    age_units = "days",
+    t = recording_timestamp,
+    # aoi,
+    # full_phrase
+    # full_phrase_language
+    point_of_disambiguation = 3000,
+    target_side = ifelse(location == "L", "left", "right"),
+    # condition
+    vanilla_trial = 0,
+    excluded = keeper == 0,
+    exclusion_reason = exclusion,
+    session_num = 1,
+    sample_rate = 60,
+    tracker = "Tobii T60-XL",
+    coding_method = "eyetracking",
+    target_stimulus_label_original = target_label,
+    target_stimulus_label_english = target_image,
+    target_stimulus_novelty = "familiar",
+    target_stimulus_image_path = NA,
+    target_image_description = target_image,
+    target_image_description_source = "experiment documentation",
+    distractor_stimulus_label_original = distractor_label,
+    distractor_stimulus_label_english = distractor_image,
+    distractor_stimulus_novelty = "familiar",
+    distractor_stimulus_image_path = NA,
+    distractor_image_description = distractor_image,
+    distractor_image_description_source = "experiment documentation",
+    l_x_max = ifelse(location == "L", x_max_t, x_max_d),
+    l_x_min = ifelse(location == "L", x_min_t, x_min_d),
+    l_y_max = ifelse(location == "L", y_max_t, y_max_d),
+    l_y_min = ifelse(location == "L", y_min_t, y_min_d),
+    r_x_max = ifelse(location == "R", x_max_t, x_max_d),
+    r_x_min = ifelse(location == "R", x_min_t, x_min_d),
+    r_y_max = ifelse(location == "R", y_max_t, y_max_d),
+    r_y_min = ifelse(location == "R", y_min_t, y_min_d),
+    # x
+    # y
+    monitor_size_x = 1920,
+    monitor_size_y = 1200,
+    aoi = ifelse(target_side == "left",
+                 case_when( # left target
+                   x <= l_x_max & x >= l_x_min & y <= l_y_max & y >= l_y_min ~ "target",
+                   x <= r_x_max & x >= r_x_min & y <= r_y_max & y >= r_y_min ~ "distractor",
+                   is.na(x) | is.na(y) | x > monitor_size_x | x < 0 | y > monitor_size_y | y < 0 ~ "missing",
+                   .default = "other"
+                 ),
+                 case_when( # right target
+                   x <= l_x_max & x >= l_x_min & y <= l_y_max & y >= l_y_min ~ "distractor",
+                   x <= r_x_max & x >= r_x_min & y <= r_y_max & y >= r_y_min ~ "target",
+                   is.na(x) | is.na(y) | x > monitor_size_x | x < 0 | y > monitor_size_y | y < 0 ~ "missing",
+                   .default = "other"
+                 ))
+  )
+
+# Princeton ####
+## loading files ####
+pct_path <- here(read_path, "Princeton")
+
+# et data
+pct_raw_data_anon <- read_csv(here(pct_path, "3aMixData_col-names.csv"),
+                              na = c(".", "-"))
+
+# demog data
+pct_comp <- read_excel(here(pct_path, "Princeton_LangComprehension.xlsx")) |> 
+  clean_names() |> 
+  rename(study_id = subj_id)
+pct_keepers <- read_csv(here(pct_path, "Princeton_keepers.csv"))
+pct_demog <- read_csv(here(pct_path, "pct_msl_anon.csv")) |> 
+  left_join(pct_comp, by = join_by(study_id)) |> 
+  left_join(pct_keepers, by = join_by(study_id)) |> 
+  mutate(testing_loc = "Princeton",
+         study_id = as.character(study_id))
+
+## getting trial info ####
+pct_trial_info <- read_csv(here(pct_path, "CompMix_trial-numbers_PCT.csv"))
+trial_info_sp <- read_csv(here(read_path, "trial_info_sp.csv"))
+
+## processing et data ####
+pct_data_cleaned <- pct_raw_data_anon |> 
+  filter(`Sub Num` != "Sub Num", # duplicate header rows
+         `Sub Num` != "") |>  
+  mutate(across(`-3100`:`4767`, as.numeric)) |> 
+  pivot_longer(cols = `-3100`:`4767`, 
+               names_to = "t_norm", 
+               values_to = "look") |> 
+  clean_names() |> 
+  rename(study_id = sub_num,
+         trial_type = condition,
+         target = target_image) |> 
+  mutate(trial_type = case_when(
+    trial_type == "Switch" ~ "Mixed",
+    .default = "Single"
+  )) |> 
+  separate_wider_delim(order, delim = "_",
+                       names = c("study_name", "study_order"),
+                       too_many = "drop",
+                       cols_remove = TRUE) |> 
+  left_join(pct_trial_info, by = join_by(study_order, target, target_side, trial_type)) |> 
+  select(-tr_num)
+
+## merging data ####
+pct_wide.table <- pct_data_cleaned |> 
+  left_join(pct_trial_info,
+            by = join_by(study_order, target_side, target, trial_type, trial_number)) |> 
+  left_join(trial_info_sp,
+            by = join_by(study_order, target_side, target, trial_type)) |> 
+  left_join(pct_demog, by = join_by(study_id)) |> 
+  mutate(
+    vocab_eng = as.numeric(vocab_eng),
+    subject_id = glue("{study_name}_{study_id}"),
+    sex = gender,
+    native_language = "eng, spa",
+    age = years * 365.2425 + months.y * (365.2425/12) + days,
+    age_units = "days",
+    t = as.numeric(t_norm),
+    aoi = case_when(
+      look == 1 ~ "target",
+      look == 0 ~ "distractor",
+      .default = "missing"
+    ),
+    # full_phrase
+    # full_phrase_language
+    point_of_disambiguation = 3100,
+    target_side = ifelse(target_side == "l", "left", "right"),
+    condition = tolower(trial_type),
+    vanilla_trial = 0,
+    excluded = keeper == "N",
+    exclusion_reason = reason,
+    session_num = 1,
+    sample_rate = NA,
+    tracker = NA,
+    coding_method = "manual gaze coding",
+    target_stimulus_label_original = target_label,
+    target_stimulus_label_english = tolower(target),
+    target_stimulus_novelty = "familiar",
+    target_stimulus_image_path = NA,
+    target_image_description = tolower(target),
+    target_image_description_source = "experiment documentation",
+    distractor_stimulus_label_original = distractor_label,
+    distractor_stimulus_label_english = distractor_image,
+    distractor_stimulus_novelty = "familiar",
+    distractor_stimulus_image_path = NA,
+    distractor_image_description = distractor_image,
+    distractor_image_description_source = "experiment documentation"
+  )
+
+# combine both datasets ####
+wide.table <- bind_rows(mtl_wide.table |> select(-keeper), 
+                        pct_wide.table |> select(-keeper))
+
+dataset_list <- digest.dataset(
+  dataset_name = dataset_name,
+  lab_dataset_id = NA,
+  cite = "Kremin, L. V., Jardak, A., Lew-Williams, C., & Byers-Heinlein, K. (2023). Bilingual children’s comprehension of code-switching at an uninformative adjective. Language Development Research 3(1), 249–276.",
+  shortcite = "Kremin et al. 2023",
+  wide.table = wide.table
+)
+
+write_and_validate_list(dataset_list, cdi_expected = FALSE, upload = TRUE)
diff --git a/data/yoon_simpimp_2015/import.R b/data/yoon_simpimp_2015/import.R
index 67c9a35..facfa41 100644
--- a/data/yoon_simpimp_2015/import.R
+++ b/data/yoon_simpimp_2015/import.R
@@ -1,3 +1,4 @@
+library(tidyverse)
 library(here)
 
 source(here("helper_functions", "idless_draft.R"))
@@ -5,17 +6,152 @@ source(here("helper_functions", "common.R"))
 dataset_name <- "yoon_simpimp_2015"
 data_path <- init(dataset_name)
 
-# TODO: figure out what part of the data we actually need here
 
-wide.table <- tibble()
+generate_aois <- function(x, y, target_side, 
+                          l_x_max, l_x_min, l_y_max, l_y_min,
+                          r_x_max, r_x_min, r_y_max, r_y_min,
+                          monitor_size_x = 1e6,
+                          monitor_size_y = 1e6) {
+  ifelse(target_side == "left",
+         case_when( # left target
+           x <= l_x_max & x >= l_x_min & y <= l_y_max & y >= l_y_min ~ "target",
+           x <= r_x_max & x >= r_x_min & y <= r_y_max & y >= r_y_min ~ "distractor",
+           is.na(x) | is.na(y) | x > monitor_size_x | x < 0 | y > monitor_size_y | y < 0 ~ "missing",
+           .default = "other"
+         ),
+         case_when( # right target
+           x <= l_x_max & x >= l_x_min & y <= l_y_max & y >= l_y_min ~ "distractor",
+           x <= r_x_max & x >= r_x_min & y <= r_y_max & y >= r_y_min ~ "target",
+           is.na(x) | is.na(y) | x > monitor_size_x | x < 0 | y > monitor_size_y | y < 0 ~ "missing",
+           .default = "other"
+         ))
+}
+# 
 eyetracking_path <- here(data_path, "eyetracking")
-# TODO t.crit is seconds right now
-data_ex_1B <- read.csv(here(eyetracking_path, "simpimp_processed_3v1.csv"))
-data_ex_1A <- read.csv(here(eyetracking_path, "simpimp_processed_2v1.csv"))
+data_ex_1<- read_csv(here(eyetracking_path, "eyetrack_expt1.csv")) |> 
+  mutate(expt="0")
 
-exclusion_data <- read.csv(here(eyetracking_path, "simpimp_et_log.csv"))
-order_data <- read.csv(here(eyetracking_path, "simpimp_et_order.csv"))
+data_ex_2 <- read_csv(here(eyetracking_path, "eyetrack_expt2.csv")) |> 
+  mutate(expt="scale") 
 
+exclusion_data <- read_csv(here(eyetracking_path, "simpimp_et_log.csv")) |> 
+  filter(age!="adult")
+order_data <- read_csv(here(eyetracking_path, "simpimp_et_order.csv"))
+
+draft_data <- data_ex_1 |> bind_rows(data_ex_2) |> inner_join(order_data) |> 
+  #mutate(time=ifelse(is.na(order), NA, time), #this is janky - VB
+  #       stimulus=ifelse(is.na(order), NA, stimulus)) |> 
+  #group_by(subid) |> 
+  #     fill(order:targetOnset) |> 
+  #     fill(time:stimulus) |> 
+  ungroup() |> 
+  filter(!is.na(time)) |> 
+  mutate(t=t-time,
+         point_of_disambiguation=1000*targetOnset) |> 
+  select(-lx, -ly, -rx, -ry) |> inner_join(exclusion_data) |> 
+  mutate(target_side=ifelse(targetPos=="R","right", "left"),
+         condition=case_when(
+           trial_type=="cs" ~ "control-single",
+           trial_type=="cd" ~ "control-double",
+           trial_type=="inf" ~ "inference"
+         ),
+         vanilla_trial = FALSE, # there are decisions to be made about whether
+         # the cs trials are vanilla, we're leaning not, but shrug
+         excluded = keep_drop=="drop", # note there should also be trial level exclusions, but we haven't tracked those down
+         exclusion_reason = ifelse(keep_drop=="drop", "participant level some reason", NA),
+         target_stimulus_label_original = str_c("target_",condition, "_",item),
+         distractor_stimulus_label_original=str_c("distractor_", condition,"_", item)
+         # will need to fix this
+  )
+
+
+
+wide.table <- draft_data |> 
+  mutate(age_units="years",
+         age=as.numeric(age),
+         full_phrase=NA,
+         native_language="eng",
+         full_phrase_language="eng",
+         session_num=0, 
+         sample_rate=NA,
+         tracker = NA,
+         coding_method = "eyetracking",
+         # note we will need to figure this out off of item at some point
+         target_stimulus_label_english = target_stimulus_label_original,
+         target_stimulus_novelty = "familiar",
+         target_stimulus_image_path = "stimulus_image_path",
+         target_image_description = "image",
+         target_image_description_source = "Peekbank discretion",
+         distractor_stimulus_label_english = distractor_stimulus_label_original,
+         distractor_stimulus_novelty = "familiar",
+         distractor_stimulus_image_path = "distrator_image_path",
+         distractor_image_description = "tbd",
+         distractor_image_description_source = "Peekbank discretion"
+  ) |> 
+  select(
+  subject_id = subid,
+  sex = sex,
+  native_language,
+  age = age,
+  age_units,
+  t,
+  #aoi = NA,
+  full_phrase,
+  full_phrase_language,
+  point_of_disambiguation,
+  target_side,
+  condition,
+  vanilla_trial,
+  excluded,
+  exclusion_reason,
+  session_num,
+  sample_rate,
+  tracker,
+  coding_method,
+  target_stimulus_label_original,
+  target_stimulus_label_english,
+  target_stimulus_novelty,
+  target_stimulus_image_path,
+  target_image_description,
+  target_image_description_source,
+  distractor_stimulus_label_original,
+  distractor_stimulus_label_english,
+  distractor_stimulus_novelty,
+  distractor_stimulus_image_path,
+  distractor_image_description,
+  distractor_image_description_source,
+  x,
+  y
+) %>%
+  # optional 
+  mutate(
+    # fill out all of these if you have xy data
+    l_x_max = 840,
+    l_x_min = 0,
+    l_y_max = 1000,
+    l_y_min = 250,
+    r_x_max = 1680,
+    r_x_min = 840,
+    r_y_max = 1000,
+    r_y_min = 250,
+    x = x,
+    y = y,
+    monitor_size_x = NA,
+    monitor_size_y = NA,
+    # if two subsequent trials can have the same stimuli combination,
+    # use this to indicate the trial order within an administration
+    trial_index = NA,
+    # lab specific name for trials
+    trial_name = NA,
+    # lab specific names for stimuli
+    target_stimulus_name = NA, 
+    distractor_stimulus_name = NA
+  ) |> mutate(aoi=generate_aois(x, y, target_side, 
+                                l_x_max, l_x_min, l_y_max, l_y_min,
+                                r_x_max, r_x_min, r_y_max, r_y_min))
+
+    
+  
 dataset_list <- digest.dataset(
   dataset_name = dataset_name,
   lab_dataset_id = NA,
diff --git a/data/yoon_simpimp_2015/simpimp_preprocess.R b/data/yoon_simpimp_2015/simpimp_preprocess.R
new file mode 100644
index 0000000..6f4014d
--- /dev/null
+++ b/data/yoon_simpimp_2015/simpimp_preprocess.R
@@ -0,0 +1,83 @@
+# based on pre-processing of the original study, 
+# but updated to use tidyverse and be idiomatic
+# and to be selective for what downstream processing
+# is already easy for peekbank to do
+
+library(tidyverse)
+library(janitor)
+source("useful.R")
+
+select_msg <- function(df){
+  if (raw.data.path=="raw_data/eyetracking/new_data/")
+  {df |>   select(time, message=l_raw_x_px)}
+  else {df |>  select(time, message=l_por_x_px)}
+  
+}
+preprocess.data <- function(file.name, x.max = 1680, y.max = 1050,
+                            samp.rate = 120,
+                            avg.eyes = TRUE) {
+  
+  
+  ## DATA CLEANING
+  # read in data and get rid of header rows
+  all.d <- read_tsv(str_c(raw.data.path, file.name), comment = "##") |> clean_names()
+  
+  
+  ## split data into messages and data
+  ## First get data:
+  dat <- all.d |>
+    filter(type == "SMP") |>
+    mutate(
+      lx = to.n(l_por_x_px),
+      rx = to.n(r_por_x_px),
+      ly = to.n(l_por_y_px),
+      ry = to.n(r_por_y_px),
+      t=time/1000 #convert ms
+    ) |>
+    select(t = t, lx, ly, rx, ry)
+  
+  print(file.name)
+  
+ # head(dat) |> print()
+  # get messages (whatever those are)
+  msgs <- all.d |>
+    filter(str_detect(type,"MSG")) |> 
+    select_msg() |> 
+    mutate(time=time/1000, #to ms
+           stimulus = as.character(message) |> 
+             str_replace("# Message: ", "") |> 
+             str_replace(".jpg","")) |> 
+    select(-message)
+  
+  msgs |> select(stimulus) |> unique() |> head() |> print()
+
+  #print(head(msgs))
+  # attach the most recently started stimulus
+  d <- dat |> left_join(msgs, by = join_by(closest(t > time))) |> 
+    filter(!str_detect(stimulus,".avi")) |> 
+    filter(!stimulus=="blank") |> 
+    filter(!is.na(stimulus)) |> 
+    rowwise() |> 
+    mutate(x=mean(c(lx,rx), na.rm=T), #average eyes
+           y=mean(c(ly,ry), na.rm=T),
+           x=ifelse(0<x & x < x.max, x, NA),
+           y=ifelse(0<y & y < y.max, y, NA),
+           y=y.max-y, # move to cartesian origin
+           subid=file.name)
+  
+  return(d)
+}
+
+raw.data.path = "raw_data/eyetracking/new_data/"
+
+stuff <- list.files(raw.data.path) |>  # big issues
+  map(preprocess.data) |>  bind_rows() |> mutate(expt="expt2") |> write_csv("raw_data/eyetracking/eyetrack_expt2.csv")
+
+raw.data.path = "raw_data/eyetracking/old_data/"
+
+#foo <- preprocess.data("140217-02-L1.txt")
+stuff2 <- list.files(raw.data.path) |> 
+  map(preprocess.data) |>  bind_rows() |> mutate(expt="expt1") |> write_csv("raw_data/eyetracking/eyetrack_expt1.csv")
+
+
+
diff --git a/data/yoon_simpimp_2015/useful.R b/data/yoon_simpimp_2015/useful.R
new file mode 100644
index 0000000..f90a74e
--- /dev/null
+++ b/data/yoon_simpimp_2015/useful.R
@@ -0,0 +1,72 @@
+################################################################################
+## USEFUL.R
+## a variety of useful libraries and commands
+## mcf 6/13 etc.
+################################################################################
+
+library(ggplot2)
+library(bootstrap)
+library(lme4)
+library(stringr)
+library(reshape2)
+library(plyr)
+
+## add some style elements for ggplot2
+theme_set(theme_bw())
+
+## standard error of the mean
+sem <- function (x) {
+  sd(x,na.rm=TRUE) / sqrt(length(x))
+}
+
+## standard error of the mean
+se <- function(x) {
+  y <- x[!is.na(x)] # remove the missing values, if any
+  sqrt(var(as.vector(y))/length(y))
+}
+
+## NA functions
+na.mean <- function(x) {mean(x,na.rm=T)}
+na.sum <- function(x) {sum(x,na.rm=T)}
+
+## convert to number
+to.n <- function(x) {
+  as.numeric(as.character(x))
+}
+
+## inverse logistic
+inv.logit <- function (x) {
+  exp(x) / (1 + exp(x)) 
+}
+
+## number of unique subs
+n.unique <- function (x) {
+  length(unique(x))
+}
+
+## for bootstrapping 95% confidence intervals
+theta <- function(x,xdata,na.rm=T) {mean(xdata[x],na.rm=na.rm)}
+ci.low <- function(x,na.rm=T) {
+  mean(x,na.rm=na.rm) - quantile(bootstrap(1:length(x),1000,theta,x,na.rm=na.rm)$thetastar,.025,na.rm=na.rm)}
+ci.high <- function(x,na.rm=T) {
+  quantile(bootstrap(1:length(x),1000,theta,x,na.rm=na.rm)$thetastar,.975,na.rm=na.rm) - mean(x,na.rm=na.rm)}
+
+## for basic plots, add linear models with correlations
+lm.txt <- function (p1,p2,x=7.5,yoff=.05,lt=2,c="black",data=data)
+{
+  l <- lm(p2 ~ p1)
+  regLine(l,lty=lt,col=c)
+  cl <- coef(l)
+  text(x,cl[1] + cl[2] * x + yoff,
+       paste("r = ",sprintf("%2.2f",sqrt(summary(l)$r.squared)),
+             getstars(anova(l)$"Pr(>F)"[1]),sep=""),
+       xpd="n")
+}
+
+## get stars for significance testing
+getstars <- function(x) {
+  if (x > .1) {return("")}
+  if (x < .001) {return("***")}
+  if (x < .01) {return("**")}
+  if (x < .05) {return("*")}
+}
diff --git a/helper_functions/idless_draft.R b/helper_functions/idless_draft.R
index d236875..6dc1fe9 100644
--- a/helper_functions/idless_draft.R
+++ b/helper_functions/idless_draft.R
@@ -168,6 +168,10 @@ digest.dataset <- function(
     ungroup() %>% 
     group_by(administration_id, trial_type_id, trial_order) %>%
     mutate(trial_id = cur_group_id() - 1) %>%
+    ungroup() %>%
+    group_by(l_x_max, l_x_min, l_y_max, l_y_min,
+             r_x_max, r_x_min, r_y_max, r_y_min) %>%
+    mutate(aoi_region_set_id = cur_group_id() - 1) %>%
     ungroup()
   
   datasets <- tibble(
@@ -212,7 +216,7 @@ digest.dataset <- function(
       age = case_when(
         lab_age_units == "months" ~ lab_age,
         lab_age_units == "days" ~ lab_age/(365.25/12),
-        lab_age_units == "years" ~ 12*lab_age + ifelse(lab_age-floor(lab_age) == 0, 6, 0),
+        lab_age_units == "years" ~ 12*lab_age + ifelse(all(lab_age-floor(lab_age) == 0), 6, 0),
         .default = NA
       ),
       administration_aux_data = NA
@@ -230,7 +234,8 @@ digest.dataset <- function(
       vanilla_trial,
       dataset_id,
       distractor_id,
-      target_id
+      target_id,
+      aoi_region_set_id
     ) %>% 
     mutate(
       target_side = tolower(target_side),
@@ -241,7 +246,6 @@ digest.dataset <- function(
         target_side == "r" ~ "right",
         .default="ERROR"),
       trial_type_aux_data = NA,
-      aoi_region_set_id = NA # set for now, set to 0 further down below if we actually have that table
       )
   
   
@@ -289,8 +293,8 @@ digest.dataset <- function(
         r_x_min,
         r_y_max,
         r_y_min,
-      ) %>% 
-      mutate(aoi_region_set_id = 0)
+        aoi_region_set_id
+      )
     
     xy_timepoints <- data %>%
       {if (rezero) peekds::rezero_times(.) else rename(., t_zeroed = t)} %>%
@@ -304,6 +308,8 @@ digest.dataset <- function(
         trial_id,
         administration_id
       )
+  } else {
+    trial_types$aoi_region_set_id <- NA
   }
   
   return(list(