-
Notifications
You must be signed in to change notification settings - Fork 0
/
Data processing.R
425 lines (291 loc) · 16.2 KB
/
Data processing.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
#Chapter 2 - Cognitive Task Data Processing
#the following code processes the output of the cognitive task
#developed by Stephen Fleming and Max Rollwage
#the following functions were developed by Audrey Mazancieux
#and Oliver Warrington based on Stephen Fleming's stan functions:
#Function_trials2counts.R
#Function_metad_group.R
library(magrittr)
library(dplyr)
library(tidyverse)
library(data.table)
library(ggplot2)
library(corrplot)
#--------------------------------------------------------------------------------------------------------------
#DATA PRE-PROCESSING
#create df with all of the participants' raw experimental data together, switchers and stickers
stickersv1 = read.csv('../Data/Stickers v1/data_exp_70615-v1_task-6ybr.csv')
stickersv2 = read.csv('../Data/Stickers v2/data_exp_70615-v2_task-6ybr.csv')
stickersv3 = read.csv('../Data/Stickers v3/data_exp_70615-v3_task-6ybr.csv')
stickers = full_join(stickersv1, stickersv2)
stickers = full_join(stickers, stickersv3)
#create column 'switch' with value '0' to indicate that participant did not switch
stickers = stickers %>%
mutate(switch = 0)
#exclude rows of participant 5249280 who made an incomplete submission
stickers = stickers[!(stickers$Participant.Private.ID=='5249280'),]
switchersv1 = read.csv('../Data/Switchers v1/data_exp_49341-v1_task-6ybr.csv')
switchersv2 = read.csv('../Data/Switchers v2/data_exp_49341-v2_task-6ybr.csv')
switchersv3 = read.csv('../Data/Switchers v3/data_exp_78465-v3_task-6ybr.csv')
switchers = full_join(switchersv1, switchersv2)
switchers = full_join(switchers,switchersv3)
#create column 'switch' with value '1' to indicate that participants switched
switchers = switchers %>%
mutate(switch = 1)
rawData = full_join(switchers, stickers)
#keep only participants who completed the experiment,
#i.e. remove those who are still 'live'
rawData = subset(rawData, rawData$Participant.Status=='complete')
#only keep participants who passed the exclusion criteria
#invalid submissions to remove from the main dataframe
allInvalidSubmissions = read.csv('../Data/all_invalid_submissions.csv')
allInvalidSubmissions = as.character(allInvalidSubmissions$x)
#exclude all the rows that correspond to the Participant.Public.ID values
#included in the list that follows %in%
rawData = rawData[! rawData$Participant.Public.ID %in% allInvalidSubmissions,]
length(levels(rawData$Participant.Public.ID))
#delete empty factor levels, i.e. the participant IDs whose rows were
#excluded in the lines above
rawData$Participant.Public.ID = factor(rawData$Participant.Public.ID, exclude = NULL)
#check that the empty factor levels have been removed
length(levels(rawData$Participant.Public.ID))
#----------------------------------------------------------------------------------------------------------------------------
#load demographic data
demoStickers = read.csv('../Data/Demographic data/Stickers_demo.csv')
demoSwitchers =read.csv('../Data/Demographic data/Switchers.csv')
#keep only approved submissions
demoStickers = subset(demoStickers, demoStickers$status == 'APPROVED')
demoSwitchers = subset(demoSwitchers, demoSwitchers$status =='APPROVED')
#create column 'switch' indicating if participant switched or not
demoStickers = demoStickers %>%
mutate(switch = 0)
demoSwitchers = demoSwitchers %>%
mutate(switch = 1)
#merge stickers and switchers dfs
demoAll = full_join(demoStickers,demoSwitchers)
#keep only columns of interest
demoAll = dplyr::select(demoAll,
participant_id,
switch,
age,
Current.U.S.state.of.residence,
Employment.Status,
Ethnicity,
Highest.education.level.completed,
Sex,
time_taken)
#set time taken in minutes rather than in seconds
demoAll$time_taken = (demoAll$time_taken/60)
#remove invalid submissions
demoAll = demoAll[! demoAll$participant_id %in% allInvalidSubmissions,]
#remove empty factors corresponding to those invalid submissions
demoAll$participant_id = factor(demoAll$participant_id)
#change the column name of participant_id to Participant.Public.ID so that you
#can merge them together afterwards
names(demoAll)[names(demoAll)=='participant_id'] = 'Participant.Public.ID'
#check that you have the same IDs in the Prolific and Gorilla dfs
prolificIDs = demoAll$Participant.Public.ID
gorillaIDs = levels(rawData$Participant.Public.ID)
intersect(prolificIDs,gorillaIDs)
setdiff(prolificIDs,gorillaIDs)
#check if participants pass the exclusion criteria in "Exclusion criteria script"
#--------------------------------------------------------------------------------------------------------------------
#select relevant columns
myData = dplyr::select(rawData,
Participant.Private.ID,
Participant.Public.ID,
time_elapsed,
Reactiontime,
responses,
confidence_rating,
key_press,
correct,
trial_type,
Trial_real_experiment,
Task_type,
current_trial_difficuly,
label,
switch)
#recode the $correct column so that the incorrect trials are '0' rather than 'NA'
myData$correct = tidyr::replace_na(myData$correct, 0)
#convert Participant.Public.ID to factor to loop through factor levels,
#i.e. through each participant
myData$Participant.Public.ID = as.factor(myData$Participant.Public.ID)
df2 = subset(myData,
myData$Task_type == 'simpleperceptual' |
myData$Task_type == 'Change_of_mind')
df2$responses = factor(df2$responses)
#remove rows corresponding to the participants practicing how to slide
#the confidence rating scale
df2 = df2[!df2$label=='confidprac',]
#create new variable "Direction", to say if the stim with more
#flickering dots was on the left or on the right
df2 = df2 %>%
mutate(Direction = case_when(df2$key_press == '87' & df2$correct == 1 ~ 'Left',
df2$key_press == '87' & df2$correct == 0 ~ 'Right',
df2$key_press == '69' & df2$correct == 1 ~ 'Right',
df2$key_press == '69' & df2$correct == 0 ~ "Left"))
#create StimID variable with Left as 0 and Right as 1
df2 = df2 %>%
mutate(StimID = case_when(df2$Direction == "Left" ~ 0,
df2$Direction == "Right" ~ 1))
#create 'Response' column indicating whether each participant
#responded "Left" or "Right" in the key_press variable
#87 = Left = 0
#69 = Right = 1
df2 = df2 %>%
mutate(Response = case_when(df2$key_press == '87' ~ 0,
df2$key_press == '69' ~ 1))
df2 = df2 %>%
mutate(confidenceRating = case_when(df2$responses == '{"Q0":1}' ~ 1,
df2$responses == '{"Q0":2}' ~ 2,
df2$responses == '{"Q0":3}' ~ 3,
df2$responses == '{"Q0":4}' ~ 4,
df2$responses == '{"Q0":5}' ~ 5,
df2$responses == '{"Q0":6}' ~ 6,
df2$responses == '{"Q0":7}' ~ 7,
df2$responses == '{"Q0":8}' ~ 8,
df2$responses == '{"Q0":9}' ~ 9))
#find column numbers to create smaller df
which(colnames(df2)=='StimID')
which(colnames(df2)=='Response')
which(colnames(df2)=='confidenceRating')
which(colnames(df2)=='Participant.Public.ID')
which(colnames(df2)=='switch')
#create dataframe where the StimID, Response and confidence_ratings rows are aligned
#keep only the columns that you'll need to create the inputs to trials2count()
#i.e. $StimID, $Response and $confidenceRating
df3 = df2[,c(16,17,18,2,14) ]
#align the confidence ratings with the other columns
for (row in 1: length(df3$confidenceRating)) { #loop from 1 through the rest of the column
if (row %% 2 == 1) { df3$confidenceRating[row] = df3$confidenceRating[row+1]} # if the remainder of the division is 1, it indicates that the row number was odd
if (row %% 2 == 0) { df3$confidenceRating[row] = NA} #above, we assign to each odd-numbered row the value of the following row
}
df4 = na.omit(df3)
write.csv(df4, '../Data/df4.csv', row.names = FALSE)
#INPUTS to trials2counts()
stimID = df4$StimID
response = df4$Response
rating = df4$confidenceRating
nRatings = as.numeric(9)
source('Function_trials2counts.R')
newlist = trials2counts(stimID,response,rating,nRatings)
print(newlist)
#-----------------------------------------------------------------------------
#measuring confirmation bias, i.e. post-decision evidence integration
#you need to do a few things before being able to do this.
#We measured confirmatory and disconfirmatory evidence integration
# as changes in confidence induced by post-decision evidence. We constructed
# trial-by-trial linear models for every participant, separately
# for correct and incorrect trials across data pooled across Tasks 1 and 2,
# using post-decision evidence strength as a predictor
# (confidence task = 0, low post-decision evidence = 1, high post-decision evidence = 2)
# and confidence ratings as the dependent variable.
# Individual beta weights for correct trials, indicating increases of confidence
# due to post-decision evidence, were estimated as measures of
# confirmatory evidence integration. Disconfirmatory evidence integration
# was estimated as the beta weight on incorrect trials
# (we reversed the sign of this beta weight in the figures such that
# higher values indicate greater disconfirmatory evidence integration).
#create df with only correct trials
correctTrials = subset(df2, df2$correct == 1) #subset the df into correct trials
correctTrials = subset(correctTrials, correctTrials$label == 'confidencerating') #keep only trials where label == confidencerating
correctTrials = correctTrials %>%
mutate(postDecisionEvidenceStrength = case_when(correctTrials$current_trial_difficuly == '1.3' ~ 1,
correctTrials$current_trial_difficuly == '1' ~ 2,
correctTrials$Task_type == 'simpleperceptual' ~ 0))
#create df with only incorrect trials
incorrectTrials = subset(df2, df2$correct == 0) #subset the df into incorrect trials
incorrectTrials = subset(incorrectTrials, incorrectTrials$label == 'confidencerating') #keep only trials where label == confidencerating
incorrectTrials = incorrectTrials %>%
mutate(postDecisionEvidenceStrength = case_when(incorrectTrials$current_trial_difficuly == '1.3' ~ 1,
incorrectTrials$current_trial_difficuly == '1' ~ 2,
incorrectTrials$Task_type == 'simpleperceptual' ~ 0))
correctTrials
write.csv(correctTrials, '../Data/correctTrials.csv', row.names = FALSE)
write.csv(incorrectTrials, '../Data/incorrectTrials.csv', row.names = FALSE)
#-----------------------------------------------------------------------------------------------------------
#calculate individual disconfirmation bias scores
#(rejection of disconfirmatory evidence) automatically
discIntegration = ''
for (participant in levels(incorrectTrials$Participant.Public.ID)) {
discDF = filter(incorrectTrials,Participant.Public.ID==participant)
discIntegration[participant] = lm(discDF$confidenceRating ~ discDF$postDecisionEvidenceStrength,
data = discDF)
}
#drop first list item, which is empty
discIntegration = discIntegration[-1]
#convert list to dataframe
disconfirmatoryDF = data.frame(discIntegration)
#keep only coefficients, drop intercepts
disconfirmatoryDF = disconfirmatoryDF[2,]
#store coefficients as list, drop their labels
listDisconf = list(as.numeric(disconfirmatoryDF[1,]))
#create df with disconfirmatory evidence integrations scores and participant IDs
disconfirmatoryDF = data.frame(listDisconf,levels(incorrectTrials$Participant.Public.ID))
#change column names
colnames(disconfirmatoryDF) = c('Disconfirmatory evidence integration','Participant.Public.ID')
#change sign of disconfirmatory evidence integration so that higher values indicate
#greater integration of disconfirmatory evidence
disconfirmatoryDF$`Disconfirmatory evidence integration` = -disconfirmatoryDF$`Disconfirmatory evidence integration`
print(disconfirmatoryDF)
#----------------------------------------------------------------------------------------------------------------------------------------------
#calculate individual confirmation bias scores (endorsement of confirmatory evidence) automatically
confIntegration = ''
for (participant in levels(correctTrials$Participant.Public.ID)) {
confDF = filter(correctTrials,Participant.Public.ID==participant)
confIntegration[participant] = lm(confDF$confidenceRating ~ confDF$postDecisionEvidenceStrength,
data = confDF)
}
#drop first list item, which is empty
confIntegration = confIntegration[-1]
#convert list to dataframe
confirmatoryDF = data.frame(confIntegration)
#keep only coefficients, drop intercepts
confirmatoryDF = confirmatoryDF[2,]
#store coefficients as list, drop their labels
listConf = list(as.numeric(confirmatoryDF[1,]))
#create df with disconfirmatory evidence integrations scores and participant IDs
confirmatoryDF = data.frame(listConf,levels(correctTrials$Participant.Public.ID))
#change column names
colnames(confirmatoryDF) = c('Confirmatory evidence integration','Participant.Public.ID')
print(confirmatoryDF)
#merge confirmatoryDF and disconfirmatoryDF
merged_conf_bias = merge(disconfirmatoryDF,confirmatoryDF, by = "Participant.Public.ID")
#---------------------------------------------------------------------------------------------------------------------------------------------------
#merge df with confirmation and disconfirmation bias with
#df with demographic data
demo_and_bias_df = merge(merged_conf_bias,demoAll, by = 'Participant.Public.ID')
#merge demo + conf bias df with values df (child-rearing values and economic-social values)
values = read.csv('../Data/values.csv')
#remove participants who revoked their consent for participation
#in the task from the values df
revoked_consent = c("5f869b703a15e41ee32f5b19",
"60c17595bf53c3dff68487d3",
"613b53e4038b14ab5bf1f889")
values = values[! values$Participant.Public.ID %in% revoked_consent,]
demo_and_bias_df = merge(demo_and_bias_df,values, by = 'Participant.Public.ID')
#remove the 3 participants who revoked their consent
demo_and_bias_df = subset(demo_and_bias_df, demo_and_bias_df$Sex != "CONSENT REVOKED")
# set DATA EXPIRED values in employment status and education to NA
demo_and_bias_df$Employment.Status = na_if(demo_and_bias_df$Employment.Status,"DATA EXPIRED")
demo_and_bias_df$Highest.education.level.completed = na_if(demo_and_bias_df$Highest.education.level.completed,
"DATA EXPIRED")
#get rid of unused factor levels in categorical variables
demo_and_bias_df$Highest.education.level.completed = droplevels(demo_and_bias_df$Highest.education.level.completed)
demo_and_bias_df$Employment.Status = droplevels(demo_and_bias_df$Employment.Status)
demo_and_bias_df$Sex = droplevels(demo_and_bias_df$Sex)
demo_and_bias_df$Ethnicity = droplevels(demo_and_bias_df$Ethnicity)
#change reference level in the categorical variables
# set reference level to 'High school diploma/A-levels' for education
demo_and_bias_df$Highest.education.level.completed = relevel(demo_and_bias_df$Highest.education.level.completed,
'High school diploma/A-levels')
#employment status
demo_and_bias_df$Employment.Status = relevel(demo_and_bias_df$Employment.Status,
"Full-Time")
#ethnicity
demo_and_bias_df$Ethnicity = relevel(demo_and_bias_df$Ethnicity,
'White/Caucasian')
#restore the auth_score to row 215 where it is missing
demo_and_bias_df[215,13] = 1
write.csv(demo_and_bias_df,'../Data/demo_and_bias_df.csv', row.names = FALSE)