forked from djangraw/MoodDrift
-
Notifications
You must be signed in to change notification settings - Fork 0
/
ImportNimhMmiData.py
302 lines (268 loc) · 12.5 KB
/
ImportNimhMmiData.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
ImportNimhMmiData.py
Import MMI data from in-person NIMH subjects performing the task online during
the COVID-19 pandemic.
Created on Tue May 19 13:41:11 2020
@author: jangrawdc
- Updated 6/2/20 by DJ - renamed dfDataCheck
- Updated 3/31/21 by DJ - adapted for shared code structure.
- Updated 4/2/21 by DJ - allowed date strings in MM/DD/YY format, use and accommodate de-ID'ed demographics file.
- Updated 12/10/21 by DJ - added dfProbe output to GetMmiRatingsAndTimes.
"""
# %% Set up
# Import packages
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import glob
import dateutil.parser as dparser
import datetime
import MoodDrift.Analysis.PlotMmiData as pmd
from MoodDrift.Preprocessing.GetMmiRatingsAndTimes import GetMmiRatingsAndTimes
import os.path
# Convert MM/DD/YY datestr to year, month, and day ints
def GetYearMonthDay(dateStr):
if '-' in dateStr: #YYYY-MM-DD format
dateYear,dateMonth,dateDay = [int(x) for x in dateStr.split('-')]
elif '/' in dateStr: # MM/DD/YY format
dateMonth,dateDay,dateYear2 = [int(x) for x in dateStr.split('/')]
# convert to 4-digit year
if dateYear2<21:
dateYear = 2000+dateYear2
else:
dateYear = 1900+dateYear2
else:
raise ValueError('date string must be in YYYY-MM-DD or MM/DD/YY format')
return dateYear,dateMonth,dateDay
# Get age from datestrings for current time and DOB
def AgeFromDateStrings(currStr,dobStr):
# Convert stting to day, month, and year
currYear,currMonth,currDay = GetYearMonthDay(currStr)
dobYear,dobMonth,dobDay = GetYearMonthDay(dobStr)
# convert to datetime objects
currDatetime = datetime.datetime(currYear,currMonth,currDay)
dobDatetime = datetime.datetime(dobYear,dobMonth,dobDay)
# get difference, convert to years
ageInYears = (currDatetime-dobDatetime).days/365.25 # get age in years (days/365.25 for leap year)
# return result
return ageInYears
# %% Load demographics data
# Import data
rawDataDir = '../Data/PilotData' # where pilot data can be found
dataCheckDir = '../Data/DataChecks' # where data check files should be saved
procDataDir = '../Data/OutFiles' # where preprocessed data should be saved
outFigDir = '../Figures' # where figures should be saved
batchName = 'RecoveryNimh' # Name of this batch
plotEveryParticipant = False # should we make a plot for every participant?
overwrite = True; # overwrite previous results if they already exist?
#demoFile = '%s/MmiRecoveryNimh_Demographics.csv'%rawDataDir # Demographics file for NIMH participants including DOB
demoFile = '%s/MmiRecoveryNimh_Demographics_nodob.csv'%rawDataDir # deID's Demographics file for NIMH participants in which DOBs are replaced with floored age
# Load demographics data
print('=== Reading and cropping %s...'%demoFile)
dfData = pd.read_csv(demoFile)
if 'AllData' in demoFile:
# extract info for each subject
cols = ['participant','SEX','DOB','Participant_Type','Age']
dfSurvey = dfData.loc[:,cols].drop_duplicates().reset_index(drop=True)
# adjust to match MTurk names/values
dfSurvey.loc[dfSurvey.SEX=='MALE','SEX'] = 'Male'
dfSurvey.loc[dfSurvey.SEX=='FEMALE','SEX'] = 'Female'
dfSurvey = dfSurvey.rename(columns={'SEX':'gender',
'Age':'age',
'Participant_Type':'diagnosis'})
elif 'Demographics' in demoFile:
# extract info for each subject
isBase = pd.notna(dfData.s_crisis_base_date)
dfData.loc[isBase,['s_crisis_fu_tot','s_crisis_fu_date']] = dfData.loc[isBase,['s_crisis_base_tot','s_crisis_base_date']]
try:
cols = ['participant','SEX','DOB','Participant_Type','s_crisis_fu_tot','s_mfq_tot','s_scaredshort_tot','s_crisis_fu_date','age']
dfSurvey = dfData.loc[:,cols].drop_duplicates(cols[:-1]).reset_index(drop=True) # drop based on everything except age
except KeyError:
cols = ['participant','SEX','Participant_Type','s_crisis_fu_tot','s_mfq_tot','s_scaredshort_tot','s_crisis_fu_date','age']
dfSurvey = dfData.loc[:,cols].drop_duplicates(cols[:-1]).reset_index(drop=True) # drop based on everything except age
# dfSurvey['age'] = np.nan
# adjust to match MTurk names/values
dfSurvey.loc[dfSurvey.SEX=='MALE','SEX'] = 'Male'
dfSurvey.loc[dfSurvey.SEX=='FEMALE','SEX'] = 'Female'
dfSurvey = dfSurvey.rename(columns={'SEX':'gender',
'Participant_Type':'diagnosis',
's_crisis_fu_tot':'CRISIS',
's_mfq_tot':'MFQ',
's_scaredshort_tot':'SCARED',
's_crisis_fu_date': 'DateOfSurvey'})
# Crop to COMPLETE measurements only
nSubj_orig = np.unique(dfSurvey.participant).size
isOk = pd.notna(dfSurvey.CRISIS) & pd.notna(dfSurvey.MFQ) & pd.notna(dfSurvey.SCARED)
dfSurvey = dfSurvey.loc[isOk,:]
# Crop to the ***FIRST measurement*** for each subject
participants = np.unique(dfSurvey.participant)
nDupes = 0;
for participant in participants:
isThis = dfSurvey.participant==participant
if np.sum(isThis)>1:
earliestDate = np.min(dfSurvey.loc[isThis,'DateOfSurvey'])
isNotEarliest = isThis & (dfSurvey.DateOfSurvey>earliestDate)
nDupes = nDupes + np.sum(isNotEarliest)
dfSurvey = dfSurvey.drop(isNotEarliest[isNotEarliest].index,axis=0)
# print(' - subj %d: %d duplicates.'%(participant,np.sum(isNotEarliest)))
dfSurvey = dfSurvey.reset_index(drop=True)
print('Deleted %d duplicate lines.'%(nDupes))
participants = np.unique(dfSurvey.participant)
nSubj = participants.size
assert (dfSurvey.shape[0]==nSubj) # make sure all participants are unique
print('%d of %d subjects had complete survey data.'%(nSubj,nSubj_orig))
# Save results
#outFile = '%s/Mmi-%s_Survey.csv'%(procDataDir,batchName)
#print('Saving to %s...'%outFile)
#if os.path.exists(outFile) and not overwrite:
# print('Not overwriting existing file.')
#else:
# dfSurvey.to_csv(outFile)
# print('Done!')
# %% Load data from Pavlovia
# data check file is distributed with shared data
dataCheckFile = '%s/%s_DataCheck.csv'%(dataCheckDir,batchName)
dfDataCheck = pd.read_csv(dataCheckFile)
maxNRuns = 3
# Old code to create datacheck file
#dfDataCheck = pd.DataFrame(participants,columns=['participant'])
##dfDataCheck['taskFile'] = '';
#maxNRuns = 0;
#for iSubj,participant in enumerate(participants):
# # get list of files from this participant
# files = np.array(glob.glob('%s/%s/%s*.csv'%(rawDataDir,batchName,participant)))
# # get completion indicator and date
# fileDate = np.zeros(len(files),datetime.datetime)
# isComplete = np.zeros(len(files),bool)
# for iFile,thisFile in enumerate(files):
# dfIn = pd.read_csv(thisFile);
# fileDate[iFile] = dparser.parse(dfIn['date'][0].split('_')[0],fuzzy=True);
# isComplete[iFile] = ('cashBonus' in dfIn.columns)
# # crop
# files = files[isComplete]
# fileDate = fileDate[isComplete]
# # sort by date
# iSorted = np.argsort(fileDate)
# files = files[iSorted]
#
# # add files to dfDataCheck dataframe
# for iFile,thisFile in enumerate(files):
# dfDataCheck.loc[iSubj,'taskFile_run%d'%(iFile+1)] = files[iFile]
# maxNRuns = max(maxNRuns,len(files))
# if len(files)==0:
# print('***WARNING: participant %d has %d complete data files!'%(participant,len(files)))
#
#outFile = '%s/%s_DataCheck.csv'%(dataCheckDir,batchName)
#print('Saving to %s...'%outFile)
#if os.path.exists(outFile) and not overwrite:
# print('Not overwriting existing file.')
#else:
# dfDataCheck.to_csv(outFile)
# print('Done!')
# %% Import data
dfDataCheck['isComplete'] = True;
nComplete = np.sum(dfDataCheck['isComplete'])
print('=== %d/%d subjects complete. ==='%(nComplete,dfDataCheck.shape[0]))
dfDataCheck_complete = dfDataCheck.loc[dfDataCheck.isComplete,:].reset_index(drop=True)
trialList = []
ratingList =[]
lifeHappyList = []
nSubj = dfDataCheck_complete.shape[0]
for iLine in range(nSubj):
# Print status
print('=== Importing Subject %d/%d... ==='%(iLine,nSubj))
participant = dfDataCheck.loc[iLine,'participant']
for iRun in range(maxNRuns):
run = iRun+1
if isinstance(dfDataCheck_complete.loc[iLine,'taskFile_run%d'%run],str): # if it's a string
date = dfDataCheck_complete.loc[iLine,'taskFile_run%d'%run].split('_')[2]
# Task
inFile = dfDataCheck_complete.loc[iLine,'taskFile_run%d'%run]
inFile = inFile.replace('../PilotData',rawDataDir) # replace relative path from data check file with relative path from here
dfTrial,dfRating,dfLifeHappy,dfProbe = GetMmiRatingsAndTimes(inFile)
# Add run/date info
for df in [dfTrial,dfRating,dfLifeHappy]:
df['run'] = run
df['date'] = date
# append to lists
trialList.append(dfTrial)
ratingList.append(dfRating)
lifeHappyList.append(dfLifeHappy)
# Plot task data
if plotEveryParticipant:
plt.figure(1,figsize=(10,4),dpi=180, facecolor='w', edgecolor='k');
plt.clf();
ax1 = plt.subplot(2,1,1);
pmd.PlotMmiRatings(dfTrial,dfRating,'line')
plt.title('MMI participant %d, run %d'%(participant,run))
ax2 = plt.subplot(2,1,2);
plt.xlim(ax1.get_xlim())
pmd.PlotMmiRPEs(dfTrial,dfRating)
plt.tight_layout()
# Save figure
outFig = '%s/Mmi-%s-%s-run%d.png'%(outFigDir,batchName,participant,run)
print('Saving figure as %s...'%outFig)
plt.savefig(outFig)
print('=== Done! ===')
# %% Append across lists
dfTrial = pd.concat(trialList);
dfRating = pd.concat(ratingList);
dfLifeHappy = pd.concat(lifeHappyList);
# %% Save?
files = {'trial': '%s/Mmi-%s_Trial.csv'%(procDataDir,batchName),
'ratings': '%s/Mmi-%s_Ratings.csv'%(procDataDir,batchName),
'lifeHappy':'%s/Mmi-%s_LifeHappy.csv'%(procDataDir,batchName)}
tables = {'trial': dfTrial,
'ratings': dfRating,
'lifeHappy': dfLifeHappy}
for item in [x[0] for x in files.items()]:
if os.path.exists(files[item]) and not overwrite:
# read dataCheck
print('==== Reading %s from %s...'%(item,files[item]))
tables[item] = pd.read_csv(files[item],index_col=0);
else:
# Save csv file
print('==== Saving %s as %s...'%(item,files[item]))
tables[item].to_csv(files[item])
# save csv file for single run
for iRun in range(maxNRuns):
run = iRun+1
dfRun = tables[item].loc[tables[item].run==run,:]
outFile = files[item].replace(batchName,'%s-run%d'%(batchName,run))
print('==== Saving %s run %d as %s...'%(item,run,outFile))
dfRun.to_csv(outFile)
# %% Calculate age with fraction-of-years, calculated from date of task and DOB.
# Adapted from GetRecoveryNimhAgeWithFractions.py.
dfAll = pd.read_csv('%s/Mmi-RecoveryNimh-run1_Ratings.csv'%procDataDir)
#dfSurvey = pd.read_csv('%s/Mmi-RecoveryNimh_Survey.csv'%procDataDir)
if ('DOB' in dfSurvey) and (dfSurvey['DOB'].dtype=='object'): # try to calculate age from DOB
print('==== Adjusting survey age to be in fraction of years at date of first task =====')
dfSurvey['age'] = np.nan
for iLine in range(dfSurvey.shape[0]):
isThis = dfAll['participant']==dfSurvey.loc[iLine,'participant'] # find this participant in ratings table
if np.any(isThis):
ageInYears = AgeFromDateStrings(dfAll.loc[isThis,'date'].values[0], dfSurvey.loc[iLine,'DOB']) # calculate age
dfSurvey.loc[iLine,'age'] = ageInYears # add to survey table
else:
print('DOB not present in demographics file - using reported age instead.')
# Make sure all ages have been calculated
assert not np.any(np.isnan(dfSurvey['age'].values))
# Save file for all runs together
outFile = '%s/Mmi-RecoveryNimh_Survey.csv'%procDataDir
print('==== Saving Survey results as %s...'%(outFile))
if os.path.exists(outFile) and not overwrite:
print('Not overwriting existing file.')
else:
dfSurvey.to_csv(outFile)
# Save files for each run separately
for iRun in range(maxNRuns):
run = iRun+1
outFile = '%s/Mmi-RecoveryNimh-run%d_Survey.csv'%(procDataDir,run)
print('==== Saving Survey run %d as %s...'%(run,outFile))
if os.path.exists(outFile) and not overwrite:
print('Not overwriting existing file.')
else:
dfSurvey.to_csv(outFile)
print('Done!')