-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy patharchive_to_folder_reaper.py
executable file
·412 lines (334 loc) · 15.3 KB
/
archive_to_folder_reaper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
#!/usr/local/bin/python
# -*- coding: utf-8 -*-
#TODO: Better documentation
"""
SciTran NIMS and SDM archive to folder Reaper conversion utility.
This code will convert a NIMS v1.0 or an SDM tar file (including the DICOMS) to a folder
tree that the SciTran folder_reaper can ingest.
Users can optionally pass in group, project, and subject arguments. If these
arguments are not passed in they are gleaned from the folder structure within
the NIMS archive.
example usage:
archive_to_folder_reaper.py /path/to/sometar.tar /path/to/place/the/output
"""
import os
import sys
import time
import glob
import gzip
import dicom
import shutil
import zipfile
import tarfile
import logging
import argparse
import subprocess
from distutils.dir_util import copy_tree
logging.basicConfig(
format='%(asctime)s %(levelname)8.8s %(message)s',
datefmt='%Y-%m-%d %H:%M:%S',
)
log = logging.getLogger()
def extract_subject_id(root_path, args):
'''
If no subjectID is provided as input, we will attempt to extract the ID from a dicom.
If there are no dicom files then we use the name of the session folder to create a subject ID.
If there is a dicom file, we read it and use the field that was passed in - if no field was
passed in then we use values from the following fields, in order: PatientID, PatientName,
StudyID ('ex' + StudyID).
'''
log.info('No subjectID provided - Attempting to extract subject ID from dicom...')
subject_id = None
(file_paths, dir_paths, _, _, _) = get_paths(root_path)
dicom_dirs = [d for d in dir_paths if d.endswith('dicom')]
# Read the dicom file and return an id from (PatientID - PatientName - StudyDate+StudyTime)
if dicom_dirs:
dicom_files = [d for d in file_paths if d.startswith(dicom_dirs[0])]
dcm = dicom.read_file(dicom_files[0])
# Use the field that was passed in
if args.subject_id_field and dcm.get(args.subject_id_field):
subject_id = dcm.get(args.subject_id_field)
# Use the PatientID field
else:
if dcm.PatientID and dcm.PatientID != args.group: # Some users put the group in this field
subject_id = dcm.PatientID
subject_id = subject_id.split('@')[0]
if '/' in subject_id:# If the group/is still in the name then no subjectID was entered
subject_id = None
# Use the PatientName field
if not subject_id and dcm.PatientName:
subject_id = dcm.PatientName.replace('^',' ')
if subject_id[0] == ' ': # If the first char is a space, remove it
subject_id = subject_id[1:]
if subject_id.find(' ') > 0:
subject_id = None
# FIXME: This could be a proper name (remove it)
# Use StudyID
if not subject_id and dcm.StudyID:
subject_id = 'ex' + dcm.StudyID
# No dicoms - use the session folder name
if not subject_id or subject_id.isspace(): # This is empty b/c there are no dicoms, or the id field set failed
log.info('... subjectID could not be extraced from DICOM header - setting subjectID from session label')
subject_id = 'sub_' + os.path.basename(root_path).replace(' ', '_').replace(':','')
# Sanitize subject_id
subject_id = subject_id.replace(os.sep, '_')
log.info('... subjectID set to %s' % subject_id)
return subject_id
def screen_save_montage(dirs):
screen_saves = [f for f in dirs if f.endswith('Screen_Save')]
if screen_saves:
log.info('... %s screen saves to process' % str(len(screen_saves)))
for d in screen_saves:
pngs = glob.glob(d + '/*.png')
montage_name = pngs[0][:-5] + 'montage.png'
pngs = [shellquote(p) for p in pngs]
# Build the montage (requires imagemagick)
os.system('montage -geometry +4+4 ' + " ".join(pngs) + ' ' + shellquote(montage_name))
# Move the contents of this folder to the correct acquitision directory
ss_num = os.path.basename(d).split('_')[0][-2:] # This is the acquisition number we need
if ss_num[0] == '0': # Drop the leading zero if it's the first char
ss_num = ss_num[1:]
for target in dirs:
if os.path.basename(target).startswith(ss_num + '_'):
target_dir = target
break
shutil.move(montage_name, target_dir)
shutil.rmtree(d) # Remove the screen save folder
log.info('... done')
else:
log.info('... 0 screen saves found')
def extract_dicoms(files):
dicom_arcs = [f for f in files if f.endswith('_dicoms.tgz') or f.endswith('_dicom.tgz')]
if dicom_arcs:
log.info('... %s dicom archives to extract' % str(len(dicom_arcs)))
for f in dicom_arcs:
utd = untar(f, os.path.dirname(f))
del_files = ['._*', 'DIGEST.txt', 'METADATA.json', 'metadata.json', 'digest.txt']
for df in del_files:
[os.remove(d) for d in glob.glob(utd + '/' + df)]
log.debug('renaming %s' % utd)
# BUG:TODO: This can be an issue if there is more than one dicom archive per acquisition (see ex9407 on SNI-SDM)
os.rename(utd, os.path.join(os.path.dirname(utd), 'dicom'))
os.remove(f)
log.debug('Removing %s' % f)
log.info('... done')
else:
log.info('... 0 dicom archives found')
def extract_pfiles(files):
import zipfile
pfile_arcs = [f for f in files if f.endswith('_pfile.tgz')]
if pfile_arcs:
log.info('... %s pfile archives to extract' % str(len(pfile_arcs)))
for f in pfile_arcs:
utd = untar(f, os.path.dirname(f))
[_files, _dirs, _, _, _] = get_paths(utd)
# Remove the files that should not be in the archive
del_files = ['._*', 'DIGEST.txt', 'METADATA.json', 'metadata.json', 'digest.txt']
for df in del_files:
[os.remove(d) for d in glob.glob(utd + '/' + df)]
# Gzip the P-file prior to adding to the archive
for p in _files:
if p.endswith('.7') and not p.endswith('_refscan.7'):
gzfile = create_gzip(p, os.path.join(utd, p + '.gz'))
os.remove(p)
# Zip the utd directory
zipdir(utd, utd + '.7.zip', os.path.basename(utd))
# Clean up the directory and files
shutil.rmtree(utd)
os.remove(f)
log.info('... done')
else:
log.info('... 0 pfile archives found')
def extract_and_zip_physio(files):
physio_arcs = [f for f in files if f.endswith('_physio.tgz')]
if physio_arcs:
log.info('... %s physio archives to extract' % str(len(physio_arcs)))
for f in physio_arcs:
utd = untar(f, os.path.dirname(f))
create_archive(utd, utd)
os.rename(utd + '.zip', utd + '.gephysio.zip')
shutil.rmtree(utd)
os.remove(f)
log.info('... done')
else:
log.info('... 0 physio archives found')
def extract_physio(files):
physio_arcs = [f for f in files if f.endswith('.csv.gz')]
if physio_arcs:
log.info('... %s physio regressor file(s) to extract' % str(len(physio_arcs)))
for f in physio_arcs:
with gzip.open(f, 'rb') as in_file:
s = in_file.read()
with open(f[:-3], 'w') as a:
a.write(s)
os.remove(f)
else:
log.info('... 0 physio regressors found')
def prune_tree(files, args):
if args.prune:
log.debug('Pruning files that end with %s ' % args.prune)
for p in args.prune:
for f in files:
if f.endswith(p) and os.path.isfile(p):
os.remove(f)
log.debug('Pruning file %s ' % f)
###### UTILITIES ######
def shellquote(s):
return "'" + s.replace("'", "'\\''") + "'"
def get_paths(root_path):
file_paths = []
dir_paths = []
groups = []
projects = []
sessions = []
for (root, dirs, files) in os.walk(root_path):
for name in files:
file_paths.append(os.path.join(root, name))
for name in dirs:
dir_paths.append(os.path.join(root, name))
if len(dir_paths) > 3:
group_level = len(dir_paths[1].split(os.sep))
project_level = group_level + 1
session_level = project_level + 1
[groups.append(d) for d in dir_paths if len(d.split(os.sep)) == group_level]
[projects.append(d) for d in dir_paths if len(d.split(os.sep)) == project_level]
[sessions.append(d) for d in dir_paths if len(d.split(os.sep)) == session_level]
return (file_paths, dir_paths, groups, projects, sessions)
def untar(fname, path):
tar = tarfile.open(fname)
tar.extractall(path)
untar_dir = '.'
while untar_dir.startswith('.'):
for name in range(0, len(tar.getnames())):
untar_dir = os.path.dirname(tar.getnames()[name])
untar_dir = os.path.join(path, untar_dir)
tar.close()
return untar_dir
def create_archive(content_dir, arcname):
zipfilepath = content_dir + '.zip'
with zipfile.ZipFile(zipfilepath, 'w', zipfile.ZIP_DEFLATED, allowZip64=True) as zf:
zf.write(content_dir, arcname)
for fn in os.listdir(content_dir):
zf.write(os.path.join(content_dir, fn), os.path.join(os.path.basename(arcname), fn))
return zipfilepath
def zipdir(dirpath, zipname=None, arcbase=None):
if not arcbase:
arcbase = os.path.basename(dirpath)
if not zipname:
zipname = dirpath + '.zip'
zipf = zipfile.ZipFile(zipname, 'w', zipfile.ZIP_DEFLATED, allowZip64=True)
for root, dirs, files in os.walk(dirpath):
for _file in files:
zipf.write(os.path.join(root, _file), os.path.join(arcbase, _file))
zipf.close()
return zipname
def create_gzip(in_file, gz_file):
if not gz_file:
gz_file = in_file + '.gz'
with open(in_file, 'rb') as f_in, gzip.open(gz_file, 'wb') as f_out:
shutil.copyfileobj(f_in, f_out)
return gz_file
######################################################################################
def main():
arg_parser = argparse.ArgumentParser()
arg_parser.add_argument('tar_file', help='NIMS Tar File', type=str)
arg_parser.add_argument('output_path', help='path for untar data', type=str)
arg_parser.add_argument('-g', '--group', help='Group', type=str, default='')
arg_parser.add_argument('-p', '--project', help='project', type=str, default='')
arg_parser.add_argument('-s', '--subject', help='Subject Code', type=str, default='')
arg_parser.add_argument('-i', '--subject_id_field', help='Look here for the subject id', type=str, default='')
arg_parser.add_argument('-l', '--loglevel', default='info', help='log level [default=info]')
arg_parser.add_argument('--prune', action='append', help='Files that end with this string will be pruned from final tree.')
args = arg_parser.parse_args()
log.setLevel(getattr(logging, args.loglevel.upper()))
log.debug(args)
# Output directory will be named with the current date and time
output_path = os.path.join(os.path.realpath(args.output_path), time.strftime('%Y-%m-%d_%H_%M_%S'))
## 1. Make the output directory where the tar file will be extracted
os.mkdir(output_path)
## 2. Extract the nims tar file
log.info('Extracting %s to %s' % (args.tar_file, output_path))
untar(args.tar_file, output_path)
## 3. Generate file paths and directory paths
log.info('Extracting path and file info in %s' % output_path)
(file_paths, dir_paths, group_paths, project_paths, session_paths) = get_paths(output_path)
db_root_path = dir_paths[0] # sdm or nims path (removed later)
## 4. Handle missing arguments
if not args.group:
get_group = True
else:
get_group = False
if not args.project:
get_project = True
else:
get_project = False
if not args.subject:
get_subject_id = True
else:
get_subject_id = False
# Go through groups/projects/sessions
for group in group_paths:
if get_group == True:
args.group = os.path.basename(group)
log.debug(group)
log.debug(args)
projects = []
[projects.append(p) for p in project_paths if p.startswith(group)]
for project in projects:
if get_project == True:
args.project = os.path.basename(project)
log.debug(project)
log.debug(args)
sessions = []
[sessions.append(s) for s in session_paths if s.startswith(project)]
for session in sessions:
(file_paths, dir_paths, _, _, _) = get_paths(session)
log.debug(session)
log.debug(project)
log.debug(args)
## 5. Remove the 'qa.json' files (UI can't read them)
for f in file_paths:
if f.endswith('qa.json'):
os.remove(f)
## 6. Rename: qa file to [...].qa.png and montage to .montage.zip
for f in file_paths:
if f.endswith('_qa.png'):
new_name = f.replace('_qa.png', '.qa.png')
os.rename(f, new_name)
if f.endswith('_montage.zip'):
new_name = f.replace('_montage.zip', '.montage.zip')
os.rename(f, new_name)
## 7. Extract physio regressors (_physio_regressors.csv.gz)
log.info('Extracting physio regressors...')
extract_physio(file_paths)
## 8. Move _physio.tgz files to gephsio and zip (removing digest .txt)
log.info('Extracting and repackaging physio data...')
extract_and_zip_physio(file_paths)
## 9. Extract pfiles and remove the digest and metadata files and gzip the file
log.info('Extracting and repackaging pfiles...')
extract_pfiles(file_paths)
## 10. Extract all the dicom archives and rename to 'dicom'
log.info('Extracting dicom archives...')
extract_dicoms(file_paths)
## 11. Create a montage of the screen saves and move them to the correct acquisition
log.info('Processing screen saves...')
screen_save_montage(dir_paths)
## 12. Get the subjectID (if not passed in)
if get_subject_id == True:
args.subject = extract_subject_id(session, args)
## 13. Prune tree to remove unwanted files
prune_tree(file_paths, args)
## 14. Make the folder hierarchy and move the session to it's right place
log.info('Organizing final file structure...')
target_path = os.path.join(output_path, args.group, args.project, args.subject)
log.debug('Target Path: %s' % target_path)
log.debug(session)
if not os.path.isdir(target_path):
os.makedirs(target_path)
shutil.move(session, target_path) # Move the session to the target
## 15. Remove the db root folder
shutil.rmtree(db_root_path)
log.info("Done.")
print output_path
if __name__ == '__main__':
main()