diff --git a/bin/oneoffs/cas_statistic.py b/bin/cas_statistic.py old mode 100644 new mode 100755 similarity index 98% rename from bin/oneoffs/cas_statistic.py rename to bin/cas_statistic.py index 38219184f..38c8a810e --- a/bin/oneoffs/cas_statistic.py +++ b/bin/cas_statistic.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python + import os import pymongo from collections import Counter diff --git a/bin/dicom_doctype.py b/bin/dicom_doctype.py deleted file mode 100755 index a0d32b6d2..000000000 --- a/bin/dicom_doctype.py +++ /dev/null @@ -1,500 +0,0 @@ -#!/usr/bin/env python - -import ast -import copy -import dateutil.parser -import dicom -import elasticsearch -import json -import logging - -from api import config -from api.web import encoder - -es = config.es -db = config.db - -# SHHH -logging.getLogger('urllib3').setLevel(logging.WARNING) - -DE_INDEX = 'data_explorer' - -ANALYSIS = { - 'analyzer' : { - 'str_search_analyzer' : { - 'tokenizer' : 'keyword', - 'filter' : ['lowercase'] - }, - - 'str_index_analyzer' : { - 'tokenizer' : 'keyword', - 'filter' : ['lowercase', 'substring'] - } - }, - 'filter' : { - 'substring': { - 'type': 'nGram', - 'min_gram': 2, - 'max_gram': 50, - 'token_chars': [] - } - } - } - -DYNAMIC_TEMPLATES = [ - { - '_id': { - 'match': '_id', - 'match_mapping_type' : 'string', - 'mapping': { - 'type': 'keyword', - 'index': 'not_analyzed' - } - } - }, - { - 'long_fields' : { - 'match_mapping_type' : 'long', - 'mapping' : { - 'ignore_malformed': True - } - } - }, - { - 'integer_fields' : { - 'match_mapping_type' : 'integer', - 'mapping' : { - 'ignore_malformed': True - } - } - }, - { - 'double_fields' : { - 'match_mapping_type' : 'double', - 'mapping' : { - 'ignore_malformed': True - } - } - }, - { - 'float_fields' : { - 'match_mapping_type' : 'float', - 'mapping' : { - 'ignore_malformed': True - } - } - }, - { - 'short_fields' : { - 'match_mapping_type' : 'short', - 'mapping' : { - 'ignore_malformed': True - } - } - }, - { - 'byte_fields' : { - 'match_mapping_type' : 'byte', - 'mapping' : { - 'ignore_malformed': True - } - } - }, - { - 'hash': { - 'match': 'hash', - 'match_mapping_type' : 'string', - 'mapping': { - 'type': 'text', - 'index': 'not_analyzed' - } - } - }, - { - 'string_fields' : { - 'match': '*', - 'match_mapping_type' : 'string', - 'mapping' : { - 'type': 'text', - 'analyzer': 'str_search_analyzer', - "fields": { - "raw": { - "type": "keyword", - "index": "not_analyzed", - "ignore_above": 256 - } - } - } - } - } -] - -def datetime(str_datetime): - pass - -def age(str_age): - pass - -BLACKLIST_KEYS = ['template', 'roles', 'permissions', 'analyses', 'files', 'collections'] - - -SKIPPED = [] -SKIPPED2ELECTRICBOOGALOO = ['PixelSpacing', 'ImageOrientationPatient', 'PatientAge', 'ImagePositionPatient'] - -# TODO: Choose integer/long and float/double where appropriate -VR_TYPES = { - 'AE': ['string', str], - 'AS': ['long', int], - 'AT': ['long', int], - 'CS': ['string', str], - 'DA': ['date', datetime], - 'DS': ['float', float], - 'DT': ['date', datetime], - 'FD': ['float', float], - 'FL': ['float', float], - 'IS': ['long', int], - 'LO': ['string', str], - 'LT': ['string', str], - 'NONE': None, - 'OB': None, - 'OB or OW': None, - 'OF': None, - 'OW': None, - 'PN': ['string', str], - 'SH': ['string', str], - 'SL': ['long', int], - 'SQ': None, - 'SS': ['long', int], - 'ST': ['string', str], - 'TM': ['time', datetime], - 'UI': ['string', str], - 'UL': ['long', int], - 'US': ['long', int], - 'US or OW': ['long', int], - 'US or SS': ['long', int], - 'US or SS or OW': ['long', int], - 'UT': ['string', str] - } - -def create_mappings(): - public_dict = dicom._dicom_dict.DicomDictionary - field_mappings = {} - for k,v in public_dict.iteritems(): - vr_type = v[0] - field_name = v[4] - vr_mapping = VR_TYPES.get(vr_type) - if vr_mapping: - field_type = vr_mapping[0] - if field_type == 'string' and vr_type not in ['UT', 'LT', 'ST']: - field_mappings[field_name+'_term'] = {'type': 'string', 'index': 'not_analyzed'} - field_mappings[field_name] = {'type': field_type} - if field_type == 'time': - # Actually store as date, format as time: - field_mappings[field_name] = {'type': 'date', 'format': 'time'} - else: - pass - #logging.warn('Skipping field {} of VR type {}'.format(field_name, vr_type)) - - return field_mappings - -def cast_date(dcm_date): - """ - Cast DICOM date string (YYYYMMDD) into ElasticSearch pre-defined strict_date format (yyyy-MM-dd) - """ - return dcm_date[:4] + '-' + dcm_date[4:6] + '-' + dcm_date[6:] - -def cast_time(dcm_time): - """ - Cast DICOM time string (HHMMSS.FRAC) - into ElasticSearch pre-defined strict_time format (HH:mm:ss.SSSZZ) - """ - # TODO: this fxn needs to be tested on real data - if len(dcm_time) < 6: - return None - hours = dcm_time[:2] - minutes = dcm_time[2:4] - seconds = dcm_time[4:6] - if len(dcm_time) > 7: - fraction_str = dcm_time[7:] - fraction = float(dcm_time[7:])/10^(len(fraction_str)) - fraction = int(fraction*1000) - else: - fraction = 0 - return '%s:%s:%s.%03d00' % (hours, minutes, seconds, fraction) - -def cast_datetime(dcm_datetime): - """ - Cast DICOM datetime string (YYYYMMDDHHMMSS.FFFFFF) - into ElasticSearch pre-defined basic_date_time format (yyyyMMdd'T'HHmmss.SSSZ) - """ - # TODO: this fxn needs to be tested on real data - year = dcm_datetime[:4] - month = dcm_datetime[4:6] - day = dcm_datetime[6:8] - if len(dcm_datetime) > 8: - hours = dcm_datetime[8:10] - minutes = dcm_datetime[10:12] - seconds = dcm_datetime[12:14] - else: - hours = '00' - minutes = '00' - seconds = '00' - if len(dcm_datetime) > 15: - fraction_str = dcm_datetime[15:] - fraction = float(dcm_datetime[15:])/10^(len(fraction_str)) - fraction = int(fraction*1000) - else: - fraction = 0 - return '%s%s%sT%s%s%s.%03d0' % (year, month, day, hours, minutes, seconds, fraction) - -def cast_age(dcm_age): - """ Cast DICOM age string into seconds""" - # TODO: this fxn needs to be tested on real data - unit = dcm_age[-1] - if unit not in ['D', 'W', 'M', 'Y']: - return None - multipliers = dict(D=60*60*24, - W=60*60*24*7, - M=60*60*24*30, - Y=60*60*24*365) - value = int(dcm_age[:-1]) - seconds = multipliers[unit]*value - return seconds - -def value_is_array(value): - if type(value) != unicode: - return False - if len(value) < 2: - return False - if value[0] == '[' and value[-1] == ']': - return True - return False - -def cast_array_from_string(string): - array = None - try: - array = ast.literal_eval(string) - except: - config.log.warn('Tried to cast string {} as array, failed.'.format(string)) - - if array: - new_array = [] - for element in array: - try: - element = int(element) - except: - try: - element = float(element) - except: - pass - new_array.append(element) - return new_array - else: - return string - -def remove_blacklisted_keys(obj): - for key in BLACKLIST_KEYS: - obj.pop(key, None) - -def handle_files(parent, parent_type, files, dicom_mappings, permissions, doc): - doc['container_type'] = 'file' - for f in files: - # f.pop('info', None) - doc['file'] = f - # doc = { - # 'file': f, - # 'permissions': permissions - # } - # if f.get('type', '') == 'dicom' and f.get('info'): - # dicom_data = f.pop('info') - # term_fields = {} - # modified_data = {} - # for skipped in SKIPPED: - # dicom_data.pop(skipped, None) - # for k,v in dicom_data.iteritems(): - - # try: - - # # Arrays are saved as strings in - # if value_is_array(v): - # config.log.debug('calling array for {} and value {}'.format(k, v)) - # v = cast_array_from_string(v) - # if 'datetime' in k.lower(): - # config.log.debug('called datetime for {} and value {}'.format(k, v)) - # v = cast_datetime(str(v)) - # elif 'date' in k.lower(): - # config.log.debug('called date for {} and value {}'.format(k, v)) - # v = cast_date(str(v)) - # # elif 'time' in k.lower(): - # # # config.log.debug('called time for {} and value {}'.format(k, v)) - # # # v = cast_time(str(v)) - # elif 'Age' in k: - # config.log.debug('called age for {} and value {}'.format(k, v)) - # v = cast_age(str(v)) - # except: - # pass - - # term_field_name = k+'_term' - # if term_field_name in dicom_mappings and type(v) in [unicode, str]: - # term_fields[k+'_term'] = str(v) - # modified_data[k] = v - - # modified_data.update(term_fields) - # doc['dicom_header'] = modified_data - - generated_id = str(parent['_id']) + '_' + f['name'] - - doc['parent'] = { - '_id': parent['_id'], - 'type': parent_type - } - - doc_s = json.dumps(doc, default=encoder.custom_json_serializer) - try: - # es.index(index=DE_INDEX, id=generated_id, parent=str(parent['_id']), doc_type='file', body=doc) - es.index(index=DE_INDEX, id=generated_id, doc_type='flywheel', body=doc_s) - except: - return - - -if __name__ == '__main__': - - if es.indices.exists(DE_INDEX): - print 'Removing existing data explorer index...' - res = es.indices.delete(index=DE_INDEX) - print 'response: {}'.format(res) - - # mappings = create_mappings() - - request = { - 'settings': { - "index.mapping.total_fields.limit": 4000, - 'number_of_shards': 1, - 'number_of_replicas': 0, - 'analysis' : ANALYSIS - }, - 'mappings': { - '_default_' : { - '_all' : {'enabled' : True}, - 'dynamic_templates': DYNAMIC_TEMPLATES - }, - 'flywheel': {} - } - } - - print 'creating {} index ...'.format(DE_INDEX) - res = es.indices.create(index=DE_INDEX, body=request) - print 'response: {}'.format(res) - - # mappings = es.indices.get_mapping(index=DE_INDEX, doc_type='flywheel') - - # dicom_mappings = mappings[DE_INDEX]['mappings']['file']['properties']['dicom_header']['properties'] - dicom_mappings = None - - permissions = [] - - groups = db.groups.find({}) - print 'STARTING THE GROUPS' - print '' - print '' - print '' - count = 1 - group_count_total = groups.count() - for g in groups: - print 'Loading group {} ({} of {})'.format(g['name'], count, group_count_total) - count += 1 - - remove_blacklisted_keys(g) - - projects = db.projects.find({'group': g['_id']}) - for p in projects: - - files = p.pop('files', []) - # Set permissions for documents - permissions = p.pop('permissions', []) - remove_blacklisted_keys(p) - - doc = { - 'project': p, - 'group': g, - 'permissions': permissions, - 'container_type': 'project' - - } - - doc_s = json.dumps(doc, default=encoder.custom_json_serializer) - es.index(index=DE_INDEX, id=str(p['_id']), doc_type='flywheel', body=doc_s) - - handle_files(p, 'project', files, dicom_mappings, permissions, doc) - - - sessions = db.sessions.find({'project': p['_id']}) - for s in sessions: - subject = s.pop('subject', {}) - - analyses = s.pop('analyses', []) - files = s.pop('files', []) - remove_blacklisted_keys(s) - - doc = { - 'project': p, - 'group': g, - 'session': s, - 'subject': subject, - 'permissions': permissions, - 'container_type': 'session' - - } - - doc_s = json.dumps(doc, default=encoder.custom_json_serializer) - es.index(index=DE_INDEX, id=str(s['_id']), doc_type='flywheel', body=doc_s) - - handle_files(s, 'session', files, dicom_mappings, permissions, doc) - - for an in analyses: - files = an.pop('files', []) - doc = { - 'analysis': an, - 'session': s, - 'subject': subject, - 'project': p, - 'group': g, - 'permissions': permissions, - 'container_type': 'analysis' - - } - - doc_s = json.dumps(doc, default=encoder.custom_json_serializer) - es.index(index=DE_INDEX, id=str(an['_id']), doc_type='flywheel', body=doc_s) - - files = [f for f in files if f.get('output')] - - handle_files(an, 'analysis', files, dicom_mappings, permissions, doc) - - - - acquisitions = db.acquisitions.find({'session': s['_id']}) - for a in acquisitions: - a.pop('info', None) - files = a.pop('files', []) - remove_blacklisted_keys(a) - - doc = { - 'acquisition': a, - 'session': s, - 'subject': subject, - 'project': p, - 'group': g, - 'permissions': permissions, - 'container_type': 'acquisition' - - } - - doc_s = json.dumps(doc, default=encoder.custom_json_serializer) - es.index(index=DE_INDEX, id=str(a['_id']), doc_type='flywheel', body=doc_s) - - - handle_files(a, 'acquisition', files, dicom_mappings, permissions, doc) - - - diff --git a/bin/integrity_check.py b/bin/integrity_check.py index c8ee53a2b..6a4c3c19c 100755 --- a/bin/integrity_check.py +++ b/bin/integrity_check.py @@ -92,8 +92,3 @@ def session_length(): except Exception as e: logging.exception('Main method failed...') sys.exit(1) - - - - - diff --git a/bin/load_users_drone_secret.py b/bin/load_users_drone_secret.py deleted file mode 100755 index 471a73669..000000000 --- a/bin/load_users_drone_secret.py +++ /dev/null @@ -1,145 +0,0 @@ -#!/usr/bin/env python - -"""This script helps bootstrap users and data""" - -import os -import sys -import json -import logging -import argparse -import datetime -import requests - -logging.basicConfig( - format='%(asctime)s %(levelname)8.8s %(message)s', - datefmt='%Y-%m-%d %H:%M:%S', - level=logging.DEBUG, -) -log = logging.getLogger('scitran.bootstrap') - -logging.getLogger('requests').setLevel(logging.WARNING) # silence Requests library - - -def _upsert_user(request_session, api_url, user_doc): - """ - Insert user, or update if insert fails due to user already existing. - - Returns: - requests.Response: API response. - - Args: - request_session (requests.Session): Session to use for the request. - api_url (str): Base url for the API eg. 'https://localhost:8443/api' - user_doc (dict): Valid user doc defined in user input schema. - """ - new_user_resp = request_session.post(api_url + '/users', json=user_doc) - if new_user_resp.status_code != 409: - return new_user_resp - - # Already exists, update instead - return request_session.put(api_url + '/users/' + user_doc['_id'], json=user_doc) - - -def _upsert_permission(request_session, api_url, permission_doc, group_id): - """ - Insert group permission, or update if insert fails due to group permission already existing. - - Returns: - requests.Response: API response. - - Args: - request_session (requests.Session): Session to use for the request. - api_url -- (str): Base url for the API eg. 'https://localhost:8443/api' - permission_doc -- (dict) Valid permission doc defined in permission input schema. - """ - base_permission_url = "{0}/groups/{1}/permissions".format(api_url, group_id) - new_permission_resp = request_session.post(base_permission_url , json=permission_doc) - if new_permission_resp.status_code != 409: - return new_permission_resp - - # Already exists, update instead - - full_permission_url = "{0}/{1}".format(base_permission_url, permission_doc['_id']) - return request_session.put(full_permission_url, json=permission_doc) - -def users(filepath, api_url, http_headers, insecure): - """ - Upserts the users/groups/permissions defined in filepath parameter. - - Raises: - requests.HTTPError: Upsert failed. - """ - now = datetime.datetime.utcnow() - with open(filepath) as fd: - input_data = json.load(fd) - with requests.Session() as rs: - log.info('bootstrapping users...') - rs.verify = not insecure - rs.headers = http_headers - for u in input_data.get('users', []): - log.info(' {0}'.format(u['_id'])) - r = _upsert_user(request_session=rs, api_url=api_url, user_doc=u) - r.raise_for_status() - - log.info('bootstrapping groups...') - r = rs.get(api_url + '/config') - r.raise_for_status() - for g in input_data.get('groups', []): - permissions = g.pop('permissions') - log.info(' {0}'.format(g['_id'])) - r = rs.post(api_url + '/groups' , json=g) - r.raise_for_status() - for permission in permissions: - r = _upsert_permission(request_session=rs, api_url=api_url, permission_doc=permission, group_id=g['_id']) - r.raise_for_status() - - log.info('bootstrapping projects...') - for p in input_data.get('projects', []): - r = rs.post(api_url + '/projects?inherit=true' , json=p) - r.raise_for_status() - - project_id = r.json()['_id'] - project_name = p['label'] - - for stanza in input_data.get('gear_rules', []): - - desired_projects = stanza.get('projects', []) - rule = stanza.get('rule', None) - - if project_name in desired_projects and rule: - log.info('Adding rule...') - r = rs.post(api_url + '/projects/' + project_id + '/rules', json=rule) - r.raise_for_status() - - log.info('bootstrapping complete') - - -ap = argparse.ArgumentParser() -ap.description = 'Bootstrap SciTran users and groups' -ap.add_argument('url', help='API URL') -ap.add_argument('json', help='JSON file containing users and groups') -ap.add_argument('--insecure', action='store_true', help='do not verify SSL connections') -ap.add_argument('--secret', help='shared API secret') -args = ap.parse_args() - -if args.insecure: - requests.packages.urllib3.disable_warnings() - -http_headers = { - 'X-SciTran-Method': 'bootstrapper', - 'X-SciTran-Name': 'Bootstrapper', -} -if args.secret: - http_headers['X-SciTran-Auth'] = args.secret -# TODO: extend this to support oauth tokens - -try: - users(args.json, args.url, http_headers, args.insecure) -except requests.HTTPError as ex: - log.error(ex) - log.error("request_body={0}".format(ex.response.request.body)) - sys.exit(1) -except Exception as ex: - log.error('Unexpected error:') - log.error(ex) - sys.exit(1) diff --git a/bin/log_csv.py b/bin/log_export.py old mode 100644 new mode 100755 similarity index 99% rename from bin/log_csv.py rename to bin/log_export.py index 8ee287666..56217a608 --- a/bin/log_csv.py +++ b/bin/log_export.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python + # This implementation as of July 19 2017 has these resource utilizations of the mongodb container: # - 2 million entries: 1.50 Gb # - 3 million entries: 2.05 Gb @@ -57,7 +59,7 @@ def download_large_csv(params): csv_file.flush() params['end_date'] = end_date - + print "Encountered unicode errors and skipped {} entries".format(unicode_err_count) csv_file.close() diff --git a/bin/oneoffs/load_external_data.py b/bin/oneoffs/load_external_data.py deleted file mode 100755 index 50635d991..000000000 --- a/bin/oneoffs/load_external_data.py +++ /dev/null @@ -1,79 +0,0 @@ -#!/usr/bin/env python - -import bson -import copy -import datetime -import dateutil.parser -import json - -from api import config - -## DEFAULTS ## - -USER_ID = "meganhenning@flywheel.io" -SAFE_FILE_HASH = "v0-sha384-a8d0d1bd9368e5385f31d3582db07f9bc257537d5e1f207d36a91fdd3d2f188fff56616c0874bb3535c37fdf761a446c" -PROJECT_ID = "5a26e049c6fa4a00161e4a1a" -GROUP_ID = 'scitran' - -# Some day maybe this can use the SDK/API calls to get the proper test data -# For now, paste it in - -SESSIONS = [] - -ACQUISITIONS = [] - -def handle_permissions(obj): - obj['permissions'] = [{ - "access": "admin", - "_id": USER_ID - }] - -def handle_dates(obj): - if obj.get('timestamp'): - obj['timestamp'] = dateutil.parser.parse(obj['timestamp']) - if obj.get('created'): - obj['created'] = dateutil.parser.parse(obj['created']) - if obj.get('modified'): - obj['modified'] = dateutil.parser.parse(obj['modified']) - -def handle_file(f): - handle_dates(f) - f.pop('info_exists', None) - f.pop('join_origin', None) - f['hash'] = SAFE_FILE_HASH - - -for i, s in enumerate(SESSIONS): - print "Processing session {} of {} sessions".format(i+1, len(SESSIONS)) - - s.pop('join-origin', None) - - s['_id'] = bson.ObjectId(s['_id']) - s['project'] = bson.ObjectId(str(PROJECT_ID)) - s['group'] = GROUP_ID - handle_dates(s) - handle_permissions(s) - - for f in s.get('files', []): - handle_file(f) - - - config.db.sessions.delete_many({'_id': s['_id']}) - config.db.sessions.insert(s) - -for i, a in enumerate(ACQUISITIONS): - print "Processing acquisition {} of {} acquisitions".format(i+1, len(ACQUISITIONS)) - - a['_id'] = bson.ObjectId(a['_id']) - a['session'] = bson.ObjectId(a['session']) - - a.pop('join-origin', None) - - handle_dates(a) - handle_permissions(a) - - for f in a.get('files', []): - handle_file(f) - - config.db.acquisitions.delete_many({'_id': a['_id']}) - config.db.acquisitions.insert(a) diff --git a/bin/oneoffs/timezone_shift.py b/bin/timezone_shift.py similarity index 100% rename from bin/oneoffs/timezone_shift.py rename to bin/timezone_shift.py