diff --git a/chord_metadata_service/package.cfg b/chord_metadata_service/package.cfg index e198f718d..0c1156e2b 100644 --- a/chord_metadata_service/package.cfg +++ b/chord_metadata_service/package.cfg @@ -1,4 +1,4 @@ [package] name = katsu -version = 1.1.1 +version = 1.2.0 authors = Ksenia Zaytseva, David Lougheed, Simon Chénard, Romain Grégoire diff --git a/chord_metadata_service/patients/api_views.py b/chord_metadata_service/patients/api_views.py index fc11c0a7d..d45a1ad90 100644 --- a/chord_metadata_service/patients/api_views.py +++ b/chord_metadata_service/patients/api_views.py @@ -5,7 +5,7 @@ from .models import Individual from .filters import IndividualFilter from chord_metadata_service.phenopackets.api_views import BIOSAMPLE_PREFETCH, PHENOPACKET_PREFETCH -from chord_metadata_service.restapi.api_renderers import FHIRRenderer, PhenopacketsRenderer +from chord_metadata_service.restapi.api_renderers import FHIRRenderer, PhenopacketsRenderer, IndividualCSVRenderer from chord_metadata_service.restapi.pagination import LargeResultsSetPagination @@ -24,7 +24,8 @@ class IndividualViewSet(viewsets.ModelViewSet): ).order_by("id") serializer_class = IndividualSerializer pagination_class = LargeResultsSetPagination - renderer_classes = (*api_settings.DEFAULT_RENDERER_CLASSES, FHIRRenderer, PhenopacketsRenderer) + renderer_classes = (*api_settings.DEFAULT_RENDERER_CLASSES, FHIRRenderer, + PhenopacketsRenderer, IndividualCSVRenderer) filter_backends = [DjangoFilterBackend, filters.SearchFilter, filters.OrderingFilter] filter_class = IndividualFilter ordering_fields = ["id"] diff --git a/chord_metadata_service/restapi/api_renderers.py b/chord_metadata_service/restapi/api_renderers.py index 97a7efe97..80769f096 100644 --- a/chord_metadata_service/restapi/api_renderers.py +++ b/chord_metadata_service/restapi/api_renderers.py @@ -1,8 +1,11 @@ import json +import csv from djangorestframework_camel_case.render import CamelCaseJSONRenderer +from rest_framework.renderers import JSONRenderer from rdflib import Graph from rdflib.plugin import register, Serializer -from rest_framework.renderers import JSONRenderer +from django.http import HttpResponse + from uuid import UUID from .jsonld_utils import dataset_to_jsonld @@ -79,3 +82,60 @@ def render(self, data, media_type=None, renderer_context=None): g = Graph().parse(data=json.dumps(ld_context_data, cls=UUIDEncoder), format='json-ld') rdf_data = g.serialize(format='pretty-xml') return rdf_data + + +class IndividualCSVRenderer(JSONRenderer): + media_type = 'text/csv' + format = 'csv' + + def render(self, data, media_type=None, renderer_context=None): + if 'results' in data: + individuals = [] + for individual in data['results']: + ind_obj = { + 'id': individual['id'], + 'sex': individual.get('sex', None), + 'date_of_birth': individual.get('date_of_birth', None), + 'taxonomy': None, + 'karyotypic_sex': individual['karyotypic_sex'], + 'race': individual.get('race', None), + 'ethnicity': individual.get('ethnicity', None), + 'age': None, + 'diseases': None, + 'created': individual['created'], + 'updated': individual['updated'] + } + if 'taxonomy' in individual: + ind_obj['taxonomy'] = individual['taxonomy'].get('label', None) + if 'age' in individual: + if 'age' in individual['age']: + ind_obj['age'] = individual['age'].get('age', None) + elif 'start' and 'end' in individual['age']: + ind_obj['age'] = str( + individual['age']['start'].get('age', "NA") + + ' - ' + + individual['age']['end'].get('age', "NA") + ) + else: + ind_obj['age'] = None + if 'phenopackets' in individual: + all_diseases = [] + for phenopacket in individual['phenopackets']: + if 'diseases' in phenopacket: + # use ; because some disease terms might contain , in their label + single_phenopacket_diseases = '; '.join( + [d['term']['label'] for d in phenopacket['diseases']] + ) + all_diseases.append(single_phenopacket_diseases) + if all_diseases: + ind_obj['diseases'] = '; '.join(all_diseases) + individuals.append(ind_obj) + columns = individuals[0].keys() + # remove underscore and capitalize column names + headers = {key: key.replace('_', ' ').capitalize() for key in individuals[0].keys()} + response = HttpResponse(content_type='text/csv') + response['Content-Disposition'] = "attachment; filename='export.csv'" + dict_writer = csv.DictWriter(response, fieldnames=columns) + dict_writer.writerow(headers) + dict_writer.writerows(individuals) + return response diff --git a/scripts/ingest.conf.json b/scripts/ingest.conf.json new file mode 100644 index 000000000..3fc88f535 --- /dev/null +++ b/scripts/ingest.conf.json @@ -0,0 +1,7 @@ +{ + "project_title": "project_1", + "dataset_title": "dataset_1", + "table_name": "table_1", + "katsu_server_url": "http://example.com:4000", + "phenopackets_json_location": "/home/user/v2/CanCOGen_synthetic_data/cancogen_phenopackets.json" +} diff --git a/scripts/ingest.py b/scripts/ingest.py new file mode 100644 index 000000000..0e9624367 --- /dev/null +++ b/scripts/ingest.py @@ -0,0 +1,208 @@ +import sys +import json +import requests + +""" +An ingest script that automates the initial data ingest for katsu service. + +Make sure you have a config file named ingest.conf.json in the same dir as this script. + +A sample ingest.conf.json is given below. + +{ + "project_title": "project_1", + "dataset_title": "dataset_1", + "table_name": "table_1", + "katsu_server_url": "http://example.com:4000", + "phenopackets_json_location": "/home/user/v2/CanCOGen_synthetic_data/cancogen_phenopackets.json" +} + +Usage (under active katsu virtualenv): +python ingest.py +""" + + +def load_config(): + """ + Load and return the config from ingest.conf.json + """ + + try: + with open("ingest.conf.json") as f: + return json.load(f) + except FileNotFoundError: + print( + "The config file ingest.conf.json is missing. You must have it in the same dir as this script." + ) + sys.exit() + + +def create_project(katsu_server_url, project_title): + """ + Create a new Katsu project. + + Return the uuid of the newly-created project. + """ + + project_request = { + "title": project_title, + "description": "A new project." + } + + try: + r = requests.post(katsu_server_url + "/api/projects", json=project_request) + except requests.exceptions.ConnectionError: + print( + "Connection to the API server {} cannot be established.".format( + katsu_server_url + ) + ) + sys.exit() + + if r.status_code == 201: + project_uuid = r.json()["identifier"] + print( + "Project {} with uuid {} has been created!".format( + project_title, project_uuid + ) + ) + return project_uuid + elif r.status_code == 400: + print( + "A project of title '{}' exists, please choose a different title, or delete this project.".format( + project_title + ) + ) + sys.exit() + else: + print(r.json()) + sys.exit() + + +def create_dataset(katsu_server_url, project_uuid, dataset_title): + """ + Create a new dataset. + + Return the uuid of newly-created dataset. + """ + dataset_request = { + "project": project_uuid, + "title": dataset_title, + "data_use": { + "consent_code": { + "primary_category": {"code": "GRU"}, + "secondary_categories": [{"code": "GSO"}], + }, + "data_use_requirements": [{"code": "COL"}, {"code": "PUB"}], + }, + } + + r2 = requests.post(katsu_server_url + "/api/datasets", json=dataset_request) + + if r2.status_code == 201: + dataset_uuid = r2.json()["identifier"] + print( + "Dataset {} with uuid {} has been created!".format( + dataset_title, dataset_uuid + ) + ) + return dataset_uuid + elif r2.status_code == 400: + print( + "A dataset of title '{}' exists, please choose a different title, or delete this dataset.".format( + dataset_title + ) + ) + sys.exit() + else: + print(r2.json()) + sys.exit() + + +def create_table(katsu_server_url, dataset_uuid, table_name): + """ + Create a new katsu table. + + Return the uuid of the newly-created table. + """ + + table_request = { + "name": table_name, + "data_type": "phenopacket", + "dataset": dataset_uuid + } + + r3 = requests.post(katsu_server_url + "/tables", json=table_request) + + if r3.status_code == 200 or r3.status_code == 201: + table_id = r3.json()["id"] + print("Table {} with uuid {} has been created!".format(table_name, table_id)) + return table_id + else: + print("Something went wrong...") + sys.exit() + + +def ingest_phenopackets(katsu_server_url, table_id, phenopackets_json_location): + """ + Ingest the phenopackets json + """ + + private_ingest_request = { + "table_id": table_id, + "workflow_id": "phenopackets_json", + "workflow_params": {"phenopackets_json.json_document": phenopackets_json_location}, + "workflow_outputs": {"json_document": phenopackets_json_location}, + } + + print("Ingesting phenopackets, this may take a while...") + + r4 = requests.post( + katsu_server_url + "/private/ingest", json=private_ingest_request + ) + + if r4.status_code == 200 or r4.status_code == 201 or r4.status_code == 204: + print( + "Phenopackets have been ingested from source at {}".format( + phenopackets_json_location + ) + ) + elif r4.status_code == 400: + print(r4.text) + sys.exit() + else: + print( + "Something else went wrong when ingesting phenopackets, possibly due to duplications." + ) + print( + "Double check phenopackets_json_location config, or remove duplicated individuals from the database and try again." + ) + sys.exit() + + +def main(): + config = load_config() + + print("Initializing...") + print( + "Warning: this script is only designed to handle the initial data ingestion of katsu service." + ) + + try: + project_title = config["project_title"] + dataset_title = config["dataset_title"] + table_name = config["table_name"] + katsu_server_url = config["katsu_server_url"] + phenopackets_json_location = config["phenopackets_json_location"] + except KeyError as e: + print("Config file corrupted: missing key {}".format(str(e))) + sys.exit() + + project_uuid = create_project(katsu_server_url, project_title) + dataset_uuid = create_dataset(katsu_server_url, project_uuid, dataset_title) + table_uuid = create_table(katsu_server_url, dataset_uuid, table_name) + ingest_phenopackets(katsu_server_url, table_uuid, phenopackets_json_location) + + +if __name__ == "__main__": + main() diff --git a/scripts/remove_from_db.py b/scripts/remove_from_db.py new file mode 100644 index 000000000..f35d47630 --- /dev/null +++ b/scripts/remove_from_db.py @@ -0,0 +1,44 @@ +import os +import sys +import subprocess + +""" +A script that automates the process of deleting the data +on the table passed as arguments. + +Usage (under active katsu virtualenv): +python remove_from_db.py Project Individual Dataset +""" + + +def main(): + if len(sys.argv) <= 1: + print("At least one table name should be passed as an argument") + return + + script = ( + "from chord_metadata_service.chord.models import *;" + "from chord_metadata_service.experiments.models import *;" + "from chord_metadata_service.mcode.models import *;" + "from chord_metadata_service.patients.models import *;" + "from chord_metadata_service.phenopackets.models import *;" + "from chord_metadata_service.resources.models import *;" + "from chord_metadata_service.restapi.models import *;" + ) + + for table in sys.argv[1:]: + response = subprocess.run( + 'python ../manage.py shell --command="{}"'.format( + script + "{}.objects.all().delete();".format(table) + ), + shell=True, + stderr=subprocess.PIPE, + ) + if response.returncode: + print('"{}" does not seem to be a valid table'.format(table)) + else: + print("Deleted data on table {}".format(table)) + + +if __name__ == "__main__": + main()