Skip to content

Commit

Permalink
Merge pull request #175 from bento-platform/develop
Browse files Browse the repository at this point in the history
Version 1.2.0
  • Loading branch information
zxenia authored Nov 30, 2020
2 parents 33251f3 + d16f1d8 commit ea60d73
Show file tree
Hide file tree
Showing 6 changed files with 324 additions and 4 deletions.
2 changes: 1 addition & 1 deletion chord_metadata_service/package.cfg
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
[package]
name = katsu
version = 1.1.1
version = 1.2.0
authors = Ksenia Zaytseva, David Lougheed, Simon Chénard, Romain Grégoire
5 changes: 3 additions & 2 deletions chord_metadata_service/patients/api_views.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from .models import Individual
from .filters import IndividualFilter
from chord_metadata_service.phenopackets.api_views import BIOSAMPLE_PREFETCH, PHENOPACKET_PREFETCH
from chord_metadata_service.restapi.api_renderers import FHIRRenderer, PhenopacketsRenderer
from chord_metadata_service.restapi.api_renderers import FHIRRenderer, PhenopacketsRenderer, IndividualCSVRenderer
from chord_metadata_service.restapi.pagination import LargeResultsSetPagination


Expand All @@ -24,7 +24,8 @@ class IndividualViewSet(viewsets.ModelViewSet):
).order_by("id")
serializer_class = IndividualSerializer
pagination_class = LargeResultsSetPagination
renderer_classes = (*api_settings.DEFAULT_RENDERER_CLASSES, FHIRRenderer, PhenopacketsRenderer)
renderer_classes = (*api_settings.DEFAULT_RENDERER_CLASSES, FHIRRenderer,
PhenopacketsRenderer, IndividualCSVRenderer)
filter_backends = [DjangoFilterBackend, filters.SearchFilter, filters.OrderingFilter]
filter_class = IndividualFilter
ordering_fields = ["id"]
Expand Down
62 changes: 61 additions & 1 deletion chord_metadata_service/restapi/api_renderers.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
import json
import csv
from djangorestframework_camel_case.render import CamelCaseJSONRenderer
from rest_framework.renderers import JSONRenderer
from rdflib import Graph
from rdflib.plugin import register, Serializer
from rest_framework.renderers import JSONRenderer
from django.http import HttpResponse

from uuid import UUID

from .jsonld_utils import dataset_to_jsonld
Expand Down Expand Up @@ -79,3 +82,60 @@ def render(self, data, media_type=None, renderer_context=None):
g = Graph().parse(data=json.dumps(ld_context_data, cls=UUIDEncoder), format='json-ld')
rdf_data = g.serialize(format='pretty-xml')
return rdf_data


class IndividualCSVRenderer(JSONRenderer):
media_type = 'text/csv'
format = 'csv'

def render(self, data, media_type=None, renderer_context=None):
if 'results' in data:
individuals = []
for individual in data['results']:
ind_obj = {
'id': individual['id'],
'sex': individual.get('sex', None),
'date_of_birth': individual.get('date_of_birth', None),
'taxonomy': None,
'karyotypic_sex': individual['karyotypic_sex'],
'race': individual.get('race', None),
'ethnicity': individual.get('ethnicity', None),
'age': None,
'diseases': None,
'created': individual['created'],
'updated': individual['updated']
}
if 'taxonomy' in individual:
ind_obj['taxonomy'] = individual['taxonomy'].get('label', None)
if 'age' in individual:
if 'age' in individual['age']:
ind_obj['age'] = individual['age'].get('age', None)
elif 'start' and 'end' in individual['age']:
ind_obj['age'] = str(
individual['age']['start'].get('age', "NA")
+ ' - ' +
individual['age']['end'].get('age', "NA")
)
else:
ind_obj['age'] = None
if 'phenopackets' in individual:
all_diseases = []
for phenopacket in individual['phenopackets']:
if 'diseases' in phenopacket:
# use ; because some disease terms might contain , in their label
single_phenopacket_diseases = '; '.join(
[d['term']['label'] for d in phenopacket['diseases']]
)
all_diseases.append(single_phenopacket_diseases)
if all_diseases:
ind_obj['diseases'] = '; '.join(all_diseases)
individuals.append(ind_obj)
columns = individuals[0].keys()
# remove underscore and capitalize column names
headers = {key: key.replace('_', ' ').capitalize() for key in individuals[0].keys()}
response = HttpResponse(content_type='text/csv')
response['Content-Disposition'] = "attachment; filename='export.csv'"
dict_writer = csv.DictWriter(response, fieldnames=columns)
dict_writer.writerow(headers)
dict_writer.writerows(individuals)
return response
7 changes: 7 additions & 0 deletions scripts/ingest.conf.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{
"project_title": "project_1",
"dataset_title": "dataset_1",
"table_name": "table_1",
"katsu_server_url": "http://example.com:4000",
"phenopackets_json_location": "/home/user/v2/CanCOGen_synthetic_data/cancogen_phenopackets.json"
}
208 changes: 208 additions & 0 deletions scripts/ingest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,208 @@
import sys
import json
import requests

"""
An ingest script that automates the initial data ingest for katsu service.
Make sure you have a config file named ingest.conf.json in the same dir as this script.
A sample ingest.conf.json is given below.
{
"project_title": "project_1",
"dataset_title": "dataset_1",
"table_name": "table_1",
"katsu_server_url": "http://example.com:4000",
"phenopackets_json_location": "/home/user/v2/CanCOGen_synthetic_data/cancogen_phenopackets.json"
}
Usage (under active katsu virtualenv):
python ingest.py
"""


def load_config():
"""
Load and return the config from ingest.conf.json
"""

try:
with open("ingest.conf.json") as f:
return json.load(f)
except FileNotFoundError:
print(
"The config file ingest.conf.json is missing. You must have it in the same dir as this script."
)
sys.exit()


def create_project(katsu_server_url, project_title):
"""
Create a new Katsu project.
Return the uuid of the newly-created project.
"""

project_request = {
"title": project_title,
"description": "A new project."
}

try:
r = requests.post(katsu_server_url + "/api/projects", json=project_request)
except requests.exceptions.ConnectionError:
print(
"Connection to the API server {} cannot be established.".format(
katsu_server_url
)
)
sys.exit()

if r.status_code == 201:
project_uuid = r.json()["identifier"]
print(
"Project {} with uuid {} has been created!".format(
project_title, project_uuid
)
)
return project_uuid
elif r.status_code == 400:
print(
"A project of title '{}' exists, please choose a different title, or delete this project.".format(
project_title
)
)
sys.exit()
else:
print(r.json())
sys.exit()


def create_dataset(katsu_server_url, project_uuid, dataset_title):
"""
Create a new dataset.
Return the uuid of newly-created dataset.
"""
dataset_request = {
"project": project_uuid,
"title": dataset_title,
"data_use": {
"consent_code": {
"primary_category": {"code": "GRU"},
"secondary_categories": [{"code": "GSO"}],
},
"data_use_requirements": [{"code": "COL"}, {"code": "PUB"}],
},
}

r2 = requests.post(katsu_server_url + "/api/datasets", json=dataset_request)

if r2.status_code == 201:
dataset_uuid = r2.json()["identifier"]
print(
"Dataset {} with uuid {} has been created!".format(
dataset_title, dataset_uuid
)
)
return dataset_uuid
elif r2.status_code == 400:
print(
"A dataset of title '{}' exists, please choose a different title, or delete this dataset.".format(
dataset_title
)
)
sys.exit()
else:
print(r2.json())
sys.exit()


def create_table(katsu_server_url, dataset_uuid, table_name):
"""
Create a new katsu table.
Return the uuid of the newly-created table.
"""

table_request = {
"name": table_name,
"data_type": "phenopacket",
"dataset": dataset_uuid
}

r3 = requests.post(katsu_server_url + "/tables", json=table_request)

if r3.status_code == 200 or r3.status_code == 201:
table_id = r3.json()["id"]
print("Table {} with uuid {} has been created!".format(table_name, table_id))
return table_id
else:
print("Something went wrong...")
sys.exit()


def ingest_phenopackets(katsu_server_url, table_id, phenopackets_json_location):
"""
Ingest the phenopackets json
"""

private_ingest_request = {
"table_id": table_id,
"workflow_id": "phenopackets_json",
"workflow_params": {"phenopackets_json.json_document": phenopackets_json_location},
"workflow_outputs": {"json_document": phenopackets_json_location},
}

print("Ingesting phenopackets, this may take a while...")

r4 = requests.post(
katsu_server_url + "/private/ingest", json=private_ingest_request
)

if r4.status_code == 200 or r4.status_code == 201 or r4.status_code == 204:
print(
"Phenopackets have been ingested from source at {}".format(
phenopackets_json_location
)
)
elif r4.status_code == 400:
print(r4.text)
sys.exit()
else:
print(
"Something else went wrong when ingesting phenopackets, possibly due to duplications."
)
print(
"Double check phenopackets_json_location config, or remove duplicated individuals from the database and try again."
)
sys.exit()


def main():
config = load_config()

print("Initializing...")
print(
"Warning: this script is only designed to handle the initial data ingestion of katsu service."
)

try:
project_title = config["project_title"]
dataset_title = config["dataset_title"]
table_name = config["table_name"]
katsu_server_url = config["katsu_server_url"]
phenopackets_json_location = config["phenopackets_json_location"]
except KeyError as e:
print("Config file corrupted: missing key {}".format(str(e)))
sys.exit()

project_uuid = create_project(katsu_server_url, project_title)
dataset_uuid = create_dataset(katsu_server_url, project_uuid, dataset_title)
table_uuid = create_table(katsu_server_url, dataset_uuid, table_name)
ingest_phenopackets(katsu_server_url, table_uuid, phenopackets_json_location)


if __name__ == "__main__":
main()
44 changes: 44 additions & 0 deletions scripts/remove_from_db.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import os
import sys
import subprocess

"""
A script that automates the process of deleting the data
on the table passed as arguments.
Usage (under active katsu virtualenv):
python remove_from_db.py Project Individual Dataset
"""


def main():
if len(sys.argv) <= 1:
print("At least one table name should be passed as an argument")
return

script = (
"from chord_metadata_service.chord.models import *;"
"from chord_metadata_service.experiments.models import *;"
"from chord_metadata_service.mcode.models import *;"
"from chord_metadata_service.patients.models import *;"
"from chord_metadata_service.phenopackets.models import *;"
"from chord_metadata_service.resources.models import *;"
"from chord_metadata_service.restapi.models import *;"
)

for table in sys.argv[1:]:
response = subprocess.run(
'python ../manage.py shell --command="{}"'.format(
script + "{}.objects.all().delete();".format(table)
),
shell=True,
stderr=subprocess.PIPE,
)
if response.returncode:
print('"{}" does not seem to be a valid table'.format(table))
else:
print("Deleted data on table {}".format(table))


if __name__ == "__main__":
main()

0 comments on commit ea60d73

Please sign in to comment.