diff --git a/chord_metadata_service/chord/export.py b/chord_metadata_service/chord/export.py new file mode 100644 index 000000000..78644bca5 --- /dev/null +++ b/chord_metadata_service/chord/export.py @@ -0,0 +1,32 @@ +import logging +from chord_metadata_service.chord.ingest import WORKFLOW_CBIOPORTAL +from chord_metadata_service.chord.models import Dataset, Project, Table +from .export_cbio import study_export as export_cbioportal_workflow + +logger = logging.getLogger(__name__) + +OBJECT_TYPE_PROJECT = "project" +OBJECT_TYPE_DATASET = "dataset" +OBJECT_TYPE_TABLE = "table" + +EXPORT_OBJECT_TYPE = { + OBJECT_TYPE_PROJECT: { + "model": Project + }, + OBJECT_TYPE_DATASET: { + "model": Dataset + }, + OBJECT_TYPE_TABLE: { + "model": Table + }, +} + +EXPORT_FORMATS = {WORKFLOW_CBIOPORTAL} + +EXPORT_FORMAT_FUNCTION_MAP = { + WORKFLOW_CBIOPORTAL: export_cbioportal_workflow +} + +EXPORT_FORMAT_OBJECT_TYPE_MAP = { + WORKFLOW_CBIOPORTAL: {OBJECT_TYPE_DATASET} +} diff --git a/chord_metadata_service/chord/export_cbio.py b/chord_metadata_service/chord/export_cbio.py new file mode 100644 index 000000000..0413078af --- /dev/null +++ b/chord_metadata_service/chord/export_cbio.py @@ -0,0 +1,286 @@ +import logging +import csv +from typing import TextIO, Callable +from django.db.models import F + +from .export_utils import ExportError + +from chord_metadata_service.chord.models import Dataset +from chord_metadata_service.patients.models import Individual +from chord_metadata_service.phenopackets import models as pm + +__all__ = [ + "study_export", +] + +logger = logging.getLogger(__name__) + +# predefined filenames recognized by cBioPortal +STUDY_FILENAME = "meta_study.txt" +SAMPLE_DATA_FILENAME = "data_clinical_sample.txt" +SAMPLE_META_FILENAME = "meta_clinical_sample.txt" +PATIENT_DATA_FILENAME = "data_clinical_patient.txt" +PATIENT_META_FILENAME = "meta_clinical_patient.txt" + +CBIO_FILES_SET = frozenset({ + STUDY_FILENAME, + SAMPLE_DATA_FILENAME, + SAMPLE_META_FILENAME, + PATIENT_DATA_FILENAME, + PATIENT_META_FILENAME +}) + +PATIENT_DATATYPE = 'PATIENT' +SAMPLE_DATATYPE = 'SAMPLE' + + +def study_export(getPath: Callable[[str], str], dataset_id: str): + """Export a given Project as a cBioPortal study""" + # TODO: a Dataset is a Study (associated with a publication), not a Project! + if Dataset.objects.count == 0: + raise ExportError("No Dataset to export") + dataset = Dataset.objects.get(identifier=dataset_id) + cbio_study_id = str(dataset.identifier) + + # Export study file + with open(getPath(STUDY_FILENAME), 'w') as file_study: + study_export_meta(dataset, file_study) + + # Export patients. + with open(getPath(PATIENT_DATA_FILENAME), 'w') as file_patient: + # Note: plural in `phenopackets` is intentional (related_name property in model) + indiv = Individual.objects.filter(phenopackets__table__ownership_record__dataset_id=dataset.identifier) + individual_export(indiv, file_patient) + + with open(getPath(PATIENT_META_FILENAME), 'w') as file_patient_meta: + clinical_meta_export(cbio_study_id, PATIENT_DATATYPE, file_patient_meta) + + # Export samples + with open(getPath(SAMPLE_DATA_FILENAME), 'w') as file_sample: + sampl = pm.Biosample.objects.filter(phenopacket__table__ownership_record__dataset_id=dataset.identifier)\ + .annotate(phenopacket_subject_id=F("phenopacket__subject")) + sample_export(sampl, file_sample) + + with open(getPath(SAMPLE_META_FILENAME), 'w') as file_sample_meta: + clinical_meta_export(cbio_study_id, SAMPLE_DATATYPE, file_sample_meta) + + +def study_export_meta(dataset: Dataset, file_handle: TextIO): + """ + Study meta data file generation + """ + lines = dict() + lines['type_of_cancer'] = "mixed" # TODO: find if this information is available. !IMPORTANT! uses Oncotree codes + lines['cancer_study_identifier'] = str(dataset.identifier) + lines['name'] = dataset.title + lines['description'] = dataset.description + + # optional fields + if len(dataset.primary_publications): + lines['citation'] = dataset.primary_publications[0] + # pmid: unvailable + # groups: unused for authentication + # add_global_case_list: ? + # tags_file: ? + # reference_genome: ? + + for field, value in lines.items(): + file_handle.write(f"{field}: {value}\n") + + +def clinical_meta_export(study_id: str, datatype: str, file_handle: TextIO): + """ + Clinical Metadata files generation (samples or patients) + """ + lines = dict() + lines['cancer_study_identifier'] = study_id + lines['genetic_alteration_type'] = 'CLINICAL' + if datatype == SAMPLE_DATATYPE: + lines['datatype'] = 'SAMPLE_ATTRIBUTES' + lines['data_filename'] = SAMPLE_DATA_FILENAME + else: + lines['datatype'] = 'PATIENT_ATTRIBUTES' + lines['data_filename'] = PATIENT_DATA_FILENAME + + for field, value in lines.items(): + file_handle.write(f"{field}: {value}\n") + + +def individual_export(results, file_handle: TextIO): + """ + Renders Individuals as a clinical_patient text file suitable for + importing by cBioPortal. + + cBioPortal Patients fields specs: + --------------------------------- + Required: + - PATIENT_ID + Special columns: + - OS_STATUS, OS_MONTHS overall survivall. Status can be 1:DECEASED, 0:LIVING + - DFS_STATUS, DFS_MONTHS disease free + - PATIENT_DISPLAY_NAME + - GENDER or SEX + - AGE + - TUMOR_SITE + """ + + individuals = [] + for individual in results: + ind_obj = { + 'id': individual.id, + 'sex': individual.sex, + } + individuals.append(ind_obj) + + columns = individuals[0].keys() + headers = individual_to_patient_header(columns) + + file_handle.writelines([line + '\n' for line in headers]) + dict_writer = csv.DictWriter(file_handle, fieldnames=columns, delimiter='\t') + dict_writer.writerows(individuals) + + +def sample_export(results, file_handle: TextIO): + """ + Renders Biosamples as a clinical_sample text file suitable for + importing by cBioPortal. + + cBioPortal Sample fields specs: + --------------------------------- + Required: + - PATIENT_ID + - SAMPLE_ID + + Special columns: + - For pan-cancer summary statistics tab: + - CANCER_TYPE as an Oncotree code + - CANCER_TYPE_DETAILED + - SAMPLE_DISPLAY_NAME + - SAMPLE_CLASS + - METASTATIC_SITE / PRIMARY_SITE overrides the patients level attribute TUMOR_SITE + - SAMPLE_TYPE, TUMOR_TISSUE_SITE, TUMOR_TYPE can have the following values + (are displayed with a distinct color in the timelines): + - "recurrence", "recurred", "progression" + - "metastatic", "metastasis" + - "primary" or any other value + - KNOWN_MOLECULAR_CLASSIFIER + - GLEASON_SCORE (prostate cancer) + - HISTOLOGY + - TUMOR_STAGE_2009 + - TUMOR_GRADE + - ETS_RAF_SPINK1_STATUS + - TMPRSS2_ERG_FUSION_STATUS + - ERG_FUSION_ACGH + - SERUM_PSA + - DRIVER_MUTATIONS + """ + + samples = [] + for sample in results: + + # sample.inidividual may be null: use Phenopacket model Subject field + # instead if available or skip. + subject_id = None + if sample.individual is not None: + subject_id = sample.individual + elif sample.phenopacket_subject_id is not None: + subject_id = sample.phenopacket_subject_id + else: + continue + + sample_obj = { + 'individual_id': subject_id, + 'id': sample.id + } + if sample.sampled_tissue: + sample_obj['tissue_label'] = sample.sampled_tissue.get('label', '') + + samples.append(sample_obj) + + columns = samples[0].keys() + headers = biosample_to_sample_header(columns) + + file_handle.writelines([line + '\n' for line in headers]) + dict_writer = csv.DictWriter(file_handle, fieldnames=columns, delimiter='\t') + dict_writer.writerows(samples) + + +class CbioportalClinicalHeaderGenerator(): + """ + Generates cBioPortal data files headers based on field names from katsu models. + """ + + fields_mapping = {} + + def __init__(self, mappings={}): + self.fields_mapping = mappings + + def make_header(self, fields: list): + """ + Maps a list of field names to a 5 rows header + suitable for cBioPortal clinical data files. + """ + + field_properties = [] + for field in fields: + if field in self.fields_mapping: + field_properties.append(self.fields_mapping[field]) + else: + fieldname = field.replace('_', ' ').capitalize() + prop = ( + fieldname, # display name + fieldname, # description + 'STRING', # type !!!TODO: TYPE DETECTION!!! + '1', # priority (note: string here for use in join()) + field.upper() # DB suitable identifier + ) + field_properties.append(prop) + + # Transpose list of properties tuples per field to tuples of + # field properties per property. + rows = list(zip(*field_properties)) + + # The 4 first rows are considered meta datas, prefixed by '#'. + # The 5th row (DB field names) is a canonical TSV header. + cbio_header = [ + '#' + '\t'.join(rows[0]), + '#' + '\t'.join(rows[1]), + '#' + '\t'.join(rows[2]), + '#' + '\t'.join(rows[3]), + '\t'.join(rows[4]) + ] + + return cbio_header + + +def individual_to_patient_header(fields: list): + """ + Maps a list of Individual field names to a 5 rows header + suitable for cBioPortal data_clinical_patient.txt file. + """ + + # predefined mappings from Individual keys to cBioPortal field properties + fields_mapping = { + 'id': ('Patient Identifier', 'Patient Identifier', 'STRING', '1', 'PATIENT_ID'), + 'sex': ('Sex', 'Sex', 'STRING', '1', 'SEX'), + } + + cbio_header = CbioportalClinicalHeaderGenerator(fields_mapping) + return cbio_header.make_header(fields) + + +def biosample_to_sample_header(fields: list): + """ + Maps a list of biosamples field names to a 5 rows header + suitable for cBioPortal data_sample_patient.txt file. + """ + + # predefined mappings from Samples keys to cBioPortal field properties + fields_mapping = { + 'individual_id': ('Patient Identifier', 'Patient Identifier', 'STRING', '1', 'PATIENT_ID'), + 'id': ('Sample Identifier', 'Sample Identifier', 'STRING', '1', 'SAMPLE_ID'), + 'tissue_label': ('Sampled Tissue', 'Sampled Tissue', 'STRING', '1', 'TISSUE_LABEL') + } + + cbio_header = CbioportalClinicalHeaderGenerator(fields_mapping) + return cbio_header.make_header(fields) diff --git a/chord_metadata_service/chord/export_utils.py b/chord_metadata_service/chord/export_utils.py new file mode 100644 index 000000000..189da1b51 --- /dev/null +++ b/chord_metadata_service/chord/export_utils.py @@ -0,0 +1,108 @@ +import logging +import os +import shutil +import tarfile +import tempfile + +from django.conf import settings + +__all__ = [ + "ExportError", + "ExportFileContext" +] + +logger = logging.getLogger(__name__) + +EXPORT_DIR = 'export' + + +class ExportError(Exception): + pass + + +class ExportFileContext: + """ + File context manager around a tmp export directory for a given study + identifier. + When no temp directory is provided, this context takes care of removing the + temp directories created with their contents. + + Attributes: + tmp_dir: path to the directory where the exported files are written. + Can be None. In that case the files are written to a tmp directory + on the system and cleaned once the context manager finishes. + project_id: name that will be used to namespace the export directory. + This is also used for the archive filename by the writeTar() method + """ + path = "" + should_del = False + base_path = "" + project_id = '' + + def __init__(self, tmp_dir: str, project_id: str): + tmp_dir = tmp_dir or settings.SERVICE_TEMP + + if tmp_dir is None: + tmp_dir = tempfile.mkdtemp() + self.should_del = True + + if not os.access(tmp_dir, os.W_OK): + raise ExportError(f"Directory does not exist or is not writable: {tmp_dir}") + + self.base_path = tmp_dir + self.project_id = project_id + + try: + self.path = os.path.join(tmp_dir, EXPORT_DIR, project_id) + + # clean pre-existing export dir + isExistant = os.path.exists(self.path) + if isExistant: + shutil.rmtree(self.path) + + original_umask = os.umask(0) # fix issue with non-writable dir due to OS based mask + os.makedirs(self.path, 0o777) + + except OSError: + raise ExportError + + finally: + os.umask(original_umask) + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, traceback): + if self.should_del and self.path: + shutil.rmtree(self.path) + + def get_path(self, filename: str = ''): + """Returns a path within the export directory + + Attributes: + filename: optional filename to use + """ + return os.path.join(self.path, filename) + + def write_tar(self): + """Creates a tar gzipped archive from the export directory content + + Note that the tar file is created inside the context of this ExportFileContext + class. If no path was provided at the time of the context creation, + then the generated tar file will be deleted along with the tmp directory + + Return: path to the generated tar file + """ + tar_path = os.path.join(self.base_path, EXPORT_DIR, self.project_id + '.tar.gz') + with tarfile.open(tar_path, 'w:gz') as tar: + output_dir = self.get_path() + tar.add(output_dir, filter=reset_tar_info) + return tar_path + + +def reset_tar_info(info: tarfile.TarInfo) -> tarfile.TarInfo: + info.gid = 0 + info.uid = 0 + info.uname = 'root' + info.gname = 'root' + return info diff --git a/chord_metadata_service/chord/ingest.py b/chord_metadata_service/chord/ingest.py index 305e2ee6d..b86a5869d 100644 --- a/chord_metadata_service/chord/ingest.py +++ b/chord_metadata_service/chord/ingest.py @@ -57,6 +57,7 @@ WORKFLOW_MCODE_FHIR_JSON = "mcode_fhir_json" WORKFLOW_MCODE_JSON = "mcode_json" WORKFLOW_READSET = "readset" +WORKFLOW_CBIOPORTAL = "cbioportal" METADATA_WORKFLOWS = { "ingestion": { @@ -240,7 +241,30 @@ ] }, }, - "analysis": {} + "analysis": {}, + "export": { + WORKFLOW_CBIOPORTAL: { + "name": "cBioPortal", + "description": "This workflow creates a bundle for cBioPortal ingestion.", + "data_type": None, + "file": "cbioportal_export.wdl", + "inputs": [ + { + "id": "dataset_id", + "type": "string", + "required": True, + } + ], + "outputs": [ + { + "id": "cbioportal_archive", + "type": "file", + "map_from_input": "dataset_id", + "value": "{}.tar" + } + ] + } + } } WORKFLOWS_PATH = os.path.join(os.path.dirname(os.path.realpath(__file__)), "workflows") diff --git a/chord_metadata_service/chord/schemas.py b/chord_metadata_service/chord/schemas.py index cb12e570c..e5539cdbd 100644 --- a/chord_metadata_service/chord/schemas.py +++ b/chord_metadata_service/chord/schemas.py @@ -31,3 +31,22 @@ "additionalProperties": False } } + +EXPORT_SCHEMA = { + "description": "Export endpoint", + "type": "object", + "properties": { + "object_type": { + "type": "string", + "enum": ["project", "dataset", "table"] + }, + "object_id": {"type": "string"}, + "format": { + "type": "string", + "enum": ["cbioportal"] + }, + "output_path": {"type": "string"} + }, + "required": ["object_type", "object_id", "format"], + "additionalProperties": False +} diff --git a/chord_metadata_service/chord/tests/test_api_export.py b/chord_metadata_service/chord/tests/test_api_export.py new file mode 100644 index 000000000..374eb7b8d --- /dev/null +++ b/chord_metadata_service/chord/tests/test_api_export.py @@ -0,0 +1,98 @@ +import json +import os +import shutil +import tempfile +import uuid + +from django.test import override_settings +from django.urls import reverse +from chord_metadata_service.chord.export_cbio import CBIO_FILES_SET +from chord_metadata_service.chord.export_utils import EXPORT_DIR +from rest_framework import status +from rest_framework.test import APITestCase + +from ..views_ingest import METADATA_WORKFLOWS +from chord_metadata_service.chord.data_types import DATA_TYPE_PHENOPACKET, DATA_TYPE_EXPERIMENT +from chord_metadata_service.chord.models import Project, Dataset, TableOwnership, Table +# noinspection PyProtectedMember +from chord_metadata_service.chord.ingest import ( + WORKFLOW_PHENOPACKETS_JSON, + WORKFLOW_INGEST_FUNCTION_MAP, +) + +from .constants import VALID_DATA_USE_1 +from .example_ingest import ( + EXAMPLE_INGEST_OUTPUTS, +) + + +def generate_phenopackets_ingest(table_id): + return { + "table_id": table_id, + "workflow_id": "phenopackets_json", + "workflow_metadata": METADATA_WORKFLOWS["ingestion"]["phenopackets_json"], + "workflow_outputs": { + "json_document": "" # TODO + }, + "workflow_params": { + "json_document": "" # TODO + } + } + + +class ExportTest(APITestCase): + def setUp(self) -> None: + # Creates a test database and populate with a phenopacket test file + + p = Project.objects.create(title="Project 1", description="") + self.d = Dataset.objects.create(title="Dataset 1", description="Some dataset", data_use=VALID_DATA_USE_1, + project=p) + self.study_id = str(self.d.identifier) + + # TODO: Real service ID + # table for phenopackets + to = TableOwnership.objects.create(table_id=uuid.uuid4(), service_id=uuid.uuid4(), service_artifact="metadata", + dataset=self.d) + self.t = Table.objects.create(ownership_record=to, name="Table 1", data_type=DATA_TYPE_PHENOPACKET) + + # table for experiments metadata + to_exp = TableOwnership.objects.create(table_id=uuid.uuid4(), service_id=uuid.uuid4(), + service_artifact="experiments", dataset=self.d) + self.t_exp = Table.objects.create(ownership_record=to_exp, name="Table 2", data_type=DATA_TYPE_EXPERIMENT) + + self.p = WORKFLOW_INGEST_FUNCTION_MAP[WORKFLOW_PHENOPACKETS_JSON](EXAMPLE_INGEST_OUTPUTS, self.t.identifier) + + @override_settings(AUTH_OVERRIDE=True) # For permissions + def test_export_cbio(self): + # Test with no export body + r = self.client.post(reverse("export"), content_type="application/json") + self.assertEqual(r.status_code, status.HTTP_400_BAD_REQUEST) + + try: + tmp_dir = tempfile.mkdtemp() + + export_payload = { + "format": "cbioportal", + "object_type": "dataset", + "object_id": self.study_id, + } + + # Test with no output_path: expect a tar archive to be returned + r = self.client.post(reverse("export"), data=json.dumps(export_payload), content_type="application/json") + self.assertEquals(r.get('Content-Disposition'), f"attachment; filename=\"{self.study_id}.tar.gz\"") + + # Test with output_path provided: expect files created in this directory + export_payload["output_path"] = tmp_dir + + r = self.client.post(reverse("export"), data=json.dumps(export_payload), content_type="application/json") + self.assertEqual(r.status_code, status.HTTP_204_NO_CONTENT) + # TODO: just write within the directory that has been provided + export_path = os.path.join(tmp_dir, EXPORT_DIR, self.study_id) + self.assertTrue(os.path.exists(export_path)) + for export_file in CBIO_FILES_SET: + self.assertTrue(os.path.exists(os.path.join(export_path, export_file))) + + finally: + shutil.rmtree(tmp_dir) + + # TODO: More diff --git a/chord_metadata_service/chord/tests/test_export_cbio.py b/chord_metadata_service/chord/tests/test_export_cbio.py new file mode 100644 index 000000000..2e2909d58 --- /dev/null +++ b/chord_metadata_service/chord/tests/test_export_cbio.py @@ -0,0 +1,205 @@ +import uuid +import io +from typing import Dict, TextIO +from os import walk, path + +from django.db.models import F +from django.test import TestCase + +from chord_metadata_service.chord.export_cbio import ( + CBIO_FILES_SET, + PATIENT_DATA_FILENAME, + PATIENT_DATATYPE, + SAMPLE_DATA_FILENAME, + SAMPLE_DATATYPE, + clinical_meta_export, + individual_export, + sample_export, + study_export, + study_export_meta +) +from chord_metadata_service.chord.data_types import DATA_TYPE_PHENOPACKET, DATA_TYPE_EXPERIMENT +from chord_metadata_service.chord.export_utils import ExportFileContext +from chord_metadata_service.chord.models import Project, Dataset, TableOwnership, Table +# noinspection PyProtectedMember +from chord_metadata_service.chord.ingest import ( + WORKFLOW_PHENOPACKETS_JSON, + WORKFLOW_INGEST_FUNCTION_MAP, +) +from chord_metadata_service.patients.models import Individual +from chord_metadata_service.phenopackets import models as PhModel + + +from .constants import VALID_DATA_USE_1 +from .example_ingest import ( + EXAMPLE_INGEST_PHENOPACKET, + EXAMPLE_INGEST_OUTPUTS, +) + + +class ExportCBioTest(TestCase): + def setUp(self) -> None: + # Creates a test database and populate with a phenopacket test file + + p = Project.objects.create(title="Project 1", description="") + self.d = Dataset.objects.create(title="Dataset 1", description="Some dataset", data_use=VALID_DATA_USE_1, + project=p) + self.study_id = str(self.d.identifier) + + # TODO: Real service ID + # table for phenopackets + to = TableOwnership.objects.create(table_id=uuid.uuid4(), service_id=uuid.uuid4(), service_artifact="metadata", + dataset=self.d) + self.t = Table.objects.create(ownership_record=to, name="Table 1", data_type=DATA_TYPE_PHENOPACKET) + + # table for experiments metadata + to_exp = TableOwnership.objects.create(table_id=uuid.uuid4(), service_id=uuid.uuid4(), + service_artifact="experiments", dataset=self.d) + self.t_exp = Table.objects.create(ownership_record=to_exp, name="Table 2", data_type=DATA_TYPE_EXPERIMENT) + + self.p = WORKFLOW_INGEST_FUNCTION_MAP[WORKFLOW_PHENOPACKETS_JSON](EXAMPLE_INGEST_OUTPUTS, self.t.identifier) + + # Update the last sample to remove direct reference to any individual. + # In that case, Sample and Individual are cross referenced through the + # Phenopacket model. + PhModel.Biosample.objects.filter( + id=EXAMPLE_INGEST_PHENOPACKET["biosamples"][-1]["id"] + ).update(individual=None) + + def stream_to_dict(self, output: TextIO) -> Dict[str, str]: + """ + Utility function. Parses cBioPortal meta data text files (lines of + key/value pairs separated by `: `) in a dictionary structure. + """ + output.seek(0) + content = dict() + for line in output: + key, value = line.rstrip().split(': ') + content[key] = value + return content + + def test_file_creation(self): + """ + Check files creation. + Files content is tested subsequently with each file generating function. + """ + + with ExportFileContext(None, self.study_id) as file_export: + study_export(file_export.get_path, self.study_id) + export_dir = file_export.get_path() + self.assertTrue(path.exists(export_dir)) + for (dirpath, dirnames, filenames) in walk(export_dir): + filesSet = {*filenames} + self.assertTrue(CBIO_FILES_SET.issubset(filesSet)) + break # do not recurse the directory tree + + def test_export_cbio_study_meta(self): + with io.StringIO() as output: + study_export_meta(self.d, output) + content = self.stream_to_dict(output) + + self.assertIn('type_of_cancer', content) + self.assertEqual(content['cancer_study_identifier'], self.study_id) + self.assertEqual(content['name'], self.d.title) + self.assertEqual(content['description'], self.d.description) + + def test_export_cbio_sample_meta(self): + with io.StringIO() as output: + clinical_meta_export(self.study_id, SAMPLE_DATATYPE, output) + content = self.stream_to_dict(output) + + self.assertEqual(content['cancer_study_identifier'], self.study_id) + self.assertEqual(content['genetic_alteration_type'], 'CLINICAL') + self.assertEqual(content['datatype'], 'SAMPLE_ATTRIBUTES') + self.assertEqual(content['data_filename'], SAMPLE_DATA_FILENAME) + + def test_export_cbio_patient_meta(self): + with io.StringIO() as output: + clinical_meta_export(self.study_id, PATIENT_DATATYPE, output) + content = self.stream_to_dict(output) + + self.assertEqual(content['cancer_study_identifier'], self.study_id) + self.assertEqual(content['genetic_alteration_type'], 'CLINICAL') + self.assertEqual(content['datatype'], 'PATIENT_ATTRIBUTES') + self.assertEqual(content['data_filename'], PATIENT_DATA_FILENAME) + + def test_export_cbio_patient_data(self): + indiv = Individual.objects.filter(phenopackets=self.p) + with io.StringIO() as output: + individual_export(indiv, output) + # Check header + output.seek(0) + field_count = None + field_names = [] + for i, line in enumerate(output): + # 4 first header lines begin with `#` + if i < 4: + self.assertEqual(line[0], '#') + continue + + # Following lines are regular TSV formatted lines + pieces = line.rstrip().split('\t') + + # 5th line is a header with predefined field names + if i == 4: + field_count = len(pieces) + field_names = pieces + + # At least PATIENT_ID and SEX + self.assertGreaterEqual(field_count, 2) + self.assertIn('PATIENT_ID', pieces) + continue + + # TSV body. Inspect first line and break + self.assertEqual(field_count, len(pieces)) + record = dict(zip(field_names, pieces)) + + self.assertEqual(record["PATIENT_ID"], EXAMPLE_INGEST_PHENOPACKET["subject"]["id"]) + self.assertEqual(record["SEX"], EXAMPLE_INGEST_PHENOPACKET["subject"]["sex"]) + break + + def test_export_cbio_sample_data(self): + samples = PhModel.Biosample.objects.filter(phenopacket=self.p)\ + .annotate(phenopacket_subject_id=F("phenopacket__subject")) + with io.StringIO() as output: + sample_export(samples, output) + # Check header + output.seek(0) + field_count = None + field_names = [] + sample_count = 0 + for i, line in enumerate(output): + # 4 first header lines begin with `#` + if i < 4: + self.assertEqual(line[0], '#') + continue + + # Following lines are regular TSV formatted lines + pieces = line.rstrip().split('\t') + + # 5th line is a header with predefined field names + if i == 4: + field_count = len(pieces) + field_names = pieces + + # At least PATIENT_ID and SAMPLE_ID + self.assertGreaterEqual(field_count, 2) + self.assertIn('PATIENT_ID', pieces) + self.assertIn('SAMPLE_ID', pieces) + continue + + # TSV body: 1 row per sample + self.assertEqual(field_count, len(pieces)) + record = dict(zip(field_names, pieces)) + + self.assertEqual( + record["PATIENT_ID"], + EXAMPLE_INGEST_PHENOPACKET["biosamples"][sample_count]["individual_id"] + ) + self.assertEqual( + record["SAMPLE_ID"], + EXAMPLE_INGEST_PHENOPACKET["biosamples"][sample_count]["id"] + ) + sample_count += 1 + + self.assertEqual(sample_count, samples.count()) diff --git a/chord_metadata_service/chord/urls.py b/chord_metadata_service/chord/urls.py index 64190a594..7bbb25edc 100644 --- a/chord_metadata_service/chord/urls.py +++ b/chord_metadata_service/chord/urls.py @@ -1,6 +1,6 @@ from django.urls import path -from . import views_ingest, views_search +from . import views_ingest, views_search, views_export urlpatterns = [ path('workflows', views_ingest.workflow_list, name="workflows"), @@ -8,6 +8,7 @@ path('workflows/.wdl', views_ingest.workflow_file, name="workflow-file"), path('private/ingest', views_ingest.ingest, name="ingest"), + path('private/export', views_export.export, name="export"), path('data-types', views_search.data_type_list, name="data-type-list"), path('data-types/', views_search.data_type_detail, name="data-type-detail"), diff --git a/chord_metadata_service/chord/views_export.py b/chord_metadata_service/chord/views_export.py new file mode 100644 index 000000000..6ba5b2464 --- /dev/null +++ b/chord_metadata_service/chord/views_export.py @@ -0,0 +1,107 @@ +import json +import logging +import traceback + +from django.http import FileResponse + +from jsonschema import Draft7Validator +from rest_framework.decorators import api_view, permission_classes +from rest_framework.permissions import AllowAny +from rest_framework.response import Response +from rest_framework.request import Request + + +from chord_metadata_service.chord.schemas import EXPORT_SCHEMA +from bento_lib.responses import errors + +from .export import EXPORT_FORMAT_FUNCTION_MAP, EXPORT_FORMAT_OBJECT_TYPE_MAP, EXPORT_FORMATS, EXPORT_OBJECT_TYPE +from .export_utils import ExportError, ExportFileContext + + +BENTO_EXPORT_SCHEMA_VALIDATOR = Draft7Validator(EXPORT_SCHEMA) + +logger = logging.getLogger(__name__) + + +# Mounted on /private/, so will get protected anyway; this allows for access from WES +# TODO: Ugly and misleading permissions +@api_view(["POST"]) +@permission_classes([AllowAny]) +def export(request: Request): + """Export data from Katsu + + Exports the requested data object (e.g. a Dataset or a Project) in the given + format. + Note that the generated files will be either written locally if a path is + provided, or downloaded as a tar gzipped attachment otherwise. + + Args: + request: Django Rest Framework request object. The data property contains + the payload as a JSON following the export schema. + """ + # Private endpoints are protected by URL namespace, not by Django permissions. + + # TODO: Schema for OpenAPI doc + + logger.info(f"Received export request: {json.dumps(request.data)}") + + if not BENTO_EXPORT_SCHEMA_VALIDATOR.is_valid(request.data): + msg_list = [err.message for err in BENTO_EXPORT_SCHEMA_VALIDATOR.iter_errors(request.data)] + return Response(errors.bad_request_error( + "Invalid ingest request body: " + "\n".join(msg_list)), + status=400 # TODO: Validation errors + ) + + object_id = request.data["object_id"] + object_type: str = request.data["object_type"] # 'dataset', 'table',... + + model = EXPORT_OBJECT_TYPE[object_type]["model"] + if not model.objects.filter(identifier=object_id).exists(): + return Response(errors.bad_request_error( + f"{object_type.capitalize()} with ID {object_id} does not exist"), + status=400 + ) + + format = request.data["format"].strip() + output_path = request.data.get("output_path") # optional parameter + + if format not in EXPORT_FORMATS: # Check that the workflow exists + return Response(errors.bad_request_error( + f"Export in format {format} is not implemented"), + status=400 + ) + + if object_type not in EXPORT_FORMAT_OBJECT_TYPE_MAP[format]: + return Response(errors.bad_request_error( + f"Exporting entities of type {object_type} in format {format} is not implemented"), + status=400 + ) + + # TODO: secure the output_path value + + try: + with ExportFileContext(output_path, object_id) as file_export: + # Pass a callable to generate the proper file paths within the export context. + EXPORT_FORMAT_FUNCTION_MAP[format](file_export.get_path, object_id) + + # If no output path parameter has been provided, the generated export + # is returned as an attachment to the Response and everything will + # be cleaned afterwards. + # Otherwise, the provided local path is under the responsability of + # the caller + if not output_path: + tarfile = file_export.write_tar() + return FileResponse(open(tarfile, "rb"), as_attachment=True) + + except ExportError as e: + return Response(errors.bad_request_error(f"Encountered export error: {e}"), status=400) + + except Exception as e: + # Encountered some other error from the export attempt, return a somewhat detailed message + logger.error(f"Encountered an exception while processing an export attempt:\n{traceback.format_exc()}") + return Response(errors.internal_server_error( + f"Encountered an exception while processing an export attempt (error: {repr(e)}"), + status=500 + ) + + return Response(status=204) diff --git a/chord_metadata_service/package.cfg b/chord_metadata_service/package.cfg index 66666ad57..d450c87af 100644 --- a/chord_metadata_service/package.cfg +++ b/chord_metadata_service/package.cfg @@ -1,4 +1,4 @@ [package] name = katsu -version = 2.9.1 +version = 2.10.0 authors = Ksenia Zaytseva, David Lougheed, Simon Chénard, Romain Grégoire diff --git a/chord_metadata_service/restapi/api_views.py b/chord_metadata_service/restapi/api_views.py index 9a17b0c5e..2a0d4d3db 100644 --- a/chord_metadata_service/restapi/api_views.py +++ b/chord_metadata_service/restapi/api_views.py @@ -1,4 +1,6 @@ import math +import logging + from collections import Counter from django.conf import settings from django.views.decorators.cache import cache_page @@ -17,6 +19,10 @@ from chord_metadata_service.mcode.api_views import MCODEPACKET_PREFETCH, MCODEPACKET_SELECT +logger = logging.getLogger("restapi_api_views") +logger.setLevel(logging.INFO) + + @api_view() @permission_classes([AllowAny]) def service_info(_request): @@ -339,7 +345,12 @@ def public_overview(_request): # add new Counter() if key not in extra_properties: extra_properties[key] = Counter() - extra_properties[key].update((individual.extra_properties[key],)) + try: + extra_properties[key].update((individual.extra_properties[key],)) + except TypeError: + logger.error(f"The extra_properties {key} value is not of type string or number.") + pass + individuals_extra_properties[key] = dict(extra_properties[key]) # Experiments for experiment in experiments: @@ -367,13 +378,14 @@ def public_overview(_request): if "bin_size" in settings.CONFIG_FIELDS["extra_properties"][key] else None # retrieve the values from extra_properties counter values = individuals_extra_properties[key] - kwargs = dict(values=values, bin_size=field_bin_size) - # sort into bins and remove numeric values where count <= threshold - extra_prop_values_in_bins = sort_numeric_values_into_bins( - **{k: v for k, v in kwargs.items() if v is not None} - ) - # rewrite with sorted values - individuals_extra_properties[key] = extra_prop_values_in_bins + if values: + kwargs = dict(values=values, bin_size=field_bin_size) + # sort into bins and remove numeric values where count <= threshold + extra_prop_values_in_bins = sort_numeric_values_into_bins( + **{k: v for k, v in kwargs.items() if v is not None} + ) + # rewrite with sorted values + individuals_extra_properties[key] = extra_prop_values_in_bins # add missing value count individuals_extra_properties[key][missing] = len(individuals_set) - sum(v for v in value.values()) else: diff --git a/chord_metadata_service/restapi/tests/constants.py b/chord_metadata_service/restapi/tests/constants.py index 8f8fa903d..d02a3243a 100644 --- a/chord_metadata_service/restapi/tests/constants.py +++ b/chord_metadata_service/restapi/tests/constants.py @@ -1,3 +1,6 @@ +from copy import deepcopy + + INVALID_FHIR_BUNDLE_1 = { "resourceType": "NotBundle", "entry": [ @@ -209,6 +212,40 @@ VALID_INDIVIDUALS = [VALID_INDIVIDUAL_1, VALID_INDIVIDUAL_2, VALID_INDIVIDUAL_3, VALID_INDIVIDUAL_4, VALID_INDIVIDUAL_5, VALID_INDIVIDUAL_6, VALID_INDIVIDUAL_7, VALID_INDIVIDUAL_8] + +extra_properties_with_list = { + "smoking": "Former smoker", + "covidstatus": "Positive", + "death_dc": "Alive", + "mobility": "I have slight problems in walking about", + "date_of_consent": "2021-03-03", + "lab_test_result_value": 699.86, + "baseline_creatinine": [100, 120] + } + +extra_properties_with_dict = { + "smoking": "Former smoker", + "covidstatus": "Positive", + "death_dc": "Alive", + "mobility": "I have slight problems in walking about", + "date_of_consent": "2021-03-03", + "lab_test_result_value": 699.86, + "baseline_creatinine": { + "test_key_1": 120, + "test_key_2": "test_value_2" + } + } + + +INDIVIDUALS_NOT_ACCEPTED_DATA_TYPES_LIST = [ + {**item, "extra_properties": extra_properties_with_list} for item in deepcopy(VALID_INDIVIDUALS) +] + +INDIVIDUALS_NOT_ACCEPTED_DATA_TYPES_DICT = [ + {**item, "extra_properties": extra_properties_with_dict} for item in deepcopy(VALID_INDIVIDUALS) +] + + CONFIG_FIELDS_TEST = { "sex": { "type": "string", diff --git a/chord_metadata_service/restapi/tests/test_api.py b/chord_metadata_service/restapi/tests/test_api.py index e4a96e13b..8a699c17e 100644 --- a/chord_metadata_service/restapi/tests/test_api.py +++ b/chord_metadata_service/restapi/tests/test_api.py @@ -1,4 +1,5 @@ from copy import deepcopy + from django.conf import settings from django.urls import reverse from django.test import override_settings @@ -12,9 +13,13 @@ from chord_metadata_service.experiments.tests import constants as exp_c from chord_metadata_service.mcode import models as mcode_m from chord_metadata_service.mcode.tests import constants as mcode_c -from .constants import CONFIG_FIELDS_TEST -from .constants import VALID_INDIVIDUALS +from .constants import ( + CONFIG_FIELDS_TEST, + VALID_INDIVIDUALS, + INDIVIDUALS_NOT_ACCEPTED_DATA_TYPES_LIST, + INDIVIDUALS_NOT_ACCEPTED_DATA_TYPES_DICT +) class ServiceInfoTest(APITestCase): @@ -227,3 +232,48 @@ def test_overview_response_no_config(self): response_obj = response.json() self.assertIsInstance(response_obj, dict) self.assertEqual(response_obj, settings.NO_PUBLIC_DATA_AVAILABLE) + + +class PublicOverviewNotSupportedDataTypesListTest(APITestCase): + # individuals (count 8) + def setUp(self) -> None: + # create individuals including those who have not accepted data types + for ind in INDIVIDUALS_NOT_ACCEPTED_DATA_TYPES_LIST: + ph_m.Individual.objects.create(**ind) + + @override_settings(CONFIG_FIELDS=CONFIG_FIELDS_TEST) + def test_overview_response(self): + # test overview response with passing TypeError exception + response = self.client.get('/api/public_overview') + response_obj = response.json() + print(response_obj) + self.assertEqual(response.status_code, status.HTTP_200_OK) + self.assertIsInstance(response_obj, dict) + # the field name is present, but the keys are not (except 'missing') + self.assertIn("baseline_creatinine", response_obj["extra_properties"]) + self.assertIn("missing", response_obj["extra_properties"]["baseline_creatinine"]) + self.assertEqual(8, response_obj["extra_properties"]["baseline_creatinine"]["missing"]) + # if we add support for an array values for the public_overview + # then this assertion will fail, so far there is no support for it + self.assertNotIn(100, response_obj["extra_properties"]["baseline_creatinine"]) + + +class PublicOverviewNotSupportedDataTypesDictTest(APITestCase): + # individuals (count 8) + def setUp(self) -> None: + # create individuals including those who have not accepted data types + for ind in INDIVIDUALS_NOT_ACCEPTED_DATA_TYPES_DICT: + ph_m.Individual.objects.create(**ind) + + @override_settings(CONFIG_FIELDS=CONFIG_FIELDS_TEST) + def test_overview_response(self): + # test overview response with passing TypeError exception + response = self.client.get('/api/public_overview') + response_obj = response.json() + print(response_obj) + self.assertEqual(response.status_code, status.HTTP_200_OK) + self.assertIsInstance(response_obj, dict) + # the field name is present, but the keys are not (except 'missing') + self.assertIn("baseline_creatinine", response_obj["extra_properties"]) + self.assertIn("missing", response_obj["extra_properties"]["baseline_creatinine"]) + self.assertEqual(8, response_obj["extra_properties"]["baseline_creatinine"]["missing"]) diff --git a/requirements.txt b/requirements.txt index 95ea516be..826f43ed7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,7 +14,7 @@ coreschema==0.0.4 coverage==5.4 cryptography==3.4.6 distlib==0.3.1 -Django==2.2.27 +Django==2.2.28 django-autocomplete-light==3.8.2 django-cors-headers==3.7.0 django-filter==2.4.0