From e9ac5af8162e00e473cb54298e72c732ff103fef Mon Sep 17 00:00:00 2001 From: Paul Pillot Date: Thu, 24 Feb 2022 11:24:16 -0500 Subject: [PATCH 01/26] export patients clinical file prototype --- chord_metadata_service/patients/api_views.py | 3 +- .../restapi/api_renderers.py | 78 +++++++++++++++++++ .../restapi/cbioportal_export_mapping.py | 48 ++++++++++++ 3 files changed, 128 insertions(+), 1 deletion(-) create mode 100644 chord_metadata_service/restapi/cbioportal_export_mapping.py diff --git a/chord_metadata_service/patients/api_views.py b/chord_metadata_service/patients/api_views.py index db2cd9c7a..a4a705831 100644 --- a/chord_metadata_service/patients/api_views.py +++ b/chord_metadata_service/patients/api_views.py @@ -15,6 +15,7 @@ PhenopacketsRenderer, IndividualCSVRenderer, ARGORenderer, + IndividualCBioPortalPatientRenderer, ) from chord_metadata_service.restapi.pagination import LargeResultsSetPagination @@ -35,7 +36,7 @@ class IndividualViewSet(viewsets.ModelViewSet): serializer_class = IndividualSerializer pagination_class = LargeResultsSetPagination renderer_classes = (*api_settings.DEFAULT_RENDERER_CLASSES, FHIRRenderer, - PhenopacketsRenderer, IndividualCSVRenderer, ARGORenderer) + PhenopacketsRenderer, IndividualCSVRenderer, ARGORenderer, IndividualCBioPortalPatientRenderer) filter_backends = [DjangoFilterBackend, filters.OrderingFilter] filter_class = IndividualFilter ordering_fields = ["id"] diff --git a/chord_metadata_service/restapi/api_renderers.py b/chord_metadata_service/restapi/api_renderers.py index f7bc6284b..e9ff159bf 100644 --- a/chord_metadata_service/restapi/api_renderers.py +++ b/chord_metadata_service/restapi/api_renderers.py @@ -10,6 +10,7 @@ from .jsonld_utils import dataset_to_jsonld from .utils import parse_onset +from .cbioportal_export_mapping import individual_to_patient_header register('json-ld', Serializer, 'rdflib_jsonld.serializer', 'JsonLDSerializer') @@ -163,3 +164,80 @@ def render(self, data, media_type=None, renderer_context=None): dict_writer.writerow(headers) dict_writer.writerows(individuals) return response + +class IndividualCBioPortalPatientRenderer(JSONRenderer): + """ + Renders Individuals as a clinical_patient text file suitable for + import by cBioPortal. + + cBioPortal Patients fields specs: + --------------------------------- + Required: + - PATIENT_ID + Special columns: + - OS_STATUS, OS_MONTHS overall survivall. Status can be 1:DECEASED, 0:LIVING + - DFS_STATUS, DFS_MONTHS disease free + - PATIENT_DISPLAY_NAME + - GENDER or SEX + - AGE + - TUMOR_SITE + """ + + media_type = 'text/plain' + format = 'cbioportal_export' + + def render(self, data, media_type=None, renderer_context=None): + if 'results' in data: + individuals = [] + for individual in data['results']: + ind_obj = { + 'id': individual['id'], + 'sex': individual.get('sex', None), + # 'date_of_birth': individual.get('date_of_birth', None), + # 'taxonomy': None, + # 'karyotypic_sex': individual['karyotypic_sex'], + # 'race': individual.get('race', None), + # 'ethnicity': individual.get('ethnicity', None), + # 'age': None, + # 'diseases': None, + # 'created': individual['created'], + # 'updated': individual['updated'] + } + # if 'taxonomy' in individual: + # ind_obj['taxonomy'] = individual['taxonomy'].get('label', None) + # if 'age' in individual: + # if 'age' in individual['age']: + # ind_obj['age'] = individual['age'].get('age', None) + # elif 'start' and 'end' in individual['age']: + # ind_obj['age'] = str( + # individual['age']['start'].get('age', "NA") + # + ' - ' + + # individual['age']['end'].get('age', "NA") + # ) + # else: + # ind_obj['age'] = None + # if 'phenopackets' in individual: + # all_diseases = [] + # for phenopacket in individual['phenopackets']: + # if 'diseases' in phenopacket: + # # use ; because some disease terms might contain , in their label + # single_phenopacket_diseases = '; '.join( + # [ + # f"{d['term']['label']} ({parse_onset(d['onset'])})" + # if 'onset' in d else d['term']['label'] for d in phenopacket['diseases'] + # ] + # ) + # all_diseases.append(single_phenopacket_diseases) + # if all_diseases: + # ind_obj['diseases'] = '; '.join(all_diseases) + individuals.append(ind_obj) + columns = individuals[0].keys() + headers = individual_to_patient_header(columns) + + response = HttpResponse(content_type=self.media_type) + response['Content-Disposition'] = "attachment; filename='data_clinical_patient.txt'" + + response.writelines([line + '\n' for line in headers]) + dict_writer = csv.DictWriter(response, fieldnames=columns, delimiter='\t') + dict_writer.writerows(individuals) + return response diff --git a/chord_metadata_service/restapi/cbioportal_export_mapping.py b/chord_metadata_service/restapi/cbioportal_export_mapping.py new file mode 100644 index 000000000..ba1540bf2 --- /dev/null +++ b/chord_metadata_service/restapi/cbioportal_export_mapping.py @@ -0,0 +1,48 @@ +"""Katsu models fields to cBioportal fields declarations + +This module contains utilities to generate cBioPortal data files headers +based on field names from katsu models. +""" + +def individual_to_patient_header(fields: list): + """ + Maps a list of Individual field names to a 5 rows header + suitable for cBioPortal data_clinical_patient.txt file. + """ + + # predefined mappings from Individual keys to cBioPortal field properties + fields_mapping = { + 'id': ('Patient Identifier', 'Patient Identifier', 'STRING', '1', 'PATIENT_ID'), + 'sex': ('Sex', 'Sex', 'STRING', '1', 'SEX'), + } + + field_properties = [] + for field in fields: + if field in fields_mapping: + field_properties.append(fields_mapping[field]) + else: + fieldname = field.replace('_', ' ').capitalize() + prop = ( + fieldname, # display name + fieldname, # description + 'STRING', # type !!!TODO: TYPE DETECTION!!! + '1', # priority (note: string here for use in join()) + field.upper() # DB suitable identifier + ) + field_properties.append(prop) + + # Transpose list of properties tuples per field to tuples of + # field properties per property. + rows = list(zip(*field_properties)) + + # The 4 first rows are considered meta datas, prefixed by '#'. + # The 5th row (DB field names) is a canonical TSV header. + cbio_header = [ + '#' + '\t'.join(rows[0]), + '#' + '\t'.join(rows[1]), + '#' + '\t'.join(rows[2]), + '#' + '\t'.join(rows[3]), + '\t'.join(rows[4]) + ] + + return cbio_header From 3acd5ebcb95daa04d588eb30062e6820a12c002a Mon Sep 17 00:00:00 2001 From: Paul Pillot Date: Fri, 25 Feb 2022 12:45:41 -0500 Subject: [PATCH 02/26] white space --- chord_metadata_service/patients/api_views.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/chord_metadata_service/patients/api_views.py b/chord_metadata_service/patients/api_views.py index a4a705831..00fd9fa0a 100644 --- a/chord_metadata_service/patients/api_views.py +++ b/chord_metadata_service/patients/api_views.py @@ -36,7 +36,8 @@ class IndividualViewSet(viewsets.ModelViewSet): serializer_class = IndividualSerializer pagination_class = LargeResultsSetPagination renderer_classes = (*api_settings.DEFAULT_RENDERER_CLASSES, FHIRRenderer, - PhenopacketsRenderer, IndividualCSVRenderer, ARGORenderer, IndividualCBioPortalPatientRenderer) + PhenopacketsRenderer, IndividualCSVRenderer, ARGORenderer, + IndividualCBioPortalPatientRenderer) filter_backends = [DjangoFilterBackend, filters.OrderingFilter] filter_class = IndividualFilter ordering_fields = ["id"] From 8114c3ed76d08a938853d23531955ca0c7dcdc7e Mon Sep 17 00:00:00 2001 From: Paul Pillot Date: Fri, 25 Feb 2022 12:49:02 -0500 Subject: [PATCH 03/26] add biosamples export to cBioPortal clinical_samples Refactor header generation to class (DRY code). Add biosamples based export. --- .../phenopackets/api_views.py | 6 +- .../restapi/api_renderers.py | 64 +++++++++++- .../restapi/cbioportal_export_mapping.py | 97 +++++++++++++------ 3 files changed, 133 insertions(+), 34 deletions(-) diff --git a/chord_metadata_service/phenopackets/api_views.py b/chord_metadata_service/phenopackets/api_views.py index f5e67cecd..dd232e480 100644 --- a/chord_metadata_service/phenopackets/api_views.py +++ b/chord_metadata_service/phenopackets/api_views.py @@ -8,7 +8,10 @@ from django.views.decorators.cache import cache_page from django_filters.rest_framework import DjangoFilterBackend -from chord_metadata_service.restapi.api_renderers import PhenopacketsRenderer, FHIRRenderer +from chord_metadata_service.restapi.api_renderers import ( + PhenopacketsRenderer, FHIRRenderer, + BiosampleCBioPortalSampleRenderer +) from chord_metadata_service.restapi.pagination import LargeResultsSetPagination from chord_metadata_service.phenopackets.schemas import PHENOPACKET_SCHEMA from . import models as m, serializers as s, filters as f @@ -166,6 +169,7 @@ class BiosampleViewSet(ExtendedPhenopacketsModelViewSet): serializer_class = s.BiosampleSerializer filter_backends = [DjangoFilterBackend] filter_class = f.BiosampleFilter + renderer_classes = (*api_settings.DEFAULT_RENDERER_CLASSES, BiosampleCBioPortalSampleRenderer) PHENOPACKET_PREFETCH = ( diff --git a/chord_metadata_service/restapi/api_renderers.py b/chord_metadata_service/restapi/api_renderers.py index e9ff159bf..26384a876 100644 --- a/chord_metadata_service/restapi/api_renderers.py +++ b/chord_metadata_service/restapi/api_renderers.py @@ -10,7 +10,7 @@ from .jsonld_utils import dataset_to_jsonld from .utils import parse_onset -from .cbioportal_export_mapping import individual_to_patient_header +from .cbioportal_export_mapping import individual_to_patient_header, biosample_to_sample_header register('json-ld', Serializer, 'rdflib_jsonld.serializer', 'JsonLDSerializer') @@ -168,7 +168,7 @@ def render(self, data, media_type=None, renderer_context=None): class IndividualCBioPortalPatientRenderer(JSONRenderer): """ Renders Individuals as a clinical_patient text file suitable for - import by cBioPortal. + importing by cBioPortal. cBioPortal Patients fields specs: --------------------------------- @@ -241,3 +241,63 @@ def render(self, data, media_type=None, renderer_context=None): dict_writer = csv.DictWriter(response, fieldnames=columns, delimiter='\t') dict_writer.writerows(individuals) return response + +class BiosampleCBioPortalSampleRenderer(JSONRenderer): + """ + Renders Biosamples as a clinical_sample text file suitable for + importing by cBioPortal. + + cBioPortal Sample fields specs: + --------------------------------- + Required: + - PATIENT_ID + - SAMPLE_ID + Special columns: + - For pan-cancer summary statistics tab: + - CANCER_TYPE as an Oncotree code + - CANCER_TYPE_DETAILED + - SAMPLE_DISPLAY_NAME + - SAMPLE_CLASS + - METASTATIC_SITE / PRIMARY_SITE overrides the patients level attribute TUMOR_SITE + - SAMPLE_TYPE, TUMOR_TISSUE_SITE, TUMOR_TYPE can have the following values (are displayed with a distinct color in the timelines): + - "recurrence", "recurred", "progression" + - "metastatic", "metastasis" + - "primary" or any other value + - KNOWN_MOLECULAR_CLASSIFIER + - GLEASON_SCORE (prostate cancer) + - HISTOLOGY + - TUMOR_STAGE_2009 + - TUMOR_GRADE + - ETS_RAF_SPINK1_STATUS + - TMPRSS2_ERG_FUSION_STATUS + - ERG_FUSION_ACGH + - SERUM_PSA + - DRIVER_MUTATIONS + """ + + media_type = 'text/plain' + format = 'cbioportal_export' + + def render(self, data, media_type=None, renderer_context=None): + if 'results' in data: + samples = [] + for sample in data['results']: + sample_obj = { + 'individual_id': sample['individual'], + 'id': sample['id'] + } + if 'sampled_tissue' in sample: + sample_obj['tissue_label'] = sample['sampled_tissue'].get('label', '') + + samples.append(sample_obj) + + columns = samples[0].keys() + headers = biosample_to_sample_header(columns) + + response = HttpResponse(content_type=self.media_type) + response['Content-Disposition'] = "attachment; filename='data_clinical_sample.txt'" + + response.writelines([line + '\n' for line in headers]) + dict_writer = csv.DictWriter(response, fieldnames=columns, delimiter='\t') + dict_writer.writerows(samples) + return response \ No newline at end of file diff --git a/chord_metadata_service/restapi/cbioportal_export_mapping.py b/chord_metadata_service/restapi/cbioportal_export_mapping.py index ba1540bf2..651cd1a7c 100644 --- a/chord_metadata_service/restapi/cbioportal_export_mapping.py +++ b/chord_metadata_service/restapi/cbioportal_export_mapping.py @@ -4,7 +4,53 @@ based on field names from katsu models. """ -def individual_to_patient_header(fields: list): +class cbioportal_clinical_header_generator (): + + fields_mapping = {} + + def __init__(self, mappings = {}): + self.fields_mapping = mappings + + def make_header (self, fields: list): + """ + Maps a list of field names to a 5 rows header + suitable for cBioPortal clinical data files. + """ + + field_properties = [] + for field in fields: + if field in self.fields_mapping: + field_properties.append(self.fields_mapping[field]) + else: + fieldname = field.replace('_', ' ').capitalize() + prop = ( + fieldname, # display name + fieldname, # description + 'STRING', # type !!!TODO: TYPE DETECTION!!! + '1', # priority (note: string here for use in join()) + field.upper() # DB suitable identifier + ) + field_properties.append(prop) + + # Transpose list of properties tuples per field to tuples of + # field properties per property. + rows = list(zip(*field_properties)) + + # The 4 first rows are considered meta datas, prefixed by '#'. + # The 5th row (DB field names) is a canonical TSV header. + cbio_header = [ + '#' + '\t'.join(rows[0]), + '#' + '\t'.join(rows[1]), + '#' + '\t'.join(rows[2]), + '#' + '\t'.join(rows[3]), + '\t'.join(rows[4]) + ] + + return cbio_header + + + +def individual_to_patient_header (fields: list): """ Maps a list of Individual field names to a 5 rows header suitable for cBioPortal data_clinical_patient.txt file. @@ -16,33 +62,22 @@ def individual_to_patient_header(fields: list): 'sex': ('Sex', 'Sex', 'STRING', '1', 'SEX'), } - field_properties = [] - for field in fields: - if field in fields_mapping: - field_properties.append(fields_mapping[field]) - else: - fieldname = field.replace('_', ' ').capitalize() - prop = ( - fieldname, # display name - fieldname, # description - 'STRING', # type !!!TODO: TYPE DETECTION!!! - '1', # priority (note: string here for use in join()) - field.upper() # DB suitable identifier - ) - field_properties.append(prop) - - # Transpose list of properties tuples per field to tuples of - # field properties per property. - rows = list(zip(*field_properties)) - - # The 4 first rows are considered meta datas, prefixed by '#'. - # The 5th row (DB field names) is a canonical TSV header. - cbio_header = [ - '#' + '\t'.join(rows[0]), - '#' + '\t'.join(rows[1]), - '#' + '\t'.join(rows[2]), - '#' + '\t'.join(rows[3]), - '\t'.join(rows[4]) - ] - - return cbio_header + cbio_header = cbioportal_clinical_header_generator(fields_mapping); + return cbio_header.make_header(fields) + +def biosample_to_sample_header (fields: list): + """ + Maps a list of biosamples field names to a 5 rows header + suitable for cBioPortal data_sample_patient.txt file. + """ + + # predefined mappings from Samples keys to cBioPortal field properties + fields_mapping = { + 'individual_id': ('Patient Identifier', 'Patient Identifier', 'STRING', '1', 'PATIENT_ID'), + 'id': ('Sample Identifier', 'Sample Identifier', 'STRING', '1', 'SAMPLE_ID'), + 'tissue_label': ('Sampled Tissue', 'Sampled Tissue', 'STRING', '1', 'TISSUE_LABEL') + } + + cbio_header = cbioportal_clinical_header_generator(fields_mapping); + return cbio_header.make_header(fields) + From 45e26aca4c131c54065890b7422cc3f1af27b37c Mon Sep 17 00:00:00 2001 From: Paul Pillot Date: Fri, 4 Mar 2022 17:18:48 -0500 Subject: [PATCH 04/26] Add cBioPortal export script --- chord_metadata_service/chord/export_cbio.py | 258 ++++++++++++++++++++ 1 file changed, 258 insertions(+) create mode 100644 chord_metadata_service/chord/export_cbio.py diff --git a/chord_metadata_service/chord/export_cbio.py b/chord_metadata_service/chord/export_cbio.py new file mode 100644 index 000000000..4771f318b --- /dev/null +++ b/chord_metadata_service/chord/export_cbio.py @@ -0,0 +1,258 @@ +import logging +import os +import re +import csv +from chord_metadata_service.restapi.cbioportal_export_mapping import biosample_to_sample_header, individual_to_patient_header +import shutil +import tempfile + +from typing import TextIO + +from django.conf import settings + + +from chord_metadata_service.chord.models import Dataset, Table, TableOwnership +from chord_metadata_service.patients.models import Individual +from chord_metadata_service.experiments import models as em +from chord_metadata_service.phenopackets import models as pm +from chord_metadata_service.resources import models as rm, utils as ru + + + +logger = logging.getLogger(__name__) + +WORKFLOWS_PATH = os.path.join(os.path.dirname(os.path.realpath(__file__)), "workflows") + + +class ExportError(Exception): + pass + + +DRS_URI_SCHEME = "drs" +FILE_URI_SCHEME = "file" +HTTP_URI_SCHEME = "http" +HTTPS_URI_SCHEME = "https" + +WINDOWS_DRIVE_SCHEME = re.compile(r"^[a-zA-Z]$") + +STUDY_FILENAME = "meta_study.txt" +SAMPLE_DATA_FILENAME = "data_clinical_sample.txt" +SAMPLE_META_FILENAME = "meta_clinical_sample.txt" +PATIENT_DATA_FILENAME = "data_clinical_patient.txt" +PATIENT_META_FILENAME = "meta_clinical_patient.txt" + +PATIENT_DATATYPE = 'PATIENT' +SAMPLE_DATATYPE = 'SAMPLE' + +class ExportError(Exception): + pass + +class CBioExportFileContext: + """ + Context manager around the tmp export directory for a given study + identifier. + """ + path = "" + should_del = False + + def __init__(self, project_id: str): + tmp_dir = settings.SERVICE_TEMP + + if tmp_dir is None: + tmp_dir = tempfile.mkdtemp() + self.should_del = True + + if not os.access(tmp_dir, os.W_OK): + raise ExportError(f"Directory does not exist or is not writable: {tmp_dir}") + + try: + tmp_dir = tmp_dir.rstrip("/") + "/cbio_export/" + self.path = os.path.join(tmp_dir, project_id) + + #clean pre-existing export dir + isExistant = os.path.exists(self.path) + if isExistant: + shutil.rmtree(self.path) + + original_umask = os.umask(0) # fix issue with non-writable dir due to OS based mask + os.makedirs(self.path, 0o777) + + except OSError: + raise ExportError + + finally: + os.umask(original_umask) + + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, traceback): + if self.should_del and self.path: + shutil.rmtree(self.path) + + def getPath (self, filename: str): + return os.path.join(self.path, filename) + + +def StudyExport (): + """Export a given Project as a cBioPortal study""" + #TODO: a Dataset is a Study (associated with a publication), not a Project! + if Dataset.objects.count == 0: + raise ExportError("No Project to export") + dataset = Dataset.objects.first() # TODO: for now export first project + project_id = str(dataset.identifier) + + # create a context wrapping a tmp folder for export + with CBioExportFileContext(project_id) as file_export: + + # Export study file + with open(file_export.getPath(STUDY_FILENAME), 'w') as file_study: + StudyExportMeta(dataset, file_study) + + # export patients. + with open(file_export.getPath(PATIENT_DATA_FILENAME), 'w') as file_patient: + # Note: plural in `phenopackets` is intentional (related_name property in model) + indiv = Individual.objects.filter(phenopackets__table__ownership_record__dataset_id=dataset.identifier) + IndividualExport(indiv, file_patient) + + with open(file_export.getPath(PATIENT_META_FILENAME), 'w') as file_patient_meta: + ClinicalMetaExport(project_id, PATIENT_DATATYPE, file_patient_meta) + + # export samples + with open(file_export.getPath(SAMPLE_DATA_FILENAME), 'w') as file_sample: + sampl = pm.Biosample.objects.filter(phenopacket__table__ownership_record__dataset_id=dataset.identifier) + SampleExport(sampl, file_sample) + + with open(file_export.getPath(SAMPLE_META_FILENAME), 'w') as file_sample_meta: + ClinicalMetaExport(project_id, SAMPLE_DATATYPE, file_sample_meta) + + + + +def StudyExportMeta (dataset: Dataset, file_handle: TextIO): + """ + Study meta data file generation + """ + lines = dict() + lines['type_of_cancer'] = "mixed" #TODO: find if this information is available. !IMPORTANT! uses Oncotree codes + lines['cancer_study_identifier'] = str(dataset.identifier) + lines['name'] = dataset.title + lines['description'] = dataset.description + + # optional fields + if len(dataset.primary_publications): + lines['citation'] = dataset.primary_publications[0] + # pmid: unvailable + # groups: unused for authentication + # add_global_case_list: ? + # tags_file: ? + # reference_genome: ? + + for field, value in lines.items(): + file_handle.write(f"{field}: {value}\n") + + +def ClinicalMetaExport (study_id: str, datatype: str, file_handle: TextIO): + """ + Clinical Metadata files generation (samples or patients) + """ + lines = dict() + lines['cancer_study_identifier'] = study_id + lines['genetic_alteration_type'] = 'CLINICAL' + if datatype == SAMPLE_DATATYPE: + lines['datatype'] = 'SAMPLE_ATTRIBUTES' + lines['data_filename'] = SAMPLE_DATA_FILENAME + else: + lines['datatype'] = 'PATIENT_ATTRIBUTES' + lines['data_filename'] = PATIENT_DATA_FILENAME + + for field, value in lines.items(): + file_handle.write(f"{field}: {value}\n") + + +def IndividualExport(results, file_handle: TextIO): + """ + Renders Individuals as a clinical_patient text file suitable for + importing by cBioPortal. + + cBioPortal Patients fields specs: + --------------------------------- + Required: + - PATIENT_ID + Special columns: + - OS_STATUS, OS_MONTHS overall survivall. Status can be 1:DECEASED, 0:LIVING + - DFS_STATUS, DFS_MONTHS disease free + - PATIENT_DISPLAY_NAME + - GENDER or SEX + - AGE + - TUMOR_SITE + """ + + individuals = [] + for individual in results: + ind_obj = { + 'id': individual.id, + 'sex': individual.sex, + } + individuals.append(ind_obj) + + columns = individuals[0].keys() + headers = individual_to_patient_header(columns) + + file_handle.writelines([line + '\n' for line in headers]) + dict_writer = csv.DictWriter(file_handle, fieldnames=columns, delimiter='\t') + dict_writer.writerows(individuals) + + +def SampleExport (results, file_handle: TextIO): + """ + Renders Biosamples as a clinical_sample text file suitable for + importing by cBioPortal. + + cBioPortal Sample fields specs: + --------------------------------- + Required: + - PATIENT_ID + - SAMPLE_ID + + Special columns: + - For pan-cancer summary statistics tab: + - CANCER_TYPE as an Oncotree code + - CANCER_TYPE_DETAILED + - SAMPLE_DISPLAY_NAME + - SAMPLE_CLASS + - METASTATIC_SITE / PRIMARY_SITE overrides the patients level attribute TUMOR_SITE + - SAMPLE_TYPE, TUMOR_TISSUE_SITE, TUMOR_TYPE can have the following values (are displayed with a distinct color in the timelines): + - "recurrence", "recurred", "progression" + - "metastatic", "metastasis" + - "primary" or any other value + - KNOWN_MOLECULAR_CLASSIFIER + - GLEASON_SCORE (prostate cancer) + - HISTOLOGY + - TUMOR_STAGE_2009 + - TUMOR_GRADE + - ETS_RAF_SPINK1_STATUS + - TMPRSS2_ERG_FUSION_STATUS + - ERG_FUSION_ACGH + - SERUM_PSA + - DRIVER_MUTATIONS + """ + + samples = [] + for sample in results: + sample_obj = { + 'individual_id': sample.individual.id, + 'id': sample.id + } + if sample.sampled_tissue: + sample_obj['tissue_label'] = sample.sampled_tissue.get('label', '') + + samples.append(sample_obj) + + columns = samples[0].keys() + headers = biosample_to_sample_header(columns) + + file_handle.writelines([line + '\n' for line in headers]) + dict_writer = csv.DictWriter(file_handle, fieldnames=columns, delimiter='\t') + dict_writer.writerows(samples) From d641e34142817fe7ac5d2b9a65e6efb46c704e98 Mon Sep 17 00:00:00 2001 From: Paul Pillot Date: Fri, 1 Apr 2022 11:17:03 -0400 Subject: [PATCH 05/26] renamed file --- .../{cbioportal_export_mapping.py => cbioportal_export_utils.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename chord_metadata_service/restapi/{cbioportal_export_mapping.py => cbioportal_export_utils.py} (100%) diff --git a/chord_metadata_service/restapi/cbioportal_export_mapping.py b/chord_metadata_service/restapi/cbioportal_export_utils.py similarity index 100% rename from chord_metadata_service/restapi/cbioportal_export_mapping.py rename to chord_metadata_service/restapi/cbioportal_export_utils.py From 29b0466509f8e254909b301b403f76f5b1ede5a1 Mon Sep 17 00:00:00 2001 From: Paul Pillot Date: Fri, 1 Apr 2022 11:18:40 -0400 Subject: [PATCH 06/26] split export code btw cbio specific vs export generic --- chord_metadata_service/chord/export.py | 71 ++++++++++++++++++ chord_metadata_service/chord/export_cbio.py | 82 ++------------------- 2 files changed, 79 insertions(+), 74 deletions(-) create mode 100644 chord_metadata_service/chord/export.py diff --git a/chord_metadata_service/chord/export.py b/chord_metadata_service/chord/export.py new file mode 100644 index 000000000..ecb861454 --- /dev/null +++ b/chord_metadata_service/chord/export.py @@ -0,0 +1,71 @@ +import logging +import os +from chord_metadata_service.chord.ingest import WORKFLOW_CBIOPORTAL +from .export_cbio import StudyExport as export_cbioportal_workflow +import shutil +import tempfile + +from django.conf import settings + +__all__ = [ + "WORKFLOW_EXPORT_FUNCTION_MAP", +] + +logger = logging.getLogger(__name__) + + +class ExportError(Exception): + pass + +class ExportFileContext: + """ + Context manager around the tmp export directory for a given study + identifier. + """ + path = "" + should_del = False + + def __init__(self, tmp_dir: str, project_id: str): + tmp_dir = tmp_dir or settings.SERVICE_TEMP + + if tmp_dir is None: + tmp_dir = tempfile.mkdtemp() + self.should_del = True + + if not os.access(tmp_dir, os.W_OK): + raise ExportError(f"Directory does not exist or is not writable: {tmp_dir}") + + try: + tmp_dir = tmp_dir.rstrip("/") + "/cbio_export/" + self.path = os.path.join(tmp_dir, project_id) + + #clean pre-existing export dir + isExistant = os.path.exists(self.path) + if isExistant: + shutil.rmtree(self.path) + + original_umask = os.umask(0) # fix issue with non-writable dir due to OS based mask + os.makedirs(self.path, 0o777) + + except OSError: + raise ExportError + + finally: + os.umask(original_umask) + + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, traceback): + if self.should_del and self.path: + shutil.rmtree(self.path) + + def getPath (self, filename: str): + return os.path.join(self.path, filename) + + + +WORKFLOW_EXPORT_FUNCTION_MAP = { + WORKFLOW_CBIOPORTAL: export_cbioportal_workflow, +} \ No newline at end of file diff --git a/chord_metadata_service/chord/export_cbio.py b/chord_metadata_service/chord/export_cbio.py index 4771f318b..c802abd32 100644 --- a/chord_metadata_service/chord/export_cbio.py +++ b/chord_metadata_service/chord/export_cbio.py @@ -1,39 +1,20 @@ import logging -import os -import re import csv -from chord_metadata_service.restapi.cbioportal_export_mapping import biosample_to_sample_header, individual_to_patient_header -import shutil -import tempfile - from typing import TextIO -from django.conf import settings - +from .export import ExportFileContext, ExportError -from chord_metadata_service.chord.models import Dataset, Table, TableOwnership +from chord_metadata_service.restapi.cbioportal_export_utils import biosample_to_sample_header, individual_to_patient_header +from chord_metadata_service.chord.models import Dataset from chord_metadata_service.patients.models import Individual -from chord_metadata_service.experiments import models as em from chord_metadata_service.phenopackets import models as pm -from chord_metadata_service.resources import models as rm, utils as ru - +__all__ = [ + "StudyExport", +] logger = logging.getLogger(__name__) -WORKFLOWS_PATH = os.path.join(os.path.dirname(os.path.realpath(__file__)), "workflows") - - -class ExportError(Exception): - pass - - -DRS_URI_SCHEME = "drs" -FILE_URI_SCHEME = "file" -HTTP_URI_SCHEME = "http" -HTTPS_URI_SCHEME = "https" - -WINDOWS_DRIVE_SCHEME = re.compile(r"^[a-zA-Z]$") STUDY_FILENAME = "meta_study.txt" SAMPLE_DATA_FILENAME = "data_clinical_sample.txt" @@ -47,55 +28,8 @@ class ExportError(Exception): class ExportError(Exception): pass -class CBioExportFileContext: - """ - Context manager around the tmp export directory for a given study - identifier. - """ - path = "" - should_del = False - - def __init__(self, project_id: str): - tmp_dir = settings.SERVICE_TEMP - - if tmp_dir is None: - tmp_dir = tempfile.mkdtemp() - self.should_del = True - - if not os.access(tmp_dir, os.W_OK): - raise ExportError(f"Directory does not exist or is not writable: {tmp_dir}") - - try: - tmp_dir = tmp_dir.rstrip("/") + "/cbio_export/" - self.path = os.path.join(tmp_dir, project_id) - - #clean pre-existing export dir - isExistant = os.path.exists(self.path) - if isExistant: - shutil.rmtree(self.path) - - original_umask = os.umask(0) # fix issue with non-writable dir due to OS based mask - os.makedirs(self.path, 0o777) - - except OSError: - raise ExportError - - finally: - os.umask(original_umask) - - - def __enter__(self): - return self - - def __exit__(self, exc_type, exc_value, traceback): - if self.should_del and self.path: - shutil.rmtree(self.path) - - def getPath (self, filename: str): - return os.path.join(self.path, filename) - -def StudyExport (): +def StudyExport (tmp_path: str, project_id: str): """Export a given Project as a cBioPortal study""" #TODO: a Dataset is a Study (associated with a publication), not a Project! if Dataset.objects.count == 0: @@ -104,7 +38,7 @@ def StudyExport (): project_id = str(dataset.identifier) # create a context wrapping a tmp folder for export - with CBioExportFileContext(project_id) as file_export: + with ExportFileContext(tmp_path, project_id) as file_export: # Export study file with open(file_export.getPath(STUDY_FILENAME), 'w') as file_study: From baa9823a379f5dfe036dee97db8b6184000db353 Mon Sep 17 00:00:00 2001 From: Paul Pillot Date: Fri, 1 Apr 2022 11:22:00 -0400 Subject: [PATCH 07/26] revert using renderers for cbio export Now all the export is done through generated files. --- chord_metadata_service/patients/api_views.py | 4 +- .../phenopackets/api_views.py | 6 +- .../restapi/api_renderers.py | 138 ------------------ 3 files changed, 2 insertions(+), 146 deletions(-) diff --git a/chord_metadata_service/patients/api_views.py b/chord_metadata_service/patients/api_views.py index 00fd9fa0a..db2cd9c7a 100644 --- a/chord_metadata_service/patients/api_views.py +++ b/chord_metadata_service/patients/api_views.py @@ -15,7 +15,6 @@ PhenopacketsRenderer, IndividualCSVRenderer, ARGORenderer, - IndividualCBioPortalPatientRenderer, ) from chord_metadata_service.restapi.pagination import LargeResultsSetPagination @@ -36,8 +35,7 @@ class IndividualViewSet(viewsets.ModelViewSet): serializer_class = IndividualSerializer pagination_class = LargeResultsSetPagination renderer_classes = (*api_settings.DEFAULT_RENDERER_CLASSES, FHIRRenderer, - PhenopacketsRenderer, IndividualCSVRenderer, ARGORenderer, - IndividualCBioPortalPatientRenderer) + PhenopacketsRenderer, IndividualCSVRenderer, ARGORenderer) filter_backends = [DjangoFilterBackend, filters.OrderingFilter] filter_class = IndividualFilter ordering_fields = ["id"] diff --git a/chord_metadata_service/phenopackets/api_views.py b/chord_metadata_service/phenopackets/api_views.py index dd232e480..f5e67cecd 100644 --- a/chord_metadata_service/phenopackets/api_views.py +++ b/chord_metadata_service/phenopackets/api_views.py @@ -8,10 +8,7 @@ from django.views.decorators.cache import cache_page from django_filters.rest_framework import DjangoFilterBackend -from chord_metadata_service.restapi.api_renderers import ( - PhenopacketsRenderer, FHIRRenderer, - BiosampleCBioPortalSampleRenderer -) +from chord_metadata_service.restapi.api_renderers import PhenopacketsRenderer, FHIRRenderer from chord_metadata_service.restapi.pagination import LargeResultsSetPagination from chord_metadata_service.phenopackets.schemas import PHENOPACKET_SCHEMA from . import models as m, serializers as s, filters as f @@ -169,7 +166,6 @@ class BiosampleViewSet(ExtendedPhenopacketsModelViewSet): serializer_class = s.BiosampleSerializer filter_backends = [DjangoFilterBackend] filter_class = f.BiosampleFilter - renderer_classes = (*api_settings.DEFAULT_RENDERER_CLASSES, BiosampleCBioPortalSampleRenderer) PHENOPACKET_PREFETCH = ( diff --git a/chord_metadata_service/restapi/api_renderers.py b/chord_metadata_service/restapi/api_renderers.py index 26384a876..f7bc6284b 100644 --- a/chord_metadata_service/restapi/api_renderers.py +++ b/chord_metadata_service/restapi/api_renderers.py @@ -10,7 +10,6 @@ from .jsonld_utils import dataset_to_jsonld from .utils import parse_onset -from .cbioportal_export_mapping import individual_to_patient_header, biosample_to_sample_header register('json-ld', Serializer, 'rdflib_jsonld.serializer', 'JsonLDSerializer') @@ -164,140 +163,3 @@ def render(self, data, media_type=None, renderer_context=None): dict_writer.writerow(headers) dict_writer.writerows(individuals) return response - -class IndividualCBioPortalPatientRenderer(JSONRenderer): - """ - Renders Individuals as a clinical_patient text file suitable for - importing by cBioPortal. - - cBioPortal Patients fields specs: - --------------------------------- - Required: - - PATIENT_ID - Special columns: - - OS_STATUS, OS_MONTHS overall survivall. Status can be 1:DECEASED, 0:LIVING - - DFS_STATUS, DFS_MONTHS disease free - - PATIENT_DISPLAY_NAME - - GENDER or SEX - - AGE - - TUMOR_SITE - """ - - media_type = 'text/plain' - format = 'cbioportal_export' - - def render(self, data, media_type=None, renderer_context=None): - if 'results' in data: - individuals = [] - for individual in data['results']: - ind_obj = { - 'id': individual['id'], - 'sex': individual.get('sex', None), - # 'date_of_birth': individual.get('date_of_birth', None), - # 'taxonomy': None, - # 'karyotypic_sex': individual['karyotypic_sex'], - # 'race': individual.get('race', None), - # 'ethnicity': individual.get('ethnicity', None), - # 'age': None, - # 'diseases': None, - # 'created': individual['created'], - # 'updated': individual['updated'] - } - # if 'taxonomy' in individual: - # ind_obj['taxonomy'] = individual['taxonomy'].get('label', None) - # if 'age' in individual: - # if 'age' in individual['age']: - # ind_obj['age'] = individual['age'].get('age', None) - # elif 'start' and 'end' in individual['age']: - # ind_obj['age'] = str( - # individual['age']['start'].get('age', "NA") - # + ' - ' + - # individual['age']['end'].get('age', "NA") - # ) - # else: - # ind_obj['age'] = None - # if 'phenopackets' in individual: - # all_diseases = [] - # for phenopacket in individual['phenopackets']: - # if 'diseases' in phenopacket: - # # use ; because some disease terms might contain , in their label - # single_phenopacket_diseases = '; '.join( - # [ - # f"{d['term']['label']} ({parse_onset(d['onset'])})" - # if 'onset' in d else d['term']['label'] for d in phenopacket['diseases'] - # ] - # ) - # all_diseases.append(single_phenopacket_diseases) - # if all_diseases: - # ind_obj['diseases'] = '; '.join(all_diseases) - individuals.append(ind_obj) - columns = individuals[0].keys() - headers = individual_to_patient_header(columns) - - response = HttpResponse(content_type=self.media_type) - response['Content-Disposition'] = "attachment; filename='data_clinical_patient.txt'" - - response.writelines([line + '\n' for line in headers]) - dict_writer = csv.DictWriter(response, fieldnames=columns, delimiter='\t') - dict_writer.writerows(individuals) - return response - -class BiosampleCBioPortalSampleRenderer(JSONRenderer): - """ - Renders Biosamples as a clinical_sample text file suitable for - importing by cBioPortal. - - cBioPortal Sample fields specs: - --------------------------------- - Required: - - PATIENT_ID - - SAMPLE_ID - Special columns: - - For pan-cancer summary statistics tab: - - CANCER_TYPE as an Oncotree code - - CANCER_TYPE_DETAILED - - SAMPLE_DISPLAY_NAME - - SAMPLE_CLASS - - METASTATIC_SITE / PRIMARY_SITE overrides the patients level attribute TUMOR_SITE - - SAMPLE_TYPE, TUMOR_TISSUE_SITE, TUMOR_TYPE can have the following values (are displayed with a distinct color in the timelines): - - "recurrence", "recurred", "progression" - - "metastatic", "metastasis" - - "primary" or any other value - - KNOWN_MOLECULAR_CLASSIFIER - - GLEASON_SCORE (prostate cancer) - - HISTOLOGY - - TUMOR_STAGE_2009 - - TUMOR_GRADE - - ETS_RAF_SPINK1_STATUS - - TMPRSS2_ERG_FUSION_STATUS - - ERG_FUSION_ACGH - - SERUM_PSA - - DRIVER_MUTATIONS - """ - - media_type = 'text/plain' - format = 'cbioportal_export' - - def render(self, data, media_type=None, renderer_context=None): - if 'results' in data: - samples = [] - for sample in data['results']: - sample_obj = { - 'individual_id': sample['individual'], - 'id': sample['id'] - } - if 'sampled_tissue' in sample: - sample_obj['tissue_label'] = sample['sampled_tissue'].get('label', '') - - samples.append(sample_obj) - - columns = samples[0].keys() - headers = biosample_to_sample_header(columns) - - response = HttpResponse(content_type=self.media_type) - response['Content-Disposition'] = "attachment; filename='data_clinical_sample.txt'" - - response.writelines([line + '\n' for line in headers]) - dict_writer = csv.DictWriter(response, fieldnames=columns, delimiter='\t') - dict_writer.writerows(samples) - return response \ No newline at end of file From 3fed8791e815379f47f94019f43b5ede90053872 Mon Sep 17 00:00:00 2001 From: Paul Pillot Date: Fri, 1 Apr 2022 11:28:43 -0400 Subject: [PATCH 08/26] add workflow metadata for cbioportal export --- chord_metadata_service/chord/ingest.py | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/chord_metadata_service/chord/ingest.py b/chord_metadata_service/chord/ingest.py index 305e2ee6d..b86a5869d 100644 --- a/chord_metadata_service/chord/ingest.py +++ b/chord_metadata_service/chord/ingest.py @@ -57,6 +57,7 @@ WORKFLOW_MCODE_FHIR_JSON = "mcode_fhir_json" WORKFLOW_MCODE_JSON = "mcode_json" WORKFLOW_READSET = "readset" +WORKFLOW_CBIOPORTAL = "cbioportal" METADATA_WORKFLOWS = { "ingestion": { @@ -240,7 +241,30 @@ ] }, }, - "analysis": {} + "analysis": {}, + "export": { + WORKFLOW_CBIOPORTAL: { + "name": "cBioPortal", + "description": "This workflow creates a bundle for cBioPortal ingestion.", + "data_type": None, + "file": "cbioportal_export.wdl", + "inputs": [ + { + "id": "dataset_id", + "type": "string", + "required": True, + } + ], + "outputs": [ + { + "id": "cbioportal_archive", + "type": "file", + "map_from_input": "dataset_id", + "value": "{}.tar" + } + ] + } + } } WORKFLOWS_PATH = os.path.join(os.path.dirname(os.path.realpath(__file__)), "workflows") From de35e86c896bb05ec71a4e2fb7fe13229c551cfa Mon Sep 17 00:00:00 2001 From: Paul Pillot Date: Fri, 1 Apr 2022 11:34:40 -0400 Subject: [PATCH 09/26] add `export` to routes --- chord_metadata_service/chord/urls.py | 3 +- chord_metadata_service/chord/views_export.py | 78 ++++++++++++++++++++ 2 files changed, 80 insertions(+), 1 deletion(-) create mode 100644 chord_metadata_service/chord/views_export.py diff --git a/chord_metadata_service/chord/urls.py b/chord_metadata_service/chord/urls.py index 64190a594..7bbb25edc 100644 --- a/chord_metadata_service/chord/urls.py +++ b/chord_metadata_service/chord/urls.py @@ -1,6 +1,6 @@ from django.urls import path -from . import views_ingest, views_search +from . import views_ingest, views_search, views_export urlpatterns = [ path('workflows', views_ingest.workflow_list, name="workflows"), @@ -8,6 +8,7 @@ path('workflows/.wdl', views_ingest.workflow_file, name="workflow-file"), path('private/ingest', views_ingest.ingest, name="ingest"), + path('private/export', views_export.export, name="export"), path('data-types', views_search.data_type_list, name="data-type-list"), path('data-types/', views_search.data_type_detail, name="data-type-detail"), diff --git a/chord_metadata_service/chord/views_export.py b/chord_metadata_service/chord/views_export.py new file mode 100644 index 000000000..40ec26ed3 --- /dev/null +++ b/chord_metadata_service/chord/views_export.py @@ -0,0 +1,78 @@ +import json +import logging +import traceback +import uuid + +# Can't because code expects `ingestion` namespace +#from jsonschema import Draft7Validator +from rest_framework.decorators import api_view, permission_classes +from rest_framework.permissions import AllowAny +from rest_framework.response import Response + + +# Can't because code expects `ingestion` namespace +#from bento_lib.schemas.bento import BENTO_INGEST_SCHEMA +from bento_lib.responses import errors +#from bento_lib.workflows import get_workflow, get_workflow_resource, workflow_exists + +from .export import WORKFLOW_EXPORT_FUNCTION_MAP, ExportError + +from .ingest import METADATA_WORKFLOWS +from .models import Dataset, Table + + +# Can't because code expects `ingestion` namespace +#BENTO_INGEST_SCHEMA_VALIDATOR = Draft7Validator(BENTO_INGEST_SCHEMA) + +logger = logging.getLogger(__name__) + + +# Mounted on /private/, so will get protected anyway; this allows for access from WES +# TODO: Ugly and misleading permissions +@api_view(["POST"]) +@permission_classes([AllowAny]) +def export(request): + # Export data from Katsu. + # Private endpoints are protected by URL namespace, not by Django permissions. + + # TODO: Schema for OpenAPI doc + + logger.info(f"Received export request: {json.dumps(request.data)}") + + # Can't because code expects `ingestion` namespace + # if not BENTO_INGEST_SCHEMA_VALIDATOR.is_valid(request.data): + # return Response(errors.bad_request_error("Invalid ingest request body"), status=400) # TODO: Validation errors + + object_id = request.data["object_id"] + object_type = request.data["object_type"] # 'dataset', 'table',... + + if (object_type == 'table' + and not Table.objects.filter(ownership_record_id=object_id).exists()): + return Response(errors.bad_request_error(f"Table with ID {object_id} does not exist"), status=400) + elif (object_type == 'dataset' + and not Dataset.objects.filter(identifier=object_id).exists()): + return Response(errors.bad_request_error(f"Dataset with ID {object_id} does not exist"), status=400) + + + object_id = str(uuid.UUID(object_id)) # Normalize ID to UUID's str format. + + workflow_id = request.data["workflow_id"].strip() + workflow_exportpath = request.data["workflow_exportpath"] + + if not workflow_id in METADATA_WORKFLOWS.export: # Check that the workflow exists + return Response(errors.bad_request_error(f"Workflow with ID {workflow_id} does not exist"), status=400) + + try: + WORKFLOW_EXPORT_FUNCTION_MAP[workflow_id](workflow_exportpath, object_id) + + except ExportError as e: + return Response(errors.bad_request_error(f"Encountered export error: {e}"), status=400) + + + except Exception as e: + # Encountered some other error from the export attempt, return a somewhat detailed message + logger.error(f"Encountered an exception while processing an export attempt:\n{traceback.format_exc()}") + return Response(errors.internal_server_error(f"Encountered an exception while processing an export attempt " + f"(error: {repr(e)}"), status=500) + + return Response(status=204) From 97cc777f503207cd910e64e2694abf31b98cbba2 Mon Sep 17 00:00:00 2001 From: Paul Pillot Date: Mon, 4 Apr 2022 09:15:05 -0400 Subject: [PATCH 10/26] fix: circular imports --- chord_metadata_service/chord/export.py | 56 ----------------- chord_metadata_service/chord/export_cbio.py | 5 +- chord_metadata_service/chord/export_utils.py | 64 ++++++++++++++++++++ chord_metadata_service/chord/views_export.py | 3 +- 4 files changed, 67 insertions(+), 61 deletions(-) create mode 100644 chord_metadata_service/chord/export_utils.py diff --git a/chord_metadata_service/chord/export.py b/chord_metadata_service/chord/export.py index ecb861454..c12213f80 100644 --- a/chord_metadata_service/chord/export.py +++ b/chord_metadata_service/chord/export.py @@ -1,11 +1,6 @@ import logging -import os from chord_metadata_service.chord.ingest import WORKFLOW_CBIOPORTAL from .export_cbio import StudyExport as export_cbioportal_workflow -import shutil -import tempfile - -from django.conf import settings __all__ = [ "WORKFLOW_EXPORT_FUNCTION_MAP", @@ -14,57 +9,6 @@ logger = logging.getLogger(__name__) -class ExportError(Exception): - pass - -class ExportFileContext: - """ - Context manager around the tmp export directory for a given study - identifier. - """ - path = "" - should_del = False - - def __init__(self, tmp_dir: str, project_id: str): - tmp_dir = tmp_dir or settings.SERVICE_TEMP - - if tmp_dir is None: - tmp_dir = tempfile.mkdtemp() - self.should_del = True - - if not os.access(tmp_dir, os.W_OK): - raise ExportError(f"Directory does not exist or is not writable: {tmp_dir}") - - try: - tmp_dir = tmp_dir.rstrip("/") + "/cbio_export/" - self.path = os.path.join(tmp_dir, project_id) - - #clean pre-existing export dir - isExistant = os.path.exists(self.path) - if isExistant: - shutil.rmtree(self.path) - - original_umask = os.umask(0) # fix issue with non-writable dir due to OS based mask - os.makedirs(self.path, 0o777) - - except OSError: - raise ExportError - - finally: - os.umask(original_umask) - - - def __enter__(self): - return self - - def __exit__(self, exc_type, exc_value, traceback): - if self.should_del and self.path: - shutil.rmtree(self.path) - - def getPath (self, filename: str): - return os.path.join(self.path, filename) - - WORKFLOW_EXPORT_FUNCTION_MAP = { WORKFLOW_CBIOPORTAL: export_cbioportal_workflow, diff --git a/chord_metadata_service/chord/export_cbio.py b/chord_metadata_service/chord/export_cbio.py index c802abd32..f2b706d23 100644 --- a/chord_metadata_service/chord/export_cbio.py +++ b/chord_metadata_service/chord/export_cbio.py @@ -2,7 +2,7 @@ import csv from typing import TextIO -from .export import ExportFileContext, ExportError +from .export_utils import ExportFileContext, ExportError from chord_metadata_service.restapi.cbioportal_export_utils import biosample_to_sample_header, individual_to_patient_header from chord_metadata_service.chord.models import Dataset @@ -25,9 +25,6 @@ PATIENT_DATATYPE = 'PATIENT' SAMPLE_DATATYPE = 'SAMPLE' -class ExportError(Exception): - pass - def StudyExport (tmp_path: str, project_id: str): """Export a given Project as a cBioPortal study""" diff --git a/chord_metadata_service/chord/export_utils.py b/chord_metadata_service/chord/export_utils.py new file mode 100644 index 000000000..0120a0e8d --- /dev/null +++ b/chord_metadata_service/chord/export_utils.py @@ -0,0 +1,64 @@ +import logging +import os +import shutil +import tempfile + +from django.conf import settings + +__all__ = [ + "ExportError", + "ExportFileContext" +] + +logger = logging.getLogger(__name__) + + +class ExportError(Exception): + pass + +class ExportFileContext: + """ + Context manager around the tmp export directory for a given study + identifier. + """ + path = "" + should_del = False + + def __init__(self, tmp_dir: str, project_id: str): + tmp_dir = tmp_dir or settings.SERVICE_TEMP + + if tmp_dir is None: + tmp_dir = tempfile.mkdtemp() + self.should_del = True + + if not os.access(tmp_dir, os.W_OK): + raise ExportError(f"Directory does not exist or is not writable: {tmp_dir}") + + try: + tmp_dir = tmp_dir.rstrip("/") + "/cbio_export/" + self.path = os.path.join(tmp_dir, project_id) + + #clean pre-existing export dir + isExistant = os.path.exists(self.path) + if isExistant: + shutil.rmtree(self.path) + + original_umask = os.umask(0) # fix issue with non-writable dir due to OS based mask + os.makedirs(self.path, 0o777) + + except OSError: + raise ExportError + + finally: + os.umask(original_umask) + + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, traceback): + if self.should_del and self.path: + shutil.rmtree(self.path) + + def getPath (self, filename: str): + return os.path.join(self.path, filename) \ No newline at end of file diff --git a/chord_metadata_service/chord/views_export.py b/chord_metadata_service/chord/views_export.py index 40ec26ed3..bd9d58cd7 100644 --- a/chord_metadata_service/chord/views_export.py +++ b/chord_metadata_service/chord/views_export.py @@ -15,7 +15,8 @@ from bento_lib.responses import errors #from bento_lib.workflows import get_workflow, get_workflow_resource, workflow_exists -from .export import WORKFLOW_EXPORT_FUNCTION_MAP, ExportError +from .export import WORKFLOW_EXPORT_FUNCTION_MAP +from .export_utils import ExportError from .ingest import METADATA_WORKFLOWS from .models import Dataset, Table From a2b726ae5785902b8301ba2c0a4c4ca92e8154ff Mon Sep 17 00:00:00 2001 From: Paul Pillot Date: Mon, 4 Apr 2022 18:21:13 +0000 Subject: [PATCH 11/26] refactor: remove dependency from restapi dir. --- chord_metadata_service/chord/export_cbio.py | 82 +++++++++++++++++- .../restapi/cbioportal_export_utils.py | 83 ------------------- 2 files changed, 81 insertions(+), 84 deletions(-) delete mode 100644 chord_metadata_service/restapi/cbioportal_export_utils.py diff --git a/chord_metadata_service/chord/export_cbio.py b/chord_metadata_service/chord/export_cbio.py index f2b706d23..ef8567ef7 100644 --- a/chord_metadata_service/chord/export_cbio.py +++ b/chord_metadata_service/chord/export_cbio.py @@ -4,7 +4,6 @@ from .export_utils import ExportFileContext, ExportError -from chord_metadata_service.restapi.cbioportal_export_utils import biosample_to_sample_header, individual_to_patient_header from chord_metadata_service.chord.models import Dataset from chord_metadata_service.patients.models import Individual from chord_metadata_service.phenopackets import models as pm @@ -187,3 +186,84 @@ def SampleExport (results, file_handle: TextIO): file_handle.writelines([line + '\n' for line in headers]) dict_writer = csv.DictWriter(file_handle, fieldnames=columns, delimiter='\t') dict_writer.writerows(samples) + + +class cbioportal_clinical_header_generator (): + """ + Generates cBioPortal data files headers based on field names from katsu models. + """ + + fields_mapping = {} + + def __init__(self, mappings = {}): + self.fields_mapping = mappings + + def make_header (self, fields: list): + """ + Maps a list of field names to a 5 rows header + suitable for cBioPortal clinical data files. + """ + + field_properties = [] + for field in fields: + if field in self.fields_mapping: + field_properties.append(self.fields_mapping[field]) + else: + fieldname = field.replace('_', ' ').capitalize() + prop = ( + fieldname, # display name + fieldname, # description + 'STRING', # type !!!TODO: TYPE DETECTION!!! + '1', # priority (note: string here for use in join()) + field.upper() # DB suitable identifier + ) + field_properties.append(prop) + + # Transpose list of properties tuples per field to tuples of + # field properties per property. + rows = list(zip(*field_properties)) + + # The 4 first rows are considered meta datas, prefixed by '#'. + # The 5th row (DB field names) is a canonical TSV header. + cbio_header = [ + '#' + '\t'.join(rows[0]), + '#' + '\t'.join(rows[1]), + '#' + '\t'.join(rows[2]), + '#' + '\t'.join(rows[3]), + '\t'.join(rows[4]) + ] + + return cbio_header + + + +def individual_to_patient_header (fields: list): + """ + Maps a list of Individual field names to a 5 rows header + suitable for cBioPortal data_clinical_patient.txt file. + """ + + # predefined mappings from Individual keys to cBioPortal field properties + fields_mapping = { + 'id': ('Patient Identifier', 'Patient Identifier', 'STRING', '1', 'PATIENT_ID'), + 'sex': ('Sex', 'Sex', 'STRING', '1', 'SEX'), + } + + cbio_header = cbioportal_clinical_header_generator(fields_mapping); + return cbio_header.make_header(fields) + +def biosample_to_sample_header (fields: list): + """ + Maps a list of biosamples field names to a 5 rows header + suitable for cBioPortal data_sample_patient.txt file. + """ + + # predefined mappings from Samples keys to cBioPortal field properties + fields_mapping = { + 'individual_id': ('Patient Identifier', 'Patient Identifier', 'STRING', '1', 'PATIENT_ID'), + 'id': ('Sample Identifier', 'Sample Identifier', 'STRING', '1', 'SAMPLE_ID'), + 'tissue_label': ('Sampled Tissue', 'Sampled Tissue', 'STRING', '1', 'TISSUE_LABEL') + } + + cbio_header = cbioportal_clinical_header_generator(fields_mapping); + return cbio_header.make_header(fields) diff --git a/chord_metadata_service/restapi/cbioportal_export_utils.py b/chord_metadata_service/restapi/cbioportal_export_utils.py deleted file mode 100644 index 651cd1a7c..000000000 --- a/chord_metadata_service/restapi/cbioportal_export_utils.py +++ /dev/null @@ -1,83 +0,0 @@ -"""Katsu models fields to cBioportal fields declarations - -This module contains utilities to generate cBioPortal data files headers -based on field names from katsu models. -""" - -class cbioportal_clinical_header_generator (): - - fields_mapping = {} - - def __init__(self, mappings = {}): - self.fields_mapping = mappings - - def make_header (self, fields: list): - """ - Maps a list of field names to a 5 rows header - suitable for cBioPortal clinical data files. - """ - - field_properties = [] - for field in fields: - if field in self.fields_mapping: - field_properties.append(self.fields_mapping[field]) - else: - fieldname = field.replace('_', ' ').capitalize() - prop = ( - fieldname, # display name - fieldname, # description - 'STRING', # type !!!TODO: TYPE DETECTION!!! - '1', # priority (note: string here for use in join()) - field.upper() # DB suitable identifier - ) - field_properties.append(prop) - - # Transpose list of properties tuples per field to tuples of - # field properties per property. - rows = list(zip(*field_properties)) - - # The 4 first rows are considered meta datas, prefixed by '#'. - # The 5th row (DB field names) is a canonical TSV header. - cbio_header = [ - '#' + '\t'.join(rows[0]), - '#' + '\t'.join(rows[1]), - '#' + '\t'.join(rows[2]), - '#' + '\t'.join(rows[3]), - '\t'.join(rows[4]) - ] - - return cbio_header - - - -def individual_to_patient_header (fields: list): - """ - Maps a list of Individual field names to a 5 rows header - suitable for cBioPortal data_clinical_patient.txt file. - """ - - # predefined mappings from Individual keys to cBioPortal field properties - fields_mapping = { - 'id': ('Patient Identifier', 'Patient Identifier', 'STRING', '1', 'PATIENT_ID'), - 'sex': ('Sex', 'Sex', 'STRING', '1', 'SEX'), - } - - cbio_header = cbioportal_clinical_header_generator(fields_mapping); - return cbio_header.make_header(fields) - -def biosample_to_sample_header (fields: list): - """ - Maps a list of biosamples field names to a 5 rows header - suitable for cBioPortal data_sample_patient.txt file. - """ - - # predefined mappings from Samples keys to cBioPortal field properties - fields_mapping = { - 'individual_id': ('Patient Identifier', 'Patient Identifier', 'STRING', '1', 'PATIENT_ID'), - 'id': ('Sample Identifier', 'Sample Identifier', 'STRING', '1', 'SAMPLE_ID'), - 'tissue_label': ('Sampled Tissue', 'Sampled Tissue', 'STRING', '1', 'TISSUE_LABEL') - } - - cbio_header = cbioportal_clinical_header_generator(fields_mapping); - return cbio_header.make_header(fields) - From 37a29e8ff923e4de7853b82cc57f5782835dff9e Mon Sep 17 00:00:00 2001 From: Paul Pillot Date: Tue, 5 Apr 2022 17:11:32 +0000 Subject: [PATCH 12/26] use dataset_id for export --- chord_metadata_service/chord/export_cbio.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/chord_metadata_service/chord/export_cbio.py b/chord_metadata_service/chord/export_cbio.py index ef8567ef7..867494f6e 100644 --- a/chord_metadata_service/chord/export_cbio.py +++ b/chord_metadata_service/chord/export_cbio.py @@ -14,7 +14,7 @@ logger = logging.getLogger(__name__) - +# predefined filenames recognized by cBioPortal STUDY_FILENAME = "meta_study.txt" SAMPLE_DATA_FILENAME = "data_clinical_sample.txt" SAMPLE_META_FILENAME = "meta_clinical_sample.txt" @@ -25,16 +25,16 @@ SAMPLE_DATATYPE = 'SAMPLE' -def StudyExport (tmp_path: str, project_id: str): +def StudyExport (tmp_path: str, dataset_id: str): """Export a given Project as a cBioPortal study""" #TODO: a Dataset is a Study (associated with a publication), not a Project! if Dataset.objects.count == 0: - raise ExportError("No Project to export") - dataset = Dataset.objects.first() # TODO: for now export first project - project_id = str(dataset.identifier) + raise ExportError("No Dataset to export") + dataset = Dataset.objects.get(identifier=dataset_id) + cbio_study_id = str(dataset.identifier) # create a context wrapping a tmp folder for export - with ExportFileContext(tmp_path, project_id) as file_export: + with ExportFileContext(tmp_path, cbio_study_id) as file_export: # Export study file with open(file_export.getPath(STUDY_FILENAME), 'w') as file_study: @@ -47,7 +47,7 @@ def StudyExport (tmp_path: str, project_id: str): IndividualExport(indiv, file_patient) with open(file_export.getPath(PATIENT_META_FILENAME), 'w') as file_patient_meta: - ClinicalMetaExport(project_id, PATIENT_DATATYPE, file_patient_meta) + ClinicalMetaExport(cbio_study_id, PATIENT_DATATYPE, file_patient_meta) # export samples with open(file_export.getPath(SAMPLE_DATA_FILENAME), 'w') as file_sample: @@ -55,7 +55,7 @@ def StudyExport (tmp_path: str, project_id: str): SampleExport(sampl, file_sample) with open(file_export.getPath(SAMPLE_META_FILENAME), 'w') as file_sample_meta: - ClinicalMetaExport(project_id, SAMPLE_DATATYPE, file_sample_meta) + ClinicalMetaExport(cbio_study_id, SAMPLE_DATATYPE, file_sample_meta) From 1cfc8a3a649237d5a0b420cc11a64c4383a53e7e Mon Sep 17 00:00:00 2001 From: Paul Pillot Date: Tue, 5 Apr 2022 17:14:47 +0000 Subject: [PATCH 13/26] Add testing suite for cbio export module --- .../chord/tests/test_export_cbio.py | 189 ++++++++++++++++++ 1 file changed, 189 insertions(+) create mode 100644 chord_metadata_service/chord/tests/test_export_cbio.py diff --git a/chord_metadata_service/chord/tests/test_export_cbio.py b/chord_metadata_service/chord/tests/test_export_cbio.py new file mode 100644 index 000000000..fc9822fe3 --- /dev/null +++ b/chord_metadata_service/chord/tests/test_export_cbio.py @@ -0,0 +1,189 @@ +import shutil +import tempfile +import uuid +import io +from typing import Dict, TextIO +from os import walk, path + +from django.test import TestCase + +from chord_metadata_service.chord.export_cbio import PATIENT_DATA_FILENAME, PATIENT_DATATYPE, PATIENT_META_FILENAME, SAMPLE_DATA_FILENAME, SAMPLE_DATATYPE, SAMPLE_META_FILENAME, STUDY_FILENAME, ClinicalMetaExport, IndividualExport, SampleExport, StudyExport, StudyExportMeta +from chord_metadata_service.chord.data_types import DATA_TYPE_PHENOPACKET, DATA_TYPE_EXPERIMENT +from chord_metadata_service.chord.models import Project, Dataset, TableOwnership, Table +# noinspection PyProtectedMember +from chord_metadata_service.chord.ingest import ( + WORKFLOW_PHENOPACKETS_JSON, + WORKFLOW_INGEST_FUNCTION_MAP, +) +from chord_metadata_service.patients.models import Individual +from chord_metadata_service.phenopackets import models as PhModel + + +from .constants import VALID_DATA_USE_1 +from .example_ingest import ( + EXAMPLE_INGEST_PHENOPACKET, + EXAMPLE_INGEST_OUTPUTS, +) + + +class ExportCBioTest(TestCase): + def setUp(self) -> None: + # Creates a test database and populate with a phenopacket test file + + p = Project.objects.create(title="Project 1", description="") + self.d = Dataset.objects.create(title="Dataset 1", description="Some dataset", data_use=VALID_DATA_USE_1, + project=p) + self.study_id = str(self.d.identifier) + + # TODO: Real service ID + # table for phenopackets + to = TableOwnership.objects.create(table_id=uuid.uuid4(), service_id=uuid.uuid4(), service_artifact="metadata", + dataset=self.d) + self.t = Table.objects.create(ownership_record=to, name="Table 1", data_type=DATA_TYPE_PHENOPACKET) + + # table for experiments metadata + to_exp = TableOwnership.objects.create(table_id=uuid.uuid4(), service_id=uuid.uuid4(), + service_artifact="experiments", dataset=self.d) + self.t_exp = Table.objects.create(ownership_record=to_exp, name="Table 2", data_type=DATA_TYPE_EXPERIMENT) + + self.p = WORKFLOW_INGEST_FUNCTION_MAP[WORKFLOW_PHENOPACKETS_JSON](EXAMPLE_INGEST_OUTPUTS, self.t.identifier) + + def streamToDict(self, output: TextIO) -> Dict[str, str]: + """ + Utility function. Parses cBioPortal meta data text files (lines of + key/value pairs separated by `: `) in a dictionary structure. + """ + output.seek(0) + content = dict() + for line in output: + key, value = line.rstrip().split(': ') + content[key] = value + return content + + def testFileCreation(self): + """ + Check files creation. + Files content is tested subsequently with each file generating function. + """ + tmp_dir = tempfile.mkdtemp() + try: + StudyExport(tmp_dir, self.study_id) + export_dir = path.join(tmp_dir, 'cbio_export', self.study_id) + self.assertTrue(path.exists(export_dir)) + for (dirpath, dirnames, filenames) in walk(export_dir): + filesSet = {*filenames} + expectedSet = { + STUDY_FILENAME, + SAMPLE_DATA_FILENAME, + SAMPLE_META_FILENAME, + PATIENT_META_FILENAME, + PATIENT_META_FILENAME + } + self.assertTrue(expectedSet.issubset(filesSet)) + break # do not recurse the directory tree + + # clean + finally: + shutil.rmtree(tmp_dir) + + + def test_export_cbio_study_meta(self): + with io.StringIO() as output: + StudyExportMeta(self.d, output) + content = self.streamToDict(output) + + self.assertIn('type_of_cancer', content) + self.assertEqual(content['cancer_study_identifier'], self.study_id) + self.assertEqual(content['name'], self.d.title) + self.assertEqual(content['description'], self.d.description) + + def test_export_cbio_sample_meta(self): + with io.StringIO() as output: + ClinicalMetaExport(self.study_id, SAMPLE_DATATYPE, output) + content = self.streamToDict(output) + + self.assertEqual(content['cancer_study_identifier'], self.study_id) + self.assertEqual(content['genetic_alteration_type'], 'CLINICAL') + self.assertEqual(content['datatype'], 'SAMPLE_ATTRIBUTES') + self.assertEqual(content['data_filename'], SAMPLE_DATA_FILENAME) + + def test_export_cbio_patient_meta(self): + with io.StringIO() as output: + ClinicalMetaExport(self.study_id, PATIENT_DATATYPE, output) + content = self.streamToDict(output) + + self.assertEqual(content['cancer_study_identifier'], self.study_id) + self.assertEqual(content['genetic_alteration_type'], 'CLINICAL') + self.assertEqual(content['datatype'], 'PATIENT_ATTRIBUTES') + self.assertEqual(content['data_filename'], PATIENT_DATA_FILENAME) + + def test_export_cbio_patient_data(self): + indiv = Individual.objects.filter(phenopackets=self.p) + with io.StringIO() as output: + IndividualExport(indiv, output) + # Check header + output.seek(0) + field_count = None + field_names = [] + for i, line in enumerate(output): + # 4 first header lines begin with `#` + if i < 4: + self.assertEqual(line[0], '#') + continue + + # Following lines are regular TSV formatted lines + pieces = line.rstrip().split('\t') + + # 5th line is a header with predefined field names + if i == 4: + field_count = len(pieces) + field_names = pieces + + # At least PATIENT_ID and SEX + self.assertGreaterEqual(field_count, 2) + self.assertIn('PATIENT_ID', pieces) + continue + + # TSV body. Inspect first line and break + self.assertEqual(field_count, len(pieces)) + record = dict(zip(field_names, pieces)) + + self.assertEqual(record["PATIENT_ID"], EXAMPLE_INGEST_PHENOPACKET["subject"]["id"]) + self.assertEqual(record["SEX"], EXAMPLE_INGEST_PHENOPACKET["subject"]["sex"]) + break + + def test_export_cbio_sample_data(self): + samples = PhModel.Biosample.objects.filter(phenopacket=self.p) + with io.StringIO() as output: + SampleExport(samples, output) + # Check header + output.seek(0) + field_count = None + field_names = [] + for i, line in enumerate(output): + # 4 first header lines begin with `#` + if i < 4: + self.assertEqual(line[0], '#') + continue + + # Following lines are regular TSV formatted lines + pieces = line.rstrip().split('\t') + + # 5th line is a header with predefined field names + if i == 4: + field_count = len(pieces) + field_names = pieces + + # At least PATIENT_ID and SAMPLE_ID + self.assertGreaterEqual(field_count, 2) + self.assertIn('PATIENT_ID', pieces) + self.assertIn('SAMPLE_ID', pieces) + continue + + # TSV body. Inspect first line and break + self.assertEqual(field_count, len(pieces)) + record = dict(zip(field_names, pieces)) + + self.assertEqual(record["PATIENT_ID"], EXAMPLE_INGEST_PHENOPACKET["subject"]["id"]) + self.assertEqual(record["SAMPLE_ID"], EXAMPLE_INGEST_PHENOPACKET["biosamples"][0]["id"]) + break From adbd946d4aa5dc36c2faa559e1aa5621f0a542c8 Mon Sep 17 00:00:00 2001 From: Paul Pillot Date: Wed, 6 Apr 2022 19:18:13 +0000 Subject: [PATCH 14/26] refactor: move ctx creation outside of export func --- chord_metadata_service/chord/export_cbio.py | 41 +++++++++---------- chord_metadata_service/chord/export_utils.py | 2 +- .../chord/tests/test_export_cbio.py | 15 +++---- 3 files changed, 25 insertions(+), 33 deletions(-) diff --git a/chord_metadata_service/chord/export_cbio.py b/chord_metadata_service/chord/export_cbio.py index 867494f6e..fae344cfd 100644 --- a/chord_metadata_service/chord/export_cbio.py +++ b/chord_metadata_service/chord/export_cbio.py @@ -1,8 +1,8 @@ import logging import csv -from typing import TextIO +from typing import TextIO, Callable -from .export_utils import ExportFileContext, ExportError +from .export_utils import ExportError from chord_metadata_service.chord.models import Dataset from chord_metadata_service.patients.models import Individual @@ -25,7 +25,7 @@ SAMPLE_DATATYPE = 'SAMPLE' -def StudyExport (tmp_path: str, dataset_id: str): +def StudyExport (getPath: Callable[[str], str], dataset_id: str): """Export a given Project as a cBioPortal study""" #TODO: a Dataset is a Study (associated with a publication), not a Project! if Dataset.objects.count == 0: @@ -33,29 +33,26 @@ def StudyExport (tmp_path: str, dataset_id: str): dataset = Dataset.objects.get(identifier=dataset_id) cbio_study_id = str(dataset.identifier) - # create a context wrapping a tmp folder for export - with ExportFileContext(tmp_path, cbio_study_id) as file_export: + # Export study file + with open(getPath(STUDY_FILENAME), 'w') as file_study: + StudyExportMeta(dataset, file_study) - # Export study file - with open(file_export.getPath(STUDY_FILENAME), 'w') as file_study: - StudyExportMeta(dataset, file_study) + # Export patients. + with open(getPath(PATIENT_DATA_FILENAME), 'w') as file_patient: + # Note: plural in `phenopackets` is intentional (related_name property in model) + indiv = Individual.objects.filter(phenopackets__table__ownership_record__dataset_id=dataset.identifier) + IndividualExport(indiv, file_patient) - # export patients. - with open(file_export.getPath(PATIENT_DATA_FILENAME), 'w') as file_patient: - # Note: plural in `phenopackets` is intentional (related_name property in model) - indiv = Individual.objects.filter(phenopackets__table__ownership_record__dataset_id=dataset.identifier) - IndividualExport(indiv, file_patient) + with open(getPath(PATIENT_META_FILENAME), 'w') as file_patient_meta: + ClinicalMetaExport(cbio_study_id, PATIENT_DATATYPE, file_patient_meta) - with open(file_export.getPath(PATIENT_META_FILENAME), 'w') as file_patient_meta: - ClinicalMetaExport(cbio_study_id, PATIENT_DATATYPE, file_patient_meta) + # Export samples + with open(getPath(SAMPLE_DATA_FILENAME), 'w') as file_sample: + sampl = pm.Biosample.objects.filter(phenopacket__table__ownership_record__dataset_id=dataset.identifier) + SampleExport(sampl, file_sample) - # export samples - with open(file_export.getPath(SAMPLE_DATA_FILENAME), 'w') as file_sample: - sampl = pm.Biosample.objects.filter(phenopacket__table__ownership_record__dataset_id=dataset.identifier) - SampleExport(sampl, file_sample) - - with open(file_export.getPath(SAMPLE_META_FILENAME), 'w') as file_sample_meta: - ClinicalMetaExport(cbio_study_id, SAMPLE_DATATYPE, file_sample_meta) + with open(getPath(SAMPLE_META_FILENAME), 'w') as file_sample_meta: + ClinicalMetaExport(cbio_study_id, SAMPLE_DATATYPE, file_sample_meta) diff --git a/chord_metadata_service/chord/export_utils.py b/chord_metadata_service/chord/export_utils.py index 0120a0e8d..ad4dae83e 100644 --- a/chord_metadata_service/chord/export_utils.py +++ b/chord_metadata_service/chord/export_utils.py @@ -60,5 +60,5 @@ def __exit__(self, exc_type, exc_value, traceback): if self.should_del and self.path: shutil.rmtree(self.path) - def getPath (self, filename: str): + def getPath (self, filename: str = ''): return os.path.join(self.path, filename) \ No newline at end of file diff --git a/chord_metadata_service/chord/tests/test_export_cbio.py b/chord_metadata_service/chord/tests/test_export_cbio.py index fc9822fe3..eb4a2dd38 100644 --- a/chord_metadata_service/chord/tests/test_export_cbio.py +++ b/chord_metadata_service/chord/tests/test_export_cbio.py @@ -1,5 +1,3 @@ -import shutil -import tempfile import uuid import io from typing import Dict, TextIO @@ -9,6 +7,7 @@ from chord_metadata_service.chord.export_cbio import PATIENT_DATA_FILENAME, PATIENT_DATATYPE, PATIENT_META_FILENAME, SAMPLE_DATA_FILENAME, SAMPLE_DATATYPE, SAMPLE_META_FILENAME, STUDY_FILENAME, ClinicalMetaExport, IndividualExport, SampleExport, StudyExport, StudyExportMeta from chord_metadata_service.chord.data_types import DATA_TYPE_PHENOPACKET, DATA_TYPE_EXPERIMENT +from chord_metadata_service.chord.export_utils import ExportFileContext from chord_metadata_service.chord.models import Project, Dataset, TableOwnership, Table # noinspection PyProtectedMember from chord_metadata_service.chord.ingest import ( @@ -65,10 +64,10 @@ def testFileCreation(self): Check files creation. Files content is tested subsequently with each file generating function. """ - tmp_dir = tempfile.mkdtemp() - try: - StudyExport(tmp_dir, self.study_id) - export_dir = path.join(tmp_dir, 'cbio_export', self.study_id) + + with ExportFileContext(None, self.study_id) as file_export: + StudyExport(file_export.getPath, self.study_id) + export_dir = file_export.getPath() self.assertTrue(path.exists(export_dir)) for (dirpath, dirnames, filenames) in walk(export_dir): filesSet = {*filenames} @@ -82,10 +81,6 @@ def testFileCreation(self): self.assertTrue(expectedSet.issubset(filesSet)) break # do not recurse the directory tree - # clean - finally: - shutil.rmtree(tmp_dir) - def test_export_cbio_study_meta(self): with io.StringIO() as output: From 24941fafb70159cfd7eedd5aaad3b5457712cc05 Mon Sep 17 00:00:00 2001 From: Paul Pillot Date: Wed, 6 Apr 2022 19:22:01 +0000 Subject: [PATCH 15/26] implement export request schema validation --- chord_metadata_service/chord/schemas.py | 19 +++++++++++++++++++ chord_metadata_service/chord/views_export.py | 18 +++++++----------- 2 files changed, 26 insertions(+), 11 deletions(-) diff --git a/chord_metadata_service/chord/schemas.py b/chord_metadata_service/chord/schemas.py index cb12e570c..e5539cdbd 100644 --- a/chord_metadata_service/chord/schemas.py +++ b/chord_metadata_service/chord/schemas.py @@ -31,3 +31,22 @@ "additionalProperties": False } } + +EXPORT_SCHEMA = { + "description": "Export endpoint", + "type": "object", + "properties": { + "object_type": { + "type": "string", + "enum": ["project", "dataset", "table"] + }, + "object_id": {"type": "string"}, + "format": { + "type": "string", + "enum": ["cbioportal"] + }, + "output_path": {"type": "string"} + }, + "required": ["object_type", "object_id", "format"], + "additionalProperties": False +} diff --git a/chord_metadata_service/chord/views_export.py b/chord_metadata_service/chord/views_export.py index bd9d58cd7..7ed288317 100644 --- a/chord_metadata_service/chord/views_export.py +++ b/chord_metadata_service/chord/views_export.py @@ -3,15 +3,13 @@ import traceback import uuid -# Can't because code expects `ingestion` namespace -#from jsonschema import Draft7Validator +from jsonschema import Draft7Validator from rest_framework.decorators import api_view, permission_classes from rest_framework.permissions import AllowAny from rest_framework.response import Response -# Can't because code expects `ingestion` namespace -#from bento_lib.schemas.bento import BENTO_INGEST_SCHEMA +from chord_metadata_service.chord.schemas import EXPORT_SCHEMA from bento_lib.responses import errors #from bento_lib.workflows import get_workflow, get_workflow_resource, workflow_exists @@ -21,9 +19,7 @@ from .ingest import METADATA_WORKFLOWS from .models import Dataset, Table - -# Can't because code expects `ingestion` namespace -#BENTO_INGEST_SCHEMA_VALIDATOR = Draft7Validator(BENTO_INGEST_SCHEMA) +BENTO_EXPORT_SCHEMA_VALIDATOR = Draft7Validator(EXPORT_SCHEMA) logger = logging.getLogger(__name__) @@ -32,7 +28,7 @@ # TODO: Ugly and misleading permissions @api_view(["POST"]) @permission_classes([AllowAny]) -def export(request): +def export(request: Request): # Export data from Katsu. # Private endpoints are protected by URL namespace, not by Django permissions. @@ -40,9 +36,9 @@ def export(request): logger.info(f"Received export request: {json.dumps(request.data)}") - # Can't because code expects `ingestion` namespace - # if not BENTO_INGEST_SCHEMA_VALIDATOR.is_valid(request.data): - # return Response(errors.bad_request_error("Invalid ingest request body"), status=400) # TODO: Validation errors + if not BENTO_EXPORT_SCHEMA_VALIDATOR.is_valid(request.data): + msg_list = [err.message for err in BENTO_EXPORT_SCHEMA_VALIDATOR.iter_errors(request.data)] + return Response(errors.bad_request_error("Invalid ingest request body: " + "\n".join(msg_list)), status=400) # TODO: Validation errors object_id = request.data["object_id"] object_type = request.data["object_type"] # 'dataset', 'table',... From 34453068c4067174bc7f1c95b6c405d47409b085 Mon Sep 17 00:00:00 2001 From: Paul Pillot Date: Wed, 6 Apr 2022 19:25:46 +0000 Subject: [PATCH 16/26] refactor export constants data structures --- chord_metadata_service/chord/export.py | 29 +++++++++++---- chord_metadata_service/chord/views_export.py | 38 ++++++++++---------- 2 files changed, 43 insertions(+), 24 deletions(-) diff --git a/chord_metadata_service/chord/export.py b/chord_metadata_service/chord/export.py index c12213f80..a6918e73a 100644 --- a/chord_metadata_service/chord/export.py +++ b/chord_metadata_service/chord/export.py @@ -1,15 +1,32 @@ import logging from chord_metadata_service.chord.ingest import WORKFLOW_CBIOPORTAL +from chord_metadata_service.chord.models import Dataset, Project, Table from .export_cbio import StudyExport as export_cbioportal_workflow -__all__ = [ - "WORKFLOW_EXPORT_FUNCTION_MAP", -] - logger = logging.getLogger(__name__) +OBJECT_TYPE_PROJECT = "project" +OBJECT_TYPE_DATASET = "dataset" +OBJECT_TYPE_TABLE = "table" + +EXPORT_OBJECT_TYPE = { + OBJECT_TYPE_PROJECT: { + "model": Project + }, + OBJECT_TYPE_DATASET: { + "model": Dataset + }, + OBJECT_TYPE_TABLE: { + "model": Table + }, +} + +EXPORT_FORMATS = { WORKFLOW_CBIOPORTAL } +EXPORT_FORMAT_FUNCTION_MAP = { + WORKFLOW_CBIOPORTAL: export_cbioportal_workflow +} -WORKFLOW_EXPORT_FUNCTION_MAP = { - WORKFLOW_CBIOPORTAL: export_cbioportal_workflow, +EXPORT_FORMAT_OBJECT_TYPE_MAP = { + WORKFLOW_CBIOPORTAL: { OBJECT_TYPE_DATASET } } \ No newline at end of file diff --git a/chord_metadata_service/chord/views_export.py b/chord_metadata_service/chord/views_export.py index 7ed288317..cb23a8fb6 100644 --- a/chord_metadata_service/chord/views_export.py +++ b/chord_metadata_service/chord/views_export.py @@ -7,17 +7,16 @@ from rest_framework.decorators import api_view, permission_classes from rest_framework.permissions import AllowAny from rest_framework.response import Response +from rest_framework.request import Request from chord_metadata_service.chord.schemas import EXPORT_SCHEMA from bento_lib.responses import errors #from bento_lib.workflows import get_workflow, get_workflow_resource, workflow_exists -from .export import WORKFLOW_EXPORT_FUNCTION_MAP -from .export_utils import ExportError +from .export import EXPORT_FORMAT_FUNCTION_MAP, EXPORT_FORMAT_OBJECT_TYPE_MAP, EXPORT_FORMATS, EXPORT_OBJECT_TYPE +from .export_utils import ExportError, ExportFileContext -from .ingest import METADATA_WORKFLOWS -from .models import Dataset, Table BENTO_EXPORT_SCHEMA_VALIDATOR = Draft7Validator(EXPORT_SCHEMA) @@ -41,26 +40,29 @@ def export(request: Request): return Response(errors.bad_request_error("Invalid ingest request body: " + "\n".join(msg_list)), status=400) # TODO: Validation errors object_id = request.data["object_id"] - object_type = request.data["object_type"] # 'dataset', 'table',... - - if (object_type == 'table' - and not Table.objects.filter(ownership_record_id=object_id).exists()): - return Response(errors.bad_request_error(f"Table with ID {object_id} does not exist"), status=400) - elif (object_type == 'dataset' - and not Dataset.objects.filter(identifier=object_id).exists()): - return Response(errors.bad_request_error(f"Dataset with ID {object_id} does not exist"), status=400) + object_type: str = request.data["object_type"] # 'dataset', 'table',... + + model = EXPORT_OBJECT_TYPE[object_type]["model"] + if not model.objects.filter(identifier=object_id).exists(): + return Response(errors.bad_request_error(f"{object_type.capitalize()} with ID {object_id} does not exist"), status=400) + #object_id = str(uuid.UUID(object_id)) # Normalize ID to UUID's str format. + + format = request.data["format"].strip() + output_path = request.data.get("output_path") # optional parameter - object_id = str(uuid.UUID(object_id)) # Normalize ID to UUID's str format. + if not format in EXPORT_FORMATS: # Check that the workflow exists + return Response(errors.bad_request_error(f"Export in format {format} is not implemented"), status=400) - workflow_id = request.data["workflow_id"].strip() - workflow_exportpath = request.data["workflow_exportpath"] + if not object_type in EXPORT_FORMAT_OBJECT_TYPE_MAP[format]: + return Response(errors.bad_request_error(f"Exporting entities of type {object_type} in format {format} is not implemented"), status=400) - if not workflow_id in METADATA_WORKFLOWS.export: # Check that the workflow exists - return Response(errors.bad_request_error(f"Workflow with ID {workflow_id} does not exist"), status=400) + # TODO: secure the output_path value try: - WORKFLOW_EXPORT_FUNCTION_MAP[workflow_id](workflow_exportpath, object_id) + with ExportFileContext(output_path, object_id) as file_export: + # Pass a callable to generate the proper file paths within the export context. + EXPORT_FORMAT_FUNCTION_MAP[format](file_export.getPath, object_id) except ExportError as e: return Response(errors.bad_request_error(f"Encountered export error: {e}"), status=400) From 220deaf6939c9fb4b337fedf23e23efba2485681 Mon Sep 17 00:00:00 2001 From: Paul Pillot Date: Wed, 6 Apr 2022 19:26:21 +0000 Subject: [PATCH 17/26] implement export as streamed tar file --- chord_metadata_service/chord/export_utils.py | 26 +++++++++++++++++--- chord_metadata_service/chord/views_export.py | 16 ++++++++++-- 2 files changed, 37 insertions(+), 5 deletions(-) diff --git a/chord_metadata_service/chord/export_utils.py b/chord_metadata_service/chord/export_utils.py index ad4dae83e..45264d49a 100644 --- a/chord_metadata_service/chord/export_utils.py +++ b/chord_metadata_service/chord/export_utils.py @@ -1,6 +1,7 @@ import logging import os import shutil +import tarfile import tempfile from django.conf import settings @@ -12,6 +13,7 @@ logger = logging.getLogger(__name__) +EXPORT_DIR = 'export' class ExportError(Exception): pass @@ -23,6 +25,8 @@ class ExportFileContext: """ path = "" should_del = False + base_path = "" + project_id = '' def __init__(self, tmp_dir: str, project_id: str): tmp_dir = tmp_dir or settings.SERVICE_TEMP @@ -34,9 +38,11 @@ def __init__(self, tmp_dir: str, project_id: str): if not os.access(tmp_dir, os.W_OK): raise ExportError(f"Directory does not exist or is not writable: {tmp_dir}") + self.base_path = tmp_dir + self.project_id = project_id + try: - tmp_dir = tmp_dir.rstrip("/") + "/cbio_export/" - self.path = os.path.join(tmp_dir, project_id) + self.path = os.path.join(tmp_dir, EXPORT_DIR, project_id) #clean pre-existing export dir isExistant = os.path.exists(self.path) @@ -61,4 +67,18 @@ def __exit__(self, exc_type, exc_value, traceback): shutil.rmtree(self.path) def getPath (self, filename: str = ''): - return os.path.join(self.path, filename) \ No newline at end of file + return os.path.join(self.path, filename) + + def writeTar (self): + tar_path = os.path.join(self.base_path, EXPORT_DIR, self.project_id + '.tar.gz') + with tarfile.open(tar_path, 'w:gz') as tar: + output_dir = self.getPath() + tar.add(output_dir, filter=resetTarInfo) + return tar_path + +def resetTarInfo(info: tarfile.TarInfo) -> tarfile.TarInfo: + info.gid = 0 + info.uid = 0 + info.uname = 'root' + info.gname = 'root' + return info \ No newline at end of file diff --git a/chord_metadata_service/chord/views_export.py b/chord_metadata_service/chord/views_export.py index cb23a8fb6..50ba487bd 100644 --- a/chord_metadata_service/chord/views_export.py +++ b/chord_metadata_service/chord/views_export.py @@ -1,7 +1,9 @@ import json import logging import traceback -import uuid +#import uuid + +from django.http import FileResponse from jsonschema import Draft7Validator from rest_framework.decorators import api_view, permission_classes @@ -45,7 +47,7 @@ def export(request: Request): model = EXPORT_OBJECT_TYPE[object_type]["model"] if not model.objects.filter(identifier=object_id).exists(): return Response(errors.bad_request_error(f"{object_type.capitalize()} with ID {object_id} does not exist"), status=400) - + #object_id = str(uuid.UUID(object_id)) # Normalize ID to UUID's str format. format = request.data["format"].strip() @@ -64,6 +66,16 @@ def export(request: Request): # Pass a callable to generate the proper file paths within the export context. EXPORT_FORMAT_FUNCTION_MAP[format](file_export.getPath, object_id) + # If no output path parameter has been provided, the generated export + # is returned as an attachment to the Response and everything will + # be cleaned afterwards. + # Otherwise, the provided local path is under the responsability of + # the caller + if not output_path: + tarfile = file_export.writeTar() + return FileResponse(open(tarfile, "rb"), as_attachment=True) + + except ExportError as e: return Response(errors.bad_request_error(f"Encountered export error: {e}"), status=400) From 61eee2d6ce9412a04fe9dded60d4d15bad4a26b7 Mon Sep 17 00:00:00 2001 From: Paul Pillot Date: Thu, 7 Apr 2022 19:17:45 +0000 Subject: [PATCH 18/26] define a set of all exported files --- chord_metadata_service/chord/export_cbio.py | 8 ++++++++ .../chord/tests/test_export_cbio.py | 11 ++--------- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/chord_metadata_service/chord/export_cbio.py b/chord_metadata_service/chord/export_cbio.py index fae344cfd..3091d168e 100644 --- a/chord_metadata_service/chord/export_cbio.py +++ b/chord_metadata_service/chord/export_cbio.py @@ -21,6 +21,14 @@ PATIENT_DATA_FILENAME = "data_clinical_patient.txt" PATIENT_META_FILENAME = "meta_clinical_patient.txt" +CBIO_FILES_SET = frozenset({ + STUDY_FILENAME, + SAMPLE_DATA_FILENAME, + SAMPLE_META_FILENAME, + PATIENT_DATA_FILENAME, + PATIENT_META_FILENAME +}) + PATIENT_DATATYPE = 'PATIENT' SAMPLE_DATATYPE = 'SAMPLE' diff --git a/chord_metadata_service/chord/tests/test_export_cbio.py b/chord_metadata_service/chord/tests/test_export_cbio.py index eb4a2dd38..b455d9a84 100644 --- a/chord_metadata_service/chord/tests/test_export_cbio.py +++ b/chord_metadata_service/chord/tests/test_export_cbio.py @@ -5,7 +5,7 @@ from django.test import TestCase -from chord_metadata_service.chord.export_cbio import PATIENT_DATA_FILENAME, PATIENT_DATATYPE, PATIENT_META_FILENAME, SAMPLE_DATA_FILENAME, SAMPLE_DATATYPE, SAMPLE_META_FILENAME, STUDY_FILENAME, ClinicalMetaExport, IndividualExport, SampleExport, StudyExport, StudyExportMeta +from chord_metadata_service.chord.export_cbio import CBIO_FILES_SET, PATIENT_DATA_FILENAME, PATIENT_DATATYPE, PATIENT_META_FILENAME, SAMPLE_DATA_FILENAME, SAMPLE_DATATYPE, SAMPLE_META_FILENAME, STUDY_FILENAME, ClinicalMetaExport, IndividualExport, SampleExport, StudyExport, StudyExportMeta from chord_metadata_service.chord.data_types import DATA_TYPE_PHENOPACKET, DATA_TYPE_EXPERIMENT from chord_metadata_service.chord.export_utils import ExportFileContext from chord_metadata_service.chord.models import Project, Dataset, TableOwnership, Table @@ -71,14 +71,7 @@ def testFileCreation(self): self.assertTrue(path.exists(export_dir)) for (dirpath, dirnames, filenames) in walk(export_dir): filesSet = {*filenames} - expectedSet = { - STUDY_FILENAME, - SAMPLE_DATA_FILENAME, - SAMPLE_META_FILENAME, - PATIENT_META_FILENAME, - PATIENT_META_FILENAME - } - self.assertTrue(expectedSet.issubset(filesSet)) + self.assertTrue(CBIO_FILES_SET.issubset(filesSet)) break # do not recurse the directory tree From d959daa783270282640360ba64c10f1892a3623e Mon Sep 17 00:00:00 2001 From: Paul Pillot Date: Thu, 7 Apr 2022 19:19:04 +0000 Subject: [PATCH 19/26] Export API tests --- .../chord/tests/test_api_export.py | 99 +++++++++++++++++++ 1 file changed, 99 insertions(+) create mode 100644 chord_metadata_service/chord/tests/test_api_export.py diff --git a/chord_metadata_service/chord/tests/test_api_export.py b/chord_metadata_service/chord/tests/test_api_export.py new file mode 100644 index 000000000..320bfc82c --- /dev/null +++ b/chord_metadata_service/chord/tests/test_api_export.py @@ -0,0 +1,99 @@ +import json +import os +import shutil +import tempfile +import uuid + +from django.test import override_settings +from django.urls import reverse +from chord_metadata_service.chord.export_cbio import CBIO_FILES_SET +from chord_metadata_service.chord.export_utils import EXPORT_DIR +from rest_framework import status +from rest_framework.test import APITestCase + +from ..views_ingest import METADATA_WORKFLOWS +from chord_metadata_service.chord.data_types import DATA_TYPE_PHENOPACKET, DATA_TYPE_EXPERIMENT +from chord_metadata_service.chord.models import Project, Dataset, TableOwnership, Table +# noinspection PyProtectedMember +from chord_metadata_service.chord.ingest import ( + WORKFLOW_PHENOPACKETS_JSON, + WORKFLOW_INGEST_FUNCTION_MAP, +) + +from .constants import VALID_DATA_USE_1 +from .example_ingest import ( + EXAMPLE_INGEST_OUTPUTS, +) + + +def generate_phenopackets_ingest(table_id): + return { + "table_id": table_id, + "workflow_id": "phenopackets_json", + "workflow_metadata": METADATA_WORKFLOWS["ingestion"]["phenopackets_json"], + "workflow_outputs": { + "json_document": "" # TODO + }, + "workflow_params": { + "json_document": "" # TODO + } + } + + +class ExportTest(APITestCase): + def setUp(self) -> None: + # Creates a test database and populate with a phenopacket test file + + p = Project.objects.create(title="Project 1", description="") + self.d = Dataset.objects.create(title="Dataset 1", description="Some dataset", data_use=VALID_DATA_USE_1, + project=p) + self.study_id = str(self.d.identifier) + + # TODO: Real service ID + # table for phenopackets + to = TableOwnership.objects.create(table_id=uuid.uuid4(), service_id=uuid.uuid4(), service_artifact="metadata", + dataset=self.d) + self.t = Table.objects.create(ownership_record=to, name="Table 1", data_type=DATA_TYPE_PHENOPACKET) + + # table for experiments metadata + to_exp = TableOwnership.objects.create(table_id=uuid.uuid4(), service_id=uuid.uuid4(), + service_artifact="experiments", dataset=self.d) + self.t_exp = Table.objects.create(ownership_record=to_exp, name="Table 2", data_type=DATA_TYPE_EXPERIMENT) + + self.p = WORKFLOW_INGEST_FUNCTION_MAP[WORKFLOW_PHENOPACKETS_JSON](EXAMPLE_INGEST_OUTPUTS, self.t.identifier) + + + @override_settings(AUTH_OVERRIDE=True) # For permissions + def test_export_cbio(self): + # Test with no export body + r = self.client.post(reverse("export"), content_type="application/json") + self.assertEqual(r.status_code, status.HTTP_400_BAD_REQUEST) + + try: + tmp_dir = tempfile.mkdtemp() + + export_payload = { + "format": "cbioportal", + "object_type": "dataset", + "object_id": self.study_id, + } + + # Test with no output_path: expect a tar archive to be returned + r = self.client.post(reverse("export"), data=json.dumps(export_payload), content_type="application/json") + self.assertEquals(r.get('Content-Disposition'), f"attachment; filename=\"{self.study_id}.tar.gz\"") + + # Test with output_path provided: expect files created in this directory + export_payload["output_path"] = tmp_dir + + r = self.client.post(reverse("export"), data=json.dumps(export_payload), content_type="application/json") + self.assertEqual(r.status_code, status.HTTP_204_NO_CONTENT) + # TODO: just write within the directory that has been provided + export_path = os.path.join(tmp_dir, EXPORT_DIR, self.study_id) + self.assertTrue(os.path.exists(export_path)) + for export_file in CBIO_FILES_SET: + self.assertTrue(os.path.exists(os.path.join(export_path, export_file))) + + finally: + shutil.rmtree(tmp_dir) + + # TODO: More From 4ffbf59cc1c6a78a3254594a53079e2da414f8fd Mon Sep 17 00:00:00 2001 From: Paul Pillot Date: Thu, 7 Apr 2022 20:19:55 +0000 Subject: [PATCH 20/26] add comments --- chord_metadata_service/chord/export_utils.py | 24 +++++++++++++++++++- chord_metadata_service/chord/views_export.py | 12 +++++++++- 2 files changed, 34 insertions(+), 2 deletions(-) diff --git a/chord_metadata_service/chord/export_utils.py b/chord_metadata_service/chord/export_utils.py index 45264d49a..ac521323e 100644 --- a/chord_metadata_service/chord/export_utils.py +++ b/chord_metadata_service/chord/export_utils.py @@ -20,8 +20,17 @@ class ExportError(Exception): class ExportFileContext: """ - Context manager around the tmp export directory for a given study + File context manager around a tmp export directory for a given study identifier. + When no temp directory is provided, this context takes care of removing the + temp directories created with their contents. + + Attributes: + tmp_dir: path to the directory where the exported files are written. + Can be None. In that case the files are written to a tmp directory + on the system and cleaned once the context manager finishes. + project_id: name that will be used to namespace the export directory. + This is also used for the archive filename by the writeTar() method """ path = "" should_del = False @@ -67,9 +76,22 @@ def __exit__(self, exc_type, exc_value, traceback): shutil.rmtree(self.path) def getPath (self, filename: str = ''): + """Returns a path within the export directory + + Attributes: + filename: optional filename to use + """ return os.path.join(self.path, filename) def writeTar (self): + """Creates a tar gzipped archive from the export directory content + + Note that the tar file is created inside the context of this ExportFileContext + class. If no path was provided at the time of the context creation, + then the generated tar file will be deleted along with the tmp directory + + Return: path to the generated tar file + """ tar_path = os.path.join(self.base_path, EXPORT_DIR, self.project_id + '.tar.gz') with tarfile.open(tar_path, 'w:gz') as tar: output_dir = self.getPath() diff --git a/chord_metadata_service/chord/views_export.py b/chord_metadata_service/chord/views_export.py index 50ba487bd..1d75cdf67 100644 --- a/chord_metadata_service/chord/views_export.py +++ b/chord_metadata_service/chord/views_export.py @@ -30,7 +30,17 @@ @api_view(["POST"]) @permission_classes([AllowAny]) def export(request: Request): - # Export data from Katsu. + """Export data from Katsu + + Exports the requested data object (e.g. a Dataset or a Project) in the given + format. + Note that the generated files will be either written locally if a path is + provided, or downloaded as a tar gzipped attachment otherwise. + + Args: + request: Django Rest Framework request object. The data property contains + the payload as a JSON following the export schema. + """ # Private endpoints are protected by URL namespace, not by Django permissions. # TODO: Schema for OpenAPI doc From 168d4584faa91309a6835fade7247e36962522f1 Mon Sep 17 00:00:00 2001 From: Paul Pillot Date: Fri, 8 Apr 2022 08:46:39 -0400 Subject: [PATCH 21/26] commented out code cleaning --- chord_metadata_service/chord/views_export.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/chord_metadata_service/chord/views_export.py b/chord_metadata_service/chord/views_export.py index 1d75cdf67..ea9650095 100644 --- a/chord_metadata_service/chord/views_export.py +++ b/chord_metadata_service/chord/views_export.py @@ -1,7 +1,6 @@ import json import logging import traceback -#import uuid from django.http import FileResponse @@ -14,7 +13,6 @@ from chord_metadata_service.chord.schemas import EXPORT_SCHEMA from bento_lib.responses import errors -#from bento_lib.workflows import get_workflow, get_workflow_resource, workflow_exists from .export import EXPORT_FORMAT_FUNCTION_MAP, EXPORT_FORMAT_OBJECT_TYPE_MAP, EXPORT_FORMATS, EXPORT_OBJECT_TYPE from .export_utils import ExportError, ExportFileContext @@ -58,8 +56,6 @@ def export(request: Request): if not model.objects.filter(identifier=object_id).exists(): return Response(errors.bad_request_error(f"{object_type.capitalize()} with ID {object_id} does not exist"), status=400) - #object_id = str(uuid.UUID(object_id)) # Normalize ID to UUID's str format. - format = request.data["format"].strip() output_path = request.data.get("output_path") # optional parameter From 0cb209809248fd7a195125371cb6a863d2a2ae3d Mon Sep 17 00:00:00 2001 From: Paul Pillot Date: Fri, 8 Apr 2022 10:15:03 -0400 Subject: [PATCH 22/26] code linting --- chord_metadata_service/chord/export.py | 6 +-- chord_metadata_service/chord/export_cbio.py | 45 +++++++++---------- chord_metadata_service/chord/export_utils.py | 18 ++++---- .../chord/tests/test_api_export.py | 3 +- .../chord/tests/test_export_cbio.py | 14 +++++- chord_metadata_service/chord/views_export.py | 32 ++++++++----- 6 files changed, 70 insertions(+), 48 deletions(-) diff --git a/chord_metadata_service/chord/export.py b/chord_metadata_service/chord/export.py index a6918e73a..a9768cc4b 100644 --- a/chord_metadata_service/chord/export.py +++ b/chord_metadata_service/chord/export.py @@ -21,12 +21,12 @@ }, } -EXPORT_FORMATS = { WORKFLOW_CBIOPORTAL } +EXPORT_FORMATS = {WORKFLOW_CBIOPORTAL} EXPORT_FORMAT_FUNCTION_MAP = { WORKFLOW_CBIOPORTAL: export_cbioportal_workflow } EXPORT_FORMAT_OBJECT_TYPE_MAP = { - WORKFLOW_CBIOPORTAL: { OBJECT_TYPE_DATASET } -} \ No newline at end of file + WORKFLOW_CBIOPORTAL: {OBJECT_TYPE_DATASET} +} diff --git a/chord_metadata_service/chord/export_cbio.py b/chord_metadata_service/chord/export_cbio.py index 3091d168e..a11200469 100644 --- a/chord_metadata_service/chord/export_cbio.py +++ b/chord_metadata_service/chord/export_cbio.py @@ -15,9 +15,9 @@ logger = logging.getLogger(__name__) # predefined filenames recognized by cBioPortal -STUDY_FILENAME = "meta_study.txt" -SAMPLE_DATA_FILENAME = "data_clinical_sample.txt" -SAMPLE_META_FILENAME = "meta_clinical_sample.txt" +STUDY_FILENAME = "meta_study.txt" +SAMPLE_DATA_FILENAME = "data_clinical_sample.txt" +SAMPLE_META_FILENAME = "meta_clinical_sample.txt" PATIENT_DATA_FILENAME = "data_clinical_patient.txt" PATIENT_META_FILENAME = "meta_clinical_patient.txt" @@ -30,12 +30,12 @@ }) PATIENT_DATATYPE = 'PATIENT' -SAMPLE_DATATYPE = 'SAMPLE' +SAMPLE_DATATYPE = 'SAMPLE' -def StudyExport (getPath: Callable[[str], str], dataset_id: str): +def StudyExport(getPath: Callable[[str], str], dataset_id: str): """Export a given Project as a cBioPortal study""" - #TODO: a Dataset is a Study (associated with a publication), not a Project! + # TODO: a Dataset is a Study (associated with a publication), not a Project! if Dataset.objects.count == 0: raise ExportError("No Dataset to export") dataset = Dataset.objects.get(identifier=dataset_id) @@ -63,21 +63,19 @@ def StudyExport (getPath: Callable[[str], str], dataset_id: str): ClinicalMetaExport(cbio_study_id, SAMPLE_DATATYPE, file_sample_meta) - - -def StudyExportMeta (dataset: Dataset, file_handle: TextIO): +def StudyExportMeta(dataset: Dataset, file_handle: TextIO): """ Study meta data file generation """ lines = dict() - lines['type_of_cancer'] = "mixed" #TODO: find if this information is available. !IMPORTANT! uses Oncotree codes + lines['type_of_cancer'] = "mixed" # TODO: find if this information is available. !IMPORTANT! uses Oncotree codes lines['cancer_study_identifier'] = str(dataset.identifier) - lines['name'] = dataset.title - lines['description'] = dataset.description + lines['name'] = dataset.title + lines['description'] = dataset.description # optional fields if len(dataset.primary_publications): - lines['citation'] = dataset.primary_publications[0] + lines['citation'] = dataset.primary_publications[0] # pmid: unvailable # groups: unused for authentication # add_global_case_list: ? @@ -88,7 +86,7 @@ def StudyExportMeta (dataset: Dataset, file_handle: TextIO): file_handle.write(f"{field}: {value}\n") -def ClinicalMetaExport (study_id: str, datatype: str, file_handle: TextIO): +def ClinicalMetaExport(study_id: str, datatype: str, file_handle: TextIO): """ Clinical Metadata files generation (samples or patients) """ @@ -140,7 +138,7 @@ def IndividualExport(results, file_handle: TextIO): dict_writer.writerows(individuals) -def SampleExport (results, file_handle: TextIO): +def SampleExport(results, file_handle: TextIO): """ Renders Biosamples as a clinical_sample text file suitable for importing by cBioPortal. @@ -158,7 +156,8 @@ def SampleExport (results, file_handle: TextIO): - SAMPLE_DISPLAY_NAME - SAMPLE_CLASS - METASTATIC_SITE / PRIMARY_SITE overrides the patients level attribute TUMOR_SITE - - SAMPLE_TYPE, TUMOR_TISSUE_SITE, TUMOR_TYPE can have the following values (are displayed with a distinct color in the timelines): + - SAMPLE_TYPE, TUMOR_TISSUE_SITE, TUMOR_TYPE can have the following values + (are displayed with a distinct color in the timelines): - "recurrence", "recurred", "progression" - "metastatic", "metastasis" - "primary" or any other value @@ -200,10 +199,10 @@ class cbioportal_clinical_header_generator (): fields_mapping = {} - def __init__(self, mappings = {}): + def __init__(self, mappings={}): self.fields_mapping = mappings - def make_header (self, fields: list): + def make_header(self, fields: list): """ Maps a list of field names to a 5 rows header suitable for cBioPortal clinical data files. @@ -241,8 +240,7 @@ def make_header (self, fields: list): return cbio_header - -def individual_to_patient_header (fields: list): +def individual_to_patient_header(fields: list): """ Maps a list of Individual field names to a 5 rows header suitable for cBioPortal data_clinical_patient.txt file. @@ -254,10 +252,11 @@ def individual_to_patient_header (fields: list): 'sex': ('Sex', 'Sex', 'STRING', '1', 'SEX'), } - cbio_header = cbioportal_clinical_header_generator(fields_mapping); + cbio_header = cbioportal_clinical_header_generator(fields_mapping) return cbio_header.make_header(fields) -def biosample_to_sample_header (fields: list): + +def biosample_to_sample_header(fields: list): """ Maps a list of biosamples field names to a 5 rows header suitable for cBioPortal data_sample_patient.txt file. @@ -270,5 +269,5 @@ def biosample_to_sample_header (fields: list): 'tissue_label': ('Sampled Tissue', 'Sampled Tissue', 'STRING', '1', 'TISSUE_LABEL') } - cbio_header = cbioportal_clinical_header_generator(fields_mapping); + cbio_header = cbioportal_clinical_header_generator(fields_mapping) return cbio_header.make_header(fields) diff --git a/chord_metadata_service/chord/export_utils.py b/chord_metadata_service/chord/export_utils.py index ac521323e..795172003 100644 --- a/chord_metadata_service/chord/export_utils.py +++ b/chord_metadata_service/chord/export_utils.py @@ -15,9 +15,11 @@ EXPORT_DIR = 'export' + class ExportError(Exception): pass + class ExportFileContext: """ File context manager around a tmp export directory for a given study @@ -49,11 +51,11 @@ def __init__(self, tmp_dir: str, project_id: str): self.base_path = tmp_dir self.project_id = project_id - + try: self.path = os.path.join(tmp_dir, EXPORT_DIR, project_id) - #clean pre-existing export dir + # clean pre-existing export dir isExistant = os.path.exists(self.path) if isExistant: shutil.rmtree(self.path) @@ -67,7 +69,6 @@ def __init__(self, tmp_dir: str, project_id: str): finally: os.umask(original_umask) - def __enter__(self): return self @@ -75,17 +76,17 @@ def __exit__(self, exc_type, exc_value, traceback): if self.should_del and self.path: shutil.rmtree(self.path) - def getPath (self, filename: str = ''): + def getPath(self, filename: str = ''): """Returns a path within the export directory - + Attributes: filename: optional filename to use """ return os.path.join(self.path, filename) - def writeTar (self): + def writeTar(self): """Creates a tar gzipped archive from the export directory content - + Note that the tar file is created inside the context of this ExportFileContext class. If no path was provided at the time of the context creation, then the generated tar file will be deleted along with the tmp directory @@ -98,9 +99,10 @@ def writeTar (self): tar.add(output_dir, filter=resetTarInfo) return tar_path + def resetTarInfo(info: tarfile.TarInfo) -> tarfile.TarInfo: info.gid = 0 info.uid = 0 info.uname = 'root' info.gname = 'root' - return info \ No newline at end of file + return info diff --git a/chord_metadata_service/chord/tests/test_api_export.py b/chord_metadata_service/chord/tests/test_api_export.py index 320bfc82c..374eb7b8d 100644 --- a/chord_metadata_service/chord/tests/test_api_export.py +++ b/chord_metadata_service/chord/tests/test_api_export.py @@ -42,7 +42,7 @@ def generate_phenopackets_ingest(table_id): class ExportTest(APITestCase): def setUp(self) -> None: - # Creates a test database and populate with a phenopacket test file + # Creates a test database and populate with a phenopacket test file p = Project.objects.create(title="Project 1", description="") self.d = Dataset.objects.create(title="Dataset 1", description="Some dataset", data_use=VALID_DATA_USE_1, @@ -62,7 +62,6 @@ def setUp(self) -> None: self.p = WORKFLOW_INGEST_FUNCTION_MAP[WORKFLOW_PHENOPACKETS_JSON](EXAMPLE_INGEST_OUTPUTS, self.t.identifier) - @override_settings(AUTH_OVERRIDE=True) # For permissions def test_export_cbio(self): # Test with no export body diff --git a/chord_metadata_service/chord/tests/test_export_cbio.py b/chord_metadata_service/chord/tests/test_export_cbio.py index b455d9a84..60f29191f 100644 --- a/chord_metadata_service/chord/tests/test_export_cbio.py +++ b/chord_metadata_service/chord/tests/test_export_cbio.py @@ -5,7 +5,18 @@ from django.test import TestCase -from chord_metadata_service.chord.export_cbio import CBIO_FILES_SET, PATIENT_DATA_FILENAME, PATIENT_DATATYPE, PATIENT_META_FILENAME, SAMPLE_DATA_FILENAME, SAMPLE_DATATYPE, SAMPLE_META_FILENAME, STUDY_FILENAME, ClinicalMetaExport, IndividualExport, SampleExport, StudyExport, StudyExportMeta +from chord_metadata_service.chord.export_cbio import ( + CBIO_FILES_SET, + PATIENT_DATA_FILENAME, + PATIENT_DATATYPE, + SAMPLE_DATA_FILENAME, + SAMPLE_DATATYPE, + ClinicalMetaExport, + IndividualExport, + SampleExport, + StudyExport, + StudyExportMeta +) from chord_metadata_service.chord.data_types import DATA_TYPE_PHENOPACKET, DATA_TYPE_EXPERIMENT from chord_metadata_service.chord.export_utils import ExportFileContext from chord_metadata_service.chord.models import Project, Dataset, TableOwnership, Table @@ -74,7 +85,6 @@ def testFileCreation(self): self.assertTrue(CBIO_FILES_SET.issubset(filesSet)) break # do not recurse the directory tree - def test_export_cbio_study_meta(self): with io.StringIO() as output: StudyExportMeta(self.d, output) diff --git a/chord_metadata_service/chord/views_export.py b/chord_metadata_service/chord/views_export.py index ea9650095..e9415d229 100644 --- a/chord_metadata_service/chord/views_export.py +++ b/chord_metadata_service/chord/views_export.py @@ -47,23 +47,35 @@ def export(request: Request): if not BENTO_EXPORT_SCHEMA_VALIDATOR.is_valid(request.data): msg_list = [err.message for err in BENTO_EXPORT_SCHEMA_VALIDATOR.iter_errors(request.data)] - return Response(errors.bad_request_error("Invalid ingest request body: " + "\n".join(msg_list)), status=400) # TODO: Validation errors + return Response(errors.bad_request_error( + "Invalid ingest request body: " + "\n".join(msg_list)), + status=400 # TODO: Validation errors + ) object_id = request.data["object_id"] object_type: str = request.data["object_type"] # 'dataset', 'table',... model = EXPORT_OBJECT_TYPE[object_type]["model"] if not model.objects.filter(identifier=object_id).exists(): - return Response(errors.bad_request_error(f"{object_type.capitalize()} with ID {object_id} does not exist"), status=400) + return Response(errors.bad_request_error( + f"{object_type.capitalize()} with ID {object_id} does not exist"), + status=400 + ) format = request.data["format"].strip() output_path = request.data.get("output_path") # optional parameter - if not format in EXPORT_FORMATS: # Check that the workflow exists - return Response(errors.bad_request_error(f"Export in format {format} is not implemented"), status=400) + if format not in EXPORT_FORMATS: # Check that the workflow exists + return Response(errors.bad_request_error( + f"Export in format {format} is not implemented"), + status=400 + ) - if not object_type in EXPORT_FORMAT_OBJECT_TYPE_MAP[format]: - return Response(errors.bad_request_error(f"Exporting entities of type {object_type} in format {format} is not implemented"), status=400) + if object_type not in EXPORT_FORMAT_OBJECT_TYPE_MAP[format]: + return Response(errors.bad_request_error( + f"Exporting entities of type {object_type} in format {format} is not implemented"), + status=400 + ) # TODO: secure the output_path value @@ -81,15 +93,15 @@ def export(request: Request): tarfile = file_export.writeTar() return FileResponse(open(tarfile, "rb"), as_attachment=True) - except ExportError as e: return Response(errors.bad_request_error(f"Encountered export error: {e}"), status=400) - except Exception as e: # Encountered some other error from the export attempt, return a somewhat detailed message logger.error(f"Encountered an exception while processing an export attempt:\n{traceback.format_exc()}") - return Response(errors.internal_server_error(f"Encountered an exception while processing an export attempt " - f"(error: {repr(e)}"), status=500) + return Response(errors.internal_server_error( + f"Encountered an exception while processing an export attempt (error: {repr(e)}"), + status=500 + ) return Response(status=204) From fc423482f14655cd2b92ae32f61bf7ad1c6e338d Mon Sep 17 00:00:00 2001 From: Paul Pillot Date: Tue, 19 Apr 2022 08:19:26 -0400 Subject: [PATCH 23/26] code styling --- chord_metadata_service/chord/export.py | 2 +- chord_metadata_service/chord/export_cbio.py | 28 +++++++-------- chord_metadata_service/chord/export_utils.py | 10 +++--- .../chord/tests/test_export_cbio.py | 34 +++++++++---------- chord_metadata_service/chord/views_export.py | 4 +-- 5 files changed, 39 insertions(+), 39 deletions(-) diff --git a/chord_metadata_service/chord/export.py b/chord_metadata_service/chord/export.py index a9768cc4b..78644bca5 100644 --- a/chord_metadata_service/chord/export.py +++ b/chord_metadata_service/chord/export.py @@ -1,7 +1,7 @@ import logging from chord_metadata_service.chord.ingest import WORKFLOW_CBIOPORTAL from chord_metadata_service.chord.models import Dataset, Project, Table -from .export_cbio import StudyExport as export_cbioportal_workflow +from .export_cbio import study_export as export_cbioportal_workflow logger = logging.getLogger(__name__) diff --git a/chord_metadata_service/chord/export_cbio.py b/chord_metadata_service/chord/export_cbio.py index a11200469..95920b055 100644 --- a/chord_metadata_service/chord/export_cbio.py +++ b/chord_metadata_service/chord/export_cbio.py @@ -9,7 +9,7 @@ from chord_metadata_service.phenopackets import models as pm __all__ = [ - "StudyExport", + "study_export", ] logger = logging.getLogger(__name__) @@ -33,7 +33,7 @@ SAMPLE_DATATYPE = 'SAMPLE' -def StudyExport(getPath: Callable[[str], str], dataset_id: str): +def study_export(getPath: Callable[[str], str], dataset_id: str): """Export a given Project as a cBioPortal study""" # TODO: a Dataset is a Study (associated with a publication), not a Project! if Dataset.objects.count == 0: @@ -43,27 +43,27 @@ def StudyExport(getPath: Callable[[str], str], dataset_id: str): # Export study file with open(getPath(STUDY_FILENAME), 'w') as file_study: - StudyExportMeta(dataset, file_study) + study_export_meta(dataset, file_study) # Export patients. with open(getPath(PATIENT_DATA_FILENAME), 'w') as file_patient: # Note: plural in `phenopackets` is intentional (related_name property in model) indiv = Individual.objects.filter(phenopackets__table__ownership_record__dataset_id=dataset.identifier) - IndividualExport(indiv, file_patient) + individual_export(indiv, file_patient) with open(getPath(PATIENT_META_FILENAME), 'w') as file_patient_meta: - ClinicalMetaExport(cbio_study_id, PATIENT_DATATYPE, file_patient_meta) + clinical_meta_export(cbio_study_id, PATIENT_DATATYPE, file_patient_meta) # Export samples with open(getPath(SAMPLE_DATA_FILENAME), 'w') as file_sample: sampl = pm.Biosample.objects.filter(phenopacket__table__ownership_record__dataset_id=dataset.identifier) - SampleExport(sampl, file_sample) + sample_export(sampl, file_sample) with open(getPath(SAMPLE_META_FILENAME), 'w') as file_sample_meta: - ClinicalMetaExport(cbio_study_id, SAMPLE_DATATYPE, file_sample_meta) + clinical_meta_export(cbio_study_id, SAMPLE_DATATYPE, file_sample_meta) -def StudyExportMeta(dataset: Dataset, file_handle: TextIO): +def study_export_meta(dataset: Dataset, file_handle: TextIO): """ Study meta data file generation """ @@ -86,7 +86,7 @@ def StudyExportMeta(dataset: Dataset, file_handle: TextIO): file_handle.write(f"{field}: {value}\n") -def ClinicalMetaExport(study_id: str, datatype: str, file_handle: TextIO): +def clinical_meta_export(study_id: str, datatype: str, file_handle: TextIO): """ Clinical Metadata files generation (samples or patients) """ @@ -104,7 +104,7 @@ def ClinicalMetaExport(study_id: str, datatype: str, file_handle: TextIO): file_handle.write(f"{field}: {value}\n") -def IndividualExport(results, file_handle: TextIO): +def individual_export(results, file_handle: TextIO): """ Renders Individuals as a clinical_patient text file suitable for importing by cBioPortal. @@ -138,7 +138,7 @@ def IndividualExport(results, file_handle: TextIO): dict_writer.writerows(individuals) -def SampleExport(results, file_handle: TextIO): +def sample_export(results, file_handle: TextIO): """ Renders Biosamples as a clinical_sample text file suitable for importing by cBioPortal. @@ -192,7 +192,7 @@ def SampleExport(results, file_handle: TextIO): dict_writer.writerows(samples) -class cbioportal_clinical_header_generator (): +class CbioportalClinicalHeaderGenerator(): """ Generates cBioPortal data files headers based on field names from katsu models. """ @@ -252,7 +252,7 @@ def individual_to_patient_header(fields: list): 'sex': ('Sex', 'Sex', 'STRING', '1', 'SEX'), } - cbio_header = cbioportal_clinical_header_generator(fields_mapping) + cbio_header = CbioportalClinicalHeaderGenerator(fields_mapping) return cbio_header.make_header(fields) @@ -269,5 +269,5 @@ def biosample_to_sample_header(fields: list): 'tissue_label': ('Sampled Tissue', 'Sampled Tissue', 'STRING', '1', 'TISSUE_LABEL') } - cbio_header = cbioportal_clinical_header_generator(fields_mapping) + cbio_header = CbioportalClinicalHeaderGenerator(fields_mapping) return cbio_header.make_header(fields) diff --git a/chord_metadata_service/chord/export_utils.py b/chord_metadata_service/chord/export_utils.py index 795172003..189da1b51 100644 --- a/chord_metadata_service/chord/export_utils.py +++ b/chord_metadata_service/chord/export_utils.py @@ -76,7 +76,7 @@ def __exit__(self, exc_type, exc_value, traceback): if self.should_del and self.path: shutil.rmtree(self.path) - def getPath(self, filename: str = ''): + def get_path(self, filename: str = ''): """Returns a path within the export directory Attributes: @@ -84,7 +84,7 @@ def getPath(self, filename: str = ''): """ return os.path.join(self.path, filename) - def writeTar(self): + def write_tar(self): """Creates a tar gzipped archive from the export directory content Note that the tar file is created inside the context of this ExportFileContext @@ -95,12 +95,12 @@ def writeTar(self): """ tar_path = os.path.join(self.base_path, EXPORT_DIR, self.project_id + '.tar.gz') with tarfile.open(tar_path, 'w:gz') as tar: - output_dir = self.getPath() - tar.add(output_dir, filter=resetTarInfo) + output_dir = self.get_path() + tar.add(output_dir, filter=reset_tar_info) return tar_path -def resetTarInfo(info: tarfile.TarInfo) -> tarfile.TarInfo: +def reset_tar_info(info: tarfile.TarInfo) -> tarfile.TarInfo: info.gid = 0 info.uid = 0 info.uname = 'root' diff --git a/chord_metadata_service/chord/tests/test_export_cbio.py b/chord_metadata_service/chord/tests/test_export_cbio.py index 60f29191f..b3c74ca6d 100644 --- a/chord_metadata_service/chord/tests/test_export_cbio.py +++ b/chord_metadata_service/chord/tests/test_export_cbio.py @@ -11,11 +11,11 @@ PATIENT_DATATYPE, SAMPLE_DATA_FILENAME, SAMPLE_DATATYPE, - ClinicalMetaExport, - IndividualExport, - SampleExport, - StudyExport, - StudyExportMeta + clinical_meta_export, + individual_export, + sample_export, + study_export, + study_export_meta ) from chord_metadata_service.chord.data_types import DATA_TYPE_PHENOPACKET, DATA_TYPE_EXPERIMENT from chord_metadata_service.chord.export_utils import ExportFileContext @@ -58,7 +58,7 @@ def setUp(self) -> None: self.p = WORKFLOW_INGEST_FUNCTION_MAP[WORKFLOW_PHENOPACKETS_JSON](EXAMPLE_INGEST_OUTPUTS, self.t.identifier) - def streamToDict(self, output: TextIO) -> Dict[str, str]: + def stream_to_dict(self, output: TextIO) -> Dict[str, str]: """ Utility function. Parses cBioPortal meta data text files (lines of key/value pairs separated by `: `) in a dictionary structure. @@ -70,15 +70,15 @@ def streamToDict(self, output: TextIO) -> Dict[str, str]: content[key] = value return content - def testFileCreation(self): + def test_file_creation(self): """ Check files creation. Files content is tested subsequently with each file generating function. """ with ExportFileContext(None, self.study_id) as file_export: - StudyExport(file_export.getPath, self.study_id) - export_dir = file_export.getPath() + study_export(file_export.get_path, self.study_id) + export_dir = file_export.get_path() self.assertTrue(path.exists(export_dir)) for (dirpath, dirnames, filenames) in walk(export_dir): filesSet = {*filenames} @@ -87,8 +87,8 @@ def testFileCreation(self): def test_export_cbio_study_meta(self): with io.StringIO() as output: - StudyExportMeta(self.d, output) - content = self.streamToDict(output) + study_export_meta(self.d, output) + content = self.stream_to_dict(output) self.assertIn('type_of_cancer', content) self.assertEqual(content['cancer_study_identifier'], self.study_id) @@ -97,8 +97,8 @@ def test_export_cbio_study_meta(self): def test_export_cbio_sample_meta(self): with io.StringIO() as output: - ClinicalMetaExport(self.study_id, SAMPLE_DATATYPE, output) - content = self.streamToDict(output) + clinical_meta_export(self.study_id, SAMPLE_DATATYPE, output) + content = self.stream_to_dict(output) self.assertEqual(content['cancer_study_identifier'], self.study_id) self.assertEqual(content['genetic_alteration_type'], 'CLINICAL') @@ -107,8 +107,8 @@ def test_export_cbio_sample_meta(self): def test_export_cbio_patient_meta(self): with io.StringIO() as output: - ClinicalMetaExport(self.study_id, PATIENT_DATATYPE, output) - content = self.streamToDict(output) + clinical_meta_export(self.study_id, PATIENT_DATATYPE, output) + content = self.stream_to_dict(output) self.assertEqual(content['cancer_study_identifier'], self.study_id) self.assertEqual(content['genetic_alteration_type'], 'CLINICAL') @@ -118,7 +118,7 @@ def test_export_cbio_patient_meta(self): def test_export_cbio_patient_data(self): indiv = Individual.objects.filter(phenopackets=self.p) with io.StringIO() as output: - IndividualExport(indiv, output) + individual_export(indiv, output) # Check header output.seek(0) field_count = None @@ -153,7 +153,7 @@ def test_export_cbio_patient_data(self): def test_export_cbio_sample_data(self): samples = PhModel.Biosample.objects.filter(phenopacket=self.p) with io.StringIO() as output: - SampleExport(samples, output) + sample_export(samples, output) # Check header output.seek(0) field_count = None diff --git a/chord_metadata_service/chord/views_export.py b/chord_metadata_service/chord/views_export.py index e9415d229..6ba5b2464 100644 --- a/chord_metadata_service/chord/views_export.py +++ b/chord_metadata_service/chord/views_export.py @@ -82,7 +82,7 @@ def export(request: Request): try: with ExportFileContext(output_path, object_id) as file_export: # Pass a callable to generate the proper file paths within the export context. - EXPORT_FORMAT_FUNCTION_MAP[format](file_export.getPath, object_id) + EXPORT_FORMAT_FUNCTION_MAP[format](file_export.get_path, object_id) # If no output path parameter has been provided, the generated export # is returned as an attachment to the Response and everything will @@ -90,7 +90,7 @@ def export(request: Request): # Otherwise, the provided local path is under the responsability of # the caller if not output_path: - tarfile = file_export.writeTar() + tarfile = file_export.write_tar() return FileResponse(open(tarfile, "rb"), as_attachment=True) except ExportError as e: From a30a41a33425d0a9f8873e745b925b219441aaa7 Mon Sep 17 00:00:00 2001 From: Paul Pillot Date: Tue, 19 Apr 2022 10:49:12 -0400 Subject: [PATCH 24/26] skip samples with no individual --- chord_metadata_service/chord/export_cbio.py | 4 ++++ .../chord/tests/test_export_cbio.py | 23 +++++++++++++++---- 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/chord_metadata_service/chord/export_cbio.py b/chord_metadata_service/chord/export_cbio.py index 95920b055..20788298f 100644 --- a/chord_metadata_service/chord/export_cbio.py +++ b/chord_metadata_service/chord/export_cbio.py @@ -175,6 +175,10 @@ def sample_export(results, file_handle: TextIO): samples = [] for sample in results: + # sample.inidividual can be null. Skip the sample in that case. + if sample.individual is None: + continue + sample_obj = { 'individual_id': sample.individual.id, 'id': sample.id diff --git a/chord_metadata_service/chord/tests/test_export_cbio.py b/chord_metadata_service/chord/tests/test_export_cbio.py index b3c74ca6d..4e9fddf78 100644 --- a/chord_metadata_service/chord/tests/test_export_cbio.py +++ b/chord_metadata_service/chord/tests/test_export_cbio.py @@ -58,6 +58,11 @@ def setUp(self) -> None: self.p = WORKFLOW_INGEST_FUNCTION_MAP[WORKFLOW_PHENOPACKETS_JSON](EXAMPLE_INGEST_OUTPUTS, self.t.identifier) + # Update the last sample to remove reference to any individual. + PhModel.Biosample.objects.filter( + id=EXAMPLE_INGEST_PHENOPACKET["biosamples"][-1]["id"] + ).update(individual=None) + def stream_to_dict(self, output: TextIO) -> Dict[str, str]: """ Utility function. Parses cBioPortal meta data text files (lines of @@ -158,6 +163,7 @@ def test_export_cbio_sample_data(self): output.seek(0) field_count = None field_names = [] + sample_count = 0 for i, line in enumerate(output): # 4 first header lines begin with `#` if i < 4: @@ -178,10 +184,19 @@ def test_export_cbio_sample_data(self): self.assertIn('SAMPLE_ID', pieces) continue - # TSV body. Inspect first line and break + # TSV body. self.assertEqual(field_count, len(pieces)) record = dict(zip(field_names, pieces)) - self.assertEqual(record["PATIENT_ID"], EXAMPLE_INGEST_PHENOPACKET["subject"]["id"]) - self.assertEqual(record["SAMPLE_ID"], EXAMPLE_INGEST_PHENOPACKET["biosamples"][0]["id"]) - break + self.assertEqual( + record["PATIENT_ID"], + EXAMPLE_INGEST_PHENOPACKET["biosamples"][sample_count]["individual_id"] + ) + self.assertEqual( + record["SAMPLE_ID"], + EXAMPLE_INGEST_PHENOPACKET["biosamples"][sample_count]["id"] + ) + sample_count += 1 + + # samples not attached to an individual are not exported + self.assertEqual(sample_count, samples.filter(individual_id__isnull=False).count()) From f0e2b8ac0f8bbdd0edec28c6f4e96522a4b8a03b Mon Sep 17 00:00:00 2001 From: Paul Pillot Date: Tue, 19 Apr 2022 15:49:31 -0400 Subject: [PATCH 25/26] fallback: get patient id from phenopackets --- chord_metadata_service/chord/export_cbio.py | 17 +++++++++++++---- .../chord/tests/test_export_cbio.py | 9 +++++---- 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/chord_metadata_service/chord/export_cbio.py b/chord_metadata_service/chord/export_cbio.py index 20788298f..eb158ded1 100644 --- a/chord_metadata_service/chord/export_cbio.py +++ b/chord_metadata_service/chord/export_cbio.py @@ -175,12 +175,21 @@ def sample_export(results, file_handle: TextIO): samples = [] for sample in results: - # sample.inidividual can be null. Skip the sample in that case. - if sample.individual is None: - continue + + # sample.inidividual may be null: use Phenopacket model Subject field + # instead if available or skip. + subject_id = None + if sample.individual is not None: + subject_id = sample.individual + else: + phnpkt = pm.Phenopacket.objects.filter(biosamples=sample).first() + if phnpkt.subject is not None: + subject_id = phnpkt.subject.id + else: + continue sample_obj = { - 'individual_id': sample.individual.id, + 'individual_id': subject_id, 'id': sample.id } if sample.sampled_tissue: diff --git a/chord_metadata_service/chord/tests/test_export_cbio.py b/chord_metadata_service/chord/tests/test_export_cbio.py index 4e9fddf78..f1b30a100 100644 --- a/chord_metadata_service/chord/tests/test_export_cbio.py +++ b/chord_metadata_service/chord/tests/test_export_cbio.py @@ -58,7 +58,9 @@ def setUp(self) -> None: self.p = WORKFLOW_INGEST_FUNCTION_MAP[WORKFLOW_PHENOPACKETS_JSON](EXAMPLE_INGEST_OUTPUTS, self.t.identifier) - # Update the last sample to remove reference to any individual. + # Update the last sample to remove direct reference to any individual. + # In that case, Sample and Individual are cross referenced through the + # Phenopacket model. PhModel.Biosample.objects.filter( id=EXAMPLE_INGEST_PHENOPACKET["biosamples"][-1]["id"] ).update(individual=None) @@ -184,7 +186,7 @@ def test_export_cbio_sample_data(self): self.assertIn('SAMPLE_ID', pieces) continue - # TSV body. + # TSV body: 1 row per sample self.assertEqual(field_count, len(pieces)) record = dict(zip(field_names, pieces)) @@ -198,5 +200,4 @@ def test_export_cbio_sample_data(self): ) sample_count += 1 - # samples not attached to an individual are not exported - self.assertEqual(sample_count, samples.filter(individual_id__isnull=False).count()) + self.assertEqual(sample_count, samples.count()) From 2130e4b6a48ce14d98058c94a6c6adb8bbda343b Mon Sep 17 00:00:00 2001 From: zxenia Date: Tue, 19 Apr 2022 23:52:08 -0400 Subject: [PATCH 26/26] annotate biosample queryset with phenopacket subject id --- chord_metadata_service/chord/export_cbio.py | 12 ++++++------ .../chord/tests/test_export_cbio.py | 4 +++- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/chord_metadata_service/chord/export_cbio.py b/chord_metadata_service/chord/export_cbio.py index eb158ded1..0413078af 100644 --- a/chord_metadata_service/chord/export_cbio.py +++ b/chord_metadata_service/chord/export_cbio.py @@ -1,6 +1,7 @@ import logging import csv from typing import TextIO, Callable +from django.db.models import F from .export_utils import ExportError @@ -56,7 +57,8 @@ def study_export(getPath: Callable[[str], str], dataset_id: str): # Export samples with open(getPath(SAMPLE_DATA_FILENAME), 'w') as file_sample: - sampl = pm.Biosample.objects.filter(phenopacket__table__ownership_record__dataset_id=dataset.identifier) + sampl = pm.Biosample.objects.filter(phenopacket__table__ownership_record__dataset_id=dataset.identifier)\ + .annotate(phenopacket_subject_id=F("phenopacket__subject")) sample_export(sampl, file_sample) with open(getPath(SAMPLE_META_FILENAME), 'w') as file_sample_meta: @@ -181,12 +183,10 @@ def sample_export(results, file_handle: TextIO): subject_id = None if sample.individual is not None: subject_id = sample.individual + elif sample.phenopacket_subject_id is not None: + subject_id = sample.phenopacket_subject_id else: - phnpkt = pm.Phenopacket.objects.filter(biosamples=sample).first() - if phnpkt.subject is not None: - subject_id = phnpkt.subject.id - else: - continue + continue sample_obj = { 'individual_id': subject_id, diff --git a/chord_metadata_service/chord/tests/test_export_cbio.py b/chord_metadata_service/chord/tests/test_export_cbio.py index f1b30a100..2e2909d58 100644 --- a/chord_metadata_service/chord/tests/test_export_cbio.py +++ b/chord_metadata_service/chord/tests/test_export_cbio.py @@ -3,6 +3,7 @@ from typing import Dict, TextIO from os import walk, path +from django.db.models import F from django.test import TestCase from chord_metadata_service.chord.export_cbio import ( @@ -158,7 +159,8 @@ def test_export_cbio_patient_data(self): break def test_export_cbio_sample_data(self): - samples = PhModel.Biosample.objects.filter(phenopacket=self.p) + samples = PhModel.Biosample.objects.filter(phenopacket=self.p)\ + .annotate(phenopacket_subject_id=F("phenopacket__subject")) with io.StringIO() as output: sample_export(samples, output) # Check header