Merge pull request #311 from bento-platform/cbio_export

Export Datasets to cBioPortal format
bento-platform · Apr 20, 2022 · 5640da3 · 5640da3
2 parents 955a948 + 2130e4b
commit 5640da3
Show file tree

Hide file tree

Showing 9 changed files with 882 additions and 2 deletions.
diff --git a/chord_metadata_service/chord/export.py b/chord_metadata_service/chord/export.py
@@ -0,0 +1,32 @@
+import logging
+from chord_metadata_service.chord.ingest import WORKFLOW_CBIOPORTAL
+from chord_metadata_service.chord.models import Dataset, Project, Table
+from .export_cbio import study_export as export_cbioportal_workflow
+
+logger = logging.getLogger(__name__)
+
+OBJECT_TYPE_PROJECT = "project"
+OBJECT_TYPE_DATASET = "dataset"
+OBJECT_TYPE_TABLE = "table"
+
+EXPORT_OBJECT_TYPE = {
+    OBJECT_TYPE_PROJECT: {
+        "model": Project
+    },
+    OBJECT_TYPE_DATASET: {
+        "model": Dataset
+    },
+    OBJECT_TYPE_TABLE: {
+        "model": Table
+    },
+}
+
+EXPORT_FORMATS = {WORKFLOW_CBIOPORTAL}
+
+EXPORT_FORMAT_FUNCTION_MAP = {
+    WORKFLOW_CBIOPORTAL: export_cbioportal_workflow
+}
+
+EXPORT_FORMAT_OBJECT_TYPE_MAP = {
+    WORKFLOW_CBIOPORTAL: {OBJECT_TYPE_DATASET}
+}
diff --git a/chord_metadata_service/chord/export_cbio.py b/chord_metadata_service/chord/export_cbio.py
@@ -0,0 +1,286 @@
+import logging
+import csv
+from typing import TextIO, Callable
+from django.db.models import F
+
+from .export_utils import ExportError
+
+from chord_metadata_service.chord.models import Dataset
+from chord_metadata_service.patients.models import Individual
+from chord_metadata_service.phenopackets import models as pm
+
+__all__ = [
+    "study_export",
+]
+
+logger = logging.getLogger(__name__)
+
+# predefined filenames recognized by cBioPortal
+STUDY_FILENAME = "meta_study.txt"
+SAMPLE_DATA_FILENAME = "data_clinical_sample.txt"
+SAMPLE_META_FILENAME = "meta_clinical_sample.txt"
+PATIENT_DATA_FILENAME = "data_clinical_patient.txt"
+PATIENT_META_FILENAME = "meta_clinical_patient.txt"
+
+CBIO_FILES_SET = frozenset({
+    STUDY_FILENAME,
+    SAMPLE_DATA_FILENAME,
+    SAMPLE_META_FILENAME,
+    PATIENT_DATA_FILENAME,
+    PATIENT_META_FILENAME
+})
+
+PATIENT_DATATYPE = 'PATIENT'
+SAMPLE_DATATYPE = 'SAMPLE'
+
+
+def study_export(getPath: Callable[[str], str], dataset_id: str):
+    """Export a given Project as a cBioPortal study"""
+    # TODO: a Dataset is a Study (associated with a publication), not a Project!
+    if Dataset.objects.count == 0:
+        raise ExportError("No Dataset to export")
+    dataset = Dataset.objects.get(identifier=dataset_id)
+    cbio_study_id = str(dataset.identifier)
+
+    # Export study file
+    with open(getPath(STUDY_FILENAME), 'w') as file_study:
+        study_export_meta(dataset, file_study)
+
+    # Export patients.
+    with open(getPath(PATIENT_DATA_FILENAME), 'w') as file_patient:
+        # Note: plural in `phenopackets` is intentional (related_name property in model)
+        indiv = Individual.objects.filter(phenopackets__table__ownership_record__dataset_id=dataset.identifier)
+        individual_export(indiv, file_patient)
+
+    with open(getPath(PATIENT_META_FILENAME), 'w') as file_patient_meta:
+        clinical_meta_export(cbio_study_id, PATIENT_DATATYPE, file_patient_meta)
+
+    # Export samples
+    with open(getPath(SAMPLE_DATA_FILENAME), 'w') as file_sample:
+        sampl = pm.Biosample.objects.filter(phenopacket__table__ownership_record__dataset_id=dataset.identifier)\
+            .annotate(phenopacket_subject_id=F("phenopacket__subject"))
+        sample_export(sampl, file_sample)
+
+    with open(getPath(SAMPLE_META_FILENAME), 'w') as file_sample_meta:
+        clinical_meta_export(cbio_study_id, SAMPLE_DATATYPE, file_sample_meta)
+
+
+def study_export_meta(dataset: Dataset, file_handle: TextIO):
+    """
+    Study meta data file generation
+    """
+    lines = dict()
+    lines['type_of_cancer'] = "mixed"   # TODO: find if this information is available. !IMPORTANT! uses Oncotree codes
+    lines['cancer_study_identifier'] = str(dataset.identifier)
+    lines['name'] = dataset.title
+    lines['description'] = dataset.description
+
+    # optional fields
+    if len(dataset.primary_publications):
+        lines['citation'] = dataset.primary_publications[0]
+    # pmid: unvailable
+    # groups: unused for authentication
+    # add_global_case_list: ?
+    # tags_file: ?
+    # reference_genome: ?
+
+    for field, value in lines.items():
+        file_handle.write(f"{field}: {value}\n")
+
+
+def clinical_meta_export(study_id: str, datatype: str, file_handle: TextIO):
+    """
+    Clinical Metadata files generation (samples or patients)
+    """
+    lines = dict()
+    lines['cancer_study_identifier'] = study_id
+    lines['genetic_alteration_type'] = 'CLINICAL'
+    if datatype == SAMPLE_DATATYPE:
+        lines['datatype'] = 'SAMPLE_ATTRIBUTES'
+        lines['data_filename'] = SAMPLE_DATA_FILENAME
+    else:
+        lines['datatype'] = 'PATIENT_ATTRIBUTES'
+        lines['data_filename'] = PATIENT_DATA_FILENAME
+
+    for field, value in lines.items():
+        file_handle.write(f"{field}: {value}\n")
+
+
+def individual_export(results, file_handle: TextIO):
+    """
+    Renders Individuals as a clinical_patient text file suitable for
+    importing by cBioPortal.
+
+    cBioPortal Patients fields specs:
+    ---------------------------------
+    Required:
+    - PATIENT_ID
+    Special columns:
+    - OS_STATUS, OS_MONTHS overall survivall. Status can be 1:DECEASED, 0:LIVING
+    - DFS_STATUS, DFS_MONTHS disease free
+    - PATIENT_DISPLAY_NAME
+    - GENDER or SEX
+    - AGE
+    - TUMOR_SITE
+    """
+
+    individuals = []
+    for individual in results:
+        ind_obj = {
+            'id': individual.id,
+            'sex': individual.sex,
+        }
+        individuals.append(ind_obj)
+
+    columns = individuals[0].keys()
+    headers = individual_to_patient_header(columns)
+
+    file_handle.writelines([line + '\n' for line in headers])
+    dict_writer = csv.DictWriter(file_handle, fieldnames=columns, delimiter='\t')
+    dict_writer.writerows(individuals)
+
+
+def sample_export(results, file_handle: TextIO):
+    """
+    Renders Biosamples as a clinical_sample text file suitable for
+    importing by cBioPortal.
+
+    cBioPortal Sample fields specs:
+    ---------------------------------
+    Required:
+    - PATIENT_ID
+    - SAMPLE_ID
+
+    Special columns:
+    - For pan-cancer summary statistics tab:
+        - CANCER_TYPE as an Oncotree code
+        - CANCER_TYPE_DETAILED
+    - SAMPLE_DISPLAY_NAME
+    - SAMPLE_CLASS
+    - METASTATIC_SITE / PRIMARY_SITE overrides the patients level attribute TUMOR_SITE
+    - SAMPLE_TYPE, TUMOR_TISSUE_SITE, TUMOR_TYPE can have the following values
+        (are displayed with a distinct color in the timelines):
+        - "recurrence", "recurred", "progression"
+        - "metastatic", "metastasis"
+        - "primary" or any other value
+    - KNOWN_MOLECULAR_CLASSIFIER
+    - GLEASON_SCORE (prostate cancer)
+    - HISTOLOGY
+    - TUMOR_STAGE_2009
+    - TUMOR_GRADE
+    - ETS_RAF_SPINK1_STATUS
+    - TMPRSS2_ERG_FUSION_STATUS
+    - ERG_FUSION_ACGH
+    - SERUM_PSA
+    - DRIVER_MUTATIONS
+    """
+
+    samples = []
+    for sample in results:
+
+        # sample.inidividual may be null: use Phenopacket model Subject field
+        # instead if available or skip.
+        subject_id = None
+        if sample.individual is not None:
+            subject_id = sample.individual
+        elif sample.phenopacket_subject_id is not None:
+            subject_id = sample.phenopacket_subject_id
+        else:
+            continue
+
+        sample_obj = {
+            'individual_id': subject_id,
+            'id': sample.id
+        }
+        if sample.sampled_tissue:
+            sample_obj['tissue_label'] = sample.sampled_tissue.get('label', '')
+
+        samples.append(sample_obj)
+
+    columns = samples[0].keys()
+    headers = biosample_to_sample_header(columns)
+
+    file_handle.writelines([line + '\n' for line in headers])
+    dict_writer = csv.DictWriter(file_handle, fieldnames=columns, delimiter='\t')
+    dict_writer.writerows(samples)
+
+
+class CbioportalClinicalHeaderGenerator():
+    """
+    Generates cBioPortal data files headers based on field names from katsu models.
+    """
+
+    fields_mapping = {}
+
+    def __init__(self, mappings={}):
+        self.fields_mapping = mappings
+
+    def make_header(self, fields: list):
+        """
+        Maps a list of field names to a 5 rows header
+        suitable for cBioPortal clinical data files.
+        """
+
+        field_properties = []
+        for field in fields:
+            if field in self.fields_mapping:
+                field_properties.append(self.fields_mapping[field])
+            else:
+                fieldname = field.replace('_', ' ').capitalize()
+                prop = (
+                    fieldname,  # display name
+                    fieldname,  # description
+                    'STRING',   # type !!!TODO: TYPE DETECTION!!!
+                    '1',        # priority (note: string here for use in join())
+                    field.upper()   # DB suitable identifier
+                )
+                field_properties.append(prop)
+
+        # Transpose list of properties tuples per field to tuples of
+        # field properties per property.
+        rows = list(zip(*field_properties))
+
+        # The 4 first rows are considered meta datas, prefixed by '#'.
+        # The 5th row (DB field names) is a canonical TSV header.
+        cbio_header = [
+            '#' + '\t'.join(rows[0]),
+            '#' + '\t'.join(rows[1]),
+            '#' + '\t'.join(rows[2]),
+            '#' + '\t'.join(rows[3]),
+            '\t'.join(rows[4])
+        ]
+
+        return cbio_header
+
+
+def individual_to_patient_header(fields: list):
+    """
+    Maps a list of Individual field names to a 5 rows header
+    suitable for cBioPortal data_clinical_patient.txt file.
+    """
+
+    # predefined mappings from Individual keys to cBioPortal field properties
+    fields_mapping = {
+        'id': ('Patient Identifier', 'Patient Identifier', 'STRING', '1', 'PATIENT_ID'),
+        'sex': ('Sex', 'Sex', 'STRING', '1', 'SEX'),
+    }
+
+    cbio_header = CbioportalClinicalHeaderGenerator(fields_mapping)
+    return cbio_header.make_header(fields)
+
+
+def biosample_to_sample_header(fields: list):
+    """
+    Maps a list of biosamples field names to a 5 rows header
+    suitable for cBioPortal data_sample_patient.txt file.
+    """
+
+    # predefined mappings from Samples keys to cBioPortal field properties
+    fields_mapping = {
+        'individual_id': ('Patient Identifier', 'Patient Identifier', 'STRING', '1', 'PATIENT_ID'),
+        'id': ('Sample Identifier', 'Sample Identifier', 'STRING', '1', 'SAMPLE_ID'),
+        'tissue_label': ('Sampled Tissue', 'Sampled Tissue', 'STRING', '1', 'TISSUE_LABEL')
+    }
+
+    cbio_header = CbioportalClinicalHeaderGenerator(fields_mapping)
+    return cbio_header.make_header(fields)