Skip to content

Commit

Permalink
Merge pull request #311 from bento-platform/cbio_export
Browse files Browse the repository at this point in the history
Export Datasets to cBioPortal format
  • Loading branch information
zxenia authored Apr 20, 2022
2 parents 955a948 + 2130e4b commit 5640da3
Show file tree
Hide file tree
Showing 9 changed files with 882 additions and 2 deletions.
32 changes: 32 additions & 0 deletions chord_metadata_service/chord/export.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import logging
from chord_metadata_service.chord.ingest import WORKFLOW_CBIOPORTAL
from chord_metadata_service.chord.models import Dataset, Project, Table
from .export_cbio import study_export as export_cbioportal_workflow

logger = logging.getLogger(__name__)

OBJECT_TYPE_PROJECT = "project"
OBJECT_TYPE_DATASET = "dataset"
OBJECT_TYPE_TABLE = "table"

EXPORT_OBJECT_TYPE = {
OBJECT_TYPE_PROJECT: {
"model": Project
},
OBJECT_TYPE_DATASET: {
"model": Dataset
},
OBJECT_TYPE_TABLE: {
"model": Table
},
}

EXPORT_FORMATS = {WORKFLOW_CBIOPORTAL}

EXPORT_FORMAT_FUNCTION_MAP = {
WORKFLOW_CBIOPORTAL: export_cbioportal_workflow
}

EXPORT_FORMAT_OBJECT_TYPE_MAP = {
WORKFLOW_CBIOPORTAL: {OBJECT_TYPE_DATASET}
}
286 changes: 286 additions & 0 deletions chord_metadata_service/chord/export_cbio.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,286 @@
import logging
import csv
from typing import TextIO, Callable
from django.db.models import F

from .export_utils import ExportError

from chord_metadata_service.chord.models import Dataset
from chord_metadata_service.patients.models import Individual
from chord_metadata_service.phenopackets import models as pm

__all__ = [
"study_export",
]

logger = logging.getLogger(__name__)

# predefined filenames recognized by cBioPortal
STUDY_FILENAME = "meta_study.txt"
SAMPLE_DATA_FILENAME = "data_clinical_sample.txt"
SAMPLE_META_FILENAME = "meta_clinical_sample.txt"
PATIENT_DATA_FILENAME = "data_clinical_patient.txt"
PATIENT_META_FILENAME = "meta_clinical_patient.txt"

CBIO_FILES_SET = frozenset({
STUDY_FILENAME,
SAMPLE_DATA_FILENAME,
SAMPLE_META_FILENAME,
PATIENT_DATA_FILENAME,
PATIENT_META_FILENAME
})

PATIENT_DATATYPE = 'PATIENT'
SAMPLE_DATATYPE = 'SAMPLE'


def study_export(getPath: Callable[[str], str], dataset_id: str):
"""Export a given Project as a cBioPortal study"""
# TODO: a Dataset is a Study (associated with a publication), not a Project!
if Dataset.objects.count == 0:
raise ExportError("No Dataset to export")
dataset = Dataset.objects.get(identifier=dataset_id)
cbio_study_id = str(dataset.identifier)

# Export study file
with open(getPath(STUDY_FILENAME), 'w') as file_study:
study_export_meta(dataset, file_study)

# Export patients.
with open(getPath(PATIENT_DATA_FILENAME), 'w') as file_patient:
# Note: plural in `phenopackets` is intentional (related_name property in model)
indiv = Individual.objects.filter(phenopackets__table__ownership_record__dataset_id=dataset.identifier)
individual_export(indiv, file_patient)

with open(getPath(PATIENT_META_FILENAME), 'w') as file_patient_meta:
clinical_meta_export(cbio_study_id, PATIENT_DATATYPE, file_patient_meta)

# Export samples
with open(getPath(SAMPLE_DATA_FILENAME), 'w') as file_sample:
sampl = pm.Biosample.objects.filter(phenopacket__table__ownership_record__dataset_id=dataset.identifier)\
.annotate(phenopacket_subject_id=F("phenopacket__subject"))
sample_export(sampl, file_sample)

with open(getPath(SAMPLE_META_FILENAME), 'w') as file_sample_meta:
clinical_meta_export(cbio_study_id, SAMPLE_DATATYPE, file_sample_meta)


def study_export_meta(dataset: Dataset, file_handle: TextIO):
"""
Study meta data file generation
"""
lines = dict()
lines['type_of_cancer'] = "mixed" # TODO: find if this information is available. !IMPORTANT! uses Oncotree codes
lines['cancer_study_identifier'] = str(dataset.identifier)
lines['name'] = dataset.title
lines['description'] = dataset.description

# optional fields
if len(dataset.primary_publications):
lines['citation'] = dataset.primary_publications[0]
# pmid: unvailable
# groups: unused for authentication
# add_global_case_list: ?
# tags_file: ?
# reference_genome: ?

for field, value in lines.items():
file_handle.write(f"{field}: {value}\n")


def clinical_meta_export(study_id: str, datatype: str, file_handle: TextIO):
"""
Clinical Metadata files generation (samples or patients)
"""
lines = dict()
lines['cancer_study_identifier'] = study_id
lines['genetic_alteration_type'] = 'CLINICAL'
if datatype == SAMPLE_DATATYPE:
lines['datatype'] = 'SAMPLE_ATTRIBUTES'
lines['data_filename'] = SAMPLE_DATA_FILENAME
else:
lines['datatype'] = 'PATIENT_ATTRIBUTES'
lines['data_filename'] = PATIENT_DATA_FILENAME

for field, value in lines.items():
file_handle.write(f"{field}: {value}\n")


def individual_export(results, file_handle: TextIO):
"""
Renders Individuals as a clinical_patient text file suitable for
importing by cBioPortal.
cBioPortal Patients fields specs:
---------------------------------
Required:
- PATIENT_ID
Special columns:
- OS_STATUS, OS_MONTHS overall survivall. Status can be 1:DECEASED, 0:LIVING
- DFS_STATUS, DFS_MONTHS disease free
- PATIENT_DISPLAY_NAME
- GENDER or SEX
- AGE
- TUMOR_SITE
"""

individuals = []
for individual in results:
ind_obj = {
'id': individual.id,
'sex': individual.sex,
}
individuals.append(ind_obj)

columns = individuals[0].keys()
headers = individual_to_patient_header(columns)

file_handle.writelines([line + '\n' for line in headers])
dict_writer = csv.DictWriter(file_handle, fieldnames=columns, delimiter='\t')
dict_writer.writerows(individuals)


def sample_export(results, file_handle: TextIO):
"""
Renders Biosamples as a clinical_sample text file suitable for
importing by cBioPortal.
cBioPortal Sample fields specs:
---------------------------------
Required:
- PATIENT_ID
- SAMPLE_ID
Special columns:
- For pan-cancer summary statistics tab:
- CANCER_TYPE as an Oncotree code
- CANCER_TYPE_DETAILED
- SAMPLE_DISPLAY_NAME
- SAMPLE_CLASS
- METASTATIC_SITE / PRIMARY_SITE overrides the patients level attribute TUMOR_SITE
- SAMPLE_TYPE, TUMOR_TISSUE_SITE, TUMOR_TYPE can have the following values
(are displayed with a distinct color in the timelines):
- "recurrence", "recurred", "progression"
- "metastatic", "metastasis"
- "primary" or any other value
- KNOWN_MOLECULAR_CLASSIFIER
- GLEASON_SCORE (prostate cancer)
- HISTOLOGY
- TUMOR_STAGE_2009
- TUMOR_GRADE
- ETS_RAF_SPINK1_STATUS
- TMPRSS2_ERG_FUSION_STATUS
- ERG_FUSION_ACGH
- SERUM_PSA
- DRIVER_MUTATIONS
"""

samples = []
for sample in results:

# sample.inidividual may be null: use Phenopacket model Subject field
# instead if available or skip.
subject_id = None
if sample.individual is not None:
subject_id = sample.individual
elif sample.phenopacket_subject_id is not None:
subject_id = sample.phenopacket_subject_id
else:
continue

sample_obj = {
'individual_id': subject_id,
'id': sample.id
}
if sample.sampled_tissue:
sample_obj['tissue_label'] = sample.sampled_tissue.get('label', '')

samples.append(sample_obj)

columns = samples[0].keys()
headers = biosample_to_sample_header(columns)

file_handle.writelines([line + '\n' for line in headers])
dict_writer = csv.DictWriter(file_handle, fieldnames=columns, delimiter='\t')
dict_writer.writerows(samples)


class CbioportalClinicalHeaderGenerator():
"""
Generates cBioPortal data files headers based on field names from katsu models.
"""

fields_mapping = {}

def __init__(self, mappings={}):
self.fields_mapping = mappings

def make_header(self, fields: list):
"""
Maps a list of field names to a 5 rows header
suitable for cBioPortal clinical data files.
"""

field_properties = []
for field in fields:
if field in self.fields_mapping:
field_properties.append(self.fields_mapping[field])
else:
fieldname = field.replace('_', ' ').capitalize()
prop = (
fieldname, # display name
fieldname, # description
'STRING', # type !!!TODO: TYPE DETECTION!!!
'1', # priority (note: string here for use in join())
field.upper() # DB suitable identifier
)
field_properties.append(prop)

# Transpose list of properties tuples per field to tuples of
# field properties per property.
rows = list(zip(*field_properties))

# The 4 first rows are considered meta datas, prefixed by '#'.
# The 5th row (DB field names) is a canonical TSV header.
cbio_header = [
'#' + '\t'.join(rows[0]),
'#' + '\t'.join(rows[1]),
'#' + '\t'.join(rows[2]),
'#' + '\t'.join(rows[3]),
'\t'.join(rows[4])
]

return cbio_header


def individual_to_patient_header(fields: list):
"""
Maps a list of Individual field names to a 5 rows header
suitable for cBioPortal data_clinical_patient.txt file.
"""

# predefined mappings from Individual keys to cBioPortal field properties
fields_mapping = {
'id': ('Patient Identifier', 'Patient Identifier', 'STRING', '1', 'PATIENT_ID'),
'sex': ('Sex', 'Sex', 'STRING', '1', 'SEX'),
}

cbio_header = CbioportalClinicalHeaderGenerator(fields_mapping)
return cbio_header.make_header(fields)


def biosample_to_sample_header(fields: list):
"""
Maps a list of biosamples field names to a 5 rows header
suitable for cBioPortal data_sample_patient.txt file.
"""

# predefined mappings from Samples keys to cBioPortal field properties
fields_mapping = {
'individual_id': ('Patient Identifier', 'Patient Identifier', 'STRING', '1', 'PATIENT_ID'),
'id': ('Sample Identifier', 'Sample Identifier', 'STRING', '1', 'SAMPLE_ID'),
'tissue_label': ('Sampled Tissue', 'Sampled Tissue', 'STRING', '1', 'TISSUE_LABEL')
}

cbio_header = CbioportalClinicalHeaderGenerator(fields_mapping)
return cbio_header.make_header(fields)
Loading

0 comments on commit 5640da3

Please sign in to comment.