diff --git a/chord_metadata_service/chord/ingest/phenopackets.py b/chord_metadata_service/chord/ingest/phenopackets.py index 4c70f86b0..03353dcd8 100644 --- a/chord_metadata_service/chord/ingest/phenopackets.py +++ b/chord_metadata_service/chord/ingest/phenopackets.py @@ -10,7 +10,7 @@ from chord_metadata_service.patients.values import KaryotypicSex from chord_metadata_service.restapi.schema_utils import patch_project_schemas from chord_metadata_service.restapi.types import ExtensionSchemaDict -from chord_metadata_service.restapi.utils import time_element_to_years +from chord_metadata_service.restapi.utils import COMPUTED_PROPERTY_PREFIX, time_element_to_years from .exceptions import IngestError from .resources import ingest_resource @@ -34,6 +34,16 @@ def _get_or_create_opt(key: str, data: dict, create_func: Callable[..., T]) -> O return obj +def _clean_extra_properties(extra_properties: dict) -> Dict: + """ + Removes computed properties from an extra_properties dictionary. + Computed extra_properties start with "__" and should never be ingested. + """ + if extra_properties: + return {k: v for k, v in extra_properties.items() if not k.startswith(COMPUTED_PROPERTY_PREFIX)} + return extra_properties + + def get_or_create_phenotypic_feature(pf: dict) -> pm.PhenotypicFeature: # Below is code for if we want to re-use phenotypic features in the future # For now, the lack of a many-to-many relationship doesn't let us do that. @@ -64,7 +74,7 @@ def get_or_create_phenotypic_feature(pf: dict) -> pm.PhenotypicFeature: severity=pf.get("severity"), onset=pf.get("onset"), evidence=pf.get("evidence"), # TODO: Separate class for evidence? - extra_properties=pf.get("extra_properties", {}), + extra_properties=_clean_extra_properties(pf.get("extra_properties", {})), ) pf_obj.save() return pf_obj @@ -84,7 +94,7 @@ def validate_phenopacket(phenopacket_data: dict[str, Any], def update_or_create_subject(subject: dict) -> pm.Individual: - extra_properties: dict[str, Any] = subject.get("extra_properties", {}) + extra_properties: dict[str, Any] = _clean_extra_properties(subject.get("extra_properties", {})) # Pre-process subject data: --------------------------------------------------------------------------------- @@ -145,7 +155,7 @@ def get_or_create_biosample(bs: dict) -> pm.Biosample: pathological_stage=bs.get("pathological_stage", {}), is_control_sample=bs.get("is_control_sample", False), diagnostic_markers=bs.get("diagnostic_markers", []), - extra_properties=bs.get("extra_properties", {}), + extra_properties=_clean_extra_properties(bs.get("extra_properties", {})), **bs_query ) @@ -240,7 +250,7 @@ def get_or_create_genomic_interpretation(gen_interp: dict) -> pm.GenomicInterpre interpretation_status=gen_interp["interpretation_status"], gene_descriptor=gene_descriptor, variant_interpretation=variant_interpretation, - extra_properties=gen_interp.get("extra_properties", {}), + extra_properties=_clean_extra_properties(gen_interp.get("extra_properties", {})), ) if related_obj: @@ -256,38 +266,49 @@ def get_or_create_disease(disease) -> pm.Disease: term=disease["term"], disease_stage=disease.get("disease_stage", []), clinical_tnm_finding=disease.get("clinical_tnm_finding", []), - extra_properties=disease.get("extra_properties", {}), + extra_properties=_clean_extra_properties(disease.get("extra_properties", {})), **query_and_check_nulls(disease, "onset") ) return d_obj -def get_or_create_diagnosis(diagnosis: dict) -> pm.Diagnosis: - # Create GenomicInterpretation - genomic_interpretations_data = diagnosis.get("genomic_interpretations", []) - genomic_interpretations = [ - get_or_create_genomic_interpretation(gen_interp) - for gen_interp - in genomic_interpretations_data - ] - # disease = pm.Disease.objects.get_or_create(diagnosis["disease"]) - diag_obj, _ = pm.Diagnosis.objects.get_or_create( +def get_or_create_interpretation_diagnosis(interpretation: dict) -> pm.Diagnosis | None: + diagnosis = interpretation.get("diagnosis") + + if not diagnosis: + # No diagnosis to create + return + + # One-to-one relation between Interpretation and Diagnosis. + # If an Interpretation has a Diagnosis, the created Diagnosis row uses the interpretation's ID as its PK + # This ensures unique diagnoses if more than one share the same disease/extra_properties + id = interpretation.get("id") + diag_obj, created = pm.Diagnosis.objects.get_or_create( + id=id, disease=diagnosis.get("disease", {}), - extra_properties=diagnosis.get("extra_properties", {}) + extra_properties=_clean_extra_properties(diagnosis.get("extra_properties", {})), ) - # diag_obj.disease.set(disease) - diag_obj.genomic_interpretations.set(genomic_interpretations) + + if created: + # Create GenomicInterpretation + genomic_interpretations_data = diagnosis.get("genomic_interpretations", []) + genomic_interpretations = [ + get_or_create_genomic_interpretation(gen_interp) + for gen_interp + in genomic_interpretations_data + ] + diag_obj.genomic_interpretations.set(genomic_interpretations) return diag_obj def get_or_create_interpretation(interpretation: dict) -> pm.Interpretation: - diagnosis = get_or_create_diagnosis(interpretation["diagnosis"]) + diagnosis = get_or_create_interpretation_diagnosis(interpretation) interp_obj, _ = pm.Interpretation.objects.get_or_create( id=interpretation["id"], diagnosis=diagnosis, progress_status=interpretation["progress_status"], - summary=interpretation.get("summary", {}), - extra_properties=interpretation.get("extra_properties", {}) + summary=interpretation.get("summary", ""), + extra_properties=_clean_extra_properties(interpretation.get("extra_properties", {})) ) return interp_obj @@ -373,7 +394,7 @@ def ingest_phenopacket(phenopacket_data: dict[str, Any], submitted_by=meta_data.get("submitted_by"), phenopacket_schema_version=meta_data.get("phenopacket_schema_version"), external_references=meta_data.get("external_references", []), - extra_properties=meta_data.get("extra_properties", {}), + extra_properties=_clean_extra_properties(meta_data.get("extra_properties", {})), ) meta_data_obj.save() diff --git a/chord_metadata_service/chord/tests/example_phenopacket_v2.json b/chord_metadata_service/chord/tests/example_phenopacket_v2.json index 891cd8515..62cb11e88 100644 --- a/chord_metadata_service/chord/tests/example_phenopacket_v2.json +++ b/chord_metadata_service/chord/tests/example_phenopacket_v2.json @@ -7,7 +7,8 @@ "karyotypic_sex": "UNKNOWN_KARYOTYPE", "extra_properties": { "cool_guy": true, - "smoker": false + "smoker": false, + "__computed": "This extra_property was computed and should not be ingested." } }, "phenotypic_features": [ diff --git a/chord_metadata_service/chord/tests/test_ingest.py b/chord_metadata_service/chord/tests/test_ingest.py index 3977049da..468505475 100644 --- a/chord_metadata_service/chord/tests/test_ingest.py +++ b/chord_metadata_service/chord/tests/test_ingest.py @@ -119,6 +119,9 @@ def test_ingesting_phenopackets_json(self): self.assertEqual(p.subject.sex, EXAMPLE_INGEST_PHENOPACKET["subject"]["sex"]) self.assertEqual(p.subject.karyotypic_sex, EXAMPLE_INGEST_PHENOPACKET["subject"]["karyotypic_sex"]) + self.assertIn("__computed", EXAMPLE_INGEST_PHENOPACKET["subject"]["extra_properties"]) + self.assertNotIn("__computed", p.subject.extra_properties) + pfs = list(p.phenotypic_features.all().order_by("pftype__id")) self.assertEqual(len(pfs), 2) diff --git a/chord_metadata_service/phenopackets/migrations/0008_v6_0_0.py b/chord_metadata_service/phenopackets/migrations/0008_v6_0_0.py index 4d8f30cef..f9d76ce7d 100644 --- a/chord_metadata_service/phenopackets/migrations/0008_v6_0_0.py +++ b/chord_metadata_service/phenopackets/migrations/0008_v6_0_0.py @@ -476,7 +476,7 @@ class Migration(migrations.Migration): migrations.AddField( model_name='interpretation', name='diagnosis', - field=models.ForeignKey(blank=True, help_text='One or more diagnoses, if made.', null=True, on_delete=django.db.models.deletion.CASCADE, to='phenopackets.diagnosis'), + field=models.OneToOneField(blank=True, help_text='The diagnosis, if made.', null=True, on_delete=django.db.models.deletion.CASCADE, to='phenopackets.diagnosis'), ), migrations.DeleteModel( name='Gene', @@ -491,4 +491,9 @@ class Migration(migrations.Migration): name='subject', field=models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.CASCADE, related_name='genomic_interpretations', to='patients.individual'), ), + migrations.AlterField( + model_name='diagnosis', + name='id', + field=models.CharField(help_text="Unique ID, uses the parent Interpretation's id when ingesting.", max_length=200, primary_key=True, serialize=False), + ), ] diff --git a/chord_metadata_service/phenopackets/models.py b/chord_metadata_service/phenopackets/models.py index a50640655..17f65f77b 100644 --- a/chord_metadata_service/phenopackets/models.py +++ b/chord_metadata_service/phenopackets/models.py @@ -356,6 +356,8 @@ class Diagnosis(BaseTimeStamp): FHIR: Condition """ + id = models.CharField(primary_key=True, max_length=200, + help_text="Unique ID, uses the parent Interpretation's id when ingesting.") disease = models.JSONField(null=True, blank=True, validators=[ontology_validator]) genomic_interpretations = models.ManyToManyField( GenomicInterpretation, blank=True, @@ -389,8 +391,8 @@ class Interpretation(BaseTimeStamp): id = models.CharField(primary_key=True, max_length=200, help_text='An arbitrary identifier for the interpretation.') progress_status = models.CharField(choices=PROGRESS_STATUS, max_length=200, blank=True, help_text='The current status of work on the case.') - diagnosis = models.ForeignKey(Diagnosis, blank=True, null=True, on_delete=models.CASCADE, - help_text='One or more diagnoses, if made.') + diagnosis = models.OneToOneField(Diagnosis, blank=True, null=True, on_delete=models.CASCADE, + help_text='The diagnosis, if made.') summary = models.CharField(max_length=200, blank=True, help_text='Free text summary of the interpretation.') extra_properties = JSONField(blank=True, null=True, help_text='Extra properties that are not supported by current schema') diff --git a/chord_metadata_service/phenopackets/serializers.py b/chord_metadata_service/phenopackets/serializers.py index ed6f27314..04ad56408 100644 --- a/chord_metadata_service/phenopackets/serializers.py +++ b/chord_metadata_service/phenopackets/serializers.py @@ -1,4 +1,6 @@ from rest_framework import serializers + +from chord_metadata_service.restapi.utils import computed_property from .models import ( MetaData, PhenotypicFeature, @@ -159,12 +161,19 @@ def to_representation(self, instance): response["variant_interpretation"] = VariantInterpretationSerializer( instance.variant_interpretation, many=False, required=False).data - # subject_or_biosample_id is obtained from the referenced subject/biosample + # The 'subject_or_biosample_id' value is obtained from the referenced subject/biosample + # The '__related_type' property is added to extra_properties as a computed value ("__" prefix) + # This allows us to disambiguate on the client side for links + extra_properties = response.get("extra_properties", {}) + computed_related_type = computed_property("related_type") if instance.subject: response["subject_or_biosample_id"] = instance.subject.id + extra_properties[computed_related_type] = "subject" elif instance.biosample: response["subject_or_biosample_id"] = instance.biosample.id + extra_properties[computed_related_type] = "biosample" + response["extra_properties"] = extra_properties return response diff --git a/chord_metadata_service/phenopackets/tests/constants.py b/chord_metadata_service/phenopackets/tests/constants.py index fd7e25726..085637b2a 100644 --- a/chord_metadata_service/phenopackets/tests/constants.py +++ b/chord_metadata_service/phenopackets/tests/constants.py @@ -603,8 +603,9 @@ def valid_genomic_interpretation(gene_descriptor=None, variant_interpretation=No return base -def valid_diagnosis(disease): +def valid_diagnosis(disease, id="interpretation:1"): return dict( + id=id, disease=disease, extra_properties={ "comment": "test data" diff --git a/chord_metadata_service/phenopackets/tests/test_api.py b/chord_metadata_service/phenopackets/tests/test_api.py index bdd3ef8b3..592bb8853 100644 --- a/chord_metadata_service/phenopackets/tests/test_api.py +++ b/chord_metadata_service/phenopackets/tests/test_api.py @@ -260,11 +260,13 @@ class CreateDiagnosisTest(APITestCase): def setUp(self): self.disease_ontology = c.VALID_DISEASE_ONTOLOGY - self.diagnosis = c.valid_diagnosis(self.disease_ontology) + self.diagnosis = c.valid_diagnosis(self.disease_ontology, "interpretation:unique_id") def test_diagnosis(self): response = get_post_response('diagnoses-list', self.diagnosis) self.assertEqual(response.status_code, status.HTTP_201_CREATED) + + def test_serializer(self): serializer = s.DiagnosisSerializer(data=self.diagnosis) self.assertEqual(serializer.is_valid(), True) diff --git a/chord_metadata_service/restapi/utils.py b/chord_metadata_service/restapi/utils.py index 28be837a9..010b8fa0b 100644 --- a/chord_metadata_service/restapi/utils.py +++ b/chord_metadata_service/restapi/utils.py @@ -25,6 +25,8 @@ "biosample": pheno_models.Biosample, } +COMPUTED_PROPERTY_PREFIX = "__" + class BinWithValue(TypedDict): label: str @@ -691,3 +693,10 @@ def bento_public_format_count_and_stats_list(annotated_queryset) -> tuple[int, l stats_list.append({"label": label, "value": value}) return total, stats_list + + +def computed_property(name: str): + """ + Takes a name and returns it prefixed with "__" + """ + return COMPUTED_PROPERTY_PREFIX + name