chore: log not-found biosamples during experiments ingestion

bento-platform · Nov 15, 2022 · 580c8aa · 580c8aa
1 parent 2a996dd
commit 580c8aa
Show file tree

Hide file tree

Showing 5 changed files with 123 additions and 11 deletions.
diff --git a/chord_metadata_service/chord/ingest.py b/chord_metadata_service/chord/ingest.py
@@ -11,7 +11,7 @@
 import jsonschema
 
 from dateutil.parser import isoparse
-from typing import Callable
+from typing import Callable, Optional
 from urllib.parse import urlparse
 
 from django.conf import settings
@@ -485,17 +485,27 @@ def ingest_experiment(experiment_data, table_id):
     extraction_protocol = experiment_data.get("extraction_protocol")
     reference_registry_id = experiment_data.get("reference_registry_id")
     qc_flags = experiment_data.get("qc_flags", [])
-    biosample = experiment_data.get("biosample")
+    biosample_id = experiment_data.get("biosample")
     experiment_results = experiment_data.get("experiment_results", [])
     instrument = experiment_data.get("instrument", {})
     extra_properties = experiment_data.get("extra_properties", {})
+
+    biosample: Optional[pm.Biosample] = None
+
     # get existing biosample id
-    if biosample is not None:
-        biosample = pm.Biosample.objects.get(id=biosample)  # TODO: Handle error nicer
+    if biosample_id is not None:
+        try:
+            biosample = pm.Biosample.objects.get(id=biosample_id)  # TODO: Handle error nicer
+        except pm.Biosample.DoesNotExist as e:
+            logger.error(f"Could not find biosample with ID: {biosample_id}")
+            raise e
+
     # create related experiment results
     experiment_results_db = [create_experiment_result(er) for er in experiment_results]
+
     # create related instrument
     instrument_db = create_instrument(instrument)
+
     # create new experiment
     new_experiment = em.Experiment.objects.create(
         id=new_experiment_id,

diff --git a/chord_metadata_service/chord/tests/example_experiment_bad_biosample.json b/chord_metadata_service/chord/tests/example_experiment_bad_biosample.json
@@ -0,0 +1,78 @@
+{
+  "experiments": [
+    {
+      "id": "experiment:1",
+      "biosample": "sample1ohno!",
+      "study_type": "Epigenomics",
+      "experiment_type": "Other",
+      "experiment_ontology": [
+        {
+          "id": "http://www.ebi.ac.uk/efo/EFO_0002692",
+          "label": "ChIP-seq"
+        }
+      ],
+      "library_strategy": "ChIP-Seq",
+      "library_source": "Genomic",
+      "library_selection": "Random",
+      "library_layout": "Single",
+      "extraction_protocol": "NGS",
+      "molecule": "genomic DNA",
+      "molecule_ontology": [
+        {
+          "id": "SO:0000991",
+          "label": "genomic DNA"
+        }
+      ],
+      "experiment_results": [
+        {
+          "identifier": "sample1_01",
+          "description": "test",
+          "filename": "sample1_01.vcf.gz",
+          "file_format": "VCF",
+          "data_output_type": "Derived data",
+          "usage": "Visualized",
+          "creation_date": "01-09-2021",
+          "created_by": "Admin",
+          "extra_properties": {
+            "test": "test"
+          }
+        },
+        {
+          "identifier": "sample1_02",
+          "description": "test2",
+          "filename": "sample1_02.vcf.gz",
+          "file_format": "CRAM",
+          "data_output_type": "Raw data",
+          "usage": "Visualized",
+          "creation_date": "01-09-2021",
+          "created_by": "Admin",
+          "extra_properties": {
+            "test": "test"
+          }
+        }
+      ],
+      "instrument": {
+        "identifier": "instrument:01",
+        "platform": "Illumina",
+        "description": "Test description",
+        "model": "Illumina HiSeq 4000",
+        "extra_properties": {
+          "date": "2021-06-21"
+        }
+      },
+      "extra_properties": {
+        "date_uploaded": "2021-03-16"
+      }
+    }
+  ],
+  "resources": [
+    {
+      "name": "Sequence types and features ontology",
+      "version": "2021-02-16",
+      "namespace_prefix": "SO",
+      "id": "SO:2021-02-16",
+      "iri_prefix": "http://purl.obolibrary.org/obo/so.owl#",
+      "url": "http://purl.obolibrary.org/obo/so.owl"
+    }
+  ]
+}
diff --git a/chord_metadata_service/chord/tests/example_ingest.py b/chord_metadata_service/chord/tests/example_ingest.py
@@ -2,10 +2,17 @@
 import os
 
 
-__all__ = ["EXAMPLE_INGEST_PHENOPACKET", "EXAMPLE_INGEST_OUTPUTS",
-           "EXAMPLE_INGEST_EXPERIMENT", "EXAMPLE_INGEST_OUTPUTS_EXPERIMENT",
-           "EXAMPLE_INGEST_INVALID_EXPERIMENT", "EXAMPLE_INGEST_INVALID_PHENOPACKET",
-           "EXAMPLE_INGEST_MULTIPLE_PHENOPACKETS", "EXAMPLE_INGEST_MULTIPLE_OUTPUTS"]
+__all__ = [
+    "EXAMPLE_INGEST_PHENOPACKET",
+    "EXAMPLE_INGEST_OUTPUTS",
+    "EXAMPLE_INGEST_EXPERIMENT",
+    "EXAMPLE_INGEST_OUTPUTS_EXPERIMENT",
+    "EXAMPLE_INGEST_OUTPUTS_EXPERIMENT_BAD_BIOSAMPLE",
+    "EXAMPLE_INGEST_INVALID_EXPERIMENT",
+    "EXAMPLE_INGEST_INVALID_PHENOPACKET",
+    "EXAMPLE_INGEST_MULTIPLE_PHENOPACKETS",
+    "EXAMPLE_INGEST_MULTIPLE_OUTPUTS",
+]
 
 with open(os.path.join(os.path.dirname(__file__), "example_phenopacket.json"), "r") as pf:
     EXAMPLE_INGEST_PHENOPACKET = json.load(pf)
@@ -22,6 +29,10 @@
     "json_document": os.path.join(os.path.dirname(__file__), "example_experiment.json"),
 }
 
+EXAMPLE_INGEST_OUTPUTS_EXPERIMENT_BAD_BIOSAMPLE = {
+    "json_document": os.path.join(os.path.dirname(__file__), "example_experiment_bad_biosample.json"),
+}
+
 
 with open(os.path.join(os.path.dirname(__file__), "example_invalid_experiment.json"), "r") as pf:
     EXAMPLE_INGEST_INVALID_EXPERIMENT = json.load(pf)

diff --git a/chord_metadata_service/chord/tests/test_ingest.py b/chord_metadata_service/chord/tests/test_ingest.py
@@ -15,7 +15,7 @@
     WORKFLOW_EXPERIMENTS_JSON,
     schema_validation
 )
-from chord_metadata_service.phenopackets.models import PhenotypicFeature, Phenopacket
+from chord_metadata_service.phenopackets.models import Biosample, PhenotypicFeature, Phenopacket
 from chord_metadata_service.phenopackets.schemas import PHENOPACKET_SCHEMA
 from chord_metadata_service.resources.models import Resource
 from chord_metadata_service.experiments.models import Experiment, ExperimentResult, Instrument
@@ -30,6 +30,7 @@
     EXAMPLE_INGEST_OUTPUTS,
     EXAMPLE_INGEST_EXPERIMENT,
     EXAMPLE_INGEST_OUTPUTS_EXPERIMENT,
+    EXAMPLE_INGEST_OUTPUTS_EXPERIMENT_BAD_BIOSAMPLE,
     EXAMPLE_INGEST_EXPERIMENT_RESULT,
     EXAMPLE_INGEST_INVALID_PHENOPACKET,
     EXAMPLE_INGEST_MULTIPLE_OUTPUTS,
@@ -118,28 +119,40 @@ def test_ingesting_experiments_json(self):
         # ingest phenopackets data in order to match to biosample ids
         p = WORKFLOW_INGEST_FUNCTION_MAP[WORKFLOW_PHENOPACKETS_JSON](EXAMPLE_INGEST_OUTPUTS, self.t.identifier)
         self.assertEqual(p.id, Phenopacket.objects.get(id=p.id).id)
+
         # ingest list of experiments
         experiments = WORKFLOW_INGEST_FUNCTION_MAP[WORKFLOW_EXPERIMENTS_JSON](
             EXAMPLE_INGEST_OUTPUTS_EXPERIMENT, self.t_exp.identifier
         )
+
         # experiments
         self.assertEqual(len(experiments), Experiment.objects.all().count())
         self.assertEqual(experiments[0].id, EXAMPLE_INGEST_EXPERIMENT["experiments"][0]["id"])
         self.assertEqual(experiments[0].biosample.id, EXAMPLE_INGEST_EXPERIMENT["experiments"][0]["biosample"])
         self.assertEqual(experiments[0].experiment_type, EXAMPLE_INGEST_EXPERIMENT["experiments"][0]["experiment_type"])
+
         # experiment results
         self.assertEqual(experiments[0].experiment_results.count(), ExperimentResult.objects.all().count())
+
         # instrument
         self.assertEqual(Instrument.objects.all().count(), 1)
+
         # resources for experiments
-        # check that experiments resource is in database
+        # - check that experiments resource is in database
         self.assertIn(EXAMPLE_INGEST_EXPERIMENT["resources"][0]["id"], [v["id"] for v in Resource.objects.values("id")])
 
+        # try ingesting the file with an invalid biosample ID
+        with self.assertRaises(Biosample.DoesNotExist):
+            WORKFLOW_INGEST_FUNCTION_MAP[WORKFLOW_EXPERIMENTS_JSON](
+                EXAMPLE_INGEST_OUTPUTS_EXPERIMENT_BAD_BIOSAMPLE, self.t_exp.identifier
+            )
+
     def test_ingesting_invalid_experiment_json(self):
         # check invalid experiment, must fail validation
         for exp in EXAMPLE_INGEST_INVALID_EXPERIMENT["experiments"]:
             validation = schema_validation(exp, EXPERIMENT_SCHEMA)
             self.assertEqual(validation, False)
+
         # check valid experiment, must pass validation
         for exp in EXAMPLE_INGEST_EXPERIMENT["experiments"]:
             validation_2 = schema_validation(exp, EXPERIMENT_SCHEMA)

diff --git a/chord_metadata_service/package.cfg b/chord_metadata_service/package.cfg
@@ -1,4 +1,4 @@
 [package]
 name = katsu
-version = 2.15.0
+version = 2.15.1
 authors = Ksenia Zaytseva, David Lougheed, Simon Chénard, Romain Grégoire, Paul Pillot, Son Chau