Skip to content

Commit

Permalink
chore: log not-found biosamples during experiments ingestion
Browse files Browse the repository at this point in the history
  • Loading branch information
davidlougheed committed Nov 15, 2022
1 parent 2a996dd commit 580c8aa
Show file tree
Hide file tree
Showing 5 changed files with 123 additions and 11 deletions.
18 changes: 14 additions & 4 deletions chord_metadata_service/chord/ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
import jsonschema

from dateutil.parser import isoparse
from typing import Callable
from typing import Callable, Optional
from urllib.parse import urlparse

from django.conf import settings
Expand Down Expand Up @@ -485,17 +485,27 @@ def ingest_experiment(experiment_data, table_id):
extraction_protocol = experiment_data.get("extraction_protocol")
reference_registry_id = experiment_data.get("reference_registry_id")
qc_flags = experiment_data.get("qc_flags", [])
biosample = experiment_data.get("biosample")
biosample_id = experiment_data.get("biosample")
experiment_results = experiment_data.get("experiment_results", [])
instrument = experiment_data.get("instrument", {})
extra_properties = experiment_data.get("extra_properties", {})

biosample: Optional[pm.Biosample] = None

# get existing biosample id
if biosample is not None:
biosample = pm.Biosample.objects.get(id=biosample) # TODO: Handle error nicer
if biosample_id is not None:
try:
biosample = pm.Biosample.objects.get(id=biosample_id) # TODO: Handle error nicer
except pm.Biosample.DoesNotExist as e:
logger.error(f"Could not find biosample with ID: {biosample_id}")
raise e

# create related experiment results
experiment_results_db = [create_experiment_result(er) for er in experiment_results]

# create related instrument
instrument_db = create_instrument(instrument)

# create new experiment
new_experiment = em.Experiment.objects.create(
id=new_experiment_id,
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
{
"experiments": [
{
"id": "experiment:1",
"biosample": "sample1ohno!",
"study_type": "Epigenomics",
"experiment_type": "Other",
"experiment_ontology": [
{
"id": "http://www.ebi.ac.uk/efo/EFO_0002692",
"label": "ChIP-seq"
}
],
"library_strategy": "ChIP-Seq",
"library_source": "Genomic",
"library_selection": "Random",
"library_layout": "Single",
"extraction_protocol": "NGS",
"molecule": "genomic DNA",
"molecule_ontology": [
{
"id": "SO:0000991",
"label": "genomic DNA"
}
],
"experiment_results": [
{
"identifier": "sample1_01",
"description": "test",
"filename": "sample1_01.vcf.gz",
"file_format": "VCF",
"data_output_type": "Derived data",
"usage": "Visualized",
"creation_date": "01-09-2021",
"created_by": "Admin",
"extra_properties": {
"test": "test"
}
},
{
"identifier": "sample1_02",
"description": "test2",
"filename": "sample1_02.vcf.gz",
"file_format": "CRAM",
"data_output_type": "Raw data",
"usage": "Visualized",
"creation_date": "01-09-2021",
"created_by": "Admin",
"extra_properties": {
"test": "test"
}
}
],
"instrument": {
"identifier": "instrument:01",
"platform": "Illumina",
"description": "Test description",
"model": "Illumina HiSeq 4000",
"extra_properties": {
"date": "2021-06-21"
}
},
"extra_properties": {
"date_uploaded": "2021-03-16"
}
}
],
"resources": [
{
"name": "Sequence types and features ontology",
"version": "2021-02-16",
"namespace_prefix": "SO",
"id": "SO:2021-02-16",
"iri_prefix": "http://purl.obolibrary.org/obo/so.owl#",
"url": "http://purl.obolibrary.org/obo/so.owl"
}
]
}
19 changes: 15 additions & 4 deletions chord_metadata_service/chord/tests/example_ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,17 @@
import os


__all__ = ["EXAMPLE_INGEST_PHENOPACKET", "EXAMPLE_INGEST_OUTPUTS",
"EXAMPLE_INGEST_EXPERIMENT", "EXAMPLE_INGEST_OUTPUTS_EXPERIMENT",
"EXAMPLE_INGEST_INVALID_EXPERIMENT", "EXAMPLE_INGEST_INVALID_PHENOPACKET",
"EXAMPLE_INGEST_MULTIPLE_PHENOPACKETS", "EXAMPLE_INGEST_MULTIPLE_OUTPUTS"]
__all__ = [
"EXAMPLE_INGEST_PHENOPACKET",
"EXAMPLE_INGEST_OUTPUTS",
"EXAMPLE_INGEST_EXPERIMENT",
"EXAMPLE_INGEST_OUTPUTS_EXPERIMENT",
"EXAMPLE_INGEST_OUTPUTS_EXPERIMENT_BAD_BIOSAMPLE",
"EXAMPLE_INGEST_INVALID_EXPERIMENT",
"EXAMPLE_INGEST_INVALID_PHENOPACKET",
"EXAMPLE_INGEST_MULTIPLE_PHENOPACKETS",
"EXAMPLE_INGEST_MULTIPLE_OUTPUTS",
]

with open(os.path.join(os.path.dirname(__file__), "example_phenopacket.json"), "r") as pf:
EXAMPLE_INGEST_PHENOPACKET = json.load(pf)
Expand All @@ -22,6 +29,10 @@
"json_document": os.path.join(os.path.dirname(__file__), "example_experiment.json"),
}

EXAMPLE_INGEST_OUTPUTS_EXPERIMENT_BAD_BIOSAMPLE = {
"json_document": os.path.join(os.path.dirname(__file__), "example_experiment_bad_biosample.json"),
}


with open(os.path.join(os.path.dirname(__file__), "example_invalid_experiment.json"), "r") as pf:
EXAMPLE_INGEST_INVALID_EXPERIMENT = json.load(pf)
Expand Down
17 changes: 15 additions & 2 deletions chord_metadata_service/chord/tests/test_ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
WORKFLOW_EXPERIMENTS_JSON,
schema_validation
)
from chord_metadata_service.phenopackets.models import PhenotypicFeature, Phenopacket
from chord_metadata_service.phenopackets.models import Biosample, PhenotypicFeature, Phenopacket
from chord_metadata_service.phenopackets.schemas import PHENOPACKET_SCHEMA
from chord_metadata_service.resources.models import Resource
from chord_metadata_service.experiments.models import Experiment, ExperimentResult, Instrument
Expand All @@ -30,6 +30,7 @@
EXAMPLE_INGEST_OUTPUTS,
EXAMPLE_INGEST_EXPERIMENT,
EXAMPLE_INGEST_OUTPUTS_EXPERIMENT,
EXAMPLE_INGEST_OUTPUTS_EXPERIMENT_BAD_BIOSAMPLE,
EXAMPLE_INGEST_EXPERIMENT_RESULT,
EXAMPLE_INGEST_INVALID_PHENOPACKET,
EXAMPLE_INGEST_MULTIPLE_OUTPUTS,
Expand Down Expand Up @@ -118,28 +119,40 @@ def test_ingesting_experiments_json(self):
# ingest phenopackets data in order to match to biosample ids
p = WORKFLOW_INGEST_FUNCTION_MAP[WORKFLOW_PHENOPACKETS_JSON](EXAMPLE_INGEST_OUTPUTS, self.t.identifier)
self.assertEqual(p.id, Phenopacket.objects.get(id=p.id).id)

# ingest list of experiments
experiments = WORKFLOW_INGEST_FUNCTION_MAP[WORKFLOW_EXPERIMENTS_JSON](
EXAMPLE_INGEST_OUTPUTS_EXPERIMENT, self.t_exp.identifier
)

# experiments
self.assertEqual(len(experiments), Experiment.objects.all().count())
self.assertEqual(experiments[0].id, EXAMPLE_INGEST_EXPERIMENT["experiments"][0]["id"])
self.assertEqual(experiments[0].biosample.id, EXAMPLE_INGEST_EXPERIMENT["experiments"][0]["biosample"])
self.assertEqual(experiments[0].experiment_type, EXAMPLE_INGEST_EXPERIMENT["experiments"][0]["experiment_type"])

# experiment results
self.assertEqual(experiments[0].experiment_results.count(), ExperimentResult.objects.all().count())

# instrument
self.assertEqual(Instrument.objects.all().count(), 1)

# resources for experiments
# check that experiments resource is in database
# - check that experiments resource is in database
self.assertIn(EXAMPLE_INGEST_EXPERIMENT["resources"][0]["id"], [v["id"] for v in Resource.objects.values("id")])

# try ingesting the file with an invalid biosample ID
with self.assertRaises(Biosample.DoesNotExist):
WORKFLOW_INGEST_FUNCTION_MAP[WORKFLOW_EXPERIMENTS_JSON](
EXAMPLE_INGEST_OUTPUTS_EXPERIMENT_BAD_BIOSAMPLE, self.t_exp.identifier
)

def test_ingesting_invalid_experiment_json(self):
# check invalid experiment, must fail validation
for exp in EXAMPLE_INGEST_INVALID_EXPERIMENT["experiments"]:
validation = schema_validation(exp, EXPERIMENT_SCHEMA)
self.assertEqual(validation, False)

# check valid experiment, must pass validation
for exp in EXAMPLE_INGEST_EXPERIMENT["experiments"]:
validation_2 = schema_validation(exp, EXPERIMENT_SCHEMA)
Expand Down
2 changes: 1 addition & 1 deletion chord_metadata_service/package.cfg
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
[package]
name = katsu
version = 2.15.0
version = 2.15.1
authors = Ksenia Zaytseva, David Lougheed, Simon Chénard, Romain Grégoire, Paul Pillot, Son Chau

0 comments on commit 580c8aa

Please sign in to comment.