Merge pull request #67 from c3g/develop

Release v0.3.0
bento-platform · Feb 19, 2020 · e63e202 · e63e202
2 parents 6fcb2ce + 724f1f9
commit e63e202
Show file tree

Hide file tree

Showing 43 changed files with 798 additions and 78 deletions.
diff --git a/chord_metadata_service/chord/api_views.py b/chord_metadata_service/chord/api_views.py
@@ -53,7 +53,7 @@ class DatasetViewSet(CHORDPublicModelViewSet):
     renderer_classes = tuple(CHORDModelViewSet.renderer_classes) + (JSONLDDatasetRenderer, RDFDatasetRenderer,)
 
 
-class TableOwnershipViewSet(CHORDModelViewSet):  # TODO: Public?
+class TableOwnershipViewSet(CHORDPublicModelViewSet):
     """
     get:
     Return a list of table-(dataset|dataset,biosample) relationships

diff --git a/chord_metadata_service/chord/migrations/0008_dataset_version.py b/chord_metadata_service/chord/migrations/0008_dataset_version.py
@@ -0,0 +1,18 @@
+# Generated by Django 2.2.9 on 2020-02-17 21:48
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('chord', '0007_auto_20200129_1537'),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name='dataset',
+            name='version',
+            field=models.CharField(blank=True, default='version_2020-02-17 16:47:59.425036', help_text='A release point for the dataset when applicable.', max_length=200),
+        ),
+    ]
diff --git a/chord_metadata_service/chord/migrations/0009_auto_20200218_1615.py b/chord_metadata_service/chord/migrations/0009_auto_20200218_1615.py
@@ -0,0 +1,19 @@
+# Generated by Django 2.2.9 on 2020-02-18 21:15
+
+import chord_metadata_service.chord.models
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('chord', '0008_dataset_version'),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name='dataset',
+            name='version',
+            field=models.CharField(blank=True, default=chord_metadata_service.chord.models.version_default, help_text='A release point for the dataset when applicable.', max_length=200),
+        ),
+    ]
diff --git a/chord_metadata_service/chord/models.py b/chord_metadata_service/chord/models.py
@@ -2,11 +2,15 @@
 
 from django.contrib.postgres.fields import JSONField, ArrayField
 from django.db import models
+from django.utils import timezone
 
 
 __all__ = ["Project", "Dataset", "TableOwnership"]
 
 
+def version_default():
+    return f"version_{timezone.now()}"
+
 #############################################################
 #                                                           #
 #                   Project Management                      #
@@ -116,6 +120,8 @@ def n_of_tables(self):
                               help_text="The grant(s) which funded and supported the work reported by the dataset.")
     keywords = ArrayField(JSONField(null=True, blank=True), blank=True, null=True,
                           help_text="Tags associated with the dataset, which will help in its discovery.")
+    version = models.CharField(max_length=200, blank=True, default=version_default,
+                                  help_text="A release point for the dataset when applicable.")
     extra_properties = JSONField(blank=True, null=True,
                                  help_text="Extra properties that do not fit in the previous specified attributes.")
 

diff --git a/chord_metadata_service/chord/tests/constants.py b/chord_metadata_service/chord/tests/constants.py
@@ -7,6 +7,7 @@
     "dats_dataset",
     "TEST_SEARCH_QUERY_1",
     "TEST_SEARCH_QUERY_2",
+    "TEST_FHIR_SEARCH_QUERY",
 ]
 
 
@@ -150,3 +151,4 @@ def dats_dataset(project_id, creators):
 
 TEST_SEARCH_QUERY_1 = ["#eq", ["#resolve", "subject", "sex"], "FEMALE"]
 TEST_SEARCH_QUERY_2 = ["#eq", ["#resolve", "subject", "sex"], "MALE"]
+TEST_FHIR_SEARCH_QUERY = {"query": {"match": {"gender": "FEMALE"}}}
diff --git a/chord_metadata_service/chord/tests/es_mocks.py b/chord_metadata_service/chord/tests/es_mocks.py
@@ -0,0 +1,56 @@
+SEARCH_SUCCESS = {
+    "_shards": {
+        "failed": 0,
+        "skipped": 0,
+        "successful": 1,
+        "total": 1
+    },
+    "hits": {
+        "hits": [
+            {
+                "_id": "Individual|patient:1",
+                "_index": "fhir_metadata",
+                "_score": 6.334576,
+                "_source": {
+                    "birthDate": "1994-09-11",
+                    "extension": [
+                        {
+                            "url": "http://ga4gh.org/fhir/phenopackets/StructureDefinition/individual-karyotypic-sex",
+                            "valueCodeableConcept": {
+                                "coding": [
+                                    {
+                                        "code": "XX",
+                                        "display": "XX",
+                                        "system": "http://ga4gh.org/fhir/phenopackets/CodeSystem/karyotypic-sex"
+                                    }
+                                ]
+                            }
+                        },
+                        {
+                            "url": "http://ga4gh.org/fhir/phenopackets/StructureDefinition/individual-taxonomy",
+                            "valueCodeableConcept": {
+                                "coding": [
+                                    {
+                                        "code": "FAKE_CODE",
+                                        "display": "Homo sapiens"
+                                    }
+                                ]
+                            }
+                        }
+                    ],
+                    "gender": "FEMALE",
+                    "id": "patient:1",
+                    "resourceType": "Patient"
+                },
+                "_type": "_doc"
+            }
+        ],
+        "max_score": 6.334576,
+        "total": {
+            "relation": "eq",
+            "value": 1
+        }
+    },
+    "timed_out": False,
+    "took": 5
+}
diff --git a/chord_metadata_service/chord/tests/test_api_search.py b/chord_metadata_service/chord/tests/test_api_search.py
@@ -1,4 +1,5 @@
 import json
+from unittest.mock import Mock, patch
 
 from django.test import override_settings
 from django.urls import reverse
@@ -8,6 +9,7 @@
 from chord_metadata_service.phenopackets.tests.constants import *
 from chord_metadata_service.phenopackets.models import *
 
+from chord_metadata_service.chord.tests.es_mocks import SEARCH_SUCCESS
 from .constants import *
 from ..models import *
 from ..views_search import PHENOPACKET_DATA_TYPE_ID, PHENOPACKET_SCHEMA, PHENOPACKET_METADATA_SCHEMA
@@ -90,7 +92,7 @@ def setUp(self) -> None:
         # Set up a dummy phenopacket
 
         self.individual, _ = Individual.objects.get_or_create(
-            id='patient:1', sex='FEMALE', age='P25Y3M2D')
+            id='patient:1', sex='FEMALE', age={"age": "P25Y3M2D"})
 
         self.procedure = Procedure.objects.create(**VALID_PROCEDURE_1)
 
@@ -205,3 +207,37 @@ def test_private_table_search_4(self):
         c = r.json()
         self.assertEqual(len(c["results"]), 1)
         self.assertEqual(self.phenopacket.id, c["results"][0]["id"])
+
+    @patch('chord_metadata_service.chord.views_search.es')
+    def test_fhir_search(self, mocked_es):
+        mocked_es.search.return_value = SEARCH_SUCCESS
+        # Valid search with result
+        r = self.client.post(reverse("fhir-search"), data=json.dumps({
+            "data_type": PHENOPACKET_DATA_TYPE_ID,
+            "query": TEST_FHIR_SEARCH_QUERY
+        }), content_type="application/json")
+
+        self.assertEqual(r.status_code, status.HTTP_200_OK)
+        c = r.json()
+
+        self.assertEqual(len(c["results"]), 1)
+        self.assertDictEqual(c["results"][0], {
+            "id": str(self.dataset.identifier),
+            "data_type": PHENOPACKET_DATA_TYPE_ID
+        })
+
+    @patch('chord_metadata_service.chord.views_search.es')
+    def test_private_fhir_search(self, mocked_es):
+        mocked_es.search.return_value = SEARCH_SUCCESS
+        # Valid search with result
+        r = self.client.post(reverse("fhir-private-search"), data=json.dumps({
+            "data_type": PHENOPACKET_DATA_TYPE_ID,
+            "query": TEST_FHIR_SEARCH_QUERY
+        }), content_type="application/json")
+
+        self.assertEqual(r.status_code, status.HTTP_200_OK)
+        c = r.json()
+
+        self.assertIn(str(self.dataset.identifier), c["results"])
+        self.assertEqual(c["results"][str(self.dataset.identifier)]["data_type"], PHENOPACKET_DATA_TYPE_ID)
+        self.assertEqual(self.phenopacket.id, c["results"][str(self.dataset.identifier)]["matches"][0]["id"])
diff --git a/chord_metadata_service/chord/views_ingest.py b/chord_metadata_service/chord/views_ingest.py
@@ -87,7 +87,7 @@ def workflow_file(_request, workflow_id):
 def create_phenotypic_feature(pf):
     pf_obj = PhenotypicFeature(
         description=pf.get("description", ""),
-        pftype=pf["type"]["id"],
+        pftype=pf["type"],
         negated=pf.get("negated", False),
         severity=pf.get("severity", None),
         modifier=pf.get("modifier", []),  # TODO: Validate ontology term in schema...
@@ -206,7 +206,7 @@ def ingest_phenopacket(phenopacket_data, table_id):
 
         g_obj, _ = Gene.objects.get_or_create(
             id=g["id"],
-            alternate_id=g.get("alternate_ids", []),
+            alternate_ids=g.get("alternate_ids", []),
             symbol=g["symbol"]
         )
 
@@ -216,8 +216,13 @@ def ingest_phenopacket(phenopacket_data, table_id):
     for d in diseases:
         # TODO: Primary key, should this be a model?
 
-        d_obj = Disease(term=d["term"], onset=d.get("onset", None), disease_stage=d.get("disease_stage", []))
-        d_obj.save()
+        d_obj, _ = Disease.objects.get_or_create(
+            term=d["term"],
+            onset=d.get("onset", None),
+            disease_stage=d.get("disease_stage", []),
+            tnm_finding=d.get("tnm_finding", [])
+        )
+        diseases_db.append(d_obj.id)
 
     resources_db = []
     for rs in meta_data.get("resources", []):

diff --git a/chord_metadata_service/chord/views_search.py b/chord_metadata_service/chord/views_search.py
@@ -1,21 +1,24 @@
 import itertools
-
-from chord_lib.responses.errors import *
-from chord_lib.search import build_search_response, postgres
 from datetime import datetime
+import json
+
 from django.db import connection
+from django.conf import settings
 from psycopg2 import sql
 from rest_framework.decorators import api_view, permission_classes
 from rest_framework.permissions import AllowAny
 from rest_framework.response import Response
 
-from .models import Dataset
-from .permissions import OverrideOrSuperUserOnly
+from chord_lib.responses.errors import *
+from chord_lib.search import build_search_response, postgres
 from chord_metadata_service.metadata.settings import DEBUG
 from chord_metadata_service.phenopackets.api_views import PHENOPACKET_PREFETCH
 from chord_metadata_service.phenopackets.models import Phenopacket
 from chord_metadata_service.phenopackets.schemas import PHENOPACKET_SCHEMA
 from chord_metadata_service.phenopackets.serializers import PhenopacketSerializer
+from chord_metadata_service.metadata.elastic import es
+from .models import Dataset
+from .permissions import OverrideOrSuperUserOnly
 
 PHENOPACKET_DATA_TYPE_ID = "phenopacket"
 
@@ -105,8 +108,19 @@ def phenopacket_results(query, params, key="id"):
 
 def phenopacket_query_results(query, params):
     # TODO: possibly a quite inefficient way of doing things...
-    return Phenopacket.objects.filter(id__in=phenopacket_results(query, params, "id")).prefetch_related(
-        *PHENOPACKET_PREFETCH)
+    # To expand further on this query : the select_related call
+    # will join on these tables we'd call anyway, thus 2 less request
+    # to the DB. prefetch_related works on M2M relationships and makes
+    # sure that, for instance, when querying diseases, we won't make multiple call
+    # for the same set of data
+    return Phenopacket.objects.filter(
+        id__in=phenopacket_results(query, params, "id")
+    ).select_related(
+        'subject',
+        'meta_data'
+    ).prefetch_related(
+        *PHENOPACKET_PREFETCH
+    )
 
 
 def search(request, internal_data=False):
@@ -160,6 +174,100 @@ def chord_private_search(request):
     return search(request, internal_data=True)
 
 
+def phenopacket_filter_results(subject_ids, htsfile_ids, disease_ids, biosample_ids,
+                               phenotypicfeature_ids, phenopacket_ids, prefetch=False):
+
+    query = Phenopacket.objects.get_queryset()
+
+    if subject_ids:
+        query = query.filter(subject__id__in=subject_ids)
+
+    if htsfile_ids:
+        query = query.filter(htsfiles__id__in=htsfile_ids)
+
+    if disease_ids:
+        query = query.filter(diseases__id__in=disease_ids)
+
+    if biosample_ids:
+        query = query.filter(biosamples__id__in=biosample_ids)
+
+    if phenotypicfeature_ids:
+        query = query.filter(phenotypic_features__id__in=phenotypicfeature_ids)
+
+    if phenopacket_ids:
+        query = query.filter(id__in=phenopacket_ids)
+
+    res = query.prefetch_related(*PHENOPACKET_PREFETCH)
+
+    return res
+
+
+# TODO: unsure why we chose POST for this endpoint? Should be GET me thinks
+def fhir_search(request, internal_data=False):
+    # TODO: not all that sure about the query format we'll want
+    # keep it simple for now
+    if "query" not in request.data:
+        return Response(bad_request_error("Missing query in request body"), status=400)
+
+    query = request.data["query"]
+    start = datetime.now()
+
+    if not es:
+        return Response(build_search_response([], start))
+
+    res = es.search(index=settings.FHIR_INDEX_NAME, body=query)
+
+    subject_ids = [hit['_id'].split('|')[1] for hit in res['hits']['hits'] if hit['_source']['resourceType'] == 'Patient']
+    htsfile_ids = [hit['_id'].split('|')[1] for hit in res['hits']['hits'] if hit['_source']['resourceType'] == 'DocumentReference']
+    disease_ids = [hit['_id'].split('|')[1] for hit in res['hits']['hits'] if hit['_source']['resourceType'] == 'Condition']
+    biosample_ids = [hit['_id'].split('|')[1] for hit in res['hits']['hits'] if hit['_source']['resourceType'] == 'Specimen']
+    phenotypicfeature_ids = [hit['_id'].split('|')[1] for hit in res['hits']['hits'] if hit['_source']['resourceType'] == 'Observation']
+    phenopacket_ids = [hit['_id'].split('|')[1] for hit in res['hits']['hits'] if hit['_source']['resourceType'] == 'Composition']
+
+    if (not subject_ids and not htsfile_ids and not disease_ids
+        and not biosample_ids and not phenotypicfeature_ids and not phenopacket_ids):
+        return Response(build_search_response([], start))
+    else:
+        phenopackets = phenopacket_filter_results(
+            subject_ids,
+            htsfile_ids,
+            disease_ids,
+            biosample_ids,
+            phenotypicfeature_ids,
+            phenopacket_ids
+        )
+
+    if not internal_data:
+        datasets = Dataset.objects.filter(
+            identifier__in = [
+                p.dataset_id for p in phenopackets
+            ]
+        )  # TODO: Maybe can avoid hitting DB here
+        return Response(build_search_response([{"id": d.identifier, "data_type": PHENOPACKET_DATA_TYPE_ID}
+                                               for d in datasets], start))
+    return Response(build_search_response({
+        dataset_id: {
+            "data_type": PHENOPACKET_DATA_TYPE_ID,
+            "matches": list(PhenopacketSerializer(p).data for p in dataset_phenopackets)
+        } for dataset_id, dataset_phenopackets in itertools.groupby(
+            phenopackets,
+            key=lambda p: str(p.dataset_id)
+        )
+    }, start))
+
+
+@api_view(["POST"])
+@permission_classes([AllowAny])
+def fhir_public_search(request):
+    return fhir_search(request)
+
+
+@api_view(["POST"])
+@permission_classes([AllowAny])
+def fhir_private_search(request):
+    return fhir_search(request, internal_data=True)
+
+
 @api_view(["POST"])
 def chord_private_table_search(request, table_id):
     # Search phenopacket data types in specific tables