From 18d5d95ecf8ae6c2ee4b56cbe7a279ef684a8498 Mon Sep 17 00:00:00 2001
From: Alyssa Dai <alyssa.ydai@gmail.com>
Date: Wed, 15 Nov 2023 14:32:20 -0500
Subject: [PATCH] [ENH] Return dataset sizes in query response (#223)

* fix docstring

* add key to CohortQueryResponse for total n in dataset

* add util for constructing query for size of specific datasets

* refactor out code to reformat http response

* update crud function for subject query endpoint to return matching dataset sizes

* update test data fixture

* test new httpx response unpacking util
---
 app/api/crud.py       | 29 ++++++++++++++------
 app/api/models.py     |  1 +
 app/api/utility.py    | 29 ++++++++++++++++++++
 tests/conftest.py     |  2 ++
 tests/test_query.py   |  2 +-
 tests/test_utility.py | 61 +++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 115 insertions(+), 9 deletions(-)
 create mode 100644 tests/test_utility.py

diff --git a/app/api/crud.py b/app/api/crud.py
index d204d8b..006d8c4 100644
--- a/app/api/crud.py
+++ b/app/api/crud.py
@@ -84,7 +84,8 @@ async def get(
     image_modal: str,
 ) -> list[CohortQueryResponse]:
     """
-    Makes a POST request to graph API using httpx where the payload is a SPARQL query generated by the create_query function.
+    Sends SPARQL queries to the graph API via httpx POST requests for subject-session or dataset metadata
+    matching the given query parameters, as well as the total number of subjects in each matching dataset.
 
     Parameters
     ----------
@@ -124,13 +125,22 @@ async def get(
         # TODO: Revisit timeout value when query performance is improved
         timeout=30.0,
     )
-
-    # Reformat SPARQL results into more human-readable form
-    results_dicts = [
-        {k: v["value"] for k, v in res.items()}
-        for res in results["results"]["bindings"]
-    ]
-    results_df = pd.DataFrame(results_dicts).reindex(columns=ATTRIBUTES_ORDER)
+    results_df = pd.DataFrame(
+        util.unpack_http_response_json_to_dicts(results)
+    ).reindex(columns=ATTRIBUTES_ORDER)
+
+    # Get the total number of subjects in each dataset that matched the query
+    matching_dataset_size_results = post_query_to_graph(
+        util.create_multidataset_size_query(
+            results_df["dataset_uuid"].unique()
+        )
+    )
+    matching_dataset_sizes = {
+        ds["dataset_uuid"]: int(ds["total_subjects"])
+        for ds in util.unpack_http_response_json_to_dicts(
+            matching_dataset_size_results
+        )
+    }
 
     response_obj = []
     dataset_cols = ["dataset_uuid", "dataset_name"]
@@ -165,6 +175,9 @@ async def get(
                 CohortQueryResponse(
                     dataset_uuid=dataset_uuid,
                     dataset_name=dataset_name,
+                    dataset_total_subjects=matching_dataset_sizes[
+                        dataset_uuid
+                    ],
                     dataset_portal_uri=group["dataset_portal_uri"].iloc[0]
                     if group["dataset_portal_uri"].notna().all()
                     else None,
diff --git a/app/api/models.py b/app/api/models.py
index a52d850..c4a9a18 100644
--- a/app/api/models.py
+++ b/app/api/models.py
@@ -58,6 +58,7 @@ class CohortQueryResponse(BaseModel):
     # dataset_file_path: str  # TODO: Revisit this field once we have datasets without imaging info/sessions.
     dataset_name: str
     dataset_portal_uri: Optional[str]
+    dataset_total_subjects: int
     records_protected: bool
     num_matching_subjects: int
     subject_data: Union[list[dict], str]
diff --git a/app/api/utility.py b/app/api/utility.py
index aa3b333..d8887a3 100644
--- a/app/api/utility.py
+++ b/app/api/utility.py
@@ -84,6 +84,18 @@ def create_context() -> str:
     )
 
 
+def unpack_http_response_json_to_dicts(response: dict) -> list[dict]:
+    """
+    Reformats a nested dictionary object from a SPARQL query response JSON into a more human-readable list of dictionaries,
+    where the keys are the variables selected in the SPARQL query and the values correspond to the variable values.
+    The number of dictionaries should correspond to the number of query matches.
+    """
+    return [
+        {k: v["value"] for k, v in res.items()}
+        for res in response["results"]["bindings"]
+    ]
+
+
 def create_query(
     return_agg: bool,
     age: Optional[tuple] = (None, None),
@@ -221,6 +233,23 @@ def create_query(
     return "\n".join([create_context(), query_string])
 
 
+def create_multidataset_size_query(dataset_uuids: list) -> str:
+    """Construct a SPARQL query to retrieve the number of subjects in each dataset in a list of dataset UUIDs."""
+    dataset_uuids_string = "\n".join([f"<{uuid}>" for uuid in dataset_uuids])
+    query_string = f"""
+        SELECT ?dataset_uuid (COUNT(DISTINCT ?subject) as ?total_subjects)
+        WHERE {{
+            VALUES ?dataset_uuid {{
+                {dataset_uuids_string}
+            }}
+            ?dataset_uuid nb:hasSamples ?subject.
+            ?subject a nb:Subject.
+        }} GROUP BY ?dataset_uuid
+    """
+
+    return "\n".join([create_context(), query_string])
+
+
 def create_terms_query(data_element_URI: str) -> str:
     """
     Creates a SPARQL query using a simple query template to retrieve term URLS for a given data element.
diff --git a/tests/conftest.py b/tests/conftest.py
index 69b1e0c..afe538a 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -26,6 +26,7 @@ def test_data():
             "dataset_uuid": "http://neurobagel.org/vocab/12345",
             "dataset_name": "QPN",
             "dataset_portal_uri": "https://rpq-qpn.ca/en/researchers-section/databases/",
+            "dataset_total_subjects": 200,
             "num_matching_subjects": 5,
             "records_protected": True,
             "subject_data": "protected",
@@ -38,6 +39,7 @@ def test_data():
             "dataset_uuid": "http://neurobagel.org/vocab/67890",
             "dataset_name": "PPMI",
             "dataset_portal_uri": "https://www.ppmi-info.org/access-data-specimens/download-data",
+            "dataset_total_subjects": 3000,
             "num_matching_subjects": 3,
             "records_protected": True,
             "subject_data": "protected",
diff --git a/tests/test_query.py b/tests/test_query.py
index 229baf4..ef50379 100644
--- a/tests/test_query.py
+++ b/tests/test_query.py
@@ -7,7 +7,7 @@
 
 
 def test_get_all(test_app, mock_successful_get, monkeypatch):
-    """Given no input for the sex parameter, returns a 200 status code and a non-empty list of results (should correspond to all subjects in graph)."""
+    """Given no input for any query parameters, returns a 200 status code and a non-empty list of results (should correspond to all subjects in graph)."""
 
     monkeypatch.setattr(crud, "get", mock_successful_get)
     response = test_app.get("/query/")
diff --git a/tests/test_utility.py b/tests/test_utility.py
new file mode 100644
index 0000000..e6049cb
--- /dev/null
+++ b/tests/test_utility.py
@@ -0,0 +1,61 @@
+"""Test utility functions."""
+from app.api import utility as util
+
+
+def test_unpack_http_response_json_to_dicts():
+    """Test that given a valid httpx JSON response, the function returns a simplified list of dicts with the correct keys and values."""
+    mock_response_json = {
+        "head": {"vars": ["dataset_uuid", "total_subjects"]},
+        "results": {
+            "bindings": [
+                {
+                    "dataset_uuid": {
+                        "type": "uri",
+                        "value": "http://neurobagel.org/vocab/ds1234",
+                    },
+                    "total_subjects": {
+                        "datatype": "http://www.w3.org/2001/XMLSchema#integer",
+                        "type": "literal",
+                        "value": "70",
+                    },
+                },
+                {
+                    "dataset_uuid": {
+                        "type": "uri",
+                        "value": "http://neurobagel.org/vocab/ds2345",
+                    },
+                    "total_subjects": {
+                        "datatype": "http://www.w3.org/2001/XMLSchema#integer",
+                        "type": "literal",
+                        "value": "40",
+                    },
+                },
+                {
+                    "dataset_uuid": {
+                        "type": "uri",
+                        "value": "http://neurobagel.org/vocab/ds3456",
+                    },
+                    "total_subjects": {
+                        "datatype": "http://www.w3.org/2001/XMLSchema#integer",
+                        "type": "literal",
+                        "value": "84",
+                    },
+                },
+            ]
+        },
+    }
+
+    assert util.unpack_http_response_json_to_dicts(mock_response_json) == [
+        {
+            "dataset_uuid": "http://neurobagel.org/vocab/ds1234",
+            "total_subjects": "70",
+        },
+        {
+            "dataset_uuid": "http://neurobagel.org/vocab/ds2345",
+            "total_subjects": "40",
+        },
+        {
+            "dataset_uuid": "http://neurobagel.org/vocab/ds3456",
+            "total_subjects": "84",
+        },
+    ]