Skip to content

Commit

Permalink
[ENH] Return dataset sizes in query response (#223)
Browse files Browse the repository at this point in the history
* fix docstring

* add key to CohortQueryResponse for total n in dataset

* add util for constructing query for size of specific datasets

* refactor out code to reformat http response

* update crud function for subject query endpoint to return matching dataset sizes

* update test data fixture

* test new httpx response unpacking util
  • Loading branch information
alyssadai authored Nov 15, 2023
1 parent 3c44558 commit 18d5d95
Show file tree
Hide file tree
Showing 6 changed files with 115 additions and 9 deletions.
29 changes: 21 additions & 8 deletions app/api/crud.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,8 @@ async def get(
image_modal: str,
) -> list[CohortQueryResponse]:
"""
Makes a POST request to graph API using httpx where the payload is a SPARQL query generated by the create_query function.
Sends SPARQL queries to the graph API via httpx POST requests for subject-session or dataset metadata
matching the given query parameters, as well as the total number of subjects in each matching dataset.
Parameters
----------
Expand Down Expand Up @@ -124,13 +125,22 @@ async def get(
# TODO: Revisit timeout value when query performance is improved
timeout=30.0,
)

# Reformat SPARQL results into more human-readable form
results_dicts = [
{k: v["value"] for k, v in res.items()}
for res in results["results"]["bindings"]
]
results_df = pd.DataFrame(results_dicts).reindex(columns=ATTRIBUTES_ORDER)
results_df = pd.DataFrame(
util.unpack_http_response_json_to_dicts(results)
).reindex(columns=ATTRIBUTES_ORDER)

# Get the total number of subjects in each dataset that matched the query
matching_dataset_size_results = post_query_to_graph(
util.create_multidataset_size_query(
results_df["dataset_uuid"].unique()
)
)
matching_dataset_sizes = {
ds["dataset_uuid"]: int(ds["total_subjects"])
for ds in util.unpack_http_response_json_to_dicts(
matching_dataset_size_results
)
}

response_obj = []
dataset_cols = ["dataset_uuid", "dataset_name"]
Expand Down Expand Up @@ -165,6 +175,9 @@ async def get(
CohortQueryResponse(
dataset_uuid=dataset_uuid,
dataset_name=dataset_name,
dataset_total_subjects=matching_dataset_sizes[
dataset_uuid
],
dataset_portal_uri=group["dataset_portal_uri"].iloc[0]
if group["dataset_portal_uri"].notna().all()
else None,
Expand Down
1 change: 1 addition & 0 deletions app/api/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ class CohortQueryResponse(BaseModel):
# dataset_file_path: str # TODO: Revisit this field once we have datasets without imaging info/sessions.
dataset_name: str
dataset_portal_uri: Optional[str]
dataset_total_subjects: int
records_protected: bool
num_matching_subjects: int
subject_data: Union[list[dict], str]
Expand Down
29 changes: 29 additions & 0 deletions app/api/utility.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,18 @@ def create_context() -> str:
)


def unpack_http_response_json_to_dicts(response: dict) -> list[dict]:
"""
Reformats a nested dictionary object from a SPARQL query response JSON into a more human-readable list of dictionaries,
where the keys are the variables selected in the SPARQL query and the values correspond to the variable values.
The number of dictionaries should correspond to the number of query matches.
"""
return [
{k: v["value"] for k, v in res.items()}
for res in response["results"]["bindings"]
]


def create_query(
return_agg: bool,
age: Optional[tuple] = (None, None),
Expand Down Expand Up @@ -221,6 +233,23 @@ def create_query(
return "\n".join([create_context(), query_string])


def create_multidataset_size_query(dataset_uuids: list) -> str:
"""Construct a SPARQL query to retrieve the number of subjects in each dataset in a list of dataset UUIDs."""
dataset_uuids_string = "\n".join([f"<{uuid}>" for uuid in dataset_uuids])
query_string = f"""
SELECT ?dataset_uuid (COUNT(DISTINCT ?subject) as ?total_subjects)
WHERE {{
VALUES ?dataset_uuid {{
{dataset_uuids_string}
}}
?dataset_uuid nb:hasSamples ?subject.
?subject a nb:Subject.
}} GROUP BY ?dataset_uuid
"""

return "\n".join([create_context(), query_string])


def create_terms_query(data_element_URI: str) -> str:
"""
Creates a SPARQL query using a simple query template to retrieve term URLS for a given data element.
Expand Down
2 changes: 2 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ def test_data():
"dataset_uuid": "http://neurobagel.org/vocab/12345",
"dataset_name": "QPN",
"dataset_portal_uri": "https://rpq-qpn.ca/en/researchers-section/databases/",
"dataset_total_subjects": 200,
"num_matching_subjects": 5,
"records_protected": True,
"subject_data": "protected",
Expand All @@ -38,6 +39,7 @@ def test_data():
"dataset_uuid": "http://neurobagel.org/vocab/67890",
"dataset_name": "PPMI",
"dataset_portal_uri": "https://www.ppmi-info.org/access-data-specimens/download-data",
"dataset_total_subjects": 3000,
"num_matching_subjects": 3,
"records_protected": True,
"subject_data": "protected",
Expand Down
2 changes: 1 addition & 1 deletion tests/test_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@


def test_get_all(test_app, mock_successful_get, monkeypatch):
"""Given no input for the sex parameter, returns a 200 status code and a non-empty list of results (should correspond to all subjects in graph)."""
"""Given no input for any query parameters, returns a 200 status code and a non-empty list of results (should correspond to all subjects in graph)."""

monkeypatch.setattr(crud, "get", mock_successful_get)
response = test_app.get("/query/")
Expand Down
61 changes: 61 additions & 0 deletions tests/test_utility.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
"""Test utility functions."""
from app.api import utility as util


def test_unpack_http_response_json_to_dicts():
"""Test that given a valid httpx JSON response, the function returns a simplified list of dicts with the correct keys and values."""
mock_response_json = {
"head": {"vars": ["dataset_uuid", "total_subjects"]},
"results": {
"bindings": [
{
"dataset_uuid": {
"type": "uri",
"value": "http://neurobagel.org/vocab/ds1234",
},
"total_subjects": {
"datatype": "http://www.w3.org/2001/XMLSchema#integer",
"type": "literal",
"value": "70",
},
},
{
"dataset_uuid": {
"type": "uri",
"value": "http://neurobagel.org/vocab/ds2345",
},
"total_subjects": {
"datatype": "http://www.w3.org/2001/XMLSchema#integer",
"type": "literal",
"value": "40",
},
},
{
"dataset_uuid": {
"type": "uri",
"value": "http://neurobagel.org/vocab/ds3456",
},
"total_subjects": {
"datatype": "http://www.w3.org/2001/XMLSchema#integer",
"type": "literal",
"value": "84",
},
},
]
},
}

assert util.unpack_http_response_json_to_dicts(mock_response_json) == [
{
"dataset_uuid": "http://neurobagel.org/vocab/ds1234",
"total_subjects": "70",
},
{
"dataset_uuid": "http://neurobagel.org/vocab/ds2345",
"total_subjects": "40",
},
{
"dataset_uuid": "http://neurobagel.org/vocab/ds3456",
"total_subjects": "84",
},
]

0 comments on commit 18d5d95

Please sign in to comment.