From 18d5d95ecf8ae6c2ee4b56cbe7a279ef684a8498 Mon Sep 17 00:00:00 2001 From: Alyssa Dai Date: Wed, 15 Nov 2023 14:32:20 -0500 Subject: [PATCH] [ENH] Return dataset sizes in query response (#223) * fix docstring * add key to CohortQueryResponse for total n in dataset * add util for constructing query for size of specific datasets * refactor out code to reformat http response * update crud function for subject query endpoint to return matching dataset sizes * update test data fixture * test new httpx response unpacking util --- app/api/crud.py | 29 ++++++++++++++------ app/api/models.py | 1 + app/api/utility.py | 29 ++++++++++++++++++++ tests/conftest.py | 2 ++ tests/test_query.py | 2 +- tests/test_utility.py | 61 +++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 115 insertions(+), 9 deletions(-) create mode 100644 tests/test_utility.py diff --git a/app/api/crud.py b/app/api/crud.py index d204d8b..006d8c4 100644 --- a/app/api/crud.py +++ b/app/api/crud.py @@ -84,7 +84,8 @@ async def get( image_modal: str, ) -> list[CohortQueryResponse]: """ - Makes a POST request to graph API using httpx where the payload is a SPARQL query generated by the create_query function. + Sends SPARQL queries to the graph API via httpx POST requests for subject-session or dataset metadata + matching the given query parameters, as well as the total number of subjects in each matching dataset. Parameters ---------- @@ -124,13 +125,22 @@ async def get( # TODO: Revisit timeout value when query performance is improved timeout=30.0, ) - - # Reformat SPARQL results into more human-readable form - results_dicts = [ - {k: v["value"] for k, v in res.items()} - for res in results["results"]["bindings"] - ] - results_df = pd.DataFrame(results_dicts).reindex(columns=ATTRIBUTES_ORDER) + results_df = pd.DataFrame( + util.unpack_http_response_json_to_dicts(results) + ).reindex(columns=ATTRIBUTES_ORDER) + + # Get the total number of subjects in each dataset that matched the query + matching_dataset_size_results = post_query_to_graph( + util.create_multidataset_size_query( + results_df["dataset_uuid"].unique() + ) + ) + matching_dataset_sizes = { + ds["dataset_uuid"]: int(ds["total_subjects"]) + for ds in util.unpack_http_response_json_to_dicts( + matching_dataset_size_results + ) + } response_obj = [] dataset_cols = ["dataset_uuid", "dataset_name"] @@ -165,6 +175,9 @@ async def get( CohortQueryResponse( dataset_uuid=dataset_uuid, dataset_name=dataset_name, + dataset_total_subjects=matching_dataset_sizes[ + dataset_uuid + ], dataset_portal_uri=group["dataset_portal_uri"].iloc[0] if group["dataset_portal_uri"].notna().all() else None, diff --git a/app/api/models.py b/app/api/models.py index a52d850..c4a9a18 100644 --- a/app/api/models.py +++ b/app/api/models.py @@ -58,6 +58,7 @@ class CohortQueryResponse(BaseModel): # dataset_file_path: str # TODO: Revisit this field once we have datasets without imaging info/sessions. dataset_name: str dataset_portal_uri: Optional[str] + dataset_total_subjects: int records_protected: bool num_matching_subjects: int subject_data: Union[list[dict], str] diff --git a/app/api/utility.py b/app/api/utility.py index aa3b333..d8887a3 100644 --- a/app/api/utility.py +++ b/app/api/utility.py @@ -84,6 +84,18 @@ def create_context() -> str: ) +def unpack_http_response_json_to_dicts(response: dict) -> list[dict]: + """ + Reformats a nested dictionary object from a SPARQL query response JSON into a more human-readable list of dictionaries, + where the keys are the variables selected in the SPARQL query and the values correspond to the variable values. + The number of dictionaries should correspond to the number of query matches. + """ + return [ + {k: v["value"] for k, v in res.items()} + for res in response["results"]["bindings"] + ] + + def create_query( return_agg: bool, age: Optional[tuple] = (None, None), @@ -221,6 +233,23 @@ def create_query( return "\n".join([create_context(), query_string]) +def create_multidataset_size_query(dataset_uuids: list) -> str: + """Construct a SPARQL query to retrieve the number of subjects in each dataset in a list of dataset UUIDs.""" + dataset_uuids_string = "\n".join([f"<{uuid}>" for uuid in dataset_uuids]) + query_string = f""" + SELECT ?dataset_uuid (COUNT(DISTINCT ?subject) as ?total_subjects) + WHERE {{ + VALUES ?dataset_uuid {{ + {dataset_uuids_string} + }} + ?dataset_uuid nb:hasSamples ?subject. + ?subject a nb:Subject. + }} GROUP BY ?dataset_uuid + """ + + return "\n".join([create_context(), query_string]) + + def create_terms_query(data_element_URI: str) -> str: """ Creates a SPARQL query using a simple query template to retrieve term URLS for a given data element. diff --git a/tests/conftest.py b/tests/conftest.py index 69b1e0c..afe538a 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -26,6 +26,7 @@ def test_data(): "dataset_uuid": "http://neurobagel.org/vocab/12345", "dataset_name": "QPN", "dataset_portal_uri": "https://rpq-qpn.ca/en/researchers-section/databases/", + "dataset_total_subjects": 200, "num_matching_subjects": 5, "records_protected": True, "subject_data": "protected", @@ -38,6 +39,7 @@ def test_data(): "dataset_uuid": "http://neurobagel.org/vocab/67890", "dataset_name": "PPMI", "dataset_portal_uri": "https://www.ppmi-info.org/access-data-specimens/download-data", + "dataset_total_subjects": 3000, "num_matching_subjects": 3, "records_protected": True, "subject_data": "protected", diff --git a/tests/test_query.py b/tests/test_query.py index 229baf4..ef50379 100644 --- a/tests/test_query.py +++ b/tests/test_query.py @@ -7,7 +7,7 @@ def test_get_all(test_app, mock_successful_get, monkeypatch): - """Given no input for the sex parameter, returns a 200 status code and a non-empty list of results (should correspond to all subjects in graph).""" + """Given no input for any query parameters, returns a 200 status code and a non-empty list of results (should correspond to all subjects in graph).""" monkeypatch.setattr(crud, "get", mock_successful_get) response = test_app.get("/query/") diff --git a/tests/test_utility.py b/tests/test_utility.py new file mode 100644 index 0000000..e6049cb --- /dev/null +++ b/tests/test_utility.py @@ -0,0 +1,61 @@ +"""Test utility functions.""" +from app.api import utility as util + + +def test_unpack_http_response_json_to_dicts(): + """Test that given a valid httpx JSON response, the function returns a simplified list of dicts with the correct keys and values.""" + mock_response_json = { + "head": {"vars": ["dataset_uuid", "total_subjects"]}, + "results": { + "bindings": [ + { + "dataset_uuid": { + "type": "uri", + "value": "http://neurobagel.org/vocab/ds1234", + }, + "total_subjects": { + "datatype": "http://www.w3.org/2001/XMLSchema#integer", + "type": "literal", + "value": "70", + }, + }, + { + "dataset_uuid": { + "type": "uri", + "value": "http://neurobagel.org/vocab/ds2345", + }, + "total_subjects": { + "datatype": "http://www.w3.org/2001/XMLSchema#integer", + "type": "literal", + "value": "40", + }, + }, + { + "dataset_uuid": { + "type": "uri", + "value": "http://neurobagel.org/vocab/ds3456", + }, + "total_subjects": { + "datatype": "http://www.w3.org/2001/XMLSchema#integer", + "type": "literal", + "value": "84", + }, + }, + ] + }, + } + + assert util.unpack_http_response_json_to_dicts(mock_response_json) == [ + { + "dataset_uuid": "http://neurobagel.org/vocab/ds1234", + "total_subjects": "70", + }, + { + "dataset_uuid": "http://neurobagel.org/vocab/ds2345", + "total_subjects": "40", + }, + { + "dataset_uuid": "http://neurobagel.org/vocab/ds3456", + "total_subjects": "84", + }, + ]