From 1b7f9fe227d2ac0bc08afc9dfb0d7c36db1400e4 Mon Sep 17 00:00:00 2001 From: Alyssa Dai Date: Tue, 7 Jan 2025 11:46:40 -0500 Subject: [PATCH] [MNT] Deprecate Cognitive Atlas vocab namespace & add check for unsupported namespaces (#410) * update test data README * update type hint * add test for phenotypic TSV with unrecognized vocab namespace * add check for unrecognized namespaces in data dict * add global var and check for deprecated namespaces * test extraction of unsupported namespaces * test deprecated namespace extraction and move checks to data dict validation * fix outdated docs link in README * add script to regenerate JSONLDs in neurobagel_examples submodule * rework example5 to have unsupported vocabs in data dict - example5 previously wasn't used anywhere and was conceptually a duplicate of example9 * update neurobagel_examples submodule * update tests * update JSONLD regeneration script docstring Co-authored-by: Sebastian Urchs --------- Co-authored-by: Sebastian Urchs --- README.md | 2 +- bagel/mappings.py | 9 ++- bagel/utilities/model_utils.py | 4 +- bagel/utilities/pheno_utils.py | 75 ++++++++++++++++++++++++- generate_neurobagel_example_jsonlds.sh | 41 ++++++++++++++ tests/data/README.md | 4 +- tests/data/example10.json | 6 +- tests/data/example11.json | 6 +- tests/data/example13.json | 6 +- tests/data/example5.json | 45 ++++++++++++++- tests/data/example5.tsv | 12 ++-- tests/data/example6.json | 6 +- tests/data/example9.json | 6 +- tests/integration/test_cli_pheno.py | 16 +++++- tests/neurobagel_examples | 2 +- tests/unit/test_model_utils.py | 2 +- tests/unit/test_pheno_utils.py | 76 +++++++++++++++++++++++++- 17 files changed, 281 insertions(+), 37 deletions(-) create mode 100755 generate_neurobagel_example_jsonlds.sh diff --git a/README.md b/README.md index 1ba21c86..20c13c1b 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,7 @@ The `bagel-cli` is a Python command-line tool to automatically parse and describe subject phenotypic and imaging attributes in an annotated dataset for integration into the Neurobagel graph. -**Please refer to our [official Neurobagel documentation](https://neurobagel.org/cli/) for information on how to install and use the CLI.** +**Please refer to our [official Neurobagel documentation](https://neurobagel.org/user_guide/cli/) for information on how to install and use the CLI.** ## Development environment diff --git a/bagel/mappings.py b/bagel/mappings.py index bd45b4ae..f7be70fb 100644 --- a/bagel/mappings.py +++ b/bagel/mappings.py @@ -16,8 +16,13 @@ NP = Namespace( "np", "https://github.com/nipoppy/pipeline-catalog/tree/main/processing/" ) -# Store all supported amespaces in a list for easy iteration & testing -ALL_NAMESPACES = [COGATLAS, NB, NCIT, NIDM, SNOMED, NP] + +# Store all supported and deprecated namespaces in a list for easy iteration & testing +SUPPORTED_NAMESPACES = [NB, NCIT, NIDM, SNOMED, NP] +SUPPORTED_NAMESPACE_PREFIXES = [ns.pf for ns in SUPPORTED_NAMESPACES] +# Keep deprecated namespaces for informative user messages +DEPRECATED_NAMESPACES = [COGATLAS] +DEPRECATED_NAMESPACE_PREFIXES = [ns.pf for ns in DEPRECATED_NAMESPACES] BIDS = { "anat": NIDM.pf + ":Anatomical", diff --git a/bagel/utilities/model_utils.py b/bagel/utilities/model_utils.py index 7e609eb5..8d34d769 100644 --- a/bagel/utilities/model_utils.py +++ b/bagel/utilities/model_utils.py @@ -7,7 +7,7 @@ from pydantic import ValidationError from bagel import models -from bagel.mappings import ALL_NAMESPACES, NB +from bagel.mappings import NB, SUPPORTED_NAMESPACES from bagel.utilities import file_utils @@ -15,7 +15,7 @@ def generate_context(): # Adapted from the dandi-schema context generation function # https://github.com/dandi/dandi-schema/blob/c616d87eaae8869770df0cb5405c24afdb9db096/dandischema/metadata.py field_preamble = { - namespace.pf: namespace.url for namespace in ALL_NAMESPACES + namespace.pf: namespace.url for namespace in SUPPORTED_NAMESPACES } fields = {} for klass_name, klass in inspect.getmembers(models): diff --git a/bagel/utilities/pheno_utils.py b/bagel/utilities/pheno_utils.py index d46ea1a6..2a9ef8a5 100644 --- a/bagel/utilities/pheno_utils.py +++ b/bagel/utilities/pheno_utils.py @@ -11,7 +11,11 @@ from typer import BadParameter from bagel import dictionary_models, mappings -from bagel.mappings import NB +from bagel.mappings import ( + DEPRECATED_NAMESPACE_PREFIXES, + NB, + SUPPORTED_NAMESPACE_PREFIXES, +) DICTIONARY_SCHEMA = dictionary_models.DataDictionary.model_json_schema() @@ -64,7 +68,7 @@ def get_columns_about(data_dict: dict, concept: str) -> list: ] -def get_annotated_columns(data_dict: dict) -> list(tuple[str, dict]): +def get_annotated_columns(data_dict: dict) -> list[tuple[str, dict]]: """ Return a list of all columns that have Neurobagel 'Annotations' in a data dictionary, where each column is represented as a tuple of the column name (dictionary key from the data dictionary) and @@ -77,6 +81,53 @@ def get_annotated_columns(data_dict: dict) -> list(tuple[str, dict]): ] +def recursive_find_values_for_key(data: dict, target: str) -> list: + """ + Recursively search for a key in a possibly nested dictionary and return a list of all values found for that key. + + TODO: This function currently only considers nested dicts, and would need to be expanded if Neurobagel + data dictionaries grow to have controlled terms inside list objects. + """ + target_values = [] + if isinstance(data, dict): + for key, value in data.items(): + if key == target: + target_values.append(value) + else: + target_values.extend( + recursive_find_values_for_key(data=value, target=target) + ) + return target_values + + +def find_unsupported_namespaces_and_term_urls( + data_dict: dict, +) -> tuple[list, dict]: + """ + From a provided data dictionary, find all term URLs that contain an unsupported namespace prefix. + Return a tuple of unsupported prefixes and a dictionary of the offending column names and their unrecognized term URLs. + """ + unsupported_prefixes = set() + unrecognized_term_urls = {} + + for col, content in get_annotated_columns(data_dict): + for col_term_url in recursive_find_values_for_key( + content["Annotations"], "TermURL" + ): + prefix = col_term_url.split(":")[0] + if prefix not in SUPPORTED_NAMESPACE_PREFIXES: + unsupported_prefixes.add(prefix) + unrecognized_term_urls[col] = col_term_url + + # sort the prefixes for a predictable order in the error message + return sorted(unsupported_prefixes), unrecognized_term_urls + + +def find_deprecated_namespaces(namespaces: list) -> list: + """Return the deprecated vocabulary namespace prefixes found in a list of namespace prefixes.""" + return [ns for ns in namespaces if ns in DEPRECATED_NAMESPACE_PREFIXES] + + def map_categories_to_columns(data_dict: dict) -> dict: """ Maps all pre-defined Neurobagel categories (e.g. "Sex") to a list containing all column names (if any) that @@ -315,6 +366,26 @@ def validate_data_dict(data_dict: dict) -> None: "The provided data dictionary must contain at least one column with Neurobagel annotations." ) + unsupported_namespaces, unrecognized_term_urls = ( + find_unsupported_namespaces_and_term_urls(data_dict) + ) + if unsupported_namespaces: + namespace_deprecation_msg = "" + if deprecated_namespaces := find_deprecated_namespaces( + unsupported_namespaces + ): + namespace_deprecation_msg = ( + f"\n\nMore info: The following vocabularies have been deprecated by Neurobagel: {deprecated_namespaces}. " + "Please update your data dictionary using the latest version of the annotation tool at https://annotate.neurobagel.org." + ) + raise LookupError( + f"The provided data dictionary contains unsupported vocabulary namespace prefixes: {unsupported_namespaces}\n" + f"Unsupported vocabularies are used for terms in the following columns' annotations: {unrecognized_term_urls}\n" + "Please ensure that the data dictionary only includes terms from Neurobagel recognized vocabularies. " + "(See https://neurobagel.org/data_models/dictionaries/.)" + f"{namespace_deprecation_msg}" + ) + if ( len( get_columns_about( diff --git a/generate_neurobagel_example_jsonlds.sh b/generate_neurobagel_example_jsonlds.sh new file mode 100755 index 00000000..2a96d92f --- /dev/null +++ b/generate_neurobagel_example_jsonlds.sh @@ -0,0 +1,41 @@ +#!/bin/bash + +# Steps to use: +# 1. cd into the tests/neurobagel_examples submodule and create a new branch that will contain the updated example files +# 2. Navigate back to the bagel-cli repository root directory and run this script from there to regenerate the example synthetic JSONLD files inside of the tests/neurobagel_examples submodule +# in neurobagel_examples. +# 3. Navigate again to tests/neurobagel_examples and from there, commit the changes, push the changes to the submodule origin, and open a PR there to merge the updated examples. + +docker build -t bagel . +cd tests + +data_dir=neurobagel_examples/data-upload + +# Phenotypic data only JSONLD +docker run --rm --volume=$PWD:/data/neurobagel/bagel-cli -w /data/neurobagel/bagel-cli bagel pheno \ + --pheno "${data_dir}/example_synthetic.tsv" \ + --dictionary "${data_dir}/example_synthetic.json" \ + --name "BIDS synthetic" \ + --output "${data_dir}/example_synthetic.jsonld" \ + --overwrite + +# Phenotypic & BIDS data JSONLD +docker run --rm --volume=$PWD:/data/neurobagel/bagel-cli -w /data/neurobagel/bagel-cli bagel bids \ + --jsonld-path ${data_dir}/example_synthetic.jsonld \ + --bids-dir bids-examples/synthetic \ + --output ${data_dir}/pheno-bids-output/example_synthetic_pheno-bids.jsonld \ + --overwrite + +# Phenotypic & derivatives data JSONLD +docker run --rm --volume=$PWD:/data/neurobagel/bagel-cli -w /data/neurobagel/bagel-cli bagel derivatives \ + --tabular ${data_dir}/nipoppy_proc_status_synthetic.tsv \ + --jsonld-path ${data_dir}/example_synthetic.jsonld \ + --output "${data_dir}/pheno-derivatives-output/example_synthetic_pheno-derivatives.jsonld" \ + --overwrite + +# Phenotypic, BIDS, and derivatives data JSONLD +docker run --rm --volume=$PWD:/data/neurobagel/bagel-cli -w /data/neurobagel/bagel-cli bagel derivatives \ + --tabular ${data_dir}/nipoppy_proc_status_synthetic.tsv \ + --jsonld-path "${data_dir}/pheno-bids-output/example_synthetic_pheno-bids.jsonld" \ + --output "${data_dir}/pheno-bids-derivatives-output/example_synthetic_pheno-bids-derivatives.jsonld" \ + --overwrite diff --git a/tests/data/README.md b/tests/data/README.md index c1171e4d..a7ca06ea 100644 --- a/tests/data/README.md +++ b/tests/data/README.md @@ -8,8 +8,8 @@ | 2 | valid, unique `participant` and `session` IDs | same as example 1 | pass | | 3 | same as example 2 | valid BIDS data dictionary, BUT: does not contain Neurobagel `"Annotations"` key | fail | | 4 | valid, has additional columns not described in `.json` | same as example 1 | pass | -| 5 | valid, has additional unique value, not documented in `.json` | same as example 1 | fail | -| 6 | valid, same as example 5. has annotation tool columns | valid, contains `"MissingValues"` attribute for categorical variable | pass | +| 5 | valid, has assessment tool columns | invalid, has TermURLs from unsupported vocabularies | fail | +| 6 | valid, same as example 5. | valid, contains `"MissingValues"` attribute for categorical variable | pass | | invalid | valid, only exists to be used together with the (invalid) .json | invalid, missing the `"TermURL"` attribute for identifiers | fail | | 7 | has fewer columns than are annotated in `.json` | same as example 1 | fail | | 8 | valid, based on ex2 has multiple participant_id columns | valid, based on ex2 multiple participant_id column annotations | fail* | diff --git a/tests/data/example10.json b/tests/data/example10.json index 415a3c14..c4e1a00b 100644 --- a/tests/data/example10.json +++ b/tests/data/example10.json @@ -51,7 +51,7 @@ "Label": "Assessment tool" }, "IsPartOf": { - "TermURL": "cogatlas:1234", + "TermURL": "snomed:1234", "Label": "Imaginary tool" }, "MissingValues": ["missing", "NOT IN TSV 1", "NOT IN TSV 2"] @@ -65,7 +65,7 @@ "Label": "Assessment tool" }, "IsPartOf": { - "TermURL": "cogatlas:1234", + "TermURL": "snomed:1234", "Label": "Imaginary tool" }, "MissingValues": ["missing", "NOT IN TSV 1", "NOT IN TSV 2"] @@ -79,7 +79,7 @@ "Label": "Assessment tool" }, "IsPartOf": { - "TermURL": "cogatlas:4321", + "TermURL": "snomed:4321", "Label": "A different imaginary tool" }, "MissingValues": ["none"] diff --git a/tests/data/example11.json b/tests/data/example11.json index 09d46fa4..ae86dbc3 100644 --- a/tests/data/example11.json +++ b/tests/data/example11.json @@ -51,7 +51,7 @@ "Label": "Assessment tool" }, "IsPartOf": { - "TermURL": "cogatlas:1234", + "TermURL": "snomed:1234", "Label": "Imaginary tool" }, "MissingValues": ["missing"] @@ -65,7 +65,7 @@ "Label": "Assessment tool" }, "IsPartOf": { - "TermURL": "cogatlas:1234", + "TermURL": "snomed:1234", "Label": "Imaginary tool" }, "MissingValues": ["missing"] @@ -79,7 +79,7 @@ "Label": "Assessment tool" }, "IsPartOf": { - "TermURL": "cogatlas:4321", + "TermURL": "snomed:4321", "Label": "A different imaginary tool" }, "MissingValues": ["none"] diff --git a/tests/data/example13.json b/tests/data/example13.json index b4289070..5a609c71 100644 --- a/tests/data/example13.json +++ b/tests/data/example13.json @@ -93,7 +93,7 @@ "Label": "Assessment tool" }, "IsPartOf": { - "TermURL": "cogatlas:1234", + "TermURL": "snomed:1234", "Label": "Imaginary tool" }, "MissingValues": ["missing"] @@ -107,7 +107,7 @@ "Label": "Assessment tool" }, "IsPartOf": { - "TermURL": "cogatlas:1234", + "TermURL": "snomed:1234", "Label": "Imaginary tool" }, "MissingValues": ["missing"] @@ -121,7 +121,7 @@ "Label": "Assessment tool" }, "IsPartOf": { - "TermURL": "cogatlas:4321", + "TermURL": "snomed:4321", "Label": "A different imaginary tool" }, "MissingValues": ["not completed"] diff --git a/tests/data/example5.json b/tests/data/example5.json index 7746e4bd..fa9d924c 100644 --- a/tests/data/example5.json +++ b/tests/data/example5.json @@ -39,7 +39,50 @@ "TermURL": "ncit:C94342", "Label": "Healthy Control" } - } + }, + "MissingValues": ["OTHER"] + } + }, + "tool_item1": { + "Description": "item 1 scores for an imaginary tool", + "Annotations": { + "IsAbout": { + "TermURL": "nb:Assessment", + "Label": "Assessment tool" + }, + "IsPartOf": { + "TermURL": "unknownvocab:1234", + "Label": "Imaginary tool" + }, + "MissingValues": ["missing"] + } + }, + "tool_item2": { + "Description": "item 2 scores for an imaginary tool", + "Annotations": { + "IsAbout": { + "TermURL": "nb:Assessment", + "Label": "Assessment tool" + }, + "IsPartOf": { + "TermURL": "unknownvocab:1234", + "Label": "Imaginary tool" + }, + "MissingValues": ["missing"] + } + }, + "other_tool_item1": { + "Description": "item 1 scores for a different imaginary tool", + "Annotations": { + "IsAbout": { + "TermURL": "nb:Assessment", + "Label": "Assessment tool" + }, + "IsPartOf": { + "TermURL": "cogatlas:4321", + "Label": "A different imaginary tool" + }, + "MissingValues": ["none"] } } } \ No newline at end of file diff --git a/tests/data/example5.tsv b/tests/data/example5.tsv index 420e7f97..5bf84f13 100644 --- a/tests/data/example5.tsv +++ b/tests/data/example5.tsv @@ -1,5 +1,7 @@ -participant_id session_id group -sub-01 ses-01 PAT -sub-01 ses-02 PAT -sub-02 ses-01 OTHER -sub-02 ses-02 CTRL +participant_id session_id group tool_item1 tool_item2 other_tool_item1 +sub-01 ses-01 PAT 11.0 "missing" "none" +sub-01 ses-02 PAT "missing" 12.0 "none" +sub-02 ses-01 OTHER "missing" "missing" "none" +sub-02 ses-02 OTHER "missing" "missing" "none" +sub-03 ses-01 CTRL 10.0 8.0 "ok" +sub-03 ses-02 CTRL 10.0 8.0 "bad" diff --git a/tests/data/example6.json b/tests/data/example6.json index 09d46fa4..ae86dbc3 100644 --- a/tests/data/example6.json +++ b/tests/data/example6.json @@ -51,7 +51,7 @@ "Label": "Assessment tool" }, "IsPartOf": { - "TermURL": "cogatlas:1234", + "TermURL": "snomed:1234", "Label": "Imaginary tool" }, "MissingValues": ["missing"] @@ -65,7 +65,7 @@ "Label": "Assessment tool" }, "IsPartOf": { - "TermURL": "cogatlas:1234", + "TermURL": "snomed:1234", "Label": "Imaginary tool" }, "MissingValues": ["missing"] @@ -79,7 +79,7 @@ "Label": "Assessment tool" }, "IsPartOf": { - "TermURL": "cogatlas:4321", + "TermURL": "snomed:4321", "Label": "A different imaginary tool" }, "MissingValues": ["none"] diff --git a/tests/data/example9.json b/tests/data/example9.json index 09d46fa4..ae86dbc3 100644 --- a/tests/data/example9.json +++ b/tests/data/example9.json @@ -51,7 +51,7 @@ "Label": "Assessment tool" }, "IsPartOf": { - "TermURL": "cogatlas:1234", + "TermURL": "snomed:1234", "Label": "Imaginary tool" }, "MissingValues": ["missing"] @@ -65,7 +65,7 @@ "Label": "Assessment tool" }, "IsPartOf": { - "TermURL": "cogatlas:1234", + "TermURL": "snomed:1234", "Label": "Imaginary tool" }, "MissingValues": ["missing"] @@ -79,7 +79,7 @@ "Label": "Assessment tool" }, "IsPartOf": { - "TermURL": "cogatlas:4321", + "TermURL": "snomed:4321", "Label": "A different imaginary tool" }, "MissingValues": ["none"] diff --git a/tests/integration/test_cli_pheno.py b/tests/integration/test_cli_pheno.py index 8bb60891..19b814e1 100644 --- a/tests/integration/test_cli_pheno.py +++ b/tests/integration/test_cli_pheno.py @@ -116,6 +116,16 @@ def test_pheno_valid_inputs_run_successfully( LookupError, ["do not have unique combinations of participant and session IDs"], ), + ( + "example5", + LookupError, + [ + "unsupported vocabulary namespace prefixes", + "['cogatlas', 'unknownvocab']", + "vocabularies have been deprecated", + "['cogatlas']", + ], + ), ], ) def test_invalid_inputs_are_handled_gracefully( @@ -491,14 +501,14 @@ def test_controlled_term_classes_have_uri_type( "assessment, subject_idx", [ ( - [{"identifier": "cogatlas:1234", "schemaKey": "Assessment"}], + [{"identifier": "snomed:1234", "schemaKey": "Assessment"}], 0, ), (None, 1), ( [ - {"identifier": "cogatlas:1234", "schemaKey": "Assessment"}, - {"identifier": "cogatlas:4321", "schemaKey": "Assessment"}, + {"identifier": "snomed:1234", "schemaKey": "Assessment"}, + {"identifier": "snomed:4321", "schemaKey": "Assessment"}, ], 2, ), diff --git a/tests/neurobagel_examples b/tests/neurobagel_examples index 872d2ca9..5690cd4f 160000 --- a/tests/neurobagel_examples +++ b/tests/neurobagel_examples @@ -1 +1 @@ -Subproject commit 872d2ca9b6a84feaf7dc0db956d5794648119416 +Subproject commit 5690cd4f42ca9df9651e93d488b9acad3cdf3213 diff --git a/tests/unit/test_model_utils.py b/tests/unit/test_model_utils.py index 010ef60c..571c9119 100644 --- a/tests/unit/test_model_utils.py +++ b/tests/unit/test_model_utils.py @@ -209,7 +209,7 @@ def test_used_namespaces_in_context(test_data_upload_path, load_test_json): "@context" ] - for ns in mappings.ALL_NAMESPACES: + for ns in mappings.SUPPORTED_NAMESPACES: assert ( ns.pf in jsonld_context.keys() ), f"The namespace '{ns.pf}' was not found in the @context of {jsonld}." diff --git a/tests/unit/test_pheno_utils.py b/tests/unit/test_pheno_utils.py index 15988f8c..a3f7f405 100644 --- a/tests/unit/test_pheno_utils.py +++ b/tests/unit/test_pheno_utils.py @@ -107,6 +107,78 @@ def test_get_columns_with_annotations(): assert result[1] == example["participant_id"] +def test_find_unsupported_namespaces_and_term_urls(): + """Test that term URLs with unsupported namespaces are correctly identified in a data dictionary.""" + data_dict = { + "participant_id": { + "Description": "Participant ID", + "Annotations": { + "IsAbout": { + "TermURL": "nb:ParticipantID", + "Label": "Unique participant identifier", + }, + "Identifies": "participant", + }, + }, + "group": { + "Description": "Experimental group", + "Levels": {"PAT": "Patient", "HC": "Healthy control"}, + "Annotations": { + "IsAbout": {"TermURL": "nb:Diagnosis", "Label": "Diagnosis"}, + "Levels": { + "PAT": { + "TermURL": "snomed:49049000", + "Label": "Parkinson's disease", + }, + "HC": { + "TermURL": "unknownvocab:1234", + "Label": "Healthy control", + }, + }, + }, + }, + "updrs_total": { + "Description": "Total UPDRS scores", + "Annotations": { + "IsAbout": { + "TermURL": "nb:Assessment", + "Label": "Assessment tool", + }, + "IsPartOf": { + "TermURL": "deprecatedvocab:1234", + "Label": "Unified Parkinson's Disease Rating Scale", + }, + }, + }, + } + + assert pheno_utils.find_unsupported_namespaces_and_term_urls( + data_dict + ) == ( + ["deprecatedvocab", "unknownvocab"], + { + "group": "unknownvocab:1234", + "updrs_total": "deprecatedvocab:1234", + }, + ) + + +@pytest.mark.parametrize( + "namespaces,deprecated_namespaces", + [ + (["fakevocab", "unknownvocab"], []), + (["cogatlas", "unknownvocab"], ["cogatlas"]), + (["snomed", "cogatlas"], ["cogatlas"]), + ], +) +def test_find_deprecated_namespaces(namespaces, deprecated_namespaces): + """Test that vocabulary namespace prefixes deprecated by Neurobagel are correctly identified.""" + assert ( + pheno_utils.find_deprecated_namespaces(namespaces) + == deprecated_namespaces + ) + + def test_map_categories_to_columns(test_data, load_test_json): """Test that inverse mapping of concepts to columns is correctly created""" data_dict = load_test_json(test_data / "example2.json") @@ -122,8 +194,8 @@ def test_map_categories_to_columns(test_data, load_test_json): @pytest.mark.parametrize( "tool, columns", [ - ("cogatlas:1234", ["tool_item1", "tool_item2"]), - ("cogatlas:4321", ["other_tool_item1"]), + ("snomed:1234", ["tool_item1", "tool_item2"]), + ("snomed:4321", ["other_tool_item1"]), ], ) def test_map_tools_to_columns(test_data, load_test_json, tool, columns):