[MNT] Deprecate Cognitive Atlas vocab namespace & add check for unsup…

…ported namespaces (#410) * update test data README * update type hint * add test for phenotypic TSV with unrecognized vocab namespace * add check for unrecognized namespaces in data dict * add global var and check for deprecated namespaces * test extraction of unsupported namespaces * test deprecated namespace extraction and move checks to data dict validation * fix outdated docs link in README * add script to regenerate JSONLDs in neurobagel_examples submodule * rework example5 to have unsupported vocabs in data dict - example5 previously wasn't used anywhere and was conceptually a duplicate of example9 * update neurobagel_examples submodule * update tests * update JSONLD regeneration script docstring Co-authored-by: Sebastian Urchs <[email protected]> --------- Co-authored-by: Sebastian Urchs <[email protected]>
neurobagel · Jan 7, 2025 · 1b7f9fe · 1b7f9fe
1 parent ec504d5
commit 1b7f9fe
Show file tree

Hide file tree

Showing 17 changed files with 281 additions and 37 deletions.
diff --git a/README.md b/README.md
@@ -14,7 +14,7 @@
 
 The `bagel-cli` is a Python command-line tool to automatically parse and describe subject phenotypic and imaging attributes in an annotated dataset for integration into the Neurobagel graph.
 
-**Please refer to our [official Neurobagel documentation](https://neurobagel.org/cli/) for information on how to install and use the CLI.**
+**Please refer to our [official Neurobagel documentation](https://neurobagel.org/user_guide/cli/) for information on how to install and use the CLI.**
 
 
 ## Development environment

diff --git a/bagel/mappings.py b/bagel/mappings.py
@@ -16,8 +16,13 @@
 NP = Namespace(
     "np", "https://github.com/nipoppy/pipeline-catalog/tree/main/processing/"
 )
-# Store all supported amespaces in a list for easy iteration & testing
-ALL_NAMESPACES = [COGATLAS, NB, NCIT, NIDM, SNOMED, NP]
+
+# Store all supported and deprecated namespaces in a list for easy iteration & testing
+SUPPORTED_NAMESPACES = [NB, NCIT, NIDM, SNOMED, NP]
+SUPPORTED_NAMESPACE_PREFIXES = [ns.pf for ns in SUPPORTED_NAMESPACES]
+# Keep deprecated namespaces for informative user messages
+DEPRECATED_NAMESPACES = [COGATLAS]
+DEPRECATED_NAMESPACE_PREFIXES = [ns.pf for ns in DEPRECATED_NAMESPACES]
 
 BIDS = {
     "anat": NIDM.pf + ":Anatomical",

diff --git a/bagel/utilities/model_utils.py b/bagel/utilities/model_utils.py
@@ -7,15 +7,15 @@
 from pydantic import ValidationError
 
 from bagel import models
-from bagel.mappings import ALL_NAMESPACES, NB
+from bagel.mappings import NB, SUPPORTED_NAMESPACES
 from bagel.utilities import file_utils
 
 
 def generate_context():
     # Adapted from the dandi-schema context generation function
     # https://github.com/dandi/dandi-schema/blob/c616d87eaae8869770df0cb5405c24afdb9db096/dandischema/metadata.py
     field_preamble = {
-        namespace.pf: namespace.url for namespace in ALL_NAMESPACES
+        namespace.pf: namespace.url for namespace in SUPPORTED_NAMESPACES
     }
     fields = {}
     for klass_name, klass in inspect.getmembers(models):

diff --git a/bagel/utilities/pheno_utils.py b/bagel/utilities/pheno_utils.py
@@ -11,7 +11,11 @@
 from typer import BadParameter
 
 from bagel import dictionary_models, mappings
-from bagel.mappings import NB
+from bagel.mappings import (
+    DEPRECATED_NAMESPACE_PREFIXES,
+    NB,
+    SUPPORTED_NAMESPACE_PREFIXES,
+)
 
 DICTIONARY_SCHEMA = dictionary_models.DataDictionary.model_json_schema()
 
@@ -64,7 +68,7 @@ def get_columns_about(data_dict: dict, concept: str) -> list:
     ]
 
 
-def get_annotated_columns(data_dict: dict) -> list(tuple[str, dict]):
+def get_annotated_columns(data_dict: dict) -> list[tuple[str, dict]]:
     """
     Return a list of all columns that have Neurobagel 'Annotations' in a data dictionary,
     where each column is represented as a tuple of the column name (dictionary key from the data dictionary) and
@@ -77,6 +81,53 @@ def get_annotated_columns(data_dict: dict) -> list(tuple[str, dict]):
     ]
 
 
+def recursive_find_values_for_key(data: dict, target: str) -> list:
+    """
+    Recursively search for a key in a possibly nested dictionary and return a list of all values found for that key.
+
+    TODO: This function currently only considers nested dicts, and would need to be expanded if Neurobagel
+    data dictionaries grow to have controlled terms inside list objects.
+    """
+    target_values = []
+    if isinstance(data, dict):
+        for key, value in data.items():
+            if key == target:
+                target_values.append(value)
+            else:
+                target_values.extend(
+                    recursive_find_values_for_key(data=value, target=target)
+                )
+    return target_values
+
+
+def find_unsupported_namespaces_and_term_urls(
+    data_dict: dict,
+) -> tuple[list, dict]:
+    """
+    From a provided data dictionary, find all term URLs that contain an unsupported namespace prefix.
+    Return a tuple of unsupported prefixes and a dictionary of the offending column names and their unrecognized term URLs.
+    """
+    unsupported_prefixes = set()
+    unrecognized_term_urls = {}
+
+    for col, content in get_annotated_columns(data_dict):
+        for col_term_url in recursive_find_values_for_key(
+            content["Annotations"], "TermURL"
+        ):
+            prefix = col_term_url.split(":")[0]
+            if prefix not in SUPPORTED_NAMESPACE_PREFIXES:
+                unsupported_prefixes.add(prefix)
+                unrecognized_term_urls[col] = col_term_url
+
+    # sort the prefixes for a predictable order in the error message
+    return sorted(unsupported_prefixes), unrecognized_term_urls
+
+
+def find_deprecated_namespaces(namespaces: list) -> list:
+    """Return the deprecated vocabulary namespace prefixes found in a list of namespace prefixes."""
+    return [ns for ns in namespaces if ns in DEPRECATED_NAMESPACE_PREFIXES]
+
+
 def map_categories_to_columns(data_dict: dict) -> dict:
     """
     Maps all pre-defined Neurobagel categories (e.g. "Sex") to a list containing all column names (if any) that
@@ -315,6 +366,26 @@ def validate_data_dict(data_dict: dict) -> None:
             "The provided data dictionary must contain at least one column with Neurobagel annotations."
         )
 
+    unsupported_namespaces, unrecognized_term_urls = (
+        find_unsupported_namespaces_and_term_urls(data_dict)
+    )
+    if unsupported_namespaces:
+        namespace_deprecation_msg = ""
+        if deprecated_namespaces := find_deprecated_namespaces(
+            unsupported_namespaces
+        ):
+            namespace_deprecation_msg = (
+                f"\n\nMore info: The following vocabularies have been deprecated by Neurobagel: {deprecated_namespaces}. "
+                "Please update your data dictionary using the latest version of the annotation tool at https://annotate.neurobagel.org."
+            )
+        raise LookupError(
+            f"The provided data dictionary contains unsupported vocabulary namespace prefixes: {unsupported_namespaces}\n"
+            f"Unsupported vocabularies are used for terms in the following columns' annotations: {unrecognized_term_urls}\n"
+            "Please ensure that the data dictionary only includes terms from Neurobagel recognized vocabularies. "
+            "(See https://neurobagel.org/data_models/dictionaries/.)"
+            f"{namespace_deprecation_msg}"
+        )
+
     if (
         len(
             get_columns_about(

diff --git a/generate_neurobagel_example_jsonlds.sh b/generate_neurobagel_example_jsonlds.sh
@@ -0,0 +1,41 @@
+#!/bin/bash
+
+# Steps to use:
+# 1. cd into the tests/neurobagel_examples submodule and create a new branch that will contain the updated example files
+# 2. Navigate back to the bagel-cli repository root directory and run this script from there to regenerate the example synthetic JSONLD files inside of the tests/neurobagel_examples submodule
+# in neurobagel_examples.
+# 3. Navigate again to tests/neurobagel_examples and from there, commit the changes, push the changes to the submodule origin, and open a PR there to merge the updated examples.
+
+docker build -t bagel .
+cd tests
+
+data_dir=neurobagel_examples/data-upload
+
+# Phenotypic data only JSONLD
+docker run --rm --volume=$PWD:/data/neurobagel/bagel-cli -w /data/neurobagel/bagel-cli bagel pheno \
+    --pheno "${data_dir}/example_synthetic.tsv" \
+    --dictionary "${data_dir}/example_synthetic.json" \
+    --name "BIDS synthetic" \
+    --output "${data_dir}/example_synthetic.jsonld" \
+    --overwrite
+
+# Phenotypic & BIDS data JSONLD
+docker run --rm --volume=$PWD:/data/neurobagel/bagel-cli -w /data/neurobagel/bagel-cli bagel bids \
+    --jsonld-path ${data_dir}/example_synthetic.jsonld \
+    --bids-dir bids-examples/synthetic \
+    --output ${data_dir}/pheno-bids-output/example_synthetic_pheno-bids.jsonld \
+    --overwrite
+
+# Phenotypic & derivatives data JSONLD
+docker run --rm --volume=$PWD:/data/neurobagel/bagel-cli -w /data/neurobagel/bagel-cli bagel derivatives \
+    --tabular ${data_dir}/nipoppy_proc_status_synthetic.tsv \
+    --jsonld-path ${data_dir}/example_synthetic.jsonld \
+    --output "${data_dir}/pheno-derivatives-output/example_synthetic_pheno-derivatives.jsonld" \
+    --overwrite
+
+# Phenotypic, BIDS, and derivatives data JSONLD
+docker run --rm --volume=$PWD:/data/neurobagel/bagel-cli -w /data/neurobagel/bagel-cli bagel derivatives \
+    --tabular ${data_dir}/nipoppy_proc_status_synthetic.tsv \
+    --jsonld-path "${data_dir}/pheno-bids-output/example_synthetic_pheno-bids.jsonld" \
+    --output "${data_dir}/pheno-bids-derivatives-output/example_synthetic_pheno-bids-derivatives.jsonld" \
+    --overwrite
diff --git a/tests/data/README.md b/tests/data/README.md
@@ -8,8 +8,8 @@
 | 2 | valid, unique `participant` and `session` IDs | same as example 1 | pass |
 | 3 | same as example 2 | valid BIDS data dictionary, BUT: does not contain Neurobagel `"Annotations"` key | fail |
 | 4 | valid, has additional columns not described in `.json` | same as example 1 | pass |
-| 5 | valid, has additional unique value, not documented in `.json` | same as example 1 | fail |
-| 6 | valid, same as example 5. has annotation tool columns | valid, contains `"MissingValues"` attribute for categorical variable | pass |
+| 5 | valid, has assessment tool columns | invalid, has TermURLs from unsupported vocabularies | fail |
+| 6 | valid, same as example 5. | valid, contains `"MissingValues"` attribute for categorical variable | pass |
 | invalid | valid, only exists to be used together with the (invalid) .json | invalid, missing the `"TermURL"` attribute for identifiers | fail |
 | 7 | has fewer columns than are annotated in `.json` | same as example 1 | fail |
 | 8 | valid, based on ex2 has multiple participant_id columns | valid, based on ex2 multiple participant_id column annotations | fail* |

diff --git a/tests/data/example10.json b/tests/data/example10.json
@@ -51,7 +51,7 @@
                 "Label": "Assessment tool"
             },
             "IsPartOf": {
-                "TermURL": "cogatlas:1234",
+                "TermURL": "snomed:1234",
                 "Label": "Imaginary tool"
             },
             "MissingValues": ["missing", "NOT IN TSV 1", "NOT IN TSV 2"]
@@ -65,7 +65,7 @@
                 "Label": "Assessment tool"
             },
             "IsPartOf": {
-                "TermURL": "cogatlas:1234",
+                "TermURL": "snomed:1234",
                 "Label": "Imaginary tool"
             },
             "MissingValues": ["missing", "NOT IN TSV 1", "NOT IN TSV 2"]
@@ -79,7 +79,7 @@
                 "Label": "Assessment tool"
             },
             "IsPartOf": {
-                "TermURL": "cogatlas:4321",
+                "TermURL": "snomed:4321",
                 "Label": "A different imaginary tool"
             },
             "MissingValues": ["none"]

diff --git a/tests/data/example11.json b/tests/data/example11.json
@@ -51,7 +51,7 @@
                 "Label": "Assessment tool"
             },
             "IsPartOf": {
-                "TermURL": "cogatlas:1234",
+                "TermURL": "snomed:1234",
                 "Label": "Imaginary tool"
             },
             "MissingValues": ["missing"]
@@ -65,7 +65,7 @@
                 "Label": "Assessment tool"
             },
             "IsPartOf": {
-                "TermURL": "cogatlas:1234",
+                "TermURL": "snomed:1234",
                 "Label": "Imaginary tool"
             },
             "MissingValues": ["missing"]
@@ -79,7 +79,7 @@
                 "Label": "Assessment tool"
             },
             "IsPartOf": {
-                "TermURL": "cogatlas:4321",
+                "TermURL": "snomed:4321",
                 "Label": "A different imaginary tool"
             },
             "MissingValues": ["none"]

diff --git a/tests/data/example13.json b/tests/data/example13.json
@@ -93,7 +93,7 @@
                 "Label": "Assessment tool"
             },
             "IsPartOf": {
-                "TermURL": "cogatlas:1234",
+                "TermURL": "snomed:1234",
                 "Label": "Imaginary tool"
             },
             "MissingValues": ["missing"]
@@ -107,7 +107,7 @@
                 "Label": "Assessment tool"
             },
             "IsPartOf": {
-                "TermURL": "cogatlas:1234",
+                "TermURL": "snomed:1234",
                 "Label": "Imaginary tool"
             },
             "MissingValues": ["missing"]
@@ -121,7 +121,7 @@
                 "Label": "Assessment tool"
             },
             "IsPartOf": {
-                "TermURL": "cogatlas:4321",
+                "TermURL": "snomed:4321",
                 "Label": "A different imaginary tool"
             },
             "MissingValues": ["not completed"]

diff --git a/tests/data/example5.json b/tests/data/example5.json
@@ -39,7 +39,50 @@
                     "TermURL": "ncit:C94342",
                     "Label": "Healthy Control"
                 }
-            }
+            },
+            "MissingValues": ["OTHER"]
+        }
+    },
+    "tool_item1": {
+        "Description": "item 1 scores for an imaginary tool",
+        "Annotations": {
+            "IsAbout": {
+                "TermURL": "nb:Assessment",
+                "Label": "Assessment tool"
+            },
+            "IsPartOf": {
+                "TermURL": "unknownvocab:1234",
+                "Label": "Imaginary tool"
+            },
+            "MissingValues": ["missing"]
+        }
+    },
+    "tool_item2": {
+        "Description": "item 2 scores for an imaginary tool",
+        "Annotations": {
+            "IsAbout": {
+                "TermURL": "nb:Assessment",
+                "Label": "Assessment tool"
+            },
+            "IsPartOf": {
+                "TermURL": "unknownvocab:1234",
+                "Label": "Imaginary tool"
+            },
+            "MissingValues": ["missing"]
+        }
+    },
+    "other_tool_item1": {
+        "Description": "item 1 scores for a different imaginary tool",
+        "Annotations": {
+            "IsAbout": {
+                "TermURL": "nb:Assessment",
+                "Label": "Assessment tool"
+            },
+            "IsPartOf": {
+                "TermURL": "cogatlas:4321",
+                "Label": "A different imaginary tool"
+            },
+            "MissingValues": ["none"]
         }
     }
 }
diff --git a/tests/data/example5.tsv b/tests/data/example5.tsv
@@ -1,5 +1,7 @@
-participant_id	session_id	group
-sub-01	ses-01	PAT
-sub-01	ses-02	PAT
-sub-02	ses-01	OTHER
-sub-02	ses-02	CTRL
+participant_id	session_id	group	tool_item1	tool_item2	other_tool_item1
+sub-01	ses-01	PAT	11.0	"missing"	"none"
+sub-01	ses-02	PAT	"missing"	12.0	"none"
+sub-02	ses-01	OTHER	"missing"	"missing"	"none"
+sub-02	ses-02	OTHER	"missing"	"missing"	"none"
+sub-03	ses-01	CTRL	10.0	8.0	"ok"
+sub-03	ses-02	CTRL	10.0	8.0	"bad"
diff --git a/tests/data/example6.json b/tests/data/example6.json
@@ -51,7 +51,7 @@
                 "Label": "Assessment tool"
             },
             "IsPartOf": {
-                "TermURL": "cogatlas:1234",
+                "TermURL": "snomed:1234",
                 "Label": "Imaginary tool"
             },
             "MissingValues": ["missing"]
@@ -65,7 +65,7 @@
                 "Label": "Assessment tool"
             },
             "IsPartOf": {
-                "TermURL": "cogatlas:1234",
+                "TermURL": "snomed:1234",
                 "Label": "Imaginary tool"
             },
             "MissingValues": ["missing"]
@@ -79,7 +79,7 @@
                 "Label": "Assessment tool"
             },
             "IsPartOf": {
-                "TermURL": "cogatlas:4321",
+                "TermURL": "snomed:4321",
                 "Label": "A different imaginary tool"
             },
             "MissingValues": ["none"]

diff --git a/tests/data/example9.json b/tests/data/example9.json
@@ -51,7 +51,7 @@
                 "Label": "Assessment tool"
             },
             "IsPartOf": {
-                "TermURL": "cogatlas:1234",
+                "TermURL": "snomed:1234",
                 "Label": "Imaginary tool"
             },
             "MissingValues": ["missing"]
@@ -65,7 +65,7 @@
                 "Label": "Assessment tool"
             },
             "IsPartOf": {
-                "TermURL": "cogatlas:1234",
+                "TermURL": "snomed:1234",
                 "Label": "Imaginary tool"
             },
             "MissingValues": ["missing"]
@@ -79,7 +79,7 @@
                 "Label": "Assessment tool"
             },
             "IsPartOf": {
-                "TermURL": "cogatlas:4321",
+                "TermURL": "snomed:4321",
                 "Label": "A different imaginary tool"
             },
             "MissingValues": ["none"]