Skip to content

Commit

Permalink
[MNT] Deprecate Cognitive Atlas vocab namespace & add check for unsup…
Browse files Browse the repository at this point in the history
…ported namespaces (#410)

* update test data README

* update type hint

* add test for phenotypic TSV with unrecognized vocab namespace

* add check for unrecognized namespaces in data dict

* add global var and check for deprecated namespaces

* test extraction of unsupported namespaces

* test deprecated namespace extraction and move checks to data dict validation

* fix outdated docs link in README

* add script to regenerate JSONLDs in neurobagel_examples submodule

* rework example5 to have unsupported vocabs in data dict
- example5 previously wasn't used anywhere and was conceptually a duplicate of example9

* update neurobagel_examples submodule

* update tests

* update JSONLD regeneration script docstring

Co-authored-by: Sebastian Urchs <[email protected]>

---------

Co-authored-by: Sebastian Urchs <[email protected]>
  • Loading branch information
alyssadai and surchs authored Jan 7, 2025
1 parent ec504d5 commit 1b7f9fe
Show file tree
Hide file tree
Showing 17 changed files with 281 additions and 37 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

The `bagel-cli` is a Python command-line tool to automatically parse and describe subject phenotypic and imaging attributes in an annotated dataset for integration into the Neurobagel graph.

**Please refer to our [official Neurobagel documentation](https://neurobagel.org/cli/) for information on how to install and use the CLI.**
**Please refer to our [official Neurobagel documentation](https://neurobagel.org/user_guide/cli/) for information on how to install and use the CLI.**


## Development environment
Expand Down
9 changes: 7 additions & 2 deletions bagel/mappings.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,13 @@
NP = Namespace(
"np", "https://github.com/nipoppy/pipeline-catalog/tree/main/processing/"
)
# Store all supported amespaces in a list for easy iteration & testing
ALL_NAMESPACES = [COGATLAS, NB, NCIT, NIDM, SNOMED, NP]

# Store all supported and deprecated namespaces in a list for easy iteration & testing
SUPPORTED_NAMESPACES = [NB, NCIT, NIDM, SNOMED, NP]
SUPPORTED_NAMESPACE_PREFIXES = [ns.pf for ns in SUPPORTED_NAMESPACES]
# Keep deprecated namespaces for informative user messages
DEPRECATED_NAMESPACES = [COGATLAS]
DEPRECATED_NAMESPACE_PREFIXES = [ns.pf for ns in DEPRECATED_NAMESPACES]

BIDS = {
"anat": NIDM.pf + ":Anatomical",
Expand Down
4 changes: 2 additions & 2 deletions bagel/utilities/model_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,15 @@
from pydantic import ValidationError

from bagel import models
from bagel.mappings import ALL_NAMESPACES, NB
from bagel.mappings import NB, SUPPORTED_NAMESPACES
from bagel.utilities import file_utils


def generate_context():
# Adapted from the dandi-schema context generation function
# https://github.com/dandi/dandi-schema/blob/c616d87eaae8869770df0cb5405c24afdb9db096/dandischema/metadata.py
field_preamble = {
namespace.pf: namespace.url for namespace in ALL_NAMESPACES
namespace.pf: namespace.url for namespace in SUPPORTED_NAMESPACES
}
fields = {}
for klass_name, klass in inspect.getmembers(models):
Expand Down
75 changes: 73 additions & 2 deletions bagel/utilities/pheno_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,11 @@
from typer import BadParameter

from bagel import dictionary_models, mappings
from bagel.mappings import NB
from bagel.mappings import (
DEPRECATED_NAMESPACE_PREFIXES,
NB,
SUPPORTED_NAMESPACE_PREFIXES,
)

DICTIONARY_SCHEMA = dictionary_models.DataDictionary.model_json_schema()

Expand Down Expand Up @@ -64,7 +68,7 @@ def get_columns_about(data_dict: dict, concept: str) -> list:
]


def get_annotated_columns(data_dict: dict) -> list(tuple[str, dict]):
def get_annotated_columns(data_dict: dict) -> list[tuple[str, dict]]:
"""
Return a list of all columns that have Neurobagel 'Annotations' in a data dictionary,
where each column is represented as a tuple of the column name (dictionary key from the data dictionary) and
Expand All @@ -77,6 +81,53 @@ def get_annotated_columns(data_dict: dict) -> list(tuple[str, dict]):
]


def recursive_find_values_for_key(data: dict, target: str) -> list:
"""
Recursively search for a key in a possibly nested dictionary and return a list of all values found for that key.
TODO: This function currently only considers nested dicts, and would need to be expanded if Neurobagel
data dictionaries grow to have controlled terms inside list objects.
"""
target_values = []
if isinstance(data, dict):
for key, value in data.items():
if key == target:
target_values.append(value)
else:
target_values.extend(
recursive_find_values_for_key(data=value, target=target)
)
return target_values


def find_unsupported_namespaces_and_term_urls(
data_dict: dict,
) -> tuple[list, dict]:
"""
From a provided data dictionary, find all term URLs that contain an unsupported namespace prefix.
Return a tuple of unsupported prefixes and a dictionary of the offending column names and their unrecognized term URLs.
"""
unsupported_prefixes = set()
unrecognized_term_urls = {}

for col, content in get_annotated_columns(data_dict):
for col_term_url in recursive_find_values_for_key(
content["Annotations"], "TermURL"
):
prefix = col_term_url.split(":")[0]
if prefix not in SUPPORTED_NAMESPACE_PREFIXES:
unsupported_prefixes.add(prefix)
unrecognized_term_urls[col] = col_term_url

# sort the prefixes for a predictable order in the error message
return sorted(unsupported_prefixes), unrecognized_term_urls


def find_deprecated_namespaces(namespaces: list) -> list:
"""Return the deprecated vocabulary namespace prefixes found in a list of namespace prefixes."""
return [ns for ns in namespaces if ns in DEPRECATED_NAMESPACE_PREFIXES]


def map_categories_to_columns(data_dict: dict) -> dict:
"""
Maps all pre-defined Neurobagel categories (e.g. "Sex") to a list containing all column names (if any) that
Expand Down Expand Up @@ -315,6 +366,26 @@ def validate_data_dict(data_dict: dict) -> None:
"The provided data dictionary must contain at least one column with Neurobagel annotations."
)

unsupported_namespaces, unrecognized_term_urls = (
find_unsupported_namespaces_and_term_urls(data_dict)
)
if unsupported_namespaces:
namespace_deprecation_msg = ""
if deprecated_namespaces := find_deprecated_namespaces(
unsupported_namespaces
):
namespace_deprecation_msg = (
f"\n\nMore info: The following vocabularies have been deprecated by Neurobagel: {deprecated_namespaces}. "
"Please update your data dictionary using the latest version of the annotation tool at https://annotate.neurobagel.org."
)
raise LookupError(
f"The provided data dictionary contains unsupported vocabulary namespace prefixes: {unsupported_namespaces}\n"
f"Unsupported vocabularies are used for terms in the following columns' annotations: {unrecognized_term_urls}\n"
"Please ensure that the data dictionary only includes terms from Neurobagel recognized vocabularies. "
"(See https://neurobagel.org/data_models/dictionaries/.)"
f"{namespace_deprecation_msg}"
)

if (
len(
get_columns_about(
Expand Down
41 changes: 41 additions & 0 deletions generate_neurobagel_example_jsonlds.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
#!/bin/bash

# Steps to use:
# 1. cd into the tests/neurobagel_examples submodule and create a new branch that will contain the updated example files
# 2. Navigate back to the bagel-cli repository root directory and run this script from there to regenerate the example synthetic JSONLD files inside of the tests/neurobagel_examples submodule
# in neurobagel_examples.
# 3. Navigate again to tests/neurobagel_examples and from there, commit the changes, push the changes to the submodule origin, and open a PR there to merge the updated examples.

docker build -t bagel .
cd tests

data_dir=neurobagel_examples/data-upload

# Phenotypic data only JSONLD
docker run --rm --volume=$PWD:/data/neurobagel/bagel-cli -w /data/neurobagel/bagel-cli bagel pheno \
--pheno "${data_dir}/example_synthetic.tsv" \
--dictionary "${data_dir}/example_synthetic.json" \
--name "BIDS synthetic" \
--output "${data_dir}/example_synthetic.jsonld" \
--overwrite

# Phenotypic & BIDS data JSONLD
docker run --rm --volume=$PWD:/data/neurobagel/bagel-cli -w /data/neurobagel/bagel-cli bagel bids \
--jsonld-path ${data_dir}/example_synthetic.jsonld \
--bids-dir bids-examples/synthetic \
--output ${data_dir}/pheno-bids-output/example_synthetic_pheno-bids.jsonld \
--overwrite

# Phenotypic & derivatives data JSONLD
docker run --rm --volume=$PWD:/data/neurobagel/bagel-cli -w /data/neurobagel/bagel-cli bagel derivatives \
--tabular ${data_dir}/nipoppy_proc_status_synthetic.tsv \
--jsonld-path ${data_dir}/example_synthetic.jsonld \
--output "${data_dir}/pheno-derivatives-output/example_synthetic_pheno-derivatives.jsonld" \
--overwrite

# Phenotypic, BIDS, and derivatives data JSONLD
docker run --rm --volume=$PWD:/data/neurobagel/bagel-cli -w /data/neurobagel/bagel-cli bagel derivatives \
--tabular ${data_dir}/nipoppy_proc_status_synthetic.tsv \
--jsonld-path "${data_dir}/pheno-bids-output/example_synthetic_pheno-bids.jsonld" \
--output "${data_dir}/pheno-bids-derivatives-output/example_synthetic_pheno-bids-derivatives.jsonld" \
--overwrite
4 changes: 2 additions & 2 deletions tests/data/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@
| 2 | valid, unique `participant` and `session` IDs | same as example 1 | pass |
| 3 | same as example 2 | valid BIDS data dictionary, BUT: does not contain Neurobagel `"Annotations"` key | fail |
| 4 | valid, has additional columns not described in `.json` | same as example 1 | pass |
| 5 | valid, has additional unique value, not documented in `.json` | same as example 1 | fail |
| 6 | valid, same as example 5. has annotation tool columns | valid, contains `"MissingValues"` attribute for categorical variable | pass |
| 5 | valid, has assessment tool columns | invalid, has TermURLs from unsupported vocabularies | fail |
| 6 | valid, same as example 5. | valid, contains `"MissingValues"` attribute for categorical variable | pass |
| invalid | valid, only exists to be used together with the (invalid) .json | invalid, missing the `"TermURL"` attribute for identifiers | fail |
| 7 | has fewer columns than are annotated in `.json` | same as example 1 | fail |
| 8 | valid, based on ex2 has multiple participant_id columns | valid, based on ex2 multiple participant_id column annotations | fail* |
Expand Down
6 changes: 3 additions & 3 deletions tests/data/example10.json
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@
"Label": "Assessment tool"
},
"IsPartOf": {
"TermURL": "cogatlas:1234",
"TermURL": "snomed:1234",
"Label": "Imaginary tool"
},
"MissingValues": ["missing", "NOT IN TSV 1", "NOT IN TSV 2"]
Expand All @@ -65,7 +65,7 @@
"Label": "Assessment tool"
},
"IsPartOf": {
"TermURL": "cogatlas:1234",
"TermURL": "snomed:1234",
"Label": "Imaginary tool"
},
"MissingValues": ["missing", "NOT IN TSV 1", "NOT IN TSV 2"]
Expand All @@ -79,7 +79,7 @@
"Label": "Assessment tool"
},
"IsPartOf": {
"TermURL": "cogatlas:4321",
"TermURL": "snomed:4321",
"Label": "A different imaginary tool"
},
"MissingValues": ["none"]
Expand Down
6 changes: 3 additions & 3 deletions tests/data/example11.json
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@
"Label": "Assessment tool"
},
"IsPartOf": {
"TermURL": "cogatlas:1234",
"TermURL": "snomed:1234",
"Label": "Imaginary tool"
},
"MissingValues": ["missing"]
Expand All @@ -65,7 +65,7 @@
"Label": "Assessment tool"
},
"IsPartOf": {
"TermURL": "cogatlas:1234",
"TermURL": "snomed:1234",
"Label": "Imaginary tool"
},
"MissingValues": ["missing"]
Expand All @@ -79,7 +79,7 @@
"Label": "Assessment tool"
},
"IsPartOf": {
"TermURL": "cogatlas:4321",
"TermURL": "snomed:4321",
"Label": "A different imaginary tool"
},
"MissingValues": ["none"]
Expand Down
6 changes: 3 additions & 3 deletions tests/data/example13.json
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@
"Label": "Assessment tool"
},
"IsPartOf": {
"TermURL": "cogatlas:1234",
"TermURL": "snomed:1234",
"Label": "Imaginary tool"
},
"MissingValues": ["missing"]
Expand All @@ -107,7 +107,7 @@
"Label": "Assessment tool"
},
"IsPartOf": {
"TermURL": "cogatlas:1234",
"TermURL": "snomed:1234",
"Label": "Imaginary tool"
},
"MissingValues": ["missing"]
Expand All @@ -121,7 +121,7 @@
"Label": "Assessment tool"
},
"IsPartOf": {
"TermURL": "cogatlas:4321",
"TermURL": "snomed:4321",
"Label": "A different imaginary tool"
},
"MissingValues": ["not completed"]
Expand Down
45 changes: 44 additions & 1 deletion tests/data/example5.json
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,50 @@
"TermURL": "ncit:C94342",
"Label": "Healthy Control"
}
}
},
"MissingValues": ["OTHER"]
}
},
"tool_item1": {
"Description": "item 1 scores for an imaginary tool",
"Annotations": {
"IsAbout": {
"TermURL": "nb:Assessment",
"Label": "Assessment tool"
},
"IsPartOf": {
"TermURL": "unknownvocab:1234",
"Label": "Imaginary tool"
},
"MissingValues": ["missing"]
}
},
"tool_item2": {
"Description": "item 2 scores for an imaginary tool",
"Annotations": {
"IsAbout": {
"TermURL": "nb:Assessment",
"Label": "Assessment tool"
},
"IsPartOf": {
"TermURL": "unknownvocab:1234",
"Label": "Imaginary tool"
},
"MissingValues": ["missing"]
}
},
"other_tool_item1": {
"Description": "item 1 scores for a different imaginary tool",
"Annotations": {
"IsAbout": {
"TermURL": "nb:Assessment",
"Label": "Assessment tool"
},
"IsPartOf": {
"TermURL": "cogatlas:4321",
"Label": "A different imaginary tool"
},
"MissingValues": ["none"]
}
}
}
12 changes: 7 additions & 5 deletions tests/data/example5.tsv
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
participant_id session_id group
sub-01 ses-01 PAT
sub-01 ses-02 PAT
sub-02 ses-01 OTHER
sub-02 ses-02 CTRL
participant_id session_id group tool_item1 tool_item2 other_tool_item1
sub-01 ses-01 PAT 11.0 "missing" "none"
sub-01 ses-02 PAT "missing" 12.0 "none"
sub-02 ses-01 OTHER "missing" "missing" "none"
sub-02 ses-02 OTHER "missing" "missing" "none"
sub-03 ses-01 CTRL 10.0 8.0 "ok"
sub-03 ses-02 CTRL 10.0 8.0 "bad"
6 changes: 3 additions & 3 deletions tests/data/example6.json
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@
"Label": "Assessment tool"
},
"IsPartOf": {
"TermURL": "cogatlas:1234",
"TermURL": "snomed:1234",
"Label": "Imaginary tool"
},
"MissingValues": ["missing"]
Expand All @@ -65,7 +65,7 @@
"Label": "Assessment tool"
},
"IsPartOf": {
"TermURL": "cogatlas:1234",
"TermURL": "snomed:1234",
"Label": "Imaginary tool"
},
"MissingValues": ["missing"]
Expand All @@ -79,7 +79,7 @@
"Label": "Assessment tool"
},
"IsPartOf": {
"TermURL": "cogatlas:4321",
"TermURL": "snomed:4321",
"Label": "A different imaginary tool"
},
"MissingValues": ["none"]
Expand Down
6 changes: 3 additions & 3 deletions tests/data/example9.json
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@
"Label": "Assessment tool"
},
"IsPartOf": {
"TermURL": "cogatlas:1234",
"TermURL": "snomed:1234",
"Label": "Imaginary tool"
},
"MissingValues": ["missing"]
Expand All @@ -65,7 +65,7 @@
"Label": "Assessment tool"
},
"IsPartOf": {
"TermURL": "cogatlas:1234",
"TermURL": "snomed:1234",
"Label": "Imaginary tool"
},
"MissingValues": ["missing"]
Expand All @@ -79,7 +79,7 @@
"Label": "Assessment tool"
},
"IsPartOf": {
"TermURL": "cogatlas:4321",
"TermURL": "snomed:4321",
"Label": "A different imaginary tool"
},
"MissingValues": ["none"]
Expand Down
Loading

0 comments on commit 1b7f9fe

Please sign in to comment.