Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Integrate and test Zarr backend #513

Merged
merged 31 commits into from
Sep 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
7cf9de6
setup for zarr
CodyCBakerPhD Sep 16, 2024
8a5e506
Merge branch 'dev' into integrate_zarr
CodyCBakerPhD Sep 16, 2024
bdb8278
Merge branch 'dev' into integrate_zarr
CodyCBakerPhD Sep 16, 2024
885f1de
fix imports
bendichter Sep 16, 2024
a28c8d7
Merge branch 'dev' into integrate_zarr
CodyCBakerPhD Sep 17, 2024
9563e7c
fix test class names; add more zarr
CodyCBakerPhD Sep 17, 2024
e4be201
changelog
CodyCBakerPhD Sep 17, 2024
8032c95
control when io is returned
CodyCBakerPhD Sep 17, 2024
c58121e
Merge branch 'dev' into integrate_zarr
CodyCBakerPhD Sep 17, 2024
949b98f
debugs
CodyCBakerPhD Sep 25, 2024
b23f9d3
debugs
CodyCBakerPhD Sep 25, 2024
34a9739
debugs
CodyCBakerPhD Sep 25, 2024
e15635b
debugs
CodyCBakerPhD Sep 25, 2024
4cfa225
Merge branch 'integrate_zarr' of https://github.com/neurodatawithoutb…
CodyCBakerPhD Sep 25, 2024
012f83c
debugs
CodyCBakerPhD Sep 25, 2024
9985a50
down to only one
CodyCBakerPhD Sep 25, 2024
83b46ca
remove f strings automatically
CodyCBakerPhD Sep 25, 2024
b9c2019
remove some
CodyCBakerPhD Sep 25, 2024
2a20eb0
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Sep 25, 2024
4b609b8
override mystery
CodyCBakerPhD Sep 25, 2024
48bbe0e
Merge branch 'integrate_zarr' of https://github.com/neurodatawithoutb…
CodyCBakerPhD Sep 25, 2024
5e12529
fix streaming path
CodyCBakerPhD Sep 25, 2024
5b437a4
Merge branch 'dev' into integrate_zarr
CodyCBakerPhD Sep 26, 2024
bf48ab0
Merge branch 'dev' into integrate_zarr
CodyCBakerPhD Sep 26, 2024
5e1d20a
Update tests/test_inspector.py
CodyCBakerPhD Sep 27, 2024
e566e81
PR suggestions
CodyCBakerPhD Sep 27, 2024
9dbfbfd
adjust docstring
CodyCBakerPhD Sep 27, 2024
2022628
add Zarr to caching
CodyCBakerPhD Sep 27, 2024
491855e
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Sep 27, 2024
10784f9
rollback caching
CodyCBakerPhD Sep 27, 2024
36421db
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Sep 27, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@
### Deprecation (API)
* The `inspect_nwb` method has been removed. Please use `inspect_nwbfile` or `inspect_nwbfile_object` instead. [#505](https://github.com/NeurodataWithoutBorders/nwbinspector/pull/505)

### New Features
* Added Zarr support. [#513](https://github.com/NeurodataWithoutBorders/nwbinspector/pull/513)

### Improvements
* Removed the `robust_ros3_read` utility helper. [#506](https://github.com/NeurodataWithoutBorders/nwbinspector/pull/506)
* Simplified the `nwbinspector.testing` configuration framework. [#509](https://github.com/NeurodataWithoutBorders/nwbinspector/pull/509)
Expand Down
5 changes: 2 additions & 3 deletions MANIFEST.in
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
graft tests
global-exclude *.py[cod]
include src/nwbinspector/internal_configs/dandi.inspector_config.yaml
include src/nwbinspector/config.schema.json
include requirements.txt
stephprince marked this conversation as resolved.
Show resolved Hide resolved
include src/nwbinspector/_internal_configs/dandi.inspector_config.yaml
include src/nwbinspector/_internal_configs/config.schema.json
8 changes: 8 additions & 0 deletions src/nwbinspector/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,10 @@
# Still keeping the legacy magic version attribute requested by some users
__version__ = importlib.metadata.version(distribution_name="nwbinspector")

# Note: this is not exposed at this outer level, but is used here to trigger the automatic submodule import
# (otherwise someone would have to import nwbinspector.testing explicitly)
from .testing import check_streaming_tests_enabled # noqa: F401

__all__ = [
"available_checks",
"default_check_registry",
Expand All @@ -51,4 +55,8 @@
"FormatterOptions",
"organize_messages",
"__version__",
# Public submodules
"checks",
"testing",
"utils",
]
7 changes: 5 additions & 2 deletions src/nwbinspector/_configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,15 @@
from . import available_checks
from ._registration import Importance

INTERNAL_CONFIGS = dict(dandi=Path(__file__).parent / "internal_configs" / "dandi.inspector_config.yaml")
INTERNAL_CONFIGS = dict(
dandi=Path(__file__).parent / "_internal_configs" / "dandi.inspector_config.yaml",
)


def validate_config(config: dict):
"""Validate an instance of configuration against the official schema."""
with open(file=Path(__file__).parent / "config.schema.json", mode="r") as fp:
config_schema_file_path = Path(__file__).parent / "_internal_configs" / "config.schema.json"
with open(file=config_schema_file_path, mode="r") as fp:
schema = json.load(fp=fp)
jsonschema.validate(instance=config, schema=schema)

Expand Down
79 changes: 39 additions & 40 deletions src/nwbinspector/_nwb_inspection.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,10 @@
from natsort import natsorted
from tqdm import tqdm

from . import available_checks, configure_checks
from . import available_checks
from ._configuration import configure_checks
from ._registration import Importance, InspectorMessage
from .tools._read_nwbfile import read_nwbfile
from .utils import (
FilePathType,
OptionalListOfStrings,
Expand Down Expand Up @@ -127,7 +129,7 @@ def inspect_all(
progress_bar_options = dict(position=0, leave=False)

if in_path.is_dir():
nwbfiles = list(in_path.rglob("*.nwb"))
nwbfiles = list(in_path.rglob("*.nwb*"))

# Remove any macOS sidecar files
nwbfiles = [nwbfile for nwbfile in nwbfiles if not nwbfile.name.startswith("._")]
Expand All @@ -141,17 +143,16 @@ def inspect_all(
# Manual identifier check over all files in the folder path
identifiers = defaultdict(list)
for nwbfile_path in nwbfiles:
with pynwb.NWBHDF5IO(path=nwbfile_path, mode="r", load_namespaces=True) as io:
try:
nwbfile = io.read()
identifiers[nwbfile.identifier].append(nwbfile_path)
except Exception as exception:
yield InspectorMessage(
message=traceback.format_exc(),
importance=Importance.ERROR,
check_function_name=f"During io.read() - {type(exception)}: {str(exception)}",
file_path=nwbfile_path,
)
try:
nwbfile = read_nwbfile(nwbfile_path=nwbfile_path)
identifiers[nwbfile.identifier].append(nwbfile_path)
except Exception as exception:
yield InspectorMessage(
message=traceback.format_exc(),
importance=Importance.ERROR,
check_function_name=f"During io.read() - {type(exception)}: {str(exception)}",
file_path=nwbfile_path,
)

if len(identifiers) != len(nwbfiles):
for identifier, nwbfiles_with_identifier in identifiers.items():
Expand Down Expand Up @@ -198,7 +199,7 @@ def inspect_all(
yield message
else:
for nwbfile_path in nwbfiles_iterable:
for message in inspect_nwbfile(nwbfile_path=nwbfile_path, checks=checks):
for message in inspect_nwbfile(nwbfile_path=nwbfile_path, checks=checks, skip_validate=skip_validate):
yield message


Expand Down Expand Up @@ -237,7 +238,7 @@ def inspect_nwbfile(
config : dict
Dictionary valid against our JSON configuration schema.
Can specify a mapping of importance levels and list of check functions whose importance you wish to change.
Typically loaded via json.load from a valid .json file
Typically loaded via `json.load` from a valid .json file.
ignore: list, optional
Names of functions to skip.
select: list, optional
Expand Down Expand Up @@ -267,10 +268,12 @@ def inspect_nwbfile(
filterwarnings(action="ignore", message="No cached namespaces found in .*")
filterwarnings(action="ignore", message="Ignoring cached namespace .*")

if not skip_validate:
validation_error_list, _ = pynwb.validate(paths=[nwbfile_path])
for validation_namespace_errors in validation_error_list:
for validation_error in validation_namespace_errors:
try:
in_memory_nwbfile, io = read_nwbfile(nwbfile_path=nwbfile_path, return_io=True)

if not skip_validate:
validation_errors = pynwb.validate(io=io)
for validation_error in validation_errors:
yield InspectorMessage(
message=validation_error.reason,
importance=Importance.PYNWB_VALIDATION,
Expand All @@ -279,27 +282,23 @@ def inspect_nwbfile(
file_path=nwbfile_path,
)

with pynwb.NWBHDF5IO(path=nwbfile_path, mode="r", load_namespaces=True) as io:
try:
in_memory_nwbfile = io.read()

for inspector_message in inspect_nwbfile_object(
nwbfile_object=in_memory_nwbfile,
checks=checks,
config=config,
ignore=ignore,
select=select,
importance_threshold=importance_threshold,
):
inspector_message.file_path = nwbfile_path
yield inspector_message
except Exception as exception:
yield InspectorMessage(
message=traceback.format_exc(),
importance=Importance.ERROR,
check_function_name=f"During io.read() - {type(exception)}: {str(exception)}",
file_path=nwbfile_path,
)
for inspector_message in inspect_nwbfile_object(
nwbfile_object=in_memory_nwbfile,
checks=checks,
config=config,
ignore=ignore,
select=select,
importance_threshold=importance_threshold,
):
inspector_message.file_path = nwbfile_path
yield inspector_message
except Exception as exception:
yield InspectorMessage(
message=traceback.format_exc(),
importance=Importance.ERROR,
check_function_name=f"During io.read() - {type(exception)}: {str(exception)}",
file_path=nwbfile_path,
)


# TODO: deprecate once subject types and dandi schemas have been extended
Expand Down
12 changes: 8 additions & 4 deletions src/nwbinspector/_registration.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from typing import Optional

import h5py
import zarr
from pynwb import NWBFile
from pynwb.ecephys import Device, ElectrodeGroup
from pynwb.file import Subject
Expand Down Expand Up @@ -93,7 +94,7 @@ def _auto_parse(check_function, obj, result: Optional[InspectorMessage] = None):


def _parse_location(neurodata_object) -> Optional[str]:
"""Grab the object location from a h5py.Dataset or a container content that is an h5py.Dataset object."""
"""Grab the object location from a dataset or a container content that is an dataset object."""
known_locations = {
NWBFile: "/",
Subject: "/general/subject",
Expand All @@ -105,13 +106,16 @@ def _parse_location(neurodata_object) -> Optional[str]:
for key, val in known_locations.items():
if isinstance(neurodata_object, key):
return val
"""Infer the human-readable path of the object within an NWBFile by tracing its parents."""

# Infer the human-readable path of the object within an NWBFile by tracing its parents
if neurodata_object.parent is None:
return "/"
# Best solution: object is or has a HDF5 Dataset
if isinstance(neurodata_object, h5py.Dataset):
if isinstance(neurodata_object, (h5py.Dataset, zarr.Array)):
return neurodata_object.name
else:
for field in neurodata_object.fields.values():
for field_name, field in neurodata_object.fields.items():
if isinstance(field, h5py.Dataset):
return field.parent.name
elif isinstance(field, zarr.Array):
return field.name.removesuffix(f"/{field_name}")
stephprince marked this conversation as resolved.
Show resolved Hide resolved
23 changes: 19 additions & 4 deletions src/nwbinspector/checks/_nwb_containers.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import os

import h5py
import zarr
from pynwb import NWBContainer

from .._registration import Importance, InspectorMessage, Severity, register_check
Expand All @@ -19,9 +20,16 @@ def check_large_dataset_compression(nwb_container: NWBContainer, gb_lower_bound:
Best Practice: :ref:`best_practice_compression`
"""
for field in getattr(nwb_container, "fields", dict()).values():
if not isinstance(field, h5py.Dataset):
if not isinstance(field, (h5py.Dataset, zarr.Array)):
continue
if field.compression is None and field.size * field.dtype.itemsize > gb_lower_bound * 1e9:

compression_indicator = None
if isinstance(field, h5py.Dataset):
compression_indicator = field.compression
elif isinstance(field, zarr.Array):
compression_indicator = field.compressor

if compression_indicator is not None and field.size * field.dtype.itemsize > gb_lower_bound * 1e9:
stephprince marked this conversation as resolved.
Show resolved Hide resolved
return InspectorMessage(
severity=Severity.HIGH,
message=f"{os.path.split(field.name)[1]} is a large uncompressed dataset! Please enable compression.",
Expand All @@ -44,10 +52,17 @@ def check_small_dataset_compression(
Best Practice: :ref:`best_practice_compression`
"""
for field in getattr(nwb_container, "fields", dict()).values():
if not isinstance(field, h5py.Dataset):
if not isinstance(field, (h5py.Dataset, zarr.Array)):
continue

compression_indicator = None
if isinstance(field, h5py.Dataset):
compression_indicator = field.compression
elif isinstance(field, zarr.Array):
compression_indicator = field.compressor

if (
field.compression is None
compression_indicator is None
and mb_lower_bound * 1e6 < field.size * field.dtype.itemsize < gb_upper_bound * 1e9
):
if field.size * field.dtype.itemsize > gb_severity_threshold * 1e9:
Expand Down
3 changes: 2 additions & 1 deletion src/nwbinspector/tools/__init__.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
from ._dandi import get_s3_urls_and_dandi_paths
from ._nwb import all_of_type, get_nwbfile_path_from_internal_object
from ._read_nwbfile import read_nwbfile
from ._read_nwbfile import BACKEND_IO_CLASSES, read_nwbfile

__all__ = [
"BACKEND_IO_CLASSES",
"get_s3_urls_and_dandi_paths",
"all_of_type",
"get_nwbfile_path_from_internal_object",
Expand Down
39 changes: 23 additions & 16 deletions src/nwbinspector/tools/_read_nwbfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,14 @@
from warnings import filterwarnings

import h5py
from hdmf.backends.io import HDMFIO
from hdmf_zarr import NWBZarrIO
from pynwb import NWBHDF5IO, NWBFile

_BACKEND_IO_CLASSES = dict(hdf5=NWBHDF5IO)

try:
from hdmf_zarr import NWBZarrIO

_BACKEND_IO_CLASSES.update(zarr=NWBZarrIO)
except ModuleNotFoundError as exception:
if str(exception) != "No module named 'hdmf_zarr'": # not the exception we're looking for, so re-raise
raise exception
BACKEND_IO_CLASSES = dict(
hdf5=NWBHDF5IO,
zarr=NWBZarrIO,
)


def _get_method(path: str):
Expand Down Expand Up @@ -47,11 +44,11 @@ def _get_backend(path: str, method: Literal["local", "fsspec", "ros3"]):
if method == "fsspec":
fs = _init_fsspec(path=path)
with fs.open(path=path, mode="rb") as file:
for backend_name, backend_class in _BACKEND_IO_CLASSES.items():
for backend_name, backend_class in BACKEND_IO_CLASSES.items():
if backend_class.can_read(path=file):
possible_backends.append(backend_name)
else:
for backend_name, backend_class in _BACKEND_IO_CLASSES.items():
for backend_name, backend_class in BACKEND_IO_CLASSES.items():
if backend_class.can_read(path):
possible_backends.append(backend_name)

Expand All @@ -69,7 +66,8 @@ def read_nwbfile(
nwbfile_path: Union[str, Path],
method: Optional[Literal["local", "fsspec", "ros3"]] = None,
backend: Optional[Literal["hdf5", "zarr"]] = None,
) -> NWBFile:
return_io: bool = False,
) -> Union[NWBFile, tuple[NWBFile, HDMFIO]]:
"""
Read an NWB file using the specified (or auto-detected) method and specified (or auto-detected) backend.

Expand All @@ -85,10 +83,16 @@ def read_nwbfile(
backend : "hdf5", "zarr", or None (default)
Type of backend used to write the file.
The default auto-detects the type of the file.
return_io : bool, default: False
Whether to return the HDMFIO object used to open the file.

Returns
-------
pynwb.NWBFile
nwbfile : pynwb.NWBFile
The in-memory NWBFile object.
io : hdmf.backends.io.HDMFIO, optional
Only passed if `return_io` is True.
The initialized HDMFIO object used to read the file.
"""
nwbfile_path = str(nwbfile_path) # If pathlib.Path, cast to str; if already str, no harm done

Expand All @@ -109,7 +113,7 @@ def read_nwbfile(
)

backend = backend or _get_backend(nwbfile_path, method)
if method == "local" and not _BACKEND_IO_CLASSES[ # Temporary until .can_read() is able to work on streamed bytes
if method == "local" and not BACKEND_IO_CLASSES[ # Temporary until .can_read() is able to work on streamed bytes
backend
].can_read(path=nwbfile_path):
raise IOError(f"The chosen backend ({backend}) is unable to read the file! Please select a different backend.")
Expand All @@ -127,7 +131,10 @@ def read_nwbfile(
io_kwargs.update(path=nwbfile_path)
if method == "ros3":
io_kwargs.update(driver="ros3")
io = _BACKEND_IO_CLASSES[backend](**io_kwargs)
io = BACKEND_IO_CLASSES[backend](**io_kwargs)
nwbfile = io.read()

return nwbfile
if return_io:
return (nwbfile, io)
else: # Note: do not be concerned about io object closing due to garbage collection here
return nwbfile # (it is attached as an attribute to the NWBFile object)
3 changes: 2 additions & 1 deletion src/nwbinspector/utils/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

import h5py
import numpy as np
import zarr
from hdmf.backends.hdf5.h5_utils import H5Dataset
from numpy.typing import ArrayLike
from packaging import version
Expand All @@ -25,7 +26,7 @@

@lru_cache(maxsize=MAX_CACHE_ITEMS)
def _cache_data_retrieval_command(
data: h5py.Dataset, reduced_selection: tuple[tuple[Optional[int], Optional[int], Optional[int]]]
data: Union[h5py.Dataset, zarr.Array], reduced_selection: tuple[tuple[Optional[int], Optional[int], Optional[int]]]
) -> np.ndarray:
"""LRU caching for _cache_data_selection cannot be applied to list inputs; this expects the tuple or Dataset."""
selection = tuple([slice(*reduced_slice) for reduced_slice in reduced_selection]) # reconstitute the slices
Expand Down
File renamed without changes.
Loading
Loading