From 2a2d805d29ff5b4c1e3e2d42066f3a8574a52f09 Mon Sep 17 00:00:00 2001 From: Cody Baker <51133164+CodyCBakerPhD@users.noreply.github.com> Date: Wed, 6 Mar 2024 12:48:50 -0500 Subject: [PATCH] Upgrade to Pydantic 2 (#767) --- CHANGELOG.md | 9 + requirements-minimal.txt | 5 +- src/neuroconv/tools/nwb_helpers/__init__.py | 15 +- .../_configuration_models/_base_backend.py | 16 +- .../_configuration_models/_base_dataset_io.py | 213 +++++++++--------- .../_configuration_models/_hdf5_backend.py | 8 +- .../_configuration_models/_hdf5_dataset_io.py | 9 +- .../_configuration_models/_zarr_backend.py | 8 +- .../_configuration_models/_zarr_dataset_io.py | 37 ++- .../tools/nwb_helpers/_configure_backend.py | 8 +- .../nwb_helpers/_dataset_configuration.py | 10 +- src/neuroconv/tools/testing/__init__.py | 1 - .../testing/_mock/_mock_dataset_models.py | 85 +++---- ...t_common_dataset_io_configuration_model.py | 96 ++------ ...t_get_default_dataset_io_configurations.py | 80 +++---- ...ataset_io_configurations_appended_files.py | 32 +-- .../test_models/test_dataset_info_model.py | 43 ---- .../test_dataset_io_configuration_model.py | 20 +- ...est_hdf5_dataset_io_configuration_model.py | 4 +- ...est_zarr_dataset_io_configuration_model.py | 21 +- 20 files changed, 306 insertions(+), 414 deletions(-) delete mode 100644 tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/test_dataset_info_model.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 8388f5ed7..fb1ae15df 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,14 @@ # Upcoming +### Improvements + +* Upgraded Pydantic support to `>v2.0.0`. [PR #767](https://github.com/catalystneuro/neuroconv/pull/767) +* Absorbed the `DatasetInfo` model into the `DatasetIOConfiguration` model. [PR #767](https://github.com/catalystneuro/neuroconv/pull/767) +* Keyword argument `field_name` of the `DatasetIOConfiguration.from_neurodata_object` method has been renamed to `dataset_name` to be more consistent with its usage. This only affects direct initialization of the model; usage via the `BackendConfiguration` constructor and its associated helper functions in `neuroconv.tools.nwb_helpers` is unaffected. [PR #767](https://github.com/catalystneuro/neuroconv/pull/767) +* Manual construction of a `DatasetIOConfiguration` now requires the field `dataset_name`, and will be validated to match the final path of `location_in_file`. Usage via the automated constructors is unchanged. [PR #767](https://github.com/catalystneuro/neuroconv/pull/767) + + + # v0.4.7 (February 21, 2024) ### Deprecation diff --git a/requirements-minimal.txt b/requirements-minimal.txt index dfd7e9619..a05b90ee1 100644 --- a/requirements-minimal.txt +++ b/requirements-minimal.txt @@ -4,10 +4,11 @@ jsonschema>=3.2.0 PyYAML>=5.4 scipy>=1.4.1 h5py>=3.9.0 -hdmf>=3.12.1 +hdmf>=3.12.2 hdmf_zarr>=0.4.0 pynwb>=2.3.2;python_version>='3.8' -pydantic>=1.10.13,<2.0.0 +pydantic>=2.0.0 +typing_extensions>=4.1.0 psutil>=5.8.0 tqdm>=4.60.0 pandas diff --git a/src/neuroconv/tools/nwb_helpers/__init__.py b/src/neuroconv/tools/nwb_helpers/__init__.py index 2c1951edf..1437d87d0 100644 --- a/src/neuroconv/tools/nwb_helpers/__init__.py +++ b/src/neuroconv/tools/nwb_helpers/__init__.py @@ -6,7 +6,7 @@ from ._backend_configuration import get_default_backend_configuration from ._configuration_models._base_backend import BackendConfiguration -from ._configuration_models._base_dataset_io import DatasetInfo, DatasetIOConfiguration +from ._configuration_models._base_dataset_io import DatasetIOConfiguration from ._configuration_models._hdf5_backend import HDF5BackendConfiguration from ._configuration_models._hdf5_dataset_io import ( AVAILABLE_HDF5_COMPRESSION_METHODS, @@ -37,11 +37,15 @@ "BACKEND_CONFIGURATIONS", "DATASET_IO_CONFIGURATIONS", "BACKEND_NWB_IO", + "BackendConfiguration", + "HDF5BackendConfiguration", + "ZarrBackendConfiguration", + "DatasetIOConfiguration", + "HDF5DatasetIOConfiguration", + "ZarrDatasetIOConfiguration", "get_default_backend_configuration", "get_default_dataset_io_configurations", "configure_backend", - "BackendConfiguration", - "DatasetIOConfiguration", "get_default_dataset_io_configurations", "get_default_backend_configuration", "add_device_from_metadata", @@ -49,9 +53,4 @@ "get_module", "make_nwbfile_from_metadata", "make_or_load_nwbfile", - "DatasetInfo", - "HDF5BackendConfiguration", - "HDF5DatasetIOConfiguration", - "ZarrBackendConfiguration", - "ZarrDatasetIOConfiguration", ] diff --git a/src/neuroconv/tools/nwb_helpers/_configuration_models/_base_backend.py b/src/neuroconv/tools/nwb_helpers/_configuration_models/_base_backend.py index b66d56975..7d6766e2b 100644 --- a/src/neuroconv/tools/nwb_helpers/_configuration_models/_base_backend.py +++ b/src/neuroconv/tools/nwb_helpers/_configuration_models/_base_backend.py @@ -3,8 +3,9 @@ from typing import ClassVar, Dict, Literal, Type from hdmf.container import DataIO -from pydantic import BaseModel, Field +from pydantic import BaseModel, ConfigDict, Field from pynwb import NWBFile +from typing_extensions import Self from ._base_dataset_io import DatasetIOConfiguration from .._dataset_configuration import get_default_dataset_io_configurations @@ -13,10 +14,11 @@ class BackendConfiguration(BaseModel): """A model for matching collections of DatasetConfigurations to a specific backend.""" - backend: ClassVar[Literal["hdf5", "zarr"]] = Field( - description="The name of the backend used to configure the NWBFile." - ) - data_io_class: Type[DataIO] = Field(description="The DataIO class that is specific to this backend.") + backend: ClassVar[Literal["hdf5", "zarr"]] + data_io_class: ClassVar[Type[DataIO]] + + model_config = ConfigDict(validate_assignment=True) # Re-validate model on mutation + dataset_configurations: Dict[str, DatasetIOConfiguration] = Field( description=( "A mapping from object locations (e.g. `acquisition/TestElectricalSeriesAP/data`) " @@ -38,10 +40,10 @@ def __str__(self) -> str: return string @classmethod - def from_nwbfile(cls, nwbfile: NWBFile) -> "BackendConfiguration": + def from_nwbfile(cls, nwbfile: NWBFile) -> Self: default_dataset_configurations = get_default_dataset_io_configurations(nwbfile=nwbfile, backend=cls.backend) dataset_configurations = { - default_dataset_configuration.dataset_info.location_in_file: default_dataset_configuration + default_dataset_configuration.location_in_file: default_dataset_configuration for default_dataset_configuration in default_dataset_configurations } diff --git a/src/neuroconv/tools/nwb_helpers/_configuration_models/_base_dataset_io.py b/src/neuroconv/tools/nwb_helpers/_configuration_models/_base_dataset_io.py index 0d4b842ac..9562fa83e 100644 --- a/src/neuroconv/tools/nwb_helpers/_configuration_models/_base_dataset_io.py +++ b/src/neuroconv/tools/nwb_helpers/_configuration_models/_base_dataset_io.py @@ -11,13 +11,21 @@ from hdmf import Container from hdmf.data_utils import GenericDataChunkIterator from hdmf.utils import get_data_shape -from pydantic import BaseModel, Field, root_validator +from pydantic import ( + BaseModel, + ConfigDict, + Field, + InstanceOf, + PositiveInt, + model_validator, +) from pynwb import NWBFile +from typing_extensions import Self from ...hdmf import SliceableDataChunkIterator -def _find_location_in_memory_nwbfile(current_location: str, neurodata_object: Container) -> str: +def _recursively_find_location_in_memory_nwbfile(current_location: str, neurodata_object: Container) -> str: """ Method for determining the location of a neurodata object within an in-memory NWBFile object. @@ -31,11 +39,20 @@ def _find_location_in_memory_nwbfile(current_location: str, neurodata_object: Co if isinstance(parent_field_value, dict) and neurodata_object.name in parent_field_value: return parent_field_name + "/" + neurodata_object.name + "/" + current_location return neurodata_object.name + "/" + current_location - return _find_location_in_memory_nwbfile( + return _recursively_find_location_in_memory_nwbfile( current_location=neurodata_object.name + "/" + current_location, neurodata_object=parent ) +def _find_location_in_memory_nwbfile(neurodata_object: Container, field_name: str) -> str: + """ + More readable call for the recursive location finder for a field of a neurodata object in an in-memory NWBFile. + + The recursive method forms from the buttom-up using the initial 'current_location' of the field itself. + """ + return _recursively_find_location_in_memory_nwbfile(current_location=field_name, neurodata_object=neurodata_object) + + def _infer_dtype_of_list(list_: List[Union[int, float, list]]) -> np.dtype: """ Attempt to infer the dtype of values in an arbitrarily sized and nested list. @@ -67,105 +84,55 @@ def _infer_dtype(dataset: Union[h5py.Dataset, zarr.Array]) -> np.dtype: return data_type -class DatasetInfo(BaseModel): - """A data model to represent immutable aspects of an object that will become a HDF5 or Zarr dataset on write.""" - - # TODO: When using Pydantic v2, replace with - # model_config = ConfigDict(allow_mutation=False) - class Config: # noqa: D106 - allow_mutation = False - arbitrary_types_allowed = True - - object_id: str = Field(description="The UUID of the neurodata object containing the dataset.") - location_in_file: str = Field( # TODO: in v2, use init_var=False or assign as a property - description="The relative location of the this dataset within the in-memory NWBFile. (e.g. 'acquisition/TestElectricalSeries/data')" - ) - dataset_name: Literal["data", "timestamps"] = Field(description="The reference name of the dataset.") - dtype: np.dtype = Field( # TODO: When using Pydantic v2, replace np.dtype with InstanceOf[np.dtype] - description="The data type of elements of this dataset." - ) - full_shape: Tuple[int, ...] = Field(description="The maximum shape of the entire dataset.") - - def __hash__(self): - """To allow instances of this class to be used as keys in dictionaries.""" - return hash((type(self),) + tuple(self.__dict__.values())) - - def __str__(self) -> str: - """ - Not overriding __repr__ as this is intended to render only when wrapped in print(). - - Reason being two-fold; a standard `repr` is intended to be slightly more machine-readable / a more basic - representation of the true object state. But then also because an iterable of these objects, such as a - `List[DataSetInfo]`, would print out the nested representations, which only look good when using the basic - `repr` (that is, this fancy string print-out does not look good when nested in another container). - """ - source_size_in_gb = math.prod(self.full_shape) * self.dtype.itemsize / 1e9 - - string = ( - f"\n{self.location_in_file}" - f"\n{'-' * len(self.location_in_file)}" - f"\n dtype : {self.dtype}" - f"\n full shape of source array : {self.full_shape}" - f"\n full size of source array : {source_size_in_gb:0.2f} GB" - ) - return string - - def __init__(self, **values): - location = values["location_in_file"] - - # For more efficient / explicit reference downstream, instead of reparsing from location multiple times - dataset_name = location.split("/")[-1] - values.update(dataset_name=dataset_name) - super().__init__(**values) - - @classmethod - def from_neurodata_object(cls, neurodata_object: Container, field_name: str) -> "DatasetInfo": - location_in_file = _find_location_in_memory_nwbfile( - current_location=field_name, neurodata_object=neurodata_object - ) - candidate_dataset = getattr(neurodata_object, field_name) - - full_shape = get_data_shape(data=candidate_dataset) - dtype = _infer_dtype(dataset=candidate_dataset) - - return cls( - object_id=neurodata_object.object_id, - object_name=neurodata_object.name, - location_in_file=location_in_file, - full_shape=full_shape, - dtype=dtype, - ) - - class DatasetIOConfiguration(BaseModel, ABC): """A data model for configuring options about an object that will become a HDF5 or Zarr Dataset in the file.""" - # TODO: When using Pydantic v2, remove - class Config: - arbitrary_types_allowed = True + model_config = ConfigDict(validate_assignment=True) # Re-validate model on mutation + + # Immutable fields about the dataset + object_id: str = Field(description="The UUID of the neurodata object containing the dataset.", frozen=True) + location_in_file: str = Field( + description=( + "The location of the this dataset within the in-memory NWBFile relative to the top-level root, " + "e.g. 'acquisition/ElectricalSeries/data'." + ), + frozen=True, + ) + dataset_name: Literal["data", "timestamps"] = Field(description="The reference name of the dataset.", frozen=True) + dtype: InstanceOf[np.dtype] = Field(description="The data type of elements of this dataset.", frozen=True) + full_shape: Tuple[int, ...] = Field(description="The maximum shape of the entire dataset.", frozen=True) - dataset_info: DatasetInfo = Field(description="The immutable information about this dataset.") - chunk_shape: Tuple[int, ...] = Field( # When using Pydantic v2.0, specify PositiveInt + # User specifiable fields + chunk_shape: Union[Tuple[PositiveInt, ...], None] = Field( description=( "The specified shape to use when chunking the dataset. " "For optimized streaming speeds, a total size of around 10 MB is recommended." - ) + ), ) - buffer_shape: Tuple[int, ...] = Field( + buffer_shape: Union[Tuple[int, ...], None] = Field( description=( "The specified shape to use when iteratively loading data into memory while writing the dataset. " "For optimized writing speeds and minimal RAM usage, a total size of around 1 GB is recommended." - ) + ), ) - # TODO: When using Pydantic v2, wrap h5py._hl.filters.FilterRefBase and numcodecs.abc.Codec with InstanceOf - compression_method: Union[str, h5py._hl.filters.FilterRefBase, numcodecs.abc.Codec, None] = Field( - default="gzip", + compression_method: Union[ + str, InstanceOf[h5py._hl.filters.FilterRefBase], InstanceOf[numcodecs.abc.Codec], None + ] = Field( description="The specified compression method to apply to this dataset. Set to `None` to disable compression.", ) compression_options: Union[Dict[str, Any], None] = Field( default=None, description="The optional parameters to use for the specified compression method." ) + @abstractmethod + def get_data_io_kwargs(self) -> Dict[str, Any]: + """ + Fetch the properly structured dictionary of input arguments. + + Should be passed directly as dynamic keyword arguments (**kwargs) into a H5DataIO or ZarrDataIO. + """ + raise NotImplementedError + def __str__(self) -> str: """ Not overriding __repr__ as this is intended to render only when wrapped in print(). @@ -175,15 +142,15 @@ def __str__(self) -> str: `List[DatasetConfiguration]`, would print out the nested representations, which only look good when using the basic `repr` (that is, this fancy string print-out does not look good when nested in another container). """ - source_size_in_gb = math.prod(self.dataset_info.full_shape) * self.dataset_info.dtype.itemsize / 1e9 - maximum_ram_usage_per_iteration_in_gb = math.prod(self.buffer_shape) * self.dataset_info.dtype.itemsize / 1e9 - disk_space_usage_per_chunk_in_mb = math.prod(self.chunk_shape) * self.dataset_info.dtype.itemsize / 1e6 + source_size_in_gb = math.prod(self.full_shape) * self.dtype.itemsize / 1e9 + maximum_ram_usage_per_iteration_in_gb = math.prod(self.buffer_shape) * self.dtype.itemsize / 1e9 + disk_space_usage_per_chunk_in_mb = math.prod(self.chunk_shape) * self.dtype.itemsize / 1e6 string = ( - f"\n{self.dataset_info.location_in_file}" - f"\n{'-' * len(self.dataset_info.location_in_file)}" - f"\n dtype : {self.dataset_info.dtype}" - f"\n full shape of source array : {self.dataset_info.full_shape}" + f"\n{self.location_in_file}" + f"\n{'-' * len(self.location_in_file)}" + f"\n dtype : {self.dtype}" + f"\n full shape of source array : {self.full_shape}" f"\n full size of source array : {source_size_in_gb:0.2f} GB" # TODO: add nicer auto-selection/rendering of units and amount for source data size "\n" @@ -204,12 +171,18 @@ def __str__(self) -> str: return string - @root_validator + @model_validator(mode="before") def validate_all_shapes(cls, values: Dict[str, Any]) -> Dict[str, Any]: + location_in_file = values["location_in_file"] + dataset_name = values["dataset_name"] + + assert ( + dataset_name == location_in_file.split("/")[-1] + ), f"The `dataset_name` ({dataset_name}) does not match the end of the `location_in_file` ({location_in_file})!" + chunk_shape = values["chunk_shape"] buffer_shape = values["buffer_shape"] - full_shape = values["dataset_info"].full_shape - location_in_file = values["dataset_info"].location_in_file # For more identifiable error messages. + full_shape = values["full_shape"] if len(chunk_shape) != len(buffer_shape): raise ValueError( @@ -254,35 +227,51 @@ def validate_all_shapes(cls, values: Dict[str, Any]) -> Dict[str, Any]: return values - @abstractmethod - def get_data_io_kwargs(self) -> Dict[str, Any]: + @classmethod + def from_neurodata_object(cls, neurodata_object: Container, dataset_name: Literal["data", "timestamps"]) -> Self: """ - Fetch the properly structured dictionary of input arguments. - - Should be passed directly as dynamic keyword arguments (**kwargs) into a H5DataIO or ZarrDataIO. + Construct an instance of a DatasetIOConfiguration for a dataset in a neurodata object in an NWBFile. + + Parameters + ---------- + neurodata_object : hdmf.Container + The neurodata object containing the field that will become a dataset when written to disk. + dataset_name : "data" or "timestamps" + The name of the field that will become a dataset when written to disk. + Some neurodata objects can have multiple such fields, such as `pynwb.TimeSeries` which can have both `data` + and `timestamps`, each of which can be configured separately. """ - raise NotImplementedError - - @classmethod - def from_neurodata_object(cls, neurodata_object: Container, field_name: str) -> "DatasetIOConfiguration": - candidate_dataset = getattr(neurodata_object, field_name) - - dataset_info = DatasetInfo.from_neurodata_object(neurodata_object=neurodata_object, field_name=field_name) + location_in_file = _find_location_in_memory_nwbfile(neurodata_object=neurodata_object, field_name=dataset_name) - dtype = dataset_info.dtype - full_shape = dataset_info.full_shape + candidate_dataset = getattr(neurodata_object, dataset_name) + full_shape = get_data_shape(data=candidate_dataset) + dtype = _infer_dtype(dataset=candidate_dataset) if isinstance(candidate_dataset, GenericDataChunkIterator): chunk_shape = candidate_dataset.chunk_shape buffer_shape = candidate_dataset.buffer_shape - elif dtype != "unknown": + compression_method = "gzip" + elif dtype != np.dtype("object"): chunk_shape = SliceableDataChunkIterator.estimate_default_chunk_shape( chunk_mb=10.0, maxshape=full_shape, dtype=np.dtype(dtype) ) buffer_shape = SliceableDataChunkIterator.estimate_default_buffer_shape( buffer_gb=0.5, chunk_shape=chunk_shape, maxshape=full_shape, dtype=np.dtype(dtype) ) - else: - pass + compression_method = "gzip" + elif dtype == np.dtype("object"): # Unclear what default chunking/compression should be for compound objects + chunk_shape = None + buffer_shape = None + compression_method = None - return cls(dataset_info=dataset_info, chunk_shape=chunk_shape, buffer_shape=buffer_shape) + return cls( + object_id=neurodata_object.object_id, + object_name=neurodata_object.name, + location_in_file=location_in_file, + dataset_name=dataset_name, + full_shape=full_shape, + dtype=dtype, + chunk_shape=chunk_shape, + buffer_shape=buffer_shape, + compression_method=compression_method, + ) diff --git a/src/neuroconv/tools/nwb_helpers/_configuration_models/_hdf5_backend.py b/src/neuroconv/tools/nwb_helpers/_configuration_models/_hdf5_backend.py index 6d199c237..2949e3bcb 100644 --- a/src/neuroconv/tools/nwb_helpers/_configuration_models/_hdf5_backend.py +++ b/src/neuroconv/tools/nwb_helpers/_configuration_models/_hdf5_backend.py @@ -13,12 +13,8 @@ class HDF5BackendConfiguration(BackendConfiguration): """A model for matching collections of DatasetConfigurations specific to the HDF5 backend.""" backend: ClassVar[Literal["hdf5"]] = "hdf5" - # Field( # TODO: in pydantic v2 use property instead of class attribute - # default="hdf5", description="The name of the backend used to configure the NWBFile." - # ) - data_io_class: Type[H5DataIO] = Field( # TODO: in pydantic v2 use property instead of class attribute - default=H5DataIO, description="The DataIO class that is specific to HDF5." - ) + data_io_class: ClassVar[Type[H5DataIO]] = H5DataIO + dataset_configurations: Dict[str, HDF5DatasetIOConfiguration] = Field( description=( "A mapping from object locations to their HDF5DatasetConfiguration specification that contains all " diff --git a/src/neuroconv/tools/nwb_helpers/_configuration_models/_hdf5_dataset_io.py b/src/neuroconv/tools/nwb_helpers/_configuration_models/_hdf5_dataset_io.py index e256c8c3b..828a37998 100644 --- a/src/neuroconv/tools/nwb_helpers/_configuration_models/_hdf5_dataset_io.py +++ b/src/neuroconv/tools/nwb_helpers/_configuration_models/_hdf5_dataset_io.py @@ -3,7 +3,7 @@ from typing import Any, Dict, Literal, Union import h5py -from pydantic import Field +from pydantic import Field, InstanceOf from ._base_dataset_io import DatasetIOConfiguration from ...importing import is_package_installed @@ -32,13 +32,8 @@ class HDF5DatasetIOConfiguration(DatasetIOConfiguration): """A data model for configuring options about an object that will become a HDF5 Dataset in the file.""" - # TODO: When using Pydantic v2, replace with `model_config = ConfigDict(...)` - class Config: - arbitrary_types_allowed = True - validate_assignment = True - compression_method: Union[ - Literal[tuple(AVAILABLE_HDF5_COMPRESSION_METHODS.keys())], h5py._hl.filters.FilterRefBase, None + Literal[tuple(AVAILABLE_HDF5_COMPRESSION_METHODS.keys())], InstanceOf[h5py._hl.filters.FilterRefBase], None ] = Field( default="gzip", description=( diff --git a/src/neuroconv/tools/nwb_helpers/_configuration_models/_zarr_backend.py b/src/neuroconv/tools/nwb_helpers/_configuration_models/_zarr_backend.py index 1c17a77a3..7f43a1299 100644 --- a/src/neuroconv/tools/nwb_helpers/_configuration_models/_zarr_backend.py +++ b/src/neuroconv/tools/nwb_helpers/_configuration_models/_zarr_backend.py @@ -14,12 +14,8 @@ class ZarrBackendConfiguration(BackendConfiguration): """A model for matching collections of DatasetConfigurations specific to the Zarr backend.""" backend: ClassVar[Literal["zarr"]] = "zarr" - # Field( # TODO: in pydantic v2 use property instead of class attribute - # default="zarr", description="The name of the backend used to configure the NWBFile." - # ) - data_io_class: Type[ZarrDataIO] = Field( # TODO: in pydantic v2 use property instead of class attribute - default=ZarrDataIO, description="The DataIO class that is specific to Zarr." - ) + data_io_class: ClassVar[Type[ZarrDataIO]] = ZarrDataIO + dataset_configurations: Dict[str, ZarrDatasetIOConfiguration] = Field( description=( "A mapping from object locations to their ZarrDatasetConfiguration specification that contains all " diff --git a/src/neuroconv/tools/nwb_helpers/_configuration_models/_zarr_dataset_io.py b/src/neuroconv/tools/nwb_helpers/_configuration_models/_zarr_dataset_io.py index 10e3951ce..c070a20e9 100644 --- a/src/neuroconv/tools/nwb_helpers/_configuration_models/_zarr_dataset_io.py +++ b/src/neuroconv/tools/nwb_helpers/_configuration_models/_zarr_dataset_io.py @@ -4,7 +4,7 @@ import numcodecs import zarr -from pydantic import Field, root_validator +from pydantic import Field, InstanceOf, model_validator from ._base_dataset_io import DatasetIOConfiguration @@ -45,21 +45,16 @@ class ZarrDatasetIOConfiguration(DatasetIOConfiguration): """A data model for configuring options about an object that will become a Zarr Dataset in the file.""" - # TODO: When using Pydantic v2, replace with `model_config = ConfigDict(...)` - class Config: - arbitrary_types_allowed = True - validate_assignment = True - - compression_method: Union[Literal[tuple(AVAILABLE_ZARR_COMPRESSION_METHODS.keys())], numcodecs.abc.Codec, None] = ( - Field( - default="gzip", # TODO: would like this to be 'auto' - description=( - "The specified compression method to apply to this dataset. " - "Can be either a string that matches an available method on your system, " - "or an instantiated numcodec.Codec object." - "Set to `None` to disable compression." - ), - ) + compression_method: Union[ + Literal[tuple(AVAILABLE_ZARR_COMPRESSION_METHODS.keys())], InstanceOf[numcodecs.abc.Codec], None + ] = Field( + default="gzip", # TODO: would like this to be 'auto' + description=( + "The specified compression method to apply to this dataset. " + "Can be either a string that matches an available method on your system, " + "or an instantiated numcodec.Codec object." + "Set to `None` to disable compression." + ), ) # TODO: actually provide better schematic rendering of options. Only support defaults in GUIDE for now. # Looks like they'll have to be hand-typed however... Can try parsing the numpy docstrings - no annotation typing. @@ -67,7 +62,7 @@ class Config: default=None, description="The optional parameters to use for the specified compression method." ) filter_methods: Union[ - List[Union[Literal[tuple(AVAILABLE_ZARR_COMPRESSION_METHODS.keys())], numcodecs.abc.Codec]], None + List[Union[Literal[tuple(AVAILABLE_ZARR_COMPRESSION_METHODS.keys())], InstanceOf[numcodecs.abc.Codec]]], None ] = Field( default=None, description=( @@ -81,7 +76,7 @@ class Config: default=None, description="The optional parameters to use for each specified filter method." ) - def __str__(self) -> str: + def __str__(self) -> str: # Inherited docstring from parent. noqa: D105 string = super().__str__() if self.filter_methods is not None: string += f"\n filter methods : {self.filter_methods}" @@ -92,10 +87,10 @@ def __str__(self) -> str: return string - @root_validator + @model_validator(mode="before") def validate_filter_methods_and_options_length_match(cls, values: Dict[str, Any]): - filter_methods = values["filter_methods"] - filter_options = values["filter_options"] + filter_methods = values.get("filter_methods", None) + filter_options = values.get("filter_options", None) if filter_methods is None and filter_options is not None: raise ValueError( diff --git a/src/neuroconv/tools/nwb_helpers/_configure_backend.py b/src/neuroconv/tools/nwb_helpers/_configure_backend.py index a2dcaec69..6bc055a04 100644 --- a/src/neuroconv/tools/nwb_helpers/_configure_backend.py +++ b/src/neuroconv/tools/nwb_helpers/_configure_backend.py @@ -17,8 +17,8 @@ def configure_backend( data_io_class = backend_configuration.data_io_class for dataset_configuration in backend_configuration.dataset_configurations.values(): - object_id = dataset_configuration.dataset_info.object_id - dataset_name = dataset_configuration.dataset_info.dataset_name + object_id = dataset_configuration.object_id + dataset_name = dataset_configuration.dataset_name data_io_kwargs = dataset_configuration.get_data_io_kwargs() # TODO: update buffer shape in iterator, if present @@ -30,7 +30,9 @@ def configure_backend( nwbfile_object.set_data_io(data_io_class=data_io_class, data_io_kwargs=data_io_kwargs) # TimeSeries data or timestamps elif isinstance(nwbfile_object, TimeSeries) and not is_dataset_linked: - nwbfile_object.set_data_io(dataset_name=dataset_name, data_io_class=data_io_class, **data_io_kwargs) + nwbfile_object.set_data_io( + dataset_name=dataset_name, data_io_class=data_io_class, data_io_kwargs=data_io_kwargs + ) # Skip the setting of a DataIO when target dataset is a link (assume it will be found in parent) elif isinstance(nwbfile_object, TimeSeries) and is_dataset_linked: continue diff --git a/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py b/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py index 60801b9c2..51f414244 100644 --- a/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py +++ b/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py @@ -114,7 +114,7 @@ def get_default_dataset_io_configurations( continue dataset_io_configuration = DatasetIOConfigurationClass.from_neurodata_object( - neurodata_object=column, field_name="data" + neurodata_object=column, dataset_name="data" ) yield dataset_io_configuration @@ -123,11 +123,11 @@ def get_default_dataset_io_configurations( # The most common example of this is ndx-events Events/LabeledEvents types time_series = neurodata_object # for readability - for field_name in ("data", "timestamps"): - if field_name not in time_series.fields: # timestamps is optional + for dataset_name in ("data", "timestamps"): + if dataset_name not in time_series.fields: # timestamps is optional continue - candidate_dataset = getattr(time_series, field_name) + candidate_dataset = getattr(time_series, dataset_name) if _is_dataset_written_to_file( candidate_dataset=candidate_dataset, backend=backend, existing_file=existing_file ): @@ -142,7 +142,7 @@ def get_default_dataset_io_configurations( continue # skip dataset_io_configuration = DatasetIOConfigurationClass.from_neurodata_object( - neurodata_object=time_series, field_name=field_name + neurodata_object=time_series, dataset_name=dataset_name ) yield dataset_io_configuration diff --git a/src/neuroconv/tools/testing/__init__.py b/src/neuroconv/tools/testing/__init__.py index 2d5b06497..7179a7544 100644 --- a/src/neuroconv/tools/testing/__init__.py +++ b/src/neuroconv/tools/testing/__init__.py @@ -1,5 +1,4 @@ from ._mock._mock_dataset_models import ( - mock_DatasetInfo, mock_HDF5BackendConfiguration, mock_HDF5DatasetIOConfiguration, mock_ZarrBackendConfiguration, diff --git a/src/neuroconv/tools/testing/_mock/_mock_dataset_models.py b/src/neuroconv/tools/testing/_mock/_mock_dataset_models.py index 6d23e0af3..4e16740d9 100644 --- a/src/neuroconv/tools/testing/_mock/_mock_dataset_models.py +++ b/src/neuroconv/tools/testing/_mock/_mock_dataset_models.py @@ -7,7 +7,6 @@ from ...nwb_helpers import ( AVAILABLE_HDF5_COMPRESSION_METHODS, AVAILABLE_ZARR_COMPRESSION_METHODS, - DatasetInfo, HDF5BackendConfiguration, HDF5DatasetIOConfiguration, ZarrBackendConfiguration, @@ -15,22 +14,14 @@ ) -def mock_DatasetInfo( +def mock_HDF5DatasetIOConfiguration( object_id: str = "481a0860-3a0c-40ec-b931-df4a3e9b101f", location_in_file: str = "acquisition/TestElectricalSeries/data", + dataset_name: Literal["data", "timestamps"] = "data", full_shape: Tuple[int, ...] = (60 * 30_000, 384), # ~1 minute of v1 NeuroPixels probe - dtype=np.dtype("int16"), -) -> DatasetInfo: - """Mock instance of a DatasetInfo with NeuroPixel-like values to showcase chunk/buffer recommendations.""" - return DatasetInfo( - object_id=object_id, - location_in_file=location_in_file, - full_shape=full_shape, - dtype=dtype, - ) - - -def mock_HDF5DatasetIOConfiguration( + dtype: np.dtype = np.dtype("int16"), + chunk_shape: Tuple[int, ...] = (78_125, 64), # ~10 MB + buffer_shape: Tuple[int, ...] = (1_250_000, 384), # ~1 GB compression_method: Union[ Literal[tuple(AVAILABLE_HDF5_COMPRESSION_METHODS.keys())], h5py._hl.filters.FilterRefBase, None ] = "gzip", @@ -38,15 +29,26 @@ def mock_HDF5DatasetIOConfiguration( ) -> HDF5DatasetIOConfiguration: """Mock object of a HDF5DatasetIOConfiguration with NeuroPixel-like values to show chunk/buffer recommendations.""" return HDF5DatasetIOConfiguration( - dataset_info=mock_DatasetInfo(), - chunk_shape=(78_125, 64), # ~10 MB - buffer_shape=(1_250_000, 384), # ~1 GB + object_id=object_id, + location_in_file=location_in_file, + dataset_name=dataset_name, + full_shape=full_shape, + dtype=dtype, + chunk_shape=chunk_shape, + buffer_shape=buffer_shape, compression_method=compression_method, compression_options=compression_options, ) def mock_ZarrDatasetIOConfiguration( + object_id: str = "481a0860-3a0c-40ec-b931-df4a3e9b101f", + location_in_file: str = "acquisition/TestElectricalSeries/data", + dataset_name: Literal["data", "timestamps"] = "data", + full_shape: Tuple[int, ...] = (60 * 30_000, 384), # ~1 minute of v1 NeuroPixels probe + dtype: np.dtype = np.dtype("int16"), + chunk_shape: Tuple[int, ...] = (78_125, 64), # ~10 MB + buffer_shape: Tuple[int, ...] = (1_250_000, 384), # ~1 GB compression_method: Union[ Literal[tuple(AVAILABLE_ZARR_COMPRESSION_METHODS.keys())], numcodecs.abc.Codec, None ] = "gzip", @@ -58,9 +60,13 @@ def mock_ZarrDatasetIOConfiguration( ) -> ZarrDatasetIOConfiguration: """Mock object of a ZarrDatasetIOConfiguration with NeuroPixel-like values to show chunk/buffer recommendations.""" return ZarrDatasetIOConfiguration( - dataset_info=mock_DatasetInfo(), - chunk_shape=(78_125, 64), # ~10 MB - buffer_shape=(1_250_000, 384), # ~1 GB + object_id=object_id, + location_in_file=location_in_file, + dataset_name=dataset_name, + full_shape=full_shape, + dtype=dtype, + chunk_shape=chunk_shape, + buffer_shape=buffer_shape, compression_method=compression_method, compression_options=compression_options, filter_methods=filter_methods, @@ -70,18 +76,15 @@ def mock_ZarrDatasetIOConfiguration( def mock_HDF5BackendConfiguration() -> HDF5BackendConfiguration: """Mock instance of a HDF5BackendConfiguration with two NeuroPixel-like datasets.""" - dataset_configurations = { - "acquisition/TestElectricalSeriesAP/data": HDF5DatasetIOConfiguration( - dataset_info=mock_DatasetInfo(location_in_file="acquisition/TestElectricalSeriesAP/data"), - chunk_shape=(78_125, 64), # ~10 MB - buffer_shape=(1_250_000, 384), # ~1 GB + dataset_configurations: Dict[str, HDF5DatasetIOConfiguration] = { + "acquisition/TestElectricalSeriesAP/data": mock_HDF5DatasetIOConfiguration( + location_in_file="acquisition/TestElectricalSeriesAP/data", dataset_name="data" ), - "acquisition/TestElectricalSeriesLF/data": HDF5DatasetIOConfiguration( - dataset_info=mock_DatasetInfo( - object_id="bc37e164-519f-4b65-a976-206440f1d325", - location_in_file="acquisition/TestElectricalSeriesLF/data", - full_shape=(75_000, 384), - ), + "acquisition/TestElectricalSeriesLF/data": mock_HDF5DatasetIOConfiguration( + object_id="bc37e164-519f-4b65-a976-206440f1d325", + location_in_file="acquisition/TestElectricalSeriesLF/data", + dataset_name="data", + full_shape=(75_000, 384), chunk_shape=(37_500, 128), # ~10 MB buffer_shape=(75_000, 384), ), @@ -92,19 +95,17 @@ def mock_HDF5BackendConfiguration() -> HDF5BackendConfiguration: def mock_ZarrBackendConfiguration() -> ZarrBackendConfiguration: """Mock instance of a HDF5BackendConfiguration with several NeuroPixel-like datasets.""" - dataset_configurations = { - "acquisition/TestElectricalSeriesAP/data": ZarrDatasetIOConfiguration( - dataset_info=mock_DatasetInfo(location_in_file="acquisition/TestElectricalSeriesAP/data"), - chunk_shape=(78_125, 64), - buffer_shape=(1_250_000, 384), # ~1 GB + dataset_configurations: Dict[str, ZarrDatasetIOConfiguration] = { + "acquisition/TestElectricalSeriesAP/data": mock_ZarrDatasetIOConfiguration( + location_in_file="acquisition/TestElectricalSeriesAP/data", + dataset_name="data", filter_methods=["delta"], ), - "acquisition/TestElectricalSeriesLF/data": ZarrDatasetIOConfiguration( - dataset_info=mock_DatasetInfo( - object_id="bc37e164-519f-4b65-a976-206440f1d325", - location_in_file="acquisition/TestElectricalSeriesLF/data", - full_shape=(75_000, 384), - ), + "acquisition/TestElectricalSeriesLF/data": mock_ZarrDatasetIOConfiguration( + object_id="bc37e164-519f-4b65-a976-206440f1d325", + location_in_file="acquisition/TestElectricalSeriesLF/data", + dataset_name="data", + full_shape=(75_000, 384), chunk_shape=(37_500, 128), # ~10 MB buffer_shape=(75_000, 384), filter_methods=["delta"], diff --git a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_common_dataset_io_configuration_model.py b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_common_dataset_io_configuration_model.py index a83076e98..0a6e56695 100644 --- a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_common_dataset_io_configuration_model.py +++ b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_common_dataset_io_configuration_model.py @@ -9,183 +9,131 @@ ZarrDatasetIOConfiguration, ) from neuroconv.tools.testing import ( - mock_DatasetInfo, mock_HDF5DatasetIOConfiguration, mock_ZarrDatasetIOConfiguration, ) @pytest.mark.parametrize( - argnames="dataset_configuration_class", argvalues=[HDF5DatasetIOConfiguration, ZarrDatasetIOConfiguration] + argnames="dataset_configuration_class", argvalues=[mock_HDF5DatasetIOConfiguration, mock_ZarrDatasetIOConfiguration] ) def test_validator_chunk_length_consistency( dataset_configuration_class: Union[HDF5DatasetIOConfiguration, ZarrDatasetIOConfiguration] ): with pytest.raises(ValueError) as error_info: - dataset_configuration_class( - dataset_info=mock_DatasetInfo(), - chunk_shape=(78_125, 64, 1), - buffer_shape=(1_250_000, 384), - ) + dataset_configuration_class(chunk_shape=(78_125, 64, 1), buffer_shape=(1_250_000, 384)) expected_error = ( "len(chunk_shape)=3 does not match len(buffer_shape)=2 for dataset at location " - "'acquisition/TestElectricalSeries/data'! (type=value_error)" + "'acquisition/TestElectricalSeries/data'! [type=value_error, " ) assert expected_error in str(error_info.value) @pytest.mark.parametrize( - argnames="dataset_configuration_class", argvalues=[HDF5DatasetIOConfiguration, ZarrDatasetIOConfiguration] + argnames="dataset_configuration_class", argvalues=[mock_HDF5DatasetIOConfiguration, mock_ZarrDatasetIOConfiguration] ) def test_validator_chunk_and_buffer_length_consistency( dataset_configuration_class: Union[HDF5DatasetIOConfiguration, ZarrDatasetIOConfiguration] ): with pytest.raises(ValueError) as error_info: - dataset_configuration_class( - dataset_info=mock_DatasetInfo(), - chunk_shape=(78_125, 64, 1), - buffer_shape=(1_250_000, 384, 1), - ) + dataset_configuration_class(chunk_shape=(78_125, 64, 1), buffer_shape=(1_250_000, 384, 1)) expected_error = ( "len(buffer_shape)=3 does not match len(full_shape)=2 for dataset at location " - "'acquisition/TestElectricalSeries/data'! (type=value_error)" + "'acquisition/TestElectricalSeries/data'! [type=value_error, " ) assert expected_error in str(error_info.value) @pytest.mark.parametrize( - argnames="dataset_configuration_class", argvalues=[HDF5DatasetIOConfiguration, ZarrDatasetIOConfiguration] + argnames="dataset_configuration_class", argvalues=[mock_HDF5DatasetIOConfiguration, mock_ZarrDatasetIOConfiguration] ) def test_validator_chunk_shape_nonpositive_elements( dataset_configuration_class: Union[HDF5DatasetIOConfiguration, ZarrDatasetIOConfiguration] ): with pytest.raises(ValueError) as error_info: - dataset_configuration_class( - dataset_info=mock_DatasetInfo(), - chunk_shape=(1, -2), - buffer_shape=(1_250_000, 384), - ) + dataset_configuration_class(chunk_shape=(1, -2), buffer_shape=(1_250_000, 384)) expected_error = ( "Some dimensions of the chunk_shape=(1, -2) are less than or equal to zero for dataset at " - "location 'acquisition/TestElectricalSeries/data'! (type=value_error)" + "location 'acquisition/TestElectricalSeries/data'! [type=value_error, " ) assert expected_error in str(error_info.value) @pytest.mark.parametrize( - argnames="dataset_configuration_class", argvalues=[HDF5DatasetIOConfiguration, ZarrDatasetIOConfiguration] + argnames="dataset_configuration_class", argvalues=[mock_HDF5DatasetIOConfiguration, mock_ZarrDatasetIOConfiguration] ) def test_validator_buffer_shape_nonpositive_elements( dataset_configuration_class: Union[HDF5DatasetIOConfiguration, ZarrDatasetIOConfiguration] ): with pytest.raises(ValueError) as error_info: - dataset_configuration_class( - dataset_info=mock_DatasetInfo(), - chunk_shape=(78_125, 64), - buffer_shape=(78_125, -2), - ) + dataset_configuration_class(chunk_shape=(78_125, 64), buffer_shape=(78_125, -2)) expected_error = ( "Some dimensions of the buffer_shape=(78125, -2) are less than or equal to zero for dataset at " - "location 'acquisition/TestElectricalSeries/data'! (type=value_error)" + "location 'acquisition/TestElectricalSeries/data'! [type=value_error, " ) assert expected_error in str(error_info.value) @pytest.mark.parametrize( - argnames="dataset_configuration_class", argvalues=[HDF5DatasetIOConfiguration, ZarrDatasetIOConfiguration] + argnames="dataset_configuration_class", argvalues=[mock_HDF5DatasetIOConfiguration, mock_ZarrDatasetIOConfiguration] ) def test_validator_chunk_shape_exceeds_buffer_shape( dataset_configuration_class: Union[HDF5DatasetIOConfiguration, ZarrDatasetIOConfiguration] ): with pytest.raises(ValueError) as error_info: - dataset_configuration_class( - dataset_info=mock_DatasetInfo(), - chunk_shape=(78_126, 64), - buffer_shape=(78_125, 384), - ) + dataset_configuration_class(chunk_shape=(78_126, 64), buffer_shape=(78_125, 384)) expected_error = ( "Some dimensions of the chunk_shape=(78126, 64) exceed the buffer_shape=(78125, 384) for dataset at location " - "'acquisition/TestElectricalSeries/data'! (type=value_error)" + "'acquisition/TestElectricalSeries/data'! [type=value_error, " ) assert expected_error in str(error_info.value) @pytest.mark.parametrize( - argnames="dataset_configuration_class", argvalues=[HDF5DatasetIOConfiguration, ZarrDatasetIOConfiguration] + argnames="dataset_configuration_class", argvalues=[mock_HDF5DatasetIOConfiguration, mock_ZarrDatasetIOConfiguration] ) def test_validator_buffer_shape_exceeds_full_shape( dataset_configuration_class: Union[HDF5DatasetIOConfiguration, ZarrDatasetIOConfiguration] ): with pytest.raises(ValueError) as error_info: - dataset_configuration_class( - dataset_info=mock_DatasetInfo(), - chunk_shape=(78_125, 64), - buffer_shape=(1_250_000, 385), - ) + dataset_configuration_class(chunk_shape=(78_125, 64), buffer_shape=(1_250_000, 385)) expected_error = ( "Some dimensions of the buffer_shape=(1250000, 385) exceed the full_shape=(1800000, 384) for " - "dataset at location 'acquisition/TestElectricalSeries/data'! (type=value_error)" + "dataset at location 'acquisition/TestElectricalSeries/data'! [type=value_error, " ) assert expected_error in str(error_info.value) @pytest.mark.parametrize( - argnames="dataset_configuration_class", argvalues=[HDF5DatasetIOConfiguration, ZarrDatasetIOConfiguration] + argnames="dataset_configuration_class", argvalues=[mock_HDF5DatasetIOConfiguration, mock_ZarrDatasetIOConfiguration] ) def test_validator_chunk_dimensions_do_not_evenly_divide_buffer( dataset_configuration_class: Union[HDF5DatasetIOConfiguration, ZarrDatasetIOConfiguration] ): with pytest.raises(ValueError) as error_info: dataset_configuration_class( - dataset_info=mock_DatasetInfo(), chunk_shape=(78_125, 7), buffer_shape=(1_250_000, 383), # Different trigger condition when not the full shape for an axis ) expected_error = ( "Some dimensions of the chunk_shape=(78125, 7) do not evenly divide the buffer_shape=(1250000, 383) for " - "dataset at location 'acquisition/TestElectricalSeries/data'! (type=value_error)" + "dataset at location 'acquisition/TestElectricalSeries/data'! [type=value_error, " ) assert expected_error in str(error_info.value) @pytest.mark.parametrize( - argnames="dataset_configuration_class", argvalues=[HDF5DatasetIOConfiguration, ZarrDatasetIOConfiguration] + argnames="dataset_configuration_class", argvalues=[mock_HDF5DatasetIOConfiguration, mock_ZarrDatasetIOConfiguration] ) def test_validator_chunk_dimensions_do_not_evenly_divide_buffer_skip_full_shape( dataset_configuration_class: Union[HDF5DatasetIOConfiguration, ZarrDatasetIOConfiguration] ): """Any divisibility is allowed when the buffer shape is capped at the full length of an axis.""" - dataset_configuration_class( - dataset_info=mock_DatasetInfo(), - chunk_shape=(78_125, 7), - buffer_shape=(1_250_000, 384), - ) - - -@pytest.mark.parametrize( - argnames="mock_dataset_configuration", - argvalues=[mock_HDF5DatasetIOConfiguration(), mock_ZarrDatasetIOConfiguration()], -) -def test_mutation_validation( - mock_dataset_configuration: Union[mock_HDF5DatasetIOConfiguration, mock_ZarrDatasetIOConfiguration] -): - """ - Only testing on one dummy case to verify the root validator is triggered. - - Trust the rest should follow. - """ - with pytest.raises(ValueError) as error_info: - mock_dataset_configuration.chunk_shape = (1, -2) - - expected_error = ( - "Some dimensions of the chunk_shape=(1, -2) are less than or equal to zero for dataset at " - "location 'acquisition/TestElectricalSeries/data'! (type=value_error)" - ) - assert expected_error in str(error_info.value) + dataset_configuration_class(chunk_shape=(78_125, 7), buffer_shape=(1_250_000, 384)) diff --git a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_default_dataset_io_configurations.py b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_default_dataset_io_configurations.py index 768ceb634..28a4bccbd 100644 --- a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_default_dataset_io_configurations.py +++ b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_default_dataset_io_configurations.py @@ -39,10 +39,10 @@ def test_configuration_on_time_series(iterator: callable, backend: Literal["hdf5 dataset_configuration = dataset_configurations[0] assert isinstance(dataset_configuration, DATASET_IO_CONFIGURATIONS[backend]) - assert dataset_configuration.dataset_info.object_id == time_series.object_id - assert dataset_configuration.dataset_info.location_in_file == "acquisition/TestTimeSeries/data" - assert dataset_configuration.dataset_info.full_shape == array.shape - assert dataset_configuration.dataset_info.dtype == array.dtype + assert dataset_configuration.object_id == time_series.object_id + assert dataset_configuration.location_in_file == "acquisition/TestTimeSeries/data" + assert dataset_configuration.full_shape == array.shape + assert dataset_configuration.dtype == array.dtype assert dataset_configuration.chunk_shape == array.shape assert dataset_configuration.buffer_shape == array.shape assert dataset_configuration.compression_method == "gzip" @@ -81,10 +81,10 @@ def test_configuration_on_dynamic_table(iterator: callable, backend: Literal["hd dataset_configuration = dataset_configurations[0] assert isinstance(dataset_configuration, DATASET_IO_CONFIGURATIONS[backend]) - assert dataset_configuration.dataset_info.object_id == column.object_id - assert dataset_configuration.dataset_info.location_in_file == "acquisition/TestDynamicTable/TestColumn/data" - assert dataset_configuration.dataset_info.full_shape == array.shape - assert dataset_configuration.dataset_info.dtype == array.dtype + assert dataset_configuration.object_id == column.object_id + assert dataset_configuration.location_in_file == "acquisition/TestDynamicTable/TestColumn/data" + assert dataset_configuration.full_shape == array.shape + assert dataset_configuration.dtype == array.dtype assert dataset_configuration.chunk_shape == array.shape assert dataset_configuration.buffer_shape == array.shape assert dataset_configuration.compression_method == "gzip" @@ -117,11 +117,11 @@ def test_configuration_on_ragged_units_table(backend: Literal["hdf5", "zarr"]): dataset_configuration = next( dataset_configuration for dataset_configuration in dataset_configurations - if dataset_configuration.dataset_info.location_in_file == "units/spike_times/data" + if dataset_configuration.location_in_file == "units/spike_times/data" ) assert isinstance(dataset_configuration, DATASET_IO_CONFIGURATIONS[backend]) - assert dataset_configuration.dataset_info.full_shape == (5,) - assert dataset_configuration.dataset_info.dtype == np.dtype("float64") + assert dataset_configuration.full_shape == (5,) + assert dataset_configuration.dtype == np.dtype("float64") assert dataset_configuration.chunk_shape == (5,) assert dataset_configuration.buffer_shape == (5,) assert dataset_configuration.compression_method == "gzip" @@ -134,11 +134,11 @@ def test_configuration_on_ragged_units_table(backend: Literal["hdf5", "zarr"]): dataset_configuration = next( dataset_configuration for dataset_configuration in dataset_configurations - if dataset_configuration.dataset_info.location_in_file == "units/spike_times_index/data" + if dataset_configuration.location_in_file == "units/spike_times_index/data" ) assert isinstance(dataset_configuration, DATASET_IO_CONFIGURATIONS[backend]) - assert dataset_configuration.dataset_info.full_shape == (2,) - assert dataset_configuration.dataset_info.dtype == np.dtype("uint8") + assert dataset_configuration.full_shape == (2,) + assert dataset_configuration.dtype == np.dtype("uint8") assert dataset_configuration.chunk_shape == (2,) assert dataset_configuration.buffer_shape == (2,) assert dataset_configuration.compression_method == "gzip" @@ -151,11 +151,11 @@ def test_configuration_on_ragged_units_table(backend: Literal["hdf5", "zarr"]): dataset_configuration = next( dataset_configuration for dataset_configuration in dataset_configurations - if dataset_configuration.dataset_info.location_in_file == "units/waveforms/data" + if dataset_configuration.location_in_file == "units/waveforms/data" ) assert isinstance(dataset_configuration, DATASET_IO_CONFIGURATIONS[backend]) - assert dataset_configuration.dataset_info.full_shape == (12, 3) - assert dataset_configuration.dataset_info.dtype == np.dtype("int32") + assert dataset_configuration.full_shape == (12, 3) + assert dataset_configuration.dtype == np.dtype("int32") assert dataset_configuration.chunk_shape == (12, 3) assert dataset_configuration.buffer_shape == (12, 3) assert dataset_configuration.compression_method == "gzip" @@ -168,11 +168,11 @@ def test_configuration_on_ragged_units_table(backend: Literal["hdf5", "zarr"]): dataset_configuration = next( dataset_configuration for dataset_configuration in dataset_configurations - if dataset_configuration.dataset_info.location_in_file == "units/waveforms_index/data" + if dataset_configuration.location_in_file == "units/waveforms_index/data" ) assert isinstance(dataset_configuration, DATASET_IO_CONFIGURATIONS[backend]) - assert dataset_configuration.dataset_info.full_shape == (4,) - assert dataset_configuration.dataset_info.dtype == np.dtype("uint8") + assert dataset_configuration.full_shape == (4,) + assert dataset_configuration.dtype == np.dtype("uint8") assert dataset_configuration.chunk_shape == (4,) assert dataset_configuration.buffer_shape == (4,) assert dataset_configuration.compression_method == "gzip" @@ -185,11 +185,11 @@ def test_configuration_on_ragged_units_table(backend: Literal["hdf5", "zarr"]): dataset_configuration = next( dataset_configuration for dataset_configuration in dataset_configurations - if dataset_configuration.dataset_info.location_in_file == "units/waveforms_index_index/data" + if dataset_configuration.location_in_file == "units/waveforms_index_index/data" ) assert isinstance(dataset_configuration, DATASET_IO_CONFIGURATIONS[backend]) - assert dataset_configuration.dataset_info.full_shape == (2,) - assert dataset_configuration.dataset_info.dtype == np.dtype("uint8") + assert dataset_configuration.full_shape == (2,) + assert dataset_configuration.dtype == np.dtype("uint8") assert dataset_configuration.chunk_shape == (2,) assert dataset_configuration.buffer_shape == (2,) assert dataset_configuration.compression_method == "gzip" @@ -218,13 +218,10 @@ def test_configuration_on_compass_direction(iterator: callable, backend: Literal dataset_configuration = dataset_configurations[0] assert isinstance(dataset_configuration, DATASET_IO_CONFIGURATIONS[backend]) - assert dataset_configuration.dataset_info.object_id == spatial_series.object_id - assert ( - dataset_configuration.dataset_info.location_in_file - == "processing/behavior/TestCompassDirection/TestSpatialSeries/data" - ) - assert dataset_configuration.dataset_info.full_shape == array.shape - assert dataset_configuration.dataset_info.dtype == array.dtype + assert dataset_configuration.object_id == spatial_series.object_id + assert dataset_configuration.location_in_file == "processing/behavior/TestCompassDirection/TestSpatialSeries/data" + assert dataset_configuration.full_shape == array.shape + assert dataset_configuration.dtype == array.dtype assert dataset_configuration.chunk_shape == array.shape assert dataset_configuration.buffer_shape == array.shape assert dataset_configuration.compression_method == "gzip" @@ -267,13 +264,13 @@ def test_configuration_on_ndx_events(backend: Literal["hdf5", "zarr"]): data_dataset_configuration = next( dataset_configuration for dataset_configuration in dataset_configurations - if dataset_configuration.dataset_info.dataset_name == "data" + if dataset_configuration.dataset_name == "data" ) assert isinstance(data_dataset_configuration, DATASET_IO_CONFIGURATIONS[backend]) - assert data_dataset_configuration.dataset_info.object_id == labeled_events.object_id - assert data_dataset_configuration.dataset_info.location_in_file == "processing/behavior/TestLabeledEvents/data" - assert data_dataset_configuration.dataset_info.full_shape == data.shape - assert data_dataset_configuration.dataset_info.dtype == data.dtype + assert data_dataset_configuration.object_id == labeled_events.object_id + assert data_dataset_configuration.location_in_file == "processing/behavior/TestLabeledEvents/data" + assert data_dataset_configuration.full_shape == data.shape + assert data_dataset_configuration.dtype == data.dtype assert data_dataset_configuration.chunk_shape == data.shape assert data_dataset_configuration.buffer_shape == data.shape assert data_dataset_configuration.compression_method == "gzip" @@ -286,16 +283,13 @@ def test_configuration_on_ndx_events(backend: Literal["hdf5", "zarr"]): timestamps_dataset_configuration = next( dataset_configuration for dataset_configuration in dataset_configurations - if dataset_configuration.dataset_info.dataset_name == "timestamps" + if dataset_configuration.dataset_name == "timestamps" ) assert isinstance(timestamps_dataset_configuration, DATASET_IO_CONFIGURATIONS[backend]) - assert timestamps_dataset_configuration.dataset_info.object_id == labeled_events.object_id - assert ( - timestamps_dataset_configuration.dataset_info.location_in_file - == "processing/behavior/TestLabeledEvents/timestamps" - ) - assert timestamps_dataset_configuration.dataset_info.full_shape == timestamps.shape - assert timestamps_dataset_configuration.dataset_info.dtype == timestamps.dtype + assert timestamps_dataset_configuration.object_id == labeled_events.object_id + assert timestamps_dataset_configuration.location_in_file == "processing/behavior/TestLabeledEvents/timestamps" + assert timestamps_dataset_configuration.full_shape == timestamps.shape + assert timestamps_dataset_configuration.dtype == timestamps.dtype assert timestamps_dataset_configuration.chunk_shape == timestamps.shape assert timestamps_dataset_configuration.buffer_shape == timestamps.shape assert timestamps_dataset_configuration.compression_method == "gzip" diff --git a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_default_dataset_io_configurations_appended_files.py b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_default_dataset_io_configurations_appended_files.py index b78448399..dca727f03 100644 --- a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_default_dataset_io_configurations_appended_files.py +++ b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_default_dataset_io_configurations_appended_files.py @@ -62,10 +62,10 @@ def test_unwrapped_time_series_hdf5(hdf5_nwbfile_path): dataset_configuration = dataset_configurations[0] assert isinstance(dataset_configuration, HDF5DatasetIOConfiguration) - assert dataset_configuration.dataset_info.object_id == new_time_series.object_id - assert dataset_configuration.dataset_info.location_in_file == "acquisition/NewTimeSeries/data" - assert dataset_configuration.dataset_info.full_shape == array.shape - assert dataset_configuration.dataset_info.dtype == array.dtype + assert dataset_configuration.object_id == new_time_series.object_id + assert dataset_configuration.location_in_file == "acquisition/NewTimeSeries/data" + assert dataset_configuration.full_shape == array.shape + assert dataset_configuration.dtype == array.dtype assert dataset_configuration.chunk_shape == array.shape assert dataset_configuration.buffer_shape == array.shape assert dataset_configuration.compression_method == "gzip" @@ -85,10 +85,10 @@ def test_unwrapped_time_series_zarr(zarr_nwbfile_path): dataset_configuration = dataset_configurations[0] assert isinstance(dataset_configuration, ZarrDatasetIOConfiguration) - assert dataset_configuration.dataset_info.object_id == new_time_series.object_id - assert dataset_configuration.dataset_info.location_in_file == "acquisition/NewTimeSeries/data" - assert dataset_configuration.dataset_info.full_shape == array.shape - assert dataset_configuration.dataset_info.dtype == array.dtype + assert dataset_configuration.object_id == new_time_series.object_id + assert dataset_configuration.location_in_file == "acquisition/NewTimeSeries/data" + assert dataset_configuration.full_shape == array.shape + assert dataset_configuration.dtype == array.dtype assert dataset_configuration.chunk_shape == array.shape assert dataset_configuration.buffer_shape == array.shape assert dataset_configuration.compression_method == "gzip" @@ -111,10 +111,10 @@ def test_unwrapped_dynamic_table_hdf5(hdf5_nwbfile_path): dataset_configuration = dataset_configurations[0] assert isinstance(dataset_configuration, HDF5DatasetIOConfiguration) - assert dataset_configuration.dataset_info.object_id == column.object_id - assert dataset_configuration.dataset_info.location_in_file == "acquisition/TestDynamicTable/TestColumn/data" - assert dataset_configuration.dataset_info.full_shape == array.shape - assert dataset_configuration.dataset_info.dtype == array.dtype + assert dataset_configuration.object_id == column.object_id + assert dataset_configuration.location_in_file == "acquisition/TestDynamicTable/TestColumn/data" + assert dataset_configuration.full_shape == array.shape + assert dataset_configuration.dtype == array.dtype assert dataset_configuration.chunk_shape == array.shape assert dataset_configuration.buffer_shape == array.shape assert dataset_configuration.compression_method == "gzip" @@ -135,10 +135,10 @@ def test_unwrapped_dynamic_table_zarr(zarr_nwbfile_path): dataset_configuration = dataset_configurations[0] assert isinstance(dataset_configuration, ZarrDatasetIOConfiguration) - assert dataset_configuration.dataset_info.object_id == column.object_id - assert dataset_configuration.dataset_info.location_in_file == "acquisition/TestDynamicTable/TestColumn/data" - assert dataset_configuration.dataset_info.full_shape == array.shape - assert dataset_configuration.dataset_info.dtype == array.dtype + assert dataset_configuration.object_id == column.object_id + assert dataset_configuration.location_in_file == "acquisition/TestDynamicTable/TestColumn/data" + assert dataset_configuration.full_shape == array.shape + assert dataset_configuration.dtype == array.dtype assert dataset_configuration.chunk_shape == array.shape assert dataset_configuration.buffer_shape == array.shape assert dataset_configuration.compression_method == "gzip" diff --git a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/test_dataset_info_model.py b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/test_dataset_info_model.py deleted file mode 100644 index b9469ef54..000000000 --- a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/test_dataset_info_model.py +++ /dev/null @@ -1,43 +0,0 @@ -"""Unit tests for the DatasetInfo Pydantic model.""" - -from io import StringIO -from unittest.mock import patch - -from neuroconv.tools.testing import mock_DatasetInfo - - -def test_dataset_info_print(): - """Test the printout display of a Dataset model looks nice.""" - dataset_info = mock_DatasetInfo() - - with patch("sys.stdout", new=StringIO()) as out: - print(dataset_info) - - expected_print = """ -acquisition/TestElectricalSeries/data -------------------------------------- - dtype : int16 - full shape of source array : (1800000, 384) - full size of source array : 1.38 GB -""" - assert out.getvalue() == expected_print - - -def test_dataset_info_repr(): - """Test the programmatic repr of a Dataset model is more dataclass-like.""" - dataset_info = mock_DatasetInfo() - - # Important to keep the `repr` unmodified for appearance inside iterables of DatasetInfo objects - expected_repr = ( - "DatasetInfo(object_id='481a0860-3a0c-40ec-b931-df4a3e9b101f', " - "location_in_file='acquisition/TestElectricalSeries/data', dataset_name='data', dtype=dtype('int16'), " - "full_shape=(1800000, 384))" - ) - assert repr(dataset_info) == expected_repr - - -def test_dataset_info_hashability(): - dataset_info = mock_DatasetInfo() - - test_dict = {dataset_info: True} # Technically this alone would raise an error if it didn't work... - assert test_dict[dataset_info] is True # ... but asserting this for good measure. diff --git a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/test_dataset_io_configuration_model.py b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/test_dataset_io_configuration_model.py index b5fffa134..060811ebf 100644 --- a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/test_dataset_io_configuration_model.py +++ b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/test_dataset_io_configuration_model.py @@ -1,14 +1,23 @@ """Unit tests for the DatasetConfiguration Pydantic model.""" +import numpy as np import pytest from neuroconv.tools.nwb_helpers import DatasetIOConfiguration -from neuroconv.tools.testing import mock_DatasetInfo def test_get_data_io_kwargs_abstract_error(): with pytest.raises(TypeError) as error_info: - DatasetIOConfiguration(dataset_info=mock_DatasetInfo(), chunk_shape=(78_125, 64), buffer_shape=(1_250_000, 384)) + DatasetIOConfiguration( + object_id="481a0860-3a0c-40ec-b931-df4a3e9b101f", + location_in_file="acquisition/TestElectricalSeries/data", + dataset_name="data", + full_shape=(60 * 30_000, 384), # ~1 minute of v1 NeuroPixels probe + dtype=np.dtype("int16"), + chunk_shape=(78_125, 64), + buffer_shape=(1_250_000, 384), + compression_method="gzip", + ) assert "Can't instantiate abstract class DatasetIOConfiguration" in str(error_info.value) @@ -18,9 +27,14 @@ def get_data_io_kwargs(self): super().get_data_io_kwargs() dataset_io_configuration = TestDatasetIOConfiguration( - dataset_info=mock_DatasetInfo(), + object_id="481a0860-3a0c-40ec-b931-df4a3e9b101f", + location_in_file="acquisition/TestElectricalSeries/data", + dataset_name="data", + full_shape=(60 * 30_000, 384), # ~1 minute of v1 NeuroPixels probe + dtype=np.dtype("int16"), chunk_shape=(78_125, 64), buffer_shape=(1_250_000, 384), + compression_method="gzip", ) with pytest.raises(NotImplementedError): diff --git a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/test_hdf5_dataset_io_configuration_model.py b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/test_hdf5_dataset_io_configuration_model.py index 63579a72b..fd3434493 100644 --- a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/test_hdf5_dataset_io_configuration_model.py +++ b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/test_hdf5_dataset_io_configuration_model.py @@ -90,9 +90,9 @@ def test_hdf5_dataset_configuration_repr(): # Important to keep the `repr` unmodified for appearance inside iterables of DatasetInfo objects expected_repr = ( - "HDF5DatasetIOConfiguration(dataset_info=DatasetInfo(object_id='481a0860-3a0c-40ec-b931-df4a3e9b101f', " + "HDF5DatasetIOConfiguration(object_id='481a0860-3a0c-40ec-b931-df4a3e9b101f', " "location_in_file='acquisition/TestElectricalSeries/data', dataset_name='data', dtype=dtype('int16'), " - "full_shape=(1800000, 384)), chunk_shape=(78125, 64), buffer_shape=(1250000, 384), compression_method='gzip', " + "full_shape=(1800000, 384), chunk_shape=(78125, 64), buffer_shape=(1250000, 384), compression_method='gzip', " "compression_options=None)" ) assert repr(hdf5_dataset_configuration) == expected_repr diff --git a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/test_zarr_dataset_io_configuration_model.py b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/test_zarr_dataset_io_configuration_model.py index ce7ddb0c7..2e5735c44 100644 --- a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/test_zarr_dataset_io_configuration_model.py +++ b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/test_zarr_dataset_io_configuration_model.py @@ -6,11 +6,8 @@ import pytest from numcodecs import GZip -from neuroconv.tools.nwb_helpers import ( - AVAILABLE_ZARR_COMPRESSION_METHODS, - ZarrDatasetIOConfiguration, -) -from neuroconv.tools.testing import mock_DatasetInfo, mock_ZarrDatasetIOConfiguration +from neuroconv.tools.nwb_helpers import AVAILABLE_ZARR_COMPRESSION_METHODS +from neuroconv.tools.testing import mock_ZarrDatasetIOConfiguration def test_zarr_dataset_io_configuration_print(): @@ -155,9 +152,9 @@ def test_zarr_dataset_configuration_repr(): # Important to keep the `repr` unmodified for appearance inside iterables of DatasetInfo objects expected_repr = ( - "ZarrDatasetIOConfiguration(dataset_info=DatasetInfo(object_id='481a0860-3a0c-40ec-b931-df4a3e9b101f', " + "ZarrDatasetIOConfiguration(object_id='481a0860-3a0c-40ec-b931-df4a3e9b101f', " "location_in_file='acquisition/TestElectricalSeries/data', dataset_name='data', dtype=dtype('int16'), " - "full_shape=(1800000, 384)), chunk_shape=(78125, 64), buffer_shape=(1250000, 384), compression_method='gzip', " + "full_shape=(1800000, 384), chunk_shape=(78125, 64), buffer_shape=(1250000, 384), compression_method='gzip', " "compression_options=None, filter_methods=None, filter_options=None)" ) assert repr(zarr_dataset_configuration) == expected_repr @@ -165,8 +162,7 @@ def test_zarr_dataset_configuration_repr(): def test_validator_filter_options_has_methods(): with pytest.raises(ValueError) as error_info: - ZarrDatasetIOConfiguration( - dataset_info=mock_DatasetInfo(), + mock_ZarrDatasetIOConfiguration( chunk_shape=(78_125, 64), buffer_shape=(1_250_000, 384), filter_methods=None, @@ -175,15 +171,14 @@ def test_validator_filter_options_has_methods(): expected_error = ( "`filter_methods` is `None` but `filter_options` is not `None` " - "(received `filter_options=[{'clevel': 5}]`)! (type=value_error)" + "(received `filter_options=[{'clevel': 5}]`)! [type=value_error, " ) assert expected_error in str(error_info.value) def test_validator_filter_methods_length_match_options(): with pytest.raises(ValueError) as error_info: - ZarrDatasetIOConfiguration( - dataset_info=mock_DatasetInfo(), + mock_ZarrDatasetIOConfiguration( chunk_shape=(78_125, 64), buffer_shape=(1_250_000, 384), filter_methods=["blosc", "delta"], @@ -192,7 +187,7 @@ def test_validator_filter_methods_length_match_options(): expected_error = ( "Length mismatch between `filter_methods` (2 methods specified) and `filter_options` (1 options found)! " - "`filter_methods` and `filter_options` should be the same length. (type=value_error)" + "`filter_methods` and `filter_options` should be the same length. [type=value_error, " ) assert expected_error in str(error_info.value)