From c4bca8ac37f56b68f228be37b274dc1dbc1fcbee Mon Sep 17 00:00:00 2001 From: Cody Baker Date: Sun, 17 Sep 2023 21:51:31 -0400 Subject: [PATCH 01/27] port over tool function for defaults --- .../nwb_helpers/_dataset_configuration.py | 213 +++++++++++ .../test_get_default_backend_configuration.py | 205 ++++++++++ ...test_get_default_dataset_configurations.py | 349 ++++++++++++++++++ ...t_dataset_configurations_appended_files.py | 146 ++++++++ .../test_dataset_configuration_model.py | 0 .../test_dataset_info_model.py | 0 .../test_hdf5_backend_configuration_model.py | 0 .../test_hdf5_dataset_configuration_model.py | 0 .../test_zarr_backend_configuration_model.py | 0 .../test_zarr_dataset_configuration_model.py | 0 10 files changed, 913 insertions(+) create mode 100644 src/neuroconv/tools/nwb_helpers/_dataset_configuration.py create mode 100644 tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_default_backend_configuration.py create mode 100644 tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_default_dataset_configurations.py create mode 100644 tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_default_dataset_configurations_appended_files.py rename tests/test_minimal/test_tools/test_backend_and_dataset_configuration/{ => test_models}/test_dataset_configuration_model.py (100%) rename tests/test_minimal/test_tools/test_backend_and_dataset_configuration/{ => test_models}/test_dataset_info_model.py (100%) rename tests/test_minimal/test_tools/test_backend_and_dataset_configuration/{ => test_models}/test_hdf5_backend_configuration_model.py (100%) rename tests/test_minimal/test_tools/test_backend_and_dataset_configuration/{ => test_models}/test_hdf5_dataset_configuration_model.py (100%) rename tests/test_minimal/test_tools/test_backend_and_dataset_configuration/{ => test_models}/test_zarr_backend_configuration_model.py (100%) rename tests/test_minimal/test_tools/test_backend_and_dataset_configuration/{ => test_models}/test_zarr_dataset_configuration_model.py (100%) diff --git a/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py b/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py new file mode 100644 index 000000000..011e16a99 --- /dev/null +++ b/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py @@ -0,0 +1,213 @@ +"""Collection of helper functions related to configuration of datasets dependent on backend.""" +from typing import Iterable, Literal, Union + +import h5py +import numpy as np +import zarr +from hdmf import Container +from hdmf.data_utils import DataChunkIterator, DataIO, GenericDataChunkIterator +from hdmf.utils import get_data_shape +from hdmf_zarr import NWBZarrIO +from pynwb import NWBHDF5IO, NWBFile, TimeSeries +from pynwb.base import DynamicTable + +from ._dataset_and_backend_models import ( + BACKEND_TO_CONFIGURATION, + BACKEND_TO_DATASET_CONFIGURATION, + DatasetConfiguration, + DatasetInfo, + HDF5BackendConfiguration, + HDF5DatasetConfiguration, + ZarrBackendConfiguration, + ZarrDatasetConfiguration, +) +from ..hdmf import SliceableDataChunkIterator + + +def _get_mode(io: Union[NWBHDF5IO, NWBZarrIO]) -> str: + """NWBHDF5IO and NWBZarrIO have different ways of storing the mode they used on a path.""" + if isinstance(io, NWBHDF5IO): + return io.mode + elif isinstance(io, NWBZarrIO): + return io._ZarrIO__mode + + +def _is_value_already_written_to_file( + candidate_dataset: Union[h5py.Dataset, zarr.Array], + backend: Literal["hdf5", "zarr"], + existing_file: Union[h5py.File, zarr.Group, None], +) -> bool: + """ + Determine if the neurodata object is already written to the file on disk. + + This object should then be skipped by the `get_io_datasets` function when working in append mode. + """ + return ( + isinstance(candidate_dataset, h5py.Dataset) # If the source data is an HDF5 Dataset + and backend == "hdf5" # If working in append mode + and candidate_dataset.file == existing_file # If the source HDF5 Dataset is the appending NWBFile + ) or ( + isinstance(candidate_dataset, zarr.Array) # If the source data is an Zarr Array + and backend == "zarr" # If working in append mode + and candidate_dataset.store == existing_file # If the source Zarr 'file' is the appending NWBFile + ) + + +def _parse_location_in_memory_nwbfile(current_location: str, neurodata_object: Container) -> str: + parent = neurodata_object.parent + if isinstance(parent, NWBFile): + # Items in defined top-level places like acquisition, intervals, etc. do not act as 'containers' + # in the .parent sense; ask if object is in their in-memory dictionaries instead + for outer_field_name, outer_field_value in parent.fields.items(): + if isinstance(outer_field_value, dict) and neurodata_object.name in outer_field_value: + return outer_field_name + "/" + neurodata_object.name + "/" + current_location + return neurodata_object.name + "/" + current_location + return _parse_location_in_memory_nwbfile( + current_location=neurodata_object.name + "/" + current_location, neurodata_object=parent + ) + + +def _get_dataset_metadata( + neurodata_object: Union[TimeSeries, DynamicTable], field_name: str, backend: Literal["hdf5", "zarr"] +) -> Union[HDF5DatasetConfiguration, ZarrDatasetConfiguration]: + """Fill in the Dataset model with as many values as can be automatically detected or inferred.""" + DatasetConfigurationClass = BACKEND_TO_DATASET_CONFIGURATION[backend] + + candidate_dataset = getattr(neurodata_object, field_name) + # For now, skip over datasets already wrapped in DataIO + # Could maybe eventually support modifying chunks in place + # But setting buffer shape only possible if iterator was wrapped first + if not isinstance(candidate_dataset, DataIO): + # DataChunkIterator has best generic dtype inference, though logic is hard to peel out of it + # And it can fail in rare cases but not essential to our default configuration + try: + dtype = str(DataChunkIterator(candidate_dataset).dtype) # string cast to be JSON friendly + except Exception as exception: + if str(exception) != "Data type could not be determined. Please specify dtype in DataChunkIterator init.": + raise exception + else: + dtype = "unknown" + + maxshape = get_data_shape(data=candidate_dataset) + + if isinstance(candidate_dataset, GenericDataChunkIterator): + chunk_shape = candidate_dataset.chunk_shape + buffer_shape = candidate_dataset.buffer_shape + elif dtype != "unknown": + # TODO: eventually replace this with staticmethods on hdmf.data_utils.GenericDataChunkIterator + chunk_shape = SliceableDataChunkIterator.estimate_default_chunk_shape( + chunk_mb=10.0, maxshape=maxshape, dtype=np.dtype(dtype) + ) + buffer_shape = SliceableDataChunkIterator.estimate_default_buffer_shape( + buffer_gb=0.5, chunk_shape=chunk_shape, maxshape=maxshape, dtype=np.dtype(dtype) + ) + else: + pass # TODO: think on this; perhaps zarr's standalone estimator? + + dataset_info = DatasetInfo( + object_id=neurodata_object.object_id, + object_name=neurodata_object.name, + location=_parse_location_in_memory_nwbfile(current_location=field_name, neurodata_object=neurodata_object), + field=field_name, + maxshape=maxshape, + dtype=dtype, + ) + dataset_configuration = DatasetConfigurationClass( + dataset_info=dataset_info, chunk_shape=chunk_shape, buffer_shape=buffer_shape + ) + return dataset_configuration + + +def get_default_dataset_configurations( + nwbfile: NWBFile, + backend: Union[None, Literal["hdf5", "zarr"]] = None, # None for auto-detect from append mode, otherwise required +) -> Iterable[DatasetConfiguration]: + """ + Method for automatically detecting all objects in the file that could be wrapped in a DataIO. + + Parameters + ---------- + nwbfile : pynwb.NWBFile + An in-memory NWBFile object, either generated from the base class or read from an existing file of any backend. + backend : "hdf5" or "zarr" + Which backend format type you would like to use in configuring each datasets compression methods and options. + + Yields + ------ + DatasetConfiguration + A summary of each detected object that can be wrapped in a DataIO. + """ + if backend is None and nwbfile.read_io is None: + raise ValueError( + "Keyword argument `backend` (either 'hdf5' or 'zarr') must be specified if the `nwbfile` was not " + "read from an existing file!" + ) + if backend is None and nwbfile.read_io is not None and nwbfile.read_io.mode not in ("r+", "a"): + raise ValueError( + "Keyword argument `backend` (either 'hdf5' or 'zarr') must be specified if the `nwbfile` is being appended." + ) + + detected_backend = None + existing_file = None + if isinstance(nwbfile.read_io, NWBHDF5IO) and _get_mode(io=nwbfile.read_io) in ("r+", "a"): + detected_backend = "hdf5" + existing_file = nwbfile.read_io._file + elif isinstance(nwbfile.read_io, NWBZarrIO) and _get_mode(io=nwbfile.read_io) in ("r+", "a"): + detected_backend = "zarr" + existing_file = nwbfile.read_io.file.store + backend = backend or detected_backend + + if detected_backend is not None and detected_backend != backend: + raise ValueError( + f"Detected backend '{detected_backend}' for appending file, but specified `backend` " + f"({backend}) does not match! Set `backend=None` or remove the keyword argument to allow it to auto-detect." + ) + + for neurodata_object in nwbfile.objects.values(): + if isinstance(neurodata_object, TimeSeries): + time_series = neurodata_object # for readability + + for field_name in ("data", "timestamps"): + if field_name not in time_series.fields: # timestamps is optional + continue + + candidate_dataset = getattr(time_series, field_name) + if _is_value_already_written_to_file( + candidate_dataset=candidate_dataset, backend=backend, existing_file=existing_file + ): + continue # skip + + # Edge case of in-memory ImageSeries with external mode; data is in fields and is empty array + if isinstance(candidate_dataset, np.ndarray) and not np.any(candidate_dataset): + continue # skip + + yield _get_dataset_metadata(neurodata_object=time_series, field_name=field_name, backend=backend) + elif isinstance(neurodata_object, DynamicTable): + dynamic_table = neurodata_object # for readability + + for column_name in dynamic_table.colnames: + candidate_dataset = dynamic_table[column_name].data # VectorData object + if _is_value_already_written_to_file( + candidate_dataset=candidate_dataset, backend=backend, existing_file=existing_file + ): + continue # skip + + yield _get_dataset_metadata( + neurodata_object=dynamic_table[column_name], field_name="data", backend=backend + ) + + +def get_default_backend_configuration( + nwbfile: NWBFile, backend: Literal["hdf5", "zarr"] +) -> Union[HDF5BackendConfiguration, ZarrBackendConfiguration]: + """Fill a default backend configuration to serve as a starting point for further customization.""" + BackendConfigurationClass = BACKEND_TO_CONFIGURATION[backend] + + default_dataset_configurations = get_default_dataset_configurations(nwbfile=nwbfile, backend=backend) + dataset_configurations = { + default_dataset_configuration.dataset_info.location: default_dataset_configuration + for default_dataset_configuration in default_dataset_configurations + } + + backend_configuration = BackendConfigurationClass(dataset_configurations=dataset_configurations) + return backend_configuration \ No newline at end of file diff --git a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_default_backend_configuration.py b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_default_backend_configuration.py new file mode 100644 index 000000000..45878e472 --- /dev/null +++ b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_default_backend_configuration.py @@ -0,0 +1,205 @@ +"""Integration tests for `get_default_backend_configuration`.""" +from io import StringIO +from pathlib import Path +from unittest.mock import patch + +import numpy as np +import pytest +from hdmf_zarr import NWBZarrIO +from pynwb import NWBHDF5IO, NWBFile +from pynwb.testing.mock.base import mock_TimeSeries +from pynwb.testing.mock.file import mock_NWBFile + +from neuroconv.tools.nwb_helpers import ( + HDF5BackendConfiguration, + ZarrBackendConfiguration, + get_default_backend_configuration, + get_module, +) + + +def generate_complex_nwbfile() -> NWBFile: + nwbfile = mock_NWBFile() + + raw_array = np.array([[1, 2, 3], [4, 5, 6]]) + raw_time_series = mock_TimeSeries(name="RawTimeSeries", data=raw_array) + nwbfile.add_acquisition(raw_time_series) + + number_of_trials = 10 + for start_time, stop_time in zip( + np.linspace(start=0.0, stop=10.0, num=number_of_trials), np.linspace(start=1.0, stop=11.0, num=number_of_trials) + ): + nwbfile.add_trial(start_time=start_time, stop_time=stop_time) + + ecephys_module = get_module(nwbfile=nwbfile, name="ecephys") + processed_array = np.array([[7.0, 8.0], [9.0, 10.0], [11.0, 12.0], [13.0, 14.0]]) + processed_time_series = mock_TimeSeries(name="ProcessedTimeSeries", data=processed_array) + ecephys_module.add(processed_time_series) + + return nwbfile + + +@pytest.fixture(scope="session") +def hdf5_nwbfile_path(tmpdir_factory): + nwbfile_path = tmpdir_factory.mktemp("data").join("test_default_backend_configuration_hdf5_nwbfile.nwb.h5") + if not Path(nwbfile_path).exists(): + nwbfile = generate_complex_nwbfile() + with NWBHDF5IO(path=str(nwbfile_path), mode="w") as io: + io.write(nwbfile) + return str(nwbfile_path) + + +@pytest.fixture(scope="session") +def zarr_nwbfile_path(tmpdir_factory): + nwbfile_path = tmpdir_factory.mktemp("data").join("test_default_backend_configuration_hdf5_nwbfile.nwb.zarr") + if not Path(nwbfile_path).exists(): + nwbfile = generate_complex_nwbfile() + with NWBZarrIO(path=str(nwbfile_path), mode="w") as io: + io.write(nwbfile) + return str(nwbfile_path) + + +def test_complex_hdf5(hdf5_nwbfile_path): + with NWBHDF5IO(path=hdf5_nwbfile_path, mode="a") as io: + nwbfile = io.read() + + raw_array = np.array([[11, 21, 31], [41, 51, 61]]) + raw_time_series = mock_TimeSeries(name="NewRawTimeSeries", data=raw_array) + nwbfile.add_acquisition(raw_time_series) + + number_of_epochs = 5 + for start_time, stop_time in zip( + np.linspace(start=0.0, stop=10.0, num=number_of_epochs), + np.linspace(start=1.0, stop=11.0, num=number_of_epochs), + ): + nwbfile.add_epoch(start_time=start_time, stop_time=stop_time) + + ecephys_module = get_module(nwbfile=nwbfile, name="ecephys") + processed_array = np.array([[7.1, 8.1], [9.1, 10.1], [11.1, 12.1], [13.1, 14.1]]) + processed_time_series = mock_TimeSeries(name="NewProcessedTimeSeries", data=processed_array) + ecephys_module.add(processed_time_series) + + backend_configuration = get_default_backend_configuration(nwbfile=nwbfile, backend="hdf5") + + assert isinstance(backend_configuration, HDF5BackendConfiguration) + + dataset_configurations = backend_configuration.dataset_configurations + assert len(dataset_configurations) == 4 + assert "acquisition/NewRawTimeSeries/data" in dataset_configurations + assert "epochs/start_time/data" in dataset_configurations + assert "epochs/stop_time/data" in dataset_configurations + assert "processing/ecephys/NewProcessedTimeSeries/data" in dataset_configurations + + # Best summary test of expected output is the printout + with patch("sys.stdout", new=StringIO()) as stdout: + print(backend_configuration) + + expected_print = """Configurable datasets identified using the hdf5 backend +------------------------------------------------------- +epochs/start_time/data + maxshape : (5,) + dtype : float64 + chunk shape : (5,) + buffer shape : (5,) + compression method : gzip + compression options : None +epochs/stop_time/data + maxshape : (5,) + dtype : float64 + chunk shape : (5,) + buffer shape : (5,) + compression method : gzip + compression options : None +acquisition/NewRawTimeSeries/data + maxshape : (2, 3) + dtype : int32 + chunk shape : (2, 3) + buffer shape : (2, 3) + compression method : gzip + compression options : None +processing/ecephys/NewProcessedTimeSeries/data + maxshape : (4, 2) + dtype : float64 + chunk shape : (4, 2) + buffer shape : (4, 2) + compression method : gzip + compression options : None +""" + assert stdout.getvalue() == expected_print + + +def test_complex_zarr(zarr_nwbfile_path): + with NWBZarrIO(path=zarr_nwbfile_path, mode="a") as io: + nwbfile = io.read() + + raw_array = np.array([[11, 21, 31], [41, 51, 61]]) + raw_time_series = mock_TimeSeries(name="NewRawTimeSeries", data=raw_array) + nwbfile.add_acquisition(raw_time_series) + + number_of_epochs = 5 + for start_time, stop_time in zip( + np.linspace(start=0.0, stop=10.0, num=number_of_epochs), + np.linspace(start=1.0, stop=11.0, num=number_of_epochs), + ): + nwbfile.add_epoch(start_time=start_time, stop_time=stop_time) + + ecephys_module = get_module(nwbfile=nwbfile, name="ecephys") + processed_array = np.array([[7.1, 8.1], [9.1, 10.1], [11.1, 12.1], [13.1, 14.1]]) + processed_time_series = mock_TimeSeries(name="NewProcessedTimeSeries", data=processed_array) + ecephys_module.add(processed_time_series) + + backend_configuration = get_default_backend_configuration(nwbfile=nwbfile, backend="zarr") + + assert isinstance(backend_configuration, ZarrBackendConfiguration) + + dataset_configurations = backend_configuration.dataset_configurations + assert len(dataset_configurations) == 4 + assert "acquisition/NewRawTimeSeries/data" in dataset_configurations + assert "epochs/start_time/data" in dataset_configurations + assert "epochs/stop_time/data" in dataset_configurations + assert "processing/ecephys/NewProcessedTimeSeries/data" in dataset_configurations + + # Best summary test of expected output is the printout + with patch("sys.stdout", new=StringIO()) as stdout: + print(backend_configuration) + + expected_print = """Configurable datasets identified using the zarr backend +------------------------------------------------------- +epochs/start_time/data + maxshape : (5,) + dtype : float64 + chunk shape : (5,) + buffer shape : (5,) + compression method : gzip + compression options : None + filter methods : None + filter options : None +epochs/stop_time/data + maxshape : (5,) + dtype : float64 + chunk shape : (5,) + buffer shape : (5,) + compression method : gzip + compression options : None + filter methods : None + filter options : None +acquisition/NewRawTimeSeries/data + maxshape : (2, 3) + dtype : int32 + chunk shape : (2, 3) + buffer shape : (2, 3) + compression method : gzip + compression options : None + filter methods : None + filter options : None +processing/ecephys/NewProcessedTimeSeries/data + maxshape : (4, 2) + dtype : float64 + chunk shape : (4, 2) + buffer shape : (4, 2) + compression method : gzip + compression options : None + filter methods : None + filter options : None +""" + assert stdout.getvalue() == expected_print diff --git a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_default_dataset_configurations.py b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_default_dataset_configurations.py new file mode 100644 index 000000000..f08c85167 --- /dev/null +++ b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_default_dataset_configurations.py @@ -0,0 +1,349 @@ +"""Unit tests for `get_default_dataset_configurations`.""" +import numpy as np +from hdmf.common import VectorData +from hdmf.data_utils import DataChunkIterator +from pynwb.base import DynamicTable +from pynwb.image import ImageSeries +from pynwb.testing.mock.base import mock_TimeSeries +from pynwb.testing.mock.file import mock_NWBFile + +from neuroconv.tools.hdmf import SliceableDataChunkIterator +from neuroconv.tools.nwb_helpers import ( + HDF5DatasetConfiguration, + ZarrDatasetConfiguration, + get_default_dataset_configurations, +) + + +def test_unwrapped_time_series_hdf5(): + array = np.array([[1, 2, 3], [4, 5, 6]]) + + nwbfile = mock_NWBFile() + time_series = mock_TimeSeries(name="TestTimeSeries", data=array) + nwbfile.add_acquisition(time_series) + + dataset_configurations = list(get_default_dataset_configurations(nwbfile=nwbfile, backend="hdf5")) + + assert len(dataset_configurations) == 1 + + dataset_configuration = dataset_configurations[0] + assert isinstance(dataset_configuration, HDF5DatasetConfiguration) + assert dataset_configuration.dataset_info.object_id == time_series.object_id + assert dataset_configuration.dataset_info.location == "acquisition/TestTimeSeries/data" + assert dataset_configuration.dataset_info.maxshape == array.shape + assert dataset_configuration.dataset_info.dtype == array.dtype + assert dataset_configuration.chunk_shape == array.shape + assert dataset_configuration.buffer_shape == array.shape + assert dataset_configuration.compression_method == "gzip" + assert dataset_configuration.compression_options is None + + +def test_unwrapped_time_series_zarr(): + array = np.array([[1, 2, 3], [4, 5, 6]]) + + nwbfile = mock_NWBFile() + time_series = mock_TimeSeries(name="TestTimeSeries", data=array) + nwbfile.add_acquisition(time_series) + + dataset_configurations = list(get_default_dataset_configurations(nwbfile=nwbfile, backend="zarr")) + + assert len(dataset_configurations) == 1 + + dataset_configuration = dataset_configurations[0] + assert isinstance(dataset_configuration, ZarrDatasetConfiguration) + assert dataset_configuration.dataset_info.object_id == time_series.object_id + assert dataset_configuration.dataset_info.location == "acquisition/TestTimeSeries/data" + assert dataset_configuration.dataset_info.maxshape == array.shape + assert dataset_configuration.dataset_info.dtype == array.dtype + assert dataset_configuration.chunk_shape == array.shape + assert dataset_configuration.buffer_shape == array.shape + assert dataset_configuration.compression_method == "gzip" + assert dataset_configuration.compression_options is None + assert dataset_configuration.filter_methods is None + assert dataset_configuration.filter_options is None + + +def test_generic_iterator_wrapped_time_series_hdf5(): + array = np.array([[1, 2, 3], [4, 5, 6]]) + + nwbfile = mock_NWBFile() + time_series = mock_TimeSeries(name="TestTimeSeries", data=SliceableDataChunkIterator(data=array)) + nwbfile.add_acquisition(time_series) + + dataset_configurations = list(get_default_dataset_configurations(nwbfile=nwbfile, backend="hdf5")) + + assert len(dataset_configurations) == 1 + + dataset_configuration = dataset_configurations[0] + assert isinstance(dataset_configuration, HDF5DatasetConfiguration) + assert dataset_configuration.dataset_info.object_id == time_series.object_id + assert dataset_configuration.dataset_info.location == "acquisition/TestTimeSeries/data" + assert dataset_configuration.dataset_info.maxshape == array.shape + assert dataset_configuration.dataset_info.dtype == array.dtype + assert dataset_configuration.chunk_shape == array.shape + assert dataset_configuration.buffer_shape == array.shape + assert dataset_configuration.compression_method == "gzip" + assert dataset_configuration.compression_options is None + + +def test_classic_iterator_wrapped_simple_time_series_zarr(): + array = np.array([[1, 2, 3], [4, 5, 6]]) + + nwbfile = mock_NWBFile() + time_series = mock_TimeSeries(name="TestTimeSeries", data=DataChunkIterator(data=array)) + nwbfile.add_acquisition(time_series) + + dataset_configurations = list(get_default_dataset_configurations(nwbfile=nwbfile, backend="zarr")) + + assert len(dataset_configurations) == 1 + + dataset_configuration = dataset_configurations[0] + assert isinstance(dataset_configuration, ZarrDatasetConfiguration) + assert dataset_configuration.dataset_info.object_id == time_series.object_id + assert dataset_configuration.dataset_info.location == "acquisition/TestTimeSeries/data" + assert dataset_configuration.dataset_info.maxshape == array.shape + assert dataset_configuration.dataset_info.dtype == array.dtype + assert dataset_configuration.chunk_shape == array.shape + assert dataset_configuration.buffer_shape == array.shape + assert dataset_configuration.compression_method == "gzip" + assert dataset_configuration.compression_options is None + assert dataset_configuration.filter_methods is None + assert dataset_configuration.filter_options is None + + +def test_classic_iterator_wrapped_time_series_hdf5(): + array = np.array([[1, 2, 3], [4, 5, 6]]) + + nwbfile = mock_NWBFile() + time_series = mock_TimeSeries(name="TestTimeSeries", data=DataChunkIterator(data=array)) + nwbfile.add_acquisition(time_series) + + dataset_configurations = list(get_default_dataset_configurations(nwbfile=nwbfile, backend="hdf5")) + + assert len(dataset_configurations) == 1 + + dataset_configuration = dataset_configurations[0] + assert isinstance(dataset_configuration, HDF5DatasetConfiguration) + assert dataset_configuration.dataset_info.object_id == time_series.object_id + assert dataset_configuration.dataset_info.location == "acquisition/TestTimeSeries/data" + assert dataset_configuration.dataset_info.maxshape == array.shape + assert dataset_configuration.dataset_info.dtype == array.dtype + assert dataset_configuration.chunk_shape == array.shape + assert dataset_configuration.buffer_shape == array.shape + assert dataset_configuration.compression_method == "gzip" + assert dataset_configuration.compression_options is None + + +def test_generic_iterator_wrapped_simple_time_series_zarr(): + array = np.array([[1, 2, 3], [4, 5, 6]]) + + nwbfile = mock_NWBFile() + time_series = mock_TimeSeries(name="TestTimeSeries", data=SliceableDataChunkIterator(data=array)) + nwbfile.add_acquisition(time_series) + + dataset_configurations = list(get_default_dataset_configurations(nwbfile=nwbfile, backend="zarr")) + + assert len(dataset_configurations) == 1 + + dataset_configuration = dataset_configurations[0] + assert isinstance(dataset_configuration, ZarrDatasetConfiguration) + assert dataset_configuration.dataset_info.object_id == time_series.object_id + assert dataset_configuration.dataset_info.location == "acquisition/TestTimeSeries/data" + assert dataset_configuration.dataset_info.maxshape == array.shape + assert dataset_configuration.dataset_info.dtype == array.dtype + assert dataset_configuration.chunk_shape == array.shape + assert dataset_configuration.buffer_shape == array.shape + assert dataset_configuration.compression_method == "gzip" + assert dataset_configuration.compression_options is None + assert dataset_configuration.filter_methods is None + assert dataset_configuration.filter_options is None + + +def test_external_image_series_hdf5(): + nwbfile = mock_NWBFile() + image_series = ImageSeries(name="TestImageSeries", external_file=[""], rate=1.0) + nwbfile.add_acquisition(image_series) + + dataset_configurations = list(get_default_dataset_configurations(nwbfile=nwbfile, backend="hdf5")) + + assert len(dataset_configurations) == 0 + + +def test_external_image_series_zarr(): + nwbfile = mock_NWBFile() + image_series = ImageSeries(name="TestImageSeries", external_file=[""], rate=1.0) + nwbfile.add_acquisition(image_series) + + dataset_configurations = list(get_default_dataset_configurations(nwbfile=nwbfile, backend="zarr")) + + assert len(dataset_configurations) == 0 + + +def test_unwrapped_dynamic_table_hdf5(): + array = np.array([0.1, 0.2, 0.3]) + + nwbfile = mock_NWBFile() + column = VectorData(name="TestColumn", description="", data=array.squeeze()) + dynamic_table = DynamicTable(name="TestDynamicTable", description="", columns=[column]) + nwbfile.add_acquisition(dynamic_table) + + dataset_configurations = list(get_default_dataset_configurations(nwbfile=nwbfile, backend="hdf5")) + + assert len(dataset_configurations) == 1 + + dataset_configuration = dataset_configurations[0] + assert isinstance(dataset_configuration, HDF5DatasetConfiguration) + assert dataset_configuration.dataset_info.object_id == column.object_id + assert dataset_configuration.dataset_info.location == "acquisition/TestDynamicTable/TestColumn/data" + assert dataset_configuration.dataset_info.maxshape == array.shape + assert dataset_configuration.dataset_info.dtype == array.dtype + assert dataset_configuration.chunk_shape == array.shape + assert dataset_configuration.buffer_shape == array.shape + assert dataset_configuration.compression_method == "gzip" + assert dataset_configuration.compression_options is None + + +def test_unwrapped_dynamic_table_zarr(): + array = np.array([0.1, 0.2, 0.3]) + + nwbfile = mock_NWBFile() + column = VectorData(name="TestColumn", description="", data=array.squeeze()) + dynamic_table = DynamicTable(name="TestDynamicTable", description="", columns=[column]) + nwbfile.add_acquisition(dynamic_table) + + dataset_configurations = list(get_default_dataset_configurations(nwbfile=nwbfile, backend="zarr")) + + assert len(dataset_configurations) == 1 + + dataset_configuration = dataset_configurations[0] + assert isinstance(dataset_configuration, ZarrDatasetConfiguration) + assert dataset_configuration.dataset_info.object_id == column.object_id + assert dataset_configuration.dataset_info.location == "acquisition/TestDynamicTable/TestColumn/data" + assert dataset_configuration.dataset_info.maxshape == array.shape + assert dataset_configuration.dataset_info.dtype == array.dtype + assert dataset_configuration.chunk_shape == array.shape + assert dataset_configuration.buffer_shape == array.shape + assert dataset_configuration.compression_method == "gzip" + assert dataset_configuration.compression_options is None + assert dataset_configuration.filter_methods is None + assert dataset_configuration.filter_options is None + + +def test_generic_iterator_wrapped_dynamic_table_hdf5(): + array = np.array([0.1, 0.2, 0.3]) + + nwbfile = mock_NWBFile() + column = VectorData(name="TestColumn", description="", data=SliceableDataChunkIterator(data=array.squeeze())) + dynamic_table = DynamicTable( + name="TestDynamicTable", + description="", + id=list(range(array.shape[0])), # Need to include ID since the data of the column is not wrapped in an IO + columns=[column], + ) + nwbfile.add_acquisition(dynamic_table) + + dataset_configurations = list(get_default_dataset_configurations(nwbfile=nwbfile, backend="hdf5")) + + assert len(dataset_configurations) == 1 + + dataset_configuration = dataset_configurations[0] + assert isinstance(dataset_configuration, HDF5DatasetConfiguration) + assert dataset_configuration.dataset_info.object_id == column.object_id + assert dataset_configuration.dataset_info.location == "acquisition/TestDynamicTable/TestColumn/data" + assert dataset_configuration.dataset_info.maxshape == (array.shape[0],) + assert dataset_configuration.dataset_info.dtype == array.dtype + assert dataset_configuration.chunk_shape == array.shape + assert dataset_configuration.buffer_shape == array.shape + assert dataset_configuration.compression_method == "gzip" + assert dataset_configuration.compression_options is None + + +def test_generic_iterator_wrapped_dynamic_table_zarr(): + array = np.array([0.1, 0.2, 0.3]) + + nwbfile = mock_NWBFile() + column = VectorData(name="TestColumn", description="", data=SliceableDataChunkIterator(data=array.squeeze())) + dynamic_table = DynamicTable( + name="TestDynamicTable", + description="", + id=list(range(array.shape[0])), # Need to include ID since the data of the column is not wrapped in an IO + columns=[column], + ) + nwbfile.add_acquisition(dynamic_table) + + dataset_configurations = list(get_default_dataset_configurations(nwbfile=nwbfile, backend="zarr")) + + assert len(dataset_configurations) == 1 + + dataset_configuration = dataset_configurations[0] + assert isinstance(dataset_configuration, ZarrDatasetConfiguration) + assert dataset_configuration.dataset_info.object_id == column.object_id + assert dataset_configuration.dataset_info.location == "acquisition/TestDynamicTable/TestColumn/data" + assert dataset_configuration.dataset_info.maxshape == (array.shape[0],) + assert dataset_configuration.dataset_info.dtype == array.dtype + assert dataset_configuration.chunk_shape == array.shape + assert dataset_configuration.buffer_shape == array.shape + assert dataset_configuration.compression_method == "gzip" + assert dataset_configuration.compression_options is None + assert dataset_configuration.filter_methods is None + assert dataset_configuration.filter_options is None + + +def test_classic_iterator_wrapped_dynamic_table_hdf5(): + array = np.array([0.1, 0.2, 0.3]) + + nwbfile = mock_NWBFile() + column = VectorData(name="TestColumn", description="", data=DataChunkIterator(data=array.squeeze())) + dynamic_table = DynamicTable( + name="TestDynamicTable", + description="", + id=list(range(array.shape[0])), # Need to include ID since the data of the column is not wrapped in an IO + columns=[column], + ) + nwbfile.add_acquisition(dynamic_table) + + dataset_configurations = list(get_default_dataset_configurations(nwbfile=nwbfile, backend="hdf5")) + + assert len(dataset_configurations) == 1 + + dataset_configuration = dataset_configurations[0] + assert isinstance(dataset_configuration, HDF5DatasetConfiguration) + assert dataset_configuration.dataset_info.object_id == column.object_id + assert dataset_configuration.dataset_info.location == "acquisition/TestDynamicTable/TestColumn/data" + assert dataset_configuration.dataset_info.maxshape == (array.shape[0],) + assert dataset_configuration.dataset_info.dtype == array.dtype + assert dataset_configuration.chunk_shape == array.shape + assert dataset_configuration.buffer_shape == array.shape + assert dataset_configuration.compression_method == "gzip" + assert dataset_configuration.compression_options is None + + +def test_classic_iterator_wrapped_dynamic_table_zarr(): + array = np.array([0.1, 0.2, 0.3]) + + nwbfile = mock_NWBFile() + column = VectorData(name="TestColumn", description="", data=DataChunkIterator(data=array.squeeze())) + dynamic_table = DynamicTable( + name="TestDynamicTable", + description="", + id=list(range(array.shape[0])), # Need to include ID since the data of the column is not wrapped in an IO + columns=[column], + ) + nwbfile.add_acquisition(dynamic_table) + + dataset_configurations = list(get_default_dataset_configurations(nwbfile=nwbfile, backend="zarr")) + + assert len(dataset_configurations) == 1 + + dataset_configuration = dataset_configurations[0] + assert isinstance(dataset_configuration, ZarrDatasetConfiguration) + assert dataset_configuration.dataset_info.object_id == column.object_id + assert dataset_configuration.dataset_info.location == "acquisition/TestDynamicTable/TestColumn/data" + assert dataset_configuration.dataset_info.maxshape == (array.shape[0],) + assert dataset_configuration.dataset_info.dtype == array.dtype + assert dataset_configuration.chunk_shape == array.shape + assert dataset_configuration.buffer_shape == array.shape + assert dataset_configuration.compression_method == "gzip" + assert dataset_configuration.compression_options is None + assert dataset_configuration.filter_methods is None + assert dataset_configuration.filter_options is None \ No newline at end of file diff --git a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_default_dataset_configurations_appended_files.py b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_default_dataset_configurations_appended_files.py new file mode 100644 index 000000000..dcda5d8a0 --- /dev/null +++ b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_default_dataset_configurations_appended_files.py @@ -0,0 +1,146 @@ +""" +Unit tests for `get_default_dataset_configurations` operating on already written files open in append mode. +Mostly testing that the right objects are skipped from identification as candidates for configuration. +""" +from pathlib import Path + +import numpy as np +import pytest +from hdmf.common import VectorData +from hdmf_zarr import NWBZarrIO +from pynwb import NWBHDF5IO, NWBFile +from pynwb.base import DynamicTable +from pynwb.testing.mock.base import mock_TimeSeries +from pynwb.testing.mock.file import mock_NWBFile + +from neuroconv.tools.nwb_helpers import ( + HDF5DatasetConfiguration, + ZarrDatasetConfiguration, + get_default_dataset_configurations, +) + + +def generate_nwbfile_with_existing_time_series() -> NWBFile: + nwbfile = mock_NWBFile() + array = np.array([[1, 2, 3], [4, 5, 6]]) + time_series = mock_TimeSeries(name="ExistingTimeSeries", data=array) + nwbfile.add_acquisition(time_series) + return nwbfile + + +@pytest.fixture(scope="session") +def hdf5_nwbfile_path(tmpdir_factory): + nwbfile_path = tmpdir_factory.mktemp("data").join("test_default_dataset_configurations_hdf5_nwbfile_.nwb.h5") + if not Path(nwbfile_path).exists(): + nwbfile = generate_nwbfile_with_existing_time_series() + with NWBHDF5IO(path=str(nwbfile_path), mode="w") as io: + io.write(nwbfile) + return str(nwbfile_path) + + +@pytest.fixture(scope="session") +def zarr_nwbfile_path(tmpdir_factory): + nwbfile_path = tmpdir_factory.mktemp("data").join("test_default_dataset_configurations_zarr_nwbfile.nwb.zarr") + if not Path(nwbfile_path).exists(): + nwbfile = generate_nwbfile_with_existing_time_series() + with NWBZarrIO(path=str(nwbfile_path), mode="w") as io: + io.write(nwbfile) + return str(nwbfile_path) + + +def test_unwrapped_time_series_hdf5(hdf5_nwbfile_path): + array = np.array([[1, 2, 3], [4, 5, 6]]) + + with NWBHDF5IO(path=hdf5_nwbfile_path, mode="a") as io: + nwbfile = io.read() + new_time_series = mock_TimeSeries(name="NewTimeSeries", data=array) + nwbfile.add_acquisition(new_time_series) + dataset_configurations = list(get_default_dataset_configurations(nwbfile=nwbfile, backend="hdf5")) + + assert len(dataset_configurations) == 1 + + dataset_configuration = dataset_configurations[0] + assert isinstance(dataset_configuration, HDF5DatasetConfiguration) + assert dataset_configuration.dataset_info.object_id == new_time_series.object_id + assert dataset_configuration.dataset_info.location == "acquisition/NewTimeSeries/data" + assert dataset_configuration.dataset_info.maxshape == array.shape + assert dataset_configuration.dataset_info.dtype == array.dtype + assert dataset_configuration.chunk_shape == array.shape + assert dataset_configuration.buffer_shape == array.shape + assert dataset_configuration.compression_method == "gzip" + assert dataset_configuration.compression_options is None + + +def test_unwrapped_time_series_zarr(zarr_nwbfile_path): + array = np.array([[1, 2, 3], [4, 5, 6]]) + + with NWBZarrIO(path=zarr_nwbfile_path, mode="a") as io: + nwbfile = io.read() + new_time_series = mock_TimeSeries(name="NewTimeSeries", data=array) + nwbfile.add_acquisition(new_time_series) + dataset_configurations = list(get_default_dataset_configurations(nwbfile=nwbfile, backend="zarr")) + + assert len(dataset_configurations) == 1 + + dataset_configuration = dataset_configurations[0] + assert isinstance(dataset_configuration, ZarrDatasetConfiguration) + assert dataset_configuration.dataset_info.object_id == new_time_series.object_id + assert dataset_configuration.dataset_info.location == "acquisition/NewTimeSeries/data" + assert dataset_configuration.dataset_info.maxshape == array.shape + assert dataset_configuration.dataset_info.dtype == array.dtype + assert dataset_configuration.chunk_shape == array.shape + assert dataset_configuration.buffer_shape == array.shape + assert dataset_configuration.compression_method == "gzip" + assert dataset_configuration.compression_options is None + assert dataset_configuration.filter_methods is None + assert dataset_configuration.filter_options is None + + +def test_unwrapped_dynamic_table_hdf5(hdf5_nwbfile_path): + array = np.array([0.1, 0.2, 0.3]) + + with NWBHDF5IO(path=hdf5_nwbfile_path, mode="a") as io: + nwbfile = io.read() + column = VectorData(name="TestColumn", description="", data=array.squeeze()) + dynamic_table = DynamicTable(name="TestDynamicTable", description="", columns=[column]) + nwbfile.add_acquisition(dynamic_table) + dataset_configurations = list(get_default_dataset_configurations(nwbfile=nwbfile, backend="hdf5")) + + assert len(dataset_configurations) == 1 + + dataset_configuration = dataset_configurations[0] + assert isinstance(dataset_configuration, HDF5DatasetConfiguration) + assert dataset_configuration.dataset_info.object_id == column.object_id + assert dataset_configuration.dataset_info.location == "acquisition/TestDynamicTable/TestColumn/data" + assert dataset_configuration.dataset_info.maxshape == array.shape + assert dataset_configuration.dataset_info.dtype == array.dtype + assert dataset_configuration.chunk_shape == array.shape + assert dataset_configuration.buffer_shape == array.shape + assert dataset_configuration.compression_method == "gzip" + assert dataset_configuration.compression_options is None + + +def test_unwrapped_dynamic_table_zarr(zarr_nwbfile_path): + array = np.array([0.1, 0.2, 0.3]) + + with NWBZarrIO(path=zarr_nwbfile_path, mode="a") as io: + nwbfile = io.read() + column = VectorData(name="TestColumn", description="", data=array.squeeze()) + dynamic_table = DynamicTable(name="TestDynamicTable", description="", columns=[column]) + nwbfile.add_acquisition(dynamic_table) + dataset_configurations = list(get_default_dataset_configurations(nwbfile=nwbfile, backend="zarr")) + + assert len(dataset_configurations) == 1 + + dataset_configuration = dataset_configurations[0] + assert isinstance(dataset_configuration, ZarrDatasetConfiguration) + assert dataset_configuration.dataset_info.object_id == column.object_id + assert dataset_configuration.dataset_info.location == "acquisition/TestDynamicTable/TestColumn/data" + assert dataset_configuration.dataset_info.maxshape == array.shape + assert dataset_configuration.dataset_info.dtype == array.dtype + assert dataset_configuration.chunk_shape == array.shape + assert dataset_configuration.buffer_shape == array.shape + assert dataset_configuration.compression_method == "gzip" + assert dataset_configuration.compression_options is None + assert dataset_configuration.filter_methods is None + assert dataset_configuration.filter_options is None \ No newline at end of file diff --git a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_dataset_configuration_model.py b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/test_dataset_configuration_model.py similarity index 100% rename from tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_dataset_configuration_model.py rename to tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/test_dataset_configuration_model.py diff --git a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_dataset_info_model.py b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/test_dataset_info_model.py similarity index 100% rename from tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_dataset_info_model.py rename to tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/test_dataset_info_model.py diff --git a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_hdf5_backend_configuration_model.py b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/test_hdf5_backend_configuration_model.py similarity index 100% rename from tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_hdf5_backend_configuration_model.py rename to tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/test_hdf5_backend_configuration_model.py diff --git a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_hdf5_dataset_configuration_model.py b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/test_hdf5_dataset_configuration_model.py similarity index 100% rename from tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_hdf5_dataset_configuration_model.py rename to tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/test_hdf5_dataset_configuration_model.py diff --git a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_zarr_backend_configuration_model.py b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/test_zarr_backend_configuration_model.py similarity index 100% rename from tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_zarr_backend_configuration_model.py rename to tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/test_zarr_backend_configuration_model.py diff --git a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_zarr_dataset_configuration_model.py b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/test_zarr_dataset_configuration_model.py similarity index 100% rename from tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_zarr_dataset_configuration_model.py rename to tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/test_zarr_dataset_configuration_model.py From 38a1fa3722c1ba23bd5f0f906dc1262a8a46055f Mon Sep 17 00:00:00 2001 From: Cody Baker Date: Sun, 17 Sep 2023 21:53:37 -0400 Subject: [PATCH 02/27] modify iterator as well --- src/neuroconv/tools/hdmf.py | 80 ++++++++++++++++++++++++++++--------- 1 file changed, 61 insertions(+), 19 deletions(-) diff --git a/src/neuroconv/tools/hdmf.py b/src/neuroconv/tools/hdmf.py index 6ad712d51..b82b12e1f 100644 --- a/src/neuroconv/tools/hdmf.py +++ b/src/neuroconv/tools/hdmf.py @@ -1,35 +1,77 @@ """Collection of modifications of HDMF functions that are to be tested/used on this repo until propagation upstream.""" +import math from typing import Tuple import numpy as np from hdmf.data_utils import GenericDataChunkIterator as HDMFGenericDataChunkIterator +from pydantic import Field +from typing_extensions import Annotated class GenericDataChunkIterator(HDMFGenericDataChunkIterator): def _get_default_buffer_shape(self, buffer_gb: float = 1.0) -> Tuple[int]: - num_axes = len(self.maxshape) - chunk_bytes = np.prod(self.chunk_shape) * self.dtype.itemsize + return self.estimate_default_buffer_shape( + buffer_gb=buffer_gb, chunk_shape=self.chunk_shape, maxshape=self.maxshape, dtype=self.dtype + ) + + # TODO: move this to the core iterator in HDMF so it can be easily swapped out as well as run on its own + @staticmethod + def estimate_default_chunk_shape( + chunk_mb: Annotated[float, Field(gt=0.0)], + maxshape: Tuple[int, ...], + dtype: np.dtype, + ) -> Tuple[int, ...]: + """ + Select chunk shape with size in MB less than the threshold of chunk_mb. + + Keeps the dimensional ratios of the original data. + """ + assert chunk_mb > 0.0, f"chunk_mb ({chunk_mb}) must be greater than zero!" + # Eventually, Pydantic validation can handle this validation for us + + n_dims = len(maxshape) + itemsize = dtype.itemsize + chunk_bytes = chunk_mb * 1e6 + + min_maxshape = min(maxshape) + v = tuple(math.floor(maxshape_axis / min_maxshape) for maxshape_axis in maxshape) + prod_v = math.prod(v) + while prod_v * itemsize > chunk_bytes and prod_v != 1: + non_unit_min_v = min(x for x in v if x != 1) + v = tuple(math.floor(x / non_unit_min_v) if x != 1 else x for x in v) + prod_v = math.prod(v) + k = math.floor((chunk_bytes / (prod_v * itemsize)) ** (1 / n_dims)) + return tuple([min(k * x, maxshape[dim]) for dim, x in enumerate(v)]) + + # TODO: move this to the core iterator in HDMF so it can be easily swapped out as well as run on its own + @staticmethod + def estimate_default_buffer_shape( + buffer_gb: Annotated[float, Field(gt=0.0)], + chunk_shape: Tuple[int, ...], + maxshape: Tuple[int, ...], + dtype: np.dtype, + ) -> Tuple[int]: + num_axes = len(maxshape) + chunk_bytes = math.prod(chunk_shape) * dtype.itemsize assert buffer_gb > 0, f"buffer_gb ({buffer_gb}) must be greater than zero!" assert ( buffer_gb >= chunk_bytes / 1e9 ), f"buffer_gb ({buffer_gb}) must be greater than the chunk size ({chunk_bytes / 1e9})!" - assert all( - np.array(self.chunk_shape) > 0 - ), f"Some dimensions of chunk_shape ({self.chunk_shape}) are less than zero!" + assert all(np.array(chunk_shape) > 0), f"Some dimensions of chunk_shape ({chunk_shape}) are less than zero!" - maxshape = np.array(self.maxshape) + maxshape = np.array(maxshape) # Early termination condition - if np.prod(maxshape) * self.dtype.itemsize / 1e9 < buffer_gb: - return tuple(self.maxshape) + if math.prod(maxshape) * dtype.itemsize / 1e9 < buffer_gb: + return tuple(maxshape) buffer_bytes = chunk_bytes - axis_sizes_bytes = maxshape * self.dtype.itemsize - smallest_chunk_axis, second_smallest_chunk_axis, *_ = np.argsort(self.chunk_shape) + axis_sizes_bytes = maxshape * dtype.itemsize + smallest_chunk_axis, second_smallest_chunk_axis, *_ = np.argsort(chunk_shape) target_buffer_bytes = buffer_gb * 1e9 # If the smallest full axis does not fit within the buffer size, form a square along the two smallest axes - sub_square_buffer_shape = np.array(self.chunk_shape) + sub_square_buffer_shape = np.array(chunk_shape) if min(axis_sizes_bytes) > target_buffer_bytes: k1 = np.floor((target_buffer_bytes / chunk_bytes) ** 0.5) for axis in [smallest_chunk_axis, second_smallest_chunk_axis]: @@ -40,32 +82,32 @@ def _get_default_buffer_shape(self, buffer_gb: float = 1.0) -> Tuple[int]: chunk_to_buffer_ratio = buffer_gb * 1e9 / chunk_bytes chunk_scaling_factor = np.floor(chunk_to_buffer_ratio ** (1 / num_axes)) unpadded_buffer_shape = [ - np.clip(a=int(x), a_min=self.chunk_shape[j], a_max=self.maxshape[j]) - for j, x in enumerate(chunk_scaling_factor * np.array(self.chunk_shape)) + np.clip(a=int(x), a_min=chunk_shape[j], a_max=maxshape[j]) + for j, x in enumerate(chunk_scaling_factor * np.array(chunk_shape)) ] - unpadded_buffer_bytes = np.prod(unpadded_buffer_shape) * self.dtype.itemsize + unpadded_buffer_bytes = math.prod(unpadded_buffer_shape) * dtype.itemsize # Method that starts by filling the smallest axis completely or calculates best partial fill - padded_buffer_shape = np.array(self.chunk_shape) - chunks_per_axis = np.ceil(maxshape / self.chunk_shape) + padded_buffer_shape = np.array(chunk_shape) + chunks_per_axis = np.ceil(maxshape / chunk_shape) small_axis_fill_size = chunk_bytes * min(chunks_per_axis) full_axes_used = np.zeros(shape=num_axes, dtype=bool) if small_axis_fill_size <= target_buffer_bytes: buffer_bytes = small_axis_fill_size - padded_buffer_shape[smallest_chunk_axis] = self.maxshape[smallest_chunk_axis] + padded_buffer_shape[smallest_chunk_axis] = maxshape[smallest_chunk_axis] full_axes_used[smallest_chunk_axis] = True for axis, chunks_on_axis in enumerate(chunks_per_axis): if full_axes_used[axis]: # If the smallest axis, skip since already used continue if chunks_on_axis * buffer_bytes <= target_buffer_bytes: # If multiple axes can be used together buffer_bytes *= chunks_on_axis - padded_buffer_shape[axis] = self.maxshape[axis] + padded_buffer_shape[axis] = maxshape[axis] else: # Found an axis that is too large to use with the rest of the buffer; calculate how much can be used k3 = np.floor(target_buffer_bytes / buffer_bytes) padded_buffer_shape[axis] *= k3 break - padded_buffer_bytes = np.prod(padded_buffer_shape) * self.dtype.itemsize + padded_buffer_bytes = math.prod(padded_buffer_shape) * dtype.itemsize if padded_buffer_bytes >= unpadded_buffer_bytes: return tuple(padded_buffer_shape) From a981068e75afdaac344683ed0f5730b59868bc69 Mon Sep 17 00:00:00 2001 From: Cody Baker Date: Sun, 17 Sep 2023 22:25:20 -0400 Subject: [PATCH 03/27] factor out backend config stuff to other PR --- .../nwb_helpers/_dataset_configuration.py | 16 -- .../test_get_default_backend_configuration.py | 205 ------------------ 2 files changed, 221 deletions(-) delete mode 100644 tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_default_backend_configuration.py diff --git a/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py b/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py index 011e16a99..0d8c74fde 100644 --- a/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py +++ b/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py @@ -195,19 +195,3 @@ def get_default_dataset_configurations( yield _get_dataset_metadata( neurodata_object=dynamic_table[column_name], field_name="data", backend=backend ) - - -def get_default_backend_configuration( - nwbfile: NWBFile, backend: Literal["hdf5", "zarr"] -) -> Union[HDF5BackendConfiguration, ZarrBackendConfiguration]: - """Fill a default backend configuration to serve as a starting point for further customization.""" - BackendConfigurationClass = BACKEND_TO_CONFIGURATION[backend] - - default_dataset_configurations = get_default_dataset_configurations(nwbfile=nwbfile, backend=backend) - dataset_configurations = { - default_dataset_configuration.dataset_info.location: default_dataset_configuration - for default_dataset_configuration in default_dataset_configurations - } - - backend_configuration = BackendConfigurationClass(dataset_configurations=dataset_configurations) - return backend_configuration \ No newline at end of file diff --git a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_default_backend_configuration.py b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_default_backend_configuration.py deleted file mode 100644 index 45878e472..000000000 --- a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_default_backend_configuration.py +++ /dev/null @@ -1,205 +0,0 @@ -"""Integration tests for `get_default_backend_configuration`.""" -from io import StringIO -from pathlib import Path -from unittest.mock import patch - -import numpy as np -import pytest -from hdmf_zarr import NWBZarrIO -from pynwb import NWBHDF5IO, NWBFile -from pynwb.testing.mock.base import mock_TimeSeries -from pynwb.testing.mock.file import mock_NWBFile - -from neuroconv.tools.nwb_helpers import ( - HDF5BackendConfiguration, - ZarrBackendConfiguration, - get_default_backend_configuration, - get_module, -) - - -def generate_complex_nwbfile() -> NWBFile: - nwbfile = mock_NWBFile() - - raw_array = np.array([[1, 2, 3], [4, 5, 6]]) - raw_time_series = mock_TimeSeries(name="RawTimeSeries", data=raw_array) - nwbfile.add_acquisition(raw_time_series) - - number_of_trials = 10 - for start_time, stop_time in zip( - np.linspace(start=0.0, stop=10.0, num=number_of_trials), np.linspace(start=1.0, stop=11.0, num=number_of_trials) - ): - nwbfile.add_trial(start_time=start_time, stop_time=stop_time) - - ecephys_module = get_module(nwbfile=nwbfile, name="ecephys") - processed_array = np.array([[7.0, 8.0], [9.0, 10.0], [11.0, 12.0], [13.0, 14.0]]) - processed_time_series = mock_TimeSeries(name="ProcessedTimeSeries", data=processed_array) - ecephys_module.add(processed_time_series) - - return nwbfile - - -@pytest.fixture(scope="session") -def hdf5_nwbfile_path(tmpdir_factory): - nwbfile_path = tmpdir_factory.mktemp("data").join("test_default_backend_configuration_hdf5_nwbfile.nwb.h5") - if not Path(nwbfile_path).exists(): - nwbfile = generate_complex_nwbfile() - with NWBHDF5IO(path=str(nwbfile_path), mode="w") as io: - io.write(nwbfile) - return str(nwbfile_path) - - -@pytest.fixture(scope="session") -def zarr_nwbfile_path(tmpdir_factory): - nwbfile_path = tmpdir_factory.mktemp("data").join("test_default_backend_configuration_hdf5_nwbfile.nwb.zarr") - if not Path(nwbfile_path).exists(): - nwbfile = generate_complex_nwbfile() - with NWBZarrIO(path=str(nwbfile_path), mode="w") as io: - io.write(nwbfile) - return str(nwbfile_path) - - -def test_complex_hdf5(hdf5_nwbfile_path): - with NWBHDF5IO(path=hdf5_nwbfile_path, mode="a") as io: - nwbfile = io.read() - - raw_array = np.array([[11, 21, 31], [41, 51, 61]]) - raw_time_series = mock_TimeSeries(name="NewRawTimeSeries", data=raw_array) - nwbfile.add_acquisition(raw_time_series) - - number_of_epochs = 5 - for start_time, stop_time in zip( - np.linspace(start=0.0, stop=10.0, num=number_of_epochs), - np.linspace(start=1.0, stop=11.0, num=number_of_epochs), - ): - nwbfile.add_epoch(start_time=start_time, stop_time=stop_time) - - ecephys_module = get_module(nwbfile=nwbfile, name="ecephys") - processed_array = np.array([[7.1, 8.1], [9.1, 10.1], [11.1, 12.1], [13.1, 14.1]]) - processed_time_series = mock_TimeSeries(name="NewProcessedTimeSeries", data=processed_array) - ecephys_module.add(processed_time_series) - - backend_configuration = get_default_backend_configuration(nwbfile=nwbfile, backend="hdf5") - - assert isinstance(backend_configuration, HDF5BackendConfiguration) - - dataset_configurations = backend_configuration.dataset_configurations - assert len(dataset_configurations) == 4 - assert "acquisition/NewRawTimeSeries/data" in dataset_configurations - assert "epochs/start_time/data" in dataset_configurations - assert "epochs/stop_time/data" in dataset_configurations - assert "processing/ecephys/NewProcessedTimeSeries/data" in dataset_configurations - - # Best summary test of expected output is the printout - with patch("sys.stdout", new=StringIO()) as stdout: - print(backend_configuration) - - expected_print = """Configurable datasets identified using the hdf5 backend -------------------------------------------------------- -epochs/start_time/data - maxshape : (5,) - dtype : float64 - chunk shape : (5,) - buffer shape : (5,) - compression method : gzip - compression options : None -epochs/stop_time/data - maxshape : (5,) - dtype : float64 - chunk shape : (5,) - buffer shape : (5,) - compression method : gzip - compression options : None -acquisition/NewRawTimeSeries/data - maxshape : (2, 3) - dtype : int32 - chunk shape : (2, 3) - buffer shape : (2, 3) - compression method : gzip - compression options : None -processing/ecephys/NewProcessedTimeSeries/data - maxshape : (4, 2) - dtype : float64 - chunk shape : (4, 2) - buffer shape : (4, 2) - compression method : gzip - compression options : None -""" - assert stdout.getvalue() == expected_print - - -def test_complex_zarr(zarr_nwbfile_path): - with NWBZarrIO(path=zarr_nwbfile_path, mode="a") as io: - nwbfile = io.read() - - raw_array = np.array([[11, 21, 31], [41, 51, 61]]) - raw_time_series = mock_TimeSeries(name="NewRawTimeSeries", data=raw_array) - nwbfile.add_acquisition(raw_time_series) - - number_of_epochs = 5 - for start_time, stop_time in zip( - np.linspace(start=0.0, stop=10.0, num=number_of_epochs), - np.linspace(start=1.0, stop=11.0, num=number_of_epochs), - ): - nwbfile.add_epoch(start_time=start_time, stop_time=stop_time) - - ecephys_module = get_module(nwbfile=nwbfile, name="ecephys") - processed_array = np.array([[7.1, 8.1], [9.1, 10.1], [11.1, 12.1], [13.1, 14.1]]) - processed_time_series = mock_TimeSeries(name="NewProcessedTimeSeries", data=processed_array) - ecephys_module.add(processed_time_series) - - backend_configuration = get_default_backend_configuration(nwbfile=nwbfile, backend="zarr") - - assert isinstance(backend_configuration, ZarrBackendConfiguration) - - dataset_configurations = backend_configuration.dataset_configurations - assert len(dataset_configurations) == 4 - assert "acquisition/NewRawTimeSeries/data" in dataset_configurations - assert "epochs/start_time/data" in dataset_configurations - assert "epochs/stop_time/data" in dataset_configurations - assert "processing/ecephys/NewProcessedTimeSeries/data" in dataset_configurations - - # Best summary test of expected output is the printout - with patch("sys.stdout", new=StringIO()) as stdout: - print(backend_configuration) - - expected_print = """Configurable datasets identified using the zarr backend -------------------------------------------------------- -epochs/start_time/data - maxshape : (5,) - dtype : float64 - chunk shape : (5,) - buffer shape : (5,) - compression method : gzip - compression options : None - filter methods : None - filter options : None -epochs/stop_time/data - maxshape : (5,) - dtype : float64 - chunk shape : (5,) - buffer shape : (5,) - compression method : gzip - compression options : None - filter methods : None - filter options : None -acquisition/NewRawTimeSeries/data - maxshape : (2, 3) - dtype : int32 - chunk shape : (2, 3) - buffer shape : (2, 3) - compression method : gzip - compression options : None - filter methods : None - filter options : None -processing/ecephys/NewProcessedTimeSeries/data - maxshape : (4, 2) - dtype : float64 - chunk shape : (4, 2) - buffer shape : (4, 2) - compression method : gzip - compression options : None - filter methods : None - filter options : None -""" - assert stdout.getvalue() == expected_print From 966592c31e66e1aed27e988492430c8298ff3759 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 18 Sep 2023 02:46:11 +0000 Subject: [PATCH 04/27] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/neuroconv/tools/nwb_helpers/__init__.py | 12 ++++++++++-- .../tools/nwb_helpers/_models/_base_models.py | 2 +- .../tools/nwb_helpers/_models/_hdf5_models.py | 6 +++--- .../tools/nwb_helpers/_models/_zarr_models.py | 6 +++--- src/neuroconv/tools/testing/__init__.py | 8 ++++---- .../tools/testing/_mock/_mock_dataset_models.py | 6 +++--- .../test_get_default_dataset_configurations.py | 2 +- ..._default_dataset_configurations_appended_files.py | 2 +- .../test_hdf5_backend_configuration_model.py | 1 - .../test_hdf5_dataset_configuration_model.py | 5 ++++- .../test_zarr_backend_configuration_model.py | 1 - .../test_zarr_dataset_configuration_model.py | 5 ++++- 12 files changed, 34 insertions(+), 22 deletions(-) diff --git a/src/neuroconv/tools/nwb_helpers/__init__.py b/src/neuroconv/tools/nwb_helpers/__init__.py index 89c738d51..0982439bb 100644 --- a/src/neuroconv/tools/nwb_helpers/__init__.py +++ b/src/neuroconv/tools/nwb_helpers/__init__.py @@ -6,8 +6,16 @@ make_or_load_nwbfile, ) from ._models._base_models import DatasetConfiguration, DatasetInfo -from ._models._hdf5_models import HDF5BackendConfiguration, HDF5DatasetConfiguration, AVAILABLE_HDF5_COMPRESSION_METHODS -from ._models._zarr_models import ZarrBackendConfiguration, ZarrDatasetConfiguration, AVAILABLE_ZARR_COMPRESSION_METHODS +from ._models._hdf5_models import ( + AVAILABLE_HDF5_COMPRESSION_METHODS, + HDF5BackendConfiguration, + HDF5DatasetConfiguration, +) +from ._models._zarr_models import ( + AVAILABLE_ZARR_COMPRESSION_METHODS, + ZarrBackendConfiguration, + ZarrDatasetConfiguration, +) BACKEND_TO_DATASET_CONFIGURATION = dict(hdf5=HDF5DatasetConfiguration, zarr=ZarrDatasetConfiguration) BACKEND_TO_CONFIGURATION = dict(hdf5=HDF5BackendConfiguration, zarr=ZarrBackendConfiguration) diff --git a/src/neuroconv/tools/nwb_helpers/_models/_base_models.py b/src/neuroconv/tools/nwb_helpers/_models/_base_models.py index eb0408c2e..ad940fb6f 100644 --- a/src/neuroconv/tools/nwb_helpers/_models/_base_models.py +++ b/src/neuroconv/tools/nwb_helpers/_models/_base_models.py @@ -1,5 +1,5 @@ """Base Pydantic models for DatasetInfo and DatasetConfiguration.""" -from typing import Any, Dict, Tuple, Union, Literal, Type +from typing import Any, Dict, Literal, Tuple, Type, Union import h5py import numcodecs diff --git a/src/neuroconv/tools/nwb_helpers/_models/_hdf5_models.py b/src/neuroconv/tools/nwb_helpers/_models/_hdf5_models.py index e21015f89..6e1108432 100644 --- a/src/neuroconv/tools/nwb_helpers/_models/_hdf5_models.py +++ b/src/neuroconv/tools/nwb_helpers/_models/_hdf5_models.py @@ -1,12 +1,12 @@ """Base Pydantic models for the HDF5DatasetConfiguration.""" -from typing import Any, Dict, Literal, Union, Type +from typing import Any, Dict, Literal, Type, Union import h5py -from pynwb import H5DataIO from nwbinspector.utils import is_module_installed from pydantic import Field +from pynwb import H5DataIO -from ._base_models import DatasetConfiguration, BackendConfiguration +from ._base_models import BackendConfiguration, DatasetConfiguration _base_hdf5_filters = set(h5py.filters.decode) - set( ( diff --git a/src/neuroconv/tools/nwb_helpers/_models/_zarr_models.py b/src/neuroconv/tools/nwb_helpers/_models/_zarr_models.py index 0e83d58e5..7b7b2dcbc 100644 --- a/src/neuroconv/tools/nwb_helpers/_models/_zarr_models.py +++ b/src/neuroconv/tools/nwb_helpers/_models/_zarr_models.py @@ -1,13 +1,13 @@ """Base Pydantic models for the ZarrDatasetConfiguration.""" -from typing import Any, Dict, Literal, Union, List, Type +from typing import Any, Dict, List, Literal, Type, Union import numcodecs -import zarr import psutil +import zarr from hdmf_zarr import ZarrDataIO from pydantic import Field, root_validator -from ._base_models import DatasetConfiguration, BackendConfiguration +from ._base_models import BackendConfiguration, DatasetConfiguration _available_zarr_filters = ( set(zarr.codec_registry.keys()) diff --git a/src/neuroconv/tools/testing/__init__.py b/src/neuroconv/tools/testing/__init__.py index 3c987fdd0..502634466 100644 --- a/src/neuroconv/tools/testing/__init__.py +++ b/src/neuroconv/tools/testing/__init__.py @@ -1,10 +1,10 @@ -from .mock_files import generate_path_expander_demo_ibl -from .mock_interfaces import MockBehaviorEventInterface, MockSpikeGLXNIDQInterface -from .mock_ttl_signals import generate_mock_ttl_signal, regenerate_test_cases from ._mock._mock_dataset_models import ( mock_DatasetInfo, mock_HDF5BackendConfiguration, - mock_ZarrBackendConfiguration, mock_HDF5DatasetConfiguration, + mock_ZarrBackendConfiguration, mock_ZarrDatasetConfiguration, ) +from .mock_files import generate_path_expander_demo_ibl +from .mock_interfaces import MockBehaviorEventInterface, MockSpikeGLXNIDQInterface +from .mock_ttl_signals import generate_mock_ttl_signal, regenerate_test_cases diff --git a/src/neuroconv/tools/testing/_mock/_mock_dataset_models.py b/src/neuroconv/tools/testing/_mock/_mock_dataset_models.py index 024ad9d3c..67b82c1bb 100644 --- a/src/neuroconv/tools/testing/_mock/_mock_dataset_models.py +++ b/src/neuroconv/tools/testing/_mock/_mock_dataset_models.py @@ -1,17 +1,17 @@ -from typing import Any, Tuple, Dict, Union, Literal, Iterable +from typing import Any, Dict, Iterable, Literal, Tuple, Union import h5py import numcodecs import numpy as np from ...nwb_helpers import ( + AVAILABLE_HDF5_COMPRESSION_METHODS, + AVAILABLE_ZARR_COMPRESSION_METHODS, DatasetInfo, HDF5BackendConfiguration, HDF5DatasetConfiguration, ZarrBackendConfiguration, ZarrDatasetConfiguration, - AVAILABLE_HDF5_COMPRESSION_METHODS, - AVAILABLE_ZARR_COMPRESSION_METHODS, ) diff --git a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_default_dataset_configurations.py b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_default_dataset_configurations.py index f08c85167..be0c6f0d8 100644 --- a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_default_dataset_configurations.py +++ b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_default_dataset_configurations.py @@ -346,4 +346,4 @@ def test_classic_iterator_wrapped_dynamic_table_zarr(): assert dataset_configuration.compression_method == "gzip" assert dataset_configuration.compression_options is None assert dataset_configuration.filter_methods is None - assert dataset_configuration.filter_options is None \ No newline at end of file + assert dataset_configuration.filter_options is None diff --git a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_default_dataset_configurations_appended_files.py b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_default_dataset_configurations_appended_files.py index dcda5d8a0..48950e0b7 100644 --- a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_default_dataset_configurations_appended_files.py +++ b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_default_dataset_configurations_appended_files.py @@ -143,4 +143,4 @@ def test_unwrapped_dynamic_table_zarr(zarr_nwbfile_path): assert dataset_configuration.compression_method == "gzip" assert dataset_configuration.compression_options is None assert dataset_configuration.filter_methods is None - assert dataset_configuration.filter_options is None \ No newline at end of file + assert dataset_configuration.filter_options is None diff --git a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/test_hdf5_backend_configuration_model.py b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/test_hdf5_backend_configuration_model.py index e77694d08..290ca6a1b 100644 --- a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/test_hdf5_backend_configuration_model.py +++ b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/test_hdf5_backend_configuration_model.py @@ -2,7 +2,6 @@ from io import StringIO from unittest.mock import patch - from neuroconv.tools.testing import mock_HDF5BackendConfiguration diff --git a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/test_hdf5_dataset_configuration_model.py b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/test_hdf5_dataset_configuration_model.py index 39e4a787d..83e0f71b8 100644 --- a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/test_hdf5_dataset_configuration_model.py +++ b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/test_hdf5_dataset_configuration_model.py @@ -4,7 +4,10 @@ import pytest -from neuroconv.tools.nwb_helpers import HDF5DatasetConfiguration, AVAILABLE_HDF5_COMPRESSION_METHODS +from neuroconv.tools.nwb_helpers import ( + AVAILABLE_HDF5_COMPRESSION_METHODS, + HDF5DatasetConfiguration, +) from neuroconv.tools.testing import mock_DatasetInfo, mock_HDF5DatasetConfiguration diff --git a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/test_zarr_backend_configuration_model.py b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/test_zarr_backend_configuration_model.py index 66d7dbc03..9e235df77 100644 --- a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/test_zarr_backend_configuration_model.py +++ b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/test_zarr_backend_configuration_model.py @@ -2,7 +2,6 @@ from io import StringIO from unittest.mock import patch - from neuroconv.tools.testing import mock_ZarrBackendConfiguration diff --git a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/test_zarr_dataset_configuration_model.py b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/test_zarr_dataset_configuration_model.py index 6350b0d4f..36e469259 100644 --- a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/test_zarr_dataset_configuration_model.py +++ b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/test_zarr_dataset_configuration_model.py @@ -4,7 +4,10 @@ import pytest -from neuroconv.tools.nwb_helpers import ZarrDatasetConfiguration, AVAILABLE_ZARR_COMPRESSION_METHODS +from neuroconv.tools.nwb_helpers import ( + AVAILABLE_ZARR_COMPRESSION_METHODS, + ZarrDatasetConfiguration, +) from neuroconv.tools.testing import mock_DatasetInfo, mock_ZarrDatasetConfiguration From 2e7af8496e4883180aa640dae0d71803f6b785c1 Mon Sep 17 00:00:00 2001 From: Cody Baker <51133164+CodyCBakerPhD@users.noreply.github.com> Date: Sun, 17 Sep 2023 23:00:34 -0400 Subject: [PATCH 05/27] Update CHANGELOG.md --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7ee7c7c08..ea285d351 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,7 @@ * Added Pydantic data models of `DatasetInfo` (immutable summary of core dataset values such as maximum shape and dtype) and `DatasetConfiguration` for both HDF5 and Zarr datasets (the optional layer that specifies chunk/buffering/compression). [PR #567](https://github.com/catalystneuro/neuroconv/pull/567) * Added Pydantic data models of `BackendConfiguration` for both HDF5 and Zarr datasets (container/mapper of all the `DatasetConfiguration`s for a particular file). [PR #568](https://github.com/catalystneuro/neuroconv/pull/568) +* Added tool function `get_default_dataset_configurations` for identifying and collecting all fields of an in-memory `NWBFile` that could become datasets on disk; and return instances of the Pydantic dataset models filled with default values for chunking/buffering/compression. [PR #569](https://github.com/catalystneuro/neuroconv/pull/569) From 85bc9277f3b379d1dcea1c8f1f847d0072b1384a Mon Sep 17 00:00:00 2001 From: Cody Baker <51133164+CodyCBakerPhD@users.noreply.github.com> Date: Sun, 17 Sep 2023 23:05:54 -0400 Subject: [PATCH 06/27] Update __init__.py --- src/neuroconv/tools/nwb_helpers/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/neuroconv/tools/nwb_helpers/__init__.py b/src/neuroconv/tools/nwb_helpers/__init__.py index 0982439bb..f961974b2 100644 --- a/src/neuroconv/tools/nwb_helpers/__init__.py +++ b/src/neuroconv/tools/nwb_helpers/__init__.py @@ -5,7 +5,7 @@ make_nwbfile_from_metadata, make_or_load_nwbfile, ) -from ._models._base_models import DatasetConfiguration, DatasetInfo +from ._models._base_models import DatasetInfo from ._models._hdf5_models import ( AVAILABLE_HDF5_COMPRESSION_METHODS, HDF5BackendConfiguration, @@ -16,6 +16,7 @@ ZarrBackendConfiguration, ZarrDatasetConfiguration, ) +from ._dataset_configuration import get_default_backend_configuration BACKEND_TO_DATASET_CONFIGURATION = dict(hdf5=HDF5DatasetConfiguration, zarr=ZarrDatasetConfiguration) BACKEND_TO_CONFIGURATION = dict(hdf5=HDF5BackendConfiguration, zarr=ZarrBackendConfiguration) From 8307156709a17f2c2b0561ea9591e93eded67694 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 18 Sep 2023 03:06:04 +0000 Subject: [PATCH 07/27] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/neuroconv/tools/nwb_helpers/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/neuroconv/tools/nwb_helpers/__init__.py b/src/neuroconv/tools/nwb_helpers/__init__.py index f961974b2..0501ea5be 100644 --- a/src/neuroconv/tools/nwb_helpers/__init__.py +++ b/src/neuroconv/tools/nwb_helpers/__init__.py @@ -1,3 +1,4 @@ +from ._dataset_configuration import get_default_backend_configuration from ._metadata_and_file_helpers import ( add_device_from_metadata, get_default_nwbfile_metadata, @@ -16,7 +17,6 @@ ZarrBackendConfiguration, ZarrDatasetConfiguration, ) -from ._dataset_configuration import get_default_backend_configuration BACKEND_TO_DATASET_CONFIGURATION = dict(hdf5=HDF5DatasetConfiguration, zarr=ZarrDatasetConfiguration) BACKEND_TO_CONFIGURATION = dict(hdf5=HDF5BackendConfiguration, zarr=ZarrBackendConfiguration) From 13c9b37c3f41c23bb515a4be2e52d62fa0b4a498 Mon Sep 17 00:00:00 2001 From: CodyCBakerPhD Date: Wed, 4 Oct 2023 04:33:36 -0400 Subject: [PATCH 08/27] use dataset_name in DatasetInfo; other debugs --- src/neuroconv/tools/nwb_helpers/__init__.py | 5 +- .../nwb_helpers/_dataset_configuration.py | 34 +- ...test_get_default_dataset_configurations.py | 413 +++++++++--------- ...t_dataset_configurations_appended_files.py | 8 +- 4 files changed, 232 insertions(+), 228 deletions(-) diff --git a/src/neuroconv/tools/nwb_helpers/__init__.py b/src/neuroconv/tools/nwb_helpers/__init__.py index 0501ea5be..e381a3294 100644 --- a/src/neuroconv/tools/nwb_helpers/__init__.py +++ b/src/neuroconv/tools/nwb_helpers/__init__.py @@ -1,4 +1,4 @@ -from ._dataset_configuration import get_default_backend_configuration +from ._dataset_configuration import get_default_dataset_configurations from ._metadata_and_file_helpers import ( add_device_from_metadata, get_default_nwbfile_metadata, @@ -17,6 +17,3 @@ ZarrBackendConfiguration, ZarrDatasetConfiguration, ) - -BACKEND_TO_DATASET_CONFIGURATION = dict(hdf5=HDF5DatasetConfiguration, zarr=ZarrDatasetConfiguration) -BACKEND_TO_CONFIGURATION = dict(hdf5=HDF5BackendConfiguration, zarr=ZarrBackendConfiguration) diff --git a/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py b/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py index 0d8c74fde..e18b3d17f 100644 --- a/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py +++ b/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py @@ -11,18 +11,14 @@ from pynwb import NWBHDF5IO, NWBFile, TimeSeries from pynwb.base import DynamicTable -from ._dataset_and_backend_models import ( - BACKEND_TO_CONFIGURATION, - BACKEND_TO_DATASET_CONFIGURATION, - DatasetConfiguration, - DatasetInfo, - HDF5BackendConfiguration, - HDF5DatasetConfiguration, - ZarrBackendConfiguration, - ZarrDatasetConfiguration, -) +from ._models._base_models import DatasetConfiguration, DatasetInfo +from ._models._hdf5_models import HDF5BackendConfiguration, HDF5DatasetConfiguration +from ._models._zarr_models import ZarrBackendConfiguration, ZarrDatasetConfiguration from ..hdmf import SliceableDataChunkIterator +BACKEND_TO_DATASET_CONFIGURATION = dict(hdf5=HDF5DatasetConfiguration, zarr=ZarrDatasetConfiguration) +BACKEND_TO_CONFIGURATION = dict(hdf5=HDF5BackendConfiguration, zarr=ZarrBackendConfiguration) + def _get_mode(io: Union[NWBHDF5IO, NWBZarrIO]) -> str: """NWBHDF5IO and NWBZarrIO have different ways of storing the mode they used on a path.""" @@ -81,14 +77,14 @@ def _get_dataset_metadata( # DataChunkIterator has best generic dtype inference, though logic is hard to peel out of it # And it can fail in rare cases but not essential to our default configuration try: - dtype = str(DataChunkIterator(candidate_dataset).dtype) # string cast to be JSON friendly + dtype = DataChunkIterator(candidate_dataset).dtype except Exception as exception: if str(exception) != "Data type could not be determined. Please specify dtype in DataChunkIterator init.": raise exception else: - dtype = "unknown" + dtype = np.dtype("object") - maxshape = get_data_shape(data=candidate_dataset) + full_shape = get_data_shape(data=candidate_dataset) if isinstance(candidate_dataset, GenericDataChunkIterator): chunk_shape = candidate_dataset.chunk_shape @@ -96,20 +92,22 @@ def _get_dataset_metadata( elif dtype != "unknown": # TODO: eventually replace this with staticmethods on hdmf.data_utils.GenericDataChunkIterator chunk_shape = SliceableDataChunkIterator.estimate_default_chunk_shape( - chunk_mb=10.0, maxshape=maxshape, dtype=np.dtype(dtype) + chunk_mb=10.0, maxshape=full_shape, dtype=np.dtype(dtype) ) buffer_shape = SliceableDataChunkIterator.estimate_default_buffer_shape( - buffer_gb=0.5, chunk_shape=chunk_shape, maxshape=maxshape, dtype=np.dtype(dtype) + buffer_gb=0.5, chunk_shape=chunk_shape, maxshape=full_shape, dtype=np.dtype(dtype) ) else: pass # TODO: think on this; perhaps zarr's standalone estimator? + location = _parse_location_in_memory_nwbfile(current_location=field_name, neurodata_object=neurodata_object) + dataset_name = location.strip("/")[-1] dataset_info = DatasetInfo( object_id=neurodata_object.object_id, object_name=neurodata_object.name, - location=_parse_location_in_memory_nwbfile(current_location=field_name, neurodata_object=neurodata_object), - field=field_name, - maxshape=maxshape, + location=location, + dataset_name=dataset_name, + full_shape=full_shape, dtype=dtype, ) dataset_configuration = DatasetConfigurationClass( diff --git a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_default_dataset_configurations.py b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_default_dataset_configurations.py index be0c6f0d8..00d030802 100644 --- a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_default_dataset_configurations.py +++ b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_default_dataset_configurations.py @@ -7,6 +7,7 @@ from pynwb.testing.mock.base import mock_TimeSeries from pynwb.testing.mock.file import mock_NWBFile +import pytest from neuroconv.tools.hdmf import SliceableDataChunkIterator from neuroconv.tools.nwb_helpers import ( HDF5DatasetConfiguration, @@ -15,11 +16,13 @@ ) -def test_unwrapped_time_series_hdf5(): +@pytest.mark.parametrize("iterator", [lambda x: x, SliceableDataChunkIterator, DataChunkIterator]) +def test_unwrapped_time_series_hdf5(iterator: callable): array = np.array([[1, 2, 3], [4, 5, 6]]) + data = iterator(array) nwbfile = mock_NWBFile() - time_series = mock_TimeSeries(name="TestTimeSeries", data=array) + time_series = mock_TimeSeries(name="TestTimeSeries", data=data) nwbfile.add_acquisition(time_series) dataset_configurations = list(get_default_dataset_configurations(nwbfile=nwbfile, backend="hdf5")) @@ -30,7 +33,7 @@ def test_unwrapped_time_series_hdf5(): assert isinstance(dataset_configuration, HDF5DatasetConfiguration) assert dataset_configuration.dataset_info.object_id == time_series.object_id assert dataset_configuration.dataset_info.location == "acquisition/TestTimeSeries/data" - assert dataset_configuration.dataset_info.maxshape == array.shape + assert dataset_configuration.dataset_info.full_shape == array.shape assert dataset_configuration.dataset_info.dtype == array.dtype assert dataset_configuration.chunk_shape == array.shape assert dataset_configuration.buffer_shape == array.shape @@ -38,11 +41,13 @@ def test_unwrapped_time_series_hdf5(): assert dataset_configuration.compression_options is None -def test_unwrapped_time_series_zarr(): +@pytest.mark.parametrize("iterator", [lambda x: x, SliceableDataChunkIterator, DataChunkIterator]) +def test_unwrapped_time_series_zarr(iterator: callable): array = np.array([[1, 2, 3], [4, 5, 6]]) + data = iterator(array) nwbfile = mock_NWBFile() - time_series = mock_TimeSeries(name="TestTimeSeries", data=array) + time_series = mock_TimeSeries(name="TestTimeSeries", data=data) nwbfile.add_acquisition(time_series) dataset_configurations = list(get_default_dataset_configurations(nwbfile=nwbfile, backend="zarr")) @@ -53,7 +58,7 @@ def test_unwrapped_time_series_zarr(): assert isinstance(dataset_configuration, ZarrDatasetConfiguration) assert dataset_configuration.dataset_info.object_id == time_series.object_id assert dataset_configuration.dataset_info.location == "acquisition/TestTimeSeries/data" - assert dataset_configuration.dataset_info.maxshape == array.shape + assert dataset_configuration.dataset_info.full_shape == array.shape assert dataset_configuration.dataset_info.dtype == array.dtype assert dataset_configuration.chunk_shape == array.shape assert dataset_configuration.buffer_shape == array.shape @@ -63,100 +68,100 @@ def test_unwrapped_time_series_zarr(): assert dataset_configuration.filter_options is None -def test_generic_iterator_wrapped_time_series_hdf5(): - array = np.array([[1, 2, 3], [4, 5, 6]]) +# def test_generic_iterator_wrapped_time_series_hdf5(): +# array = np.array([[1, 2, 3], [4, 5, 6]]) - nwbfile = mock_NWBFile() - time_series = mock_TimeSeries(name="TestTimeSeries", data=SliceableDataChunkIterator(data=array)) - nwbfile.add_acquisition(time_series) +# nwbfile = mock_NWBFile() +# time_series = mock_TimeSeries(name="TestTimeSeries", data=SliceableDataChunkIterator(data=array)) +# nwbfile.add_acquisition(time_series) - dataset_configurations = list(get_default_dataset_configurations(nwbfile=nwbfile, backend="hdf5")) +# dataset_configurations = list(get_default_dataset_configurations(nwbfile=nwbfile, backend="hdf5")) - assert len(dataset_configurations) == 1 +# assert len(dataset_configurations) == 1 - dataset_configuration = dataset_configurations[0] - assert isinstance(dataset_configuration, HDF5DatasetConfiguration) - assert dataset_configuration.dataset_info.object_id == time_series.object_id - assert dataset_configuration.dataset_info.location == "acquisition/TestTimeSeries/data" - assert dataset_configuration.dataset_info.maxshape == array.shape - assert dataset_configuration.dataset_info.dtype == array.dtype - assert dataset_configuration.chunk_shape == array.shape - assert dataset_configuration.buffer_shape == array.shape - assert dataset_configuration.compression_method == "gzip" - assert dataset_configuration.compression_options is None +# dataset_configuration = dataset_configurations[0] +# assert isinstance(dataset_configuration, HDF5DatasetConfiguration) +# assert dataset_configuration.dataset_info.object_id == time_series.object_id +# assert dataset_configuration.dataset_info.location == "acquisition/TestTimeSeries/data" +# assert dataset_configuration.dataset_info.full_shape == array.shape +# assert dataset_configuration.dataset_info.dtype == array.dtype +# assert dataset_configuration.chunk_shape == array.shape +# assert dataset_configuration.buffer_shape == array.shape +# assert dataset_configuration.compression_method == "gzip" +# assert dataset_configuration.compression_options is None -def test_classic_iterator_wrapped_simple_time_series_zarr(): - array = np.array([[1, 2, 3], [4, 5, 6]]) +# def test_classic_iterator_wrapped_simple_time_series_zarr(): +# array = np.array([[1, 2, 3], [4, 5, 6]]) - nwbfile = mock_NWBFile() - time_series = mock_TimeSeries(name="TestTimeSeries", data=DataChunkIterator(data=array)) - nwbfile.add_acquisition(time_series) +# nwbfile = mock_NWBFile() +# time_series = mock_TimeSeries(name="TestTimeSeries", data=DataChunkIterator(data=array)) +# nwbfile.add_acquisition(time_series) - dataset_configurations = list(get_default_dataset_configurations(nwbfile=nwbfile, backend="zarr")) +# dataset_configurations = list(get_default_dataset_configurations(nwbfile=nwbfile, backend="zarr")) - assert len(dataset_configurations) == 1 +# assert len(dataset_configurations) == 1 - dataset_configuration = dataset_configurations[0] - assert isinstance(dataset_configuration, ZarrDatasetConfiguration) - assert dataset_configuration.dataset_info.object_id == time_series.object_id - assert dataset_configuration.dataset_info.location == "acquisition/TestTimeSeries/data" - assert dataset_configuration.dataset_info.maxshape == array.shape - assert dataset_configuration.dataset_info.dtype == array.dtype - assert dataset_configuration.chunk_shape == array.shape - assert dataset_configuration.buffer_shape == array.shape - assert dataset_configuration.compression_method == "gzip" - assert dataset_configuration.compression_options is None - assert dataset_configuration.filter_methods is None - assert dataset_configuration.filter_options is None +# dataset_configuration = dataset_configurations[0] +# assert isinstance(dataset_configuration, ZarrDatasetConfiguration) +# assert dataset_configuration.dataset_info.object_id == time_series.object_id +# assert dataset_configuration.dataset_info.location == "acquisition/TestTimeSeries/data" +# assert dataset_configuration.dataset_info.full_shape == array.shape +# assert dataset_configuration.dataset_info.dtype == array.dtype +# assert dataset_configuration.chunk_shape == array.shape +# assert dataset_configuration.buffer_shape == array.shape +# assert dataset_configuration.compression_method == "gzip" +# assert dataset_configuration.compression_options is None +# assert dataset_configuration.filter_methods is None +# assert dataset_configuration.filter_options is None -def test_classic_iterator_wrapped_time_series_hdf5(): - array = np.array([[1, 2, 3], [4, 5, 6]]) +# def test_classic_iterator_wrapped_time_series_hdf5(): +# array = np.array([[1, 2, 3], [4, 5, 6]]) - nwbfile = mock_NWBFile() - time_series = mock_TimeSeries(name="TestTimeSeries", data=DataChunkIterator(data=array)) - nwbfile.add_acquisition(time_series) +# nwbfile = mock_NWBFile() +# time_series = mock_TimeSeries(name="TestTimeSeries", data=DataChunkIterator(data=array)) +# nwbfile.add_acquisition(time_series) - dataset_configurations = list(get_default_dataset_configurations(nwbfile=nwbfile, backend="hdf5")) +# dataset_configurations = list(get_default_dataset_configurations(nwbfile=nwbfile, backend="hdf5")) - assert len(dataset_configurations) == 1 +# assert len(dataset_configurations) == 1 - dataset_configuration = dataset_configurations[0] - assert isinstance(dataset_configuration, HDF5DatasetConfiguration) - assert dataset_configuration.dataset_info.object_id == time_series.object_id - assert dataset_configuration.dataset_info.location == "acquisition/TestTimeSeries/data" - assert dataset_configuration.dataset_info.maxshape == array.shape - assert dataset_configuration.dataset_info.dtype == array.dtype - assert dataset_configuration.chunk_shape == array.shape - assert dataset_configuration.buffer_shape == array.shape - assert dataset_configuration.compression_method == "gzip" - assert dataset_configuration.compression_options is None +# dataset_configuration = dataset_configurations[0] +# assert isinstance(dataset_configuration, HDF5DatasetConfiguration) +# assert dataset_configuration.dataset_info.object_id == time_series.object_id +# assert dataset_configuration.dataset_info.location == "acquisition/TestTimeSeries/data" +# assert dataset_configuration.dataset_info.full_shape == array.shape +# assert dataset_configuration.dataset_info.dtype == array.dtype +# assert dataset_configuration.chunk_shape == array.shape +# assert dataset_configuration.buffer_shape == array.shape +# assert dataset_configuration.compression_method == "gzip" +# assert dataset_configuration.compression_options is None -def test_generic_iterator_wrapped_simple_time_series_zarr(): - array = np.array([[1, 2, 3], [4, 5, 6]]) +# def test_generic_iterator_wrapped_simple_time_series_zarr(): +# array = np.array([[1, 2, 3], [4, 5, 6]]) - nwbfile = mock_NWBFile() - time_series = mock_TimeSeries(name="TestTimeSeries", data=SliceableDataChunkIterator(data=array)) - nwbfile.add_acquisition(time_series) +# nwbfile = mock_NWBFile() +# time_series = mock_TimeSeries(name="TestTimeSeries", data=SliceableDataChunkIterator(data=array)) +# nwbfile.add_acquisition(time_series) - dataset_configurations = list(get_default_dataset_configurations(nwbfile=nwbfile, backend="zarr")) +# dataset_configurations = list(get_default_dataset_configurations(nwbfile=nwbfile, backend="zarr")) - assert len(dataset_configurations) == 1 +# assert len(dataset_configurations) == 1 - dataset_configuration = dataset_configurations[0] - assert isinstance(dataset_configuration, ZarrDatasetConfiguration) - assert dataset_configuration.dataset_info.object_id == time_series.object_id - assert dataset_configuration.dataset_info.location == "acquisition/TestTimeSeries/data" - assert dataset_configuration.dataset_info.maxshape == array.shape - assert dataset_configuration.dataset_info.dtype == array.dtype - assert dataset_configuration.chunk_shape == array.shape - assert dataset_configuration.buffer_shape == array.shape - assert dataset_configuration.compression_method == "gzip" - assert dataset_configuration.compression_options is None - assert dataset_configuration.filter_methods is None - assert dataset_configuration.filter_options is None +# dataset_configuration = dataset_configurations[0] +# assert isinstance(dataset_configuration, ZarrDatasetConfiguration) +# assert dataset_configuration.dataset_info.object_id == time_series.object_id +# assert dataset_configuration.dataset_info.location == "acquisition/TestTimeSeries/data" +# assert dataset_configuration.dataset_info.full_shape == array.shape +# assert dataset_configuration.dataset_info.dtype == array.dtype +# assert dataset_configuration.chunk_shape == array.shape +# assert dataset_configuration.buffer_shape == array.shape +# assert dataset_configuration.compression_method == "gzip" +# assert dataset_configuration.compression_options is None +# assert dataset_configuration.filter_methods is None +# assert dataset_configuration.filter_options is None def test_external_image_series_hdf5(): @@ -179,11 +184,13 @@ def test_external_image_series_zarr(): assert len(dataset_configurations) == 0 -def test_unwrapped_dynamic_table_hdf5(): +@pytest.mark.parametrize("iterator", [lambda x: x, SliceableDataChunkIterator, DataChunkIterator]) +def test_unwrapped_dynamic_table_hdf5(iterator: callable): array = np.array([0.1, 0.2, 0.3]) + data = iterator(array.squeeze()) nwbfile = mock_NWBFile() - column = VectorData(name="TestColumn", description="", data=array.squeeze()) + column = VectorData(name="TestColumn", description="", data=data) dynamic_table = DynamicTable(name="TestDynamicTable", description="", columns=[column]) nwbfile.add_acquisition(dynamic_table) @@ -195,7 +202,7 @@ def test_unwrapped_dynamic_table_hdf5(): assert isinstance(dataset_configuration, HDF5DatasetConfiguration) assert dataset_configuration.dataset_info.object_id == column.object_id assert dataset_configuration.dataset_info.location == "acquisition/TestDynamicTable/TestColumn/data" - assert dataset_configuration.dataset_info.maxshape == array.shape + assert dataset_configuration.dataset_info.full_shape == array.shape assert dataset_configuration.dataset_info.dtype == array.dtype assert dataset_configuration.chunk_shape == array.shape assert dataset_configuration.buffer_shape == array.shape @@ -203,11 +210,13 @@ def test_unwrapped_dynamic_table_hdf5(): assert dataset_configuration.compression_options is None -def test_unwrapped_dynamic_table_zarr(): +@pytest.mark.parametrize("iterator", [lambda x: x, SliceableDataChunkIterator, DataChunkIterator]) +def test_unwrapped_dynamic_table_zarr(iterator: callable): array = np.array([0.1, 0.2, 0.3]) + data = iterator(array.squeeze()) nwbfile = mock_NWBFile() - column = VectorData(name="TestColumn", description="", data=array.squeeze()) + column = VectorData(name="TestColumn", description="", data=data) dynamic_table = DynamicTable(name="TestDynamicTable", description="", columns=[column]) nwbfile.add_acquisition(dynamic_table) @@ -219,67 +228,7 @@ def test_unwrapped_dynamic_table_zarr(): assert isinstance(dataset_configuration, ZarrDatasetConfiguration) assert dataset_configuration.dataset_info.object_id == column.object_id assert dataset_configuration.dataset_info.location == "acquisition/TestDynamicTable/TestColumn/data" - assert dataset_configuration.dataset_info.maxshape == array.shape - assert dataset_configuration.dataset_info.dtype == array.dtype - assert dataset_configuration.chunk_shape == array.shape - assert dataset_configuration.buffer_shape == array.shape - assert dataset_configuration.compression_method == "gzip" - assert dataset_configuration.compression_options is None - assert dataset_configuration.filter_methods is None - assert dataset_configuration.filter_options is None - - -def test_generic_iterator_wrapped_dynamic_table_hdf5(): - array = np.array([0.1, 0.2, 0.3]) - - nwbfile = mock_NWBFile() - column = VectorData(name="TestColumn", description="", data=SliceableDataChunkIterator(data=array.squeeze())) - dynamic_table = DynamicTable( - name="TestDynamicTable", - description="", - id=list(range(array.shape[0])), # Need to include ID since the data of the column is not wrapped in an IO - columns=[column], - ) - nwbfile.add_acquisition(dynamic_table) - - dataset_configurations = list(get_default_dataset_configurations(nwbfile=nwbfile, backend="hdf5")) - - assert len(dataset_configurations) == 1 - - dataset_configuration = dataset_configurations[0] - assert isinstance(dataset_configuration, HDF5DatasetConfiguration) - assert dataset_configuration.dataset_info.object_id == column.object_id - assert dataset_configuration.dataset_info.location == "acquisition/TestDynamicTable/TestColumn/data" - assert dataset_configuration.dataset_info.maxshape == (array.shape[0],) - assert dataset_configuration.dataset_info.dtype == array.dtype - assert dataset_configuration.chunk_shape == array.shape - assert dataset_configuration.buffer_shape == array.shape - assert dataset_configuration.compression_method == "gzip" - assert dataset_configuration.compression_options is None - - -def test_generic_iterator_wrapped_dynamic_table_zarr(): - array = np.array([0.1, 0.2, 0.3]) - - nwbfile = mock_NWBFile() - column = VectorData(name="TestColumn", description="", data=SliceableDataChunkIterator(data=array.squeeze())) - dynamic_table = DynamicTable( - name="TestDynamicTable", - description="", - id=list(range(array.shape[0])), # Need to include ID since the data of the column is not wrapped in an IO - columns=[column], - ) - nwbfile.add_acquisition(dynamic_table) - - dataset_configurations = list(get_default_dataset_configurations(nwbfile=nwbfile, backend="zarr")) - - assert len(dataset_configurations) == 1 - - dataset_configuration = dataset_configurations[0] - assert isinstance(dataset_configuration, ZarrDatasetConfiguration) - assert dataset_configuration.dataset_info.object_id == column.object_id - assert dataset_configuration.dataset_info.location == "acquisition/TestDynamicTable/TestColumn/data" - assert dataset_configuration.dataset_info.maxshape == (array.shape[0],) + assert dataset_configuration.dataset_info.full_shape == array.shape assert dataset_configuration.dataset_info.dtype == array.dtype assert dataset_configuration.chunk_shape == array.shape assert dataset_configuration.buffer_shape == array.shape @@ -289,61 +238,121 @@ def test_generic_iterator_wrapped_dynamic_table_zarr(): assert dataset_configuration.filter_options is None -def test_classic_iterator_wrapped_dynamic_table_hdf5(): - array = np.array([0.1, 0.2, 0.3]) - - nwbfile = mock_NWBFile() - column = VectorData(name="TestColumn", description="", data=DataChunkIterator(data=array.squeeze())) - dynamic_table = DynamicTable( - name="TestDynamicTable", - description="", - id=list(range(array.shape[0])), # Need to include ID since the data of the column is not wrapped in an IO - columns=[column], - ) - nwbfile.add_acquisition(dynamic_table) - - dataset_configurations = list(get_default_dataset_configurations(nwbfile=nwbfile, backend="hdf5")) - - assert len(dataset_configurations) == 1 - - dataset_configuration = dataset_configurations[0] - assert isinstance(dataset_configuration, HDF5DatasetConfiguration) - assert dataset_configuration.dataset_info.object_id == column.object_id - assert dataset_configuration.dataset_info.location == "acquisition/TestDynamicTable/TestColumn/data" - assert dataset_configuration.dataset_info.maxshape == (array.shape[0],) - assert dataset_configuration.dataset_info.dtype == array.dtype - assert dataset_configuration.chunk_shape == array.shape - assert dataset_configuration.buffer_shape == array.shape - assert dataset_configuration.compression_method == "gzip" - assert dataset_configuration.compression_options is None - - -def test_classic_iterator_wrapped_dynamic_table_zarr(): - array = np.array([0.1, 0.2, 0.3]) - - nwbfile = mock_NWBFile() - column = VectorData(name="TestColumn", description="", data=DataChunkIterator(data=array.squeeze())) - dynamic_table = DynamicTable( - name="TestDynamicTable", - description="", - id=list(range(array.shape[0])), # Need to include ID since the data of the column is not wrapped in an IO - columns=[column], - ) - nwbfile.add_acquisition(dynamic_table) - - dataset_configurations = list(get_default_dataset_configurations(nwbfile=nwbfile, backend="zarr")) - - assert len(dataset_configurations) == 1 - - dataset_configuration = dataset_configurations[0] - assert isinstance(dataset_configuration, ZarrDatasetConfiguration) - assert dataset_configuration.dataset_info.object_id == column.object_id - assert dataset_configuration.dataset_info.location == "acquisition/TestDynamicTable/TestColumn/data" - assert dataset_configuration.dataset_info.maxshape == (array.shape[0],) - assert dataset_configuration.dataset_info.dtype == array.dtype - assert dataset_configuration.chunk_shape == array.shape - assert dataset_configuration.buffer_shape == array.shape - assert dataset_configuration.compression_method == "gzip" - assert dataset_configuration.compression_options is None - assert dataset_configuration.filter_methods is None - assert dataset_configuration.filter_options is None +# def test_generic_iterator_wrapped_dynamic_table_hdf5(): +# array = np.array([0.1, 0.2, 0.3]) + +# nwbfile = mock_NWBFile() +# column = VectorData(name="TestColumn", description="", data=SliceableDataChunkIterator(data=array.squeeze())) +# dynamic_table = DynamicTable( +# name="TestDynamicTable", +# description="", +# id=list(range(array.shape[0])), # Need to include ID since the data of the column is not wrapped in an IO +# columns=[column], +# ) +# nwbfile.add_acquisition(dynamic_table) + +# dataset_configurations = list(get_default_dataset_configurations(nwbfile=nwbfile, backend="hdf5")) + +# assert len(dataset_configurations) == 1 + +# dataset_configuration = dataset_configurations[0] +# assert isinstance(dataset_configuration, HDF5DatasetConfiguration) +# assert dataset_configuration.dataset_info.object_id == column.object_id +# assert dataset_configuration.dataset_info.location == "acquisition/TestDynamicTable/TestColumn/data" +# assert dataset_configuration.dataset_info.full_shape == (array.shape[0],) +# assert dataset_configuration.dataset_info.dtype == array.dtype +# assert dataset_configuration.chunk_shape == array.shape +# assert dataset_configuration.buffer_shape == array.shape +# assert dataset_configuration.compression_method == "gzip" +# assert dataset_configuration.compression_options is None + + +# def test_generic_iterator_wrapped_dynamic_table_zarr(): +# array = np.array([0.1, 0.2, 0.3]) + +# nwbfile = mock_NWBFile() +# column = VectorData(name="TestColumn", description="", data=SliceableDataChunkIterator(data=array.squeeze())) +# dynamic_table = DynamicTable( +# name="TestDynamicTable", +# description="", +# id=list(range(array.shape[0])), # Need to include ID since the data of the column is not wrapped in an IO +# columns=[column], +# ) +# nwbfile.add_acquisition(dynamic_table) + +# dataset_configurations = list(get_default_dataset_configurations(nwbfile=nwbfile, backend="zarr")) + +# assert len(dataset_configurations) == 1 + +# dataset_configuration = dataset_configurations[0] +# assert isinstance(dataset_configuration, ZarrDatasetConfiguration) +# assert dataset_configuration.dataset_info.object_id == column.object_id +# assert dataset_configuration.dataset_info.location == "acquisition/TestDynamicTable/TestColumn/data" +# assert dataset_configuration.dataset_info.full_shape == (array.shape[0],) +# assert dataset_configuration.dataset_info.dtype == array.dtype +# assert dataset_configuration.chunk_shape == array.shape +# assert dataset_configuration.buffer_shape == array.shape +# assert dataset_configuration.compression_method == "gzip" +# assert dataset_configuration.compression_options is None +# assert dataset_configuration.filter_methods is None +# assert dataset_configuration.filter_options is None + + +# def test_classic_iterator_wrapped_dynamic_table_hdf5(): +# array = np.array([0.1, 0.2, 0.3]) + +# nwbfile = mock_NWBFile() +# column = VectorData(name="TestColumn", description="", data=DataChunkIterator(data=array.squeeze())) +# dynamic_table = DynamicTable( +# name="TestDynamicTable", +# description="", +# id=list(range(array.shape[0])), # Need to include ID since the data of the column is not wrapped in an IO +# columns=[column], +# ) +# nwbfile.add_acquisition(dynamic_table) + +# dataset_configurations = list(get_default_dataset_configurations(nwbfile=nwbfile, backend="hdf5")) + +# assert len(dataset_configurations) == 1 + +# dataset_configuration = dataset_configurations[0] +# assert isinstance(dataset_configuration, HDF5DatasetConfiguration) +# assert dataset_configuration.dataset_info.object_id == column.object_id +# assert dataset_configuration.dataset_info.location == "acquisition/TestDynamicTable/TestColumn/data" +# assert dataset_configuration.dataset_info.full_shape == (array.shape[0],) +# assert dataset_configuration.dataset_info.dtype == array.dtype +# assert dataset_configuration.chunk_shape == array.shape +# assert dataset_configuration.buffer_shape == array.shape +# assert dataset_configuration.compression_method == "gzip" +# assert dataset_configuration.compression_options is None + + +# def test_classic_iterator_wrapped_dynamic_table_zarr(): +# array = np.array([0.1, 0.2, 0.3]) + +# nwbfile = mock_NWBFile() +# column = VectorData(name="TestColumn", description="", data=DataChunkIterator(data=array.squeeze())) +# dynamic_table = DynamicTable( +# name="TestDynamicTable", +# description="", +# id=list(range(array.shape[0])), # Need to include ID since the data of the column is not wrapped in an IO +# columns=[column], +# ) +# nwbfile.add_acquisition(dynamic_table) + +# dataset_configurations = list(get_default_dataset_configurations(nwbfile=nwbfile, backend="zarr")) + +# assert len(dataset_configurations) == 1 + +# dataset_configuration = dataset_configurations[0] +# assert isinstance(dataset_configuration, ZarrDatasetConfiguration) +# assert dataset_configuration.dataset_info.object_id == column.object_id +# assert dataset_configuration.dataset_info.location == "acquisition/TestDynamicTable/TestColumn/data" +# assert dataset_configuration.dataset_info.full_shape == (array.shape[0],) +# assert dataset_configuration.dataset_info.dtype == array.dtype +# assert dataset_configuration.chunk_shape == array.shape +# assert dataset_configuration.buffer_shape == array.shape +# assert dataset_configuration.compression_method == "gzip" +# assert dataset_configuration.compression_options is None +# assert dataset_configuration.filter_methods is None +# assert dataset_configuration.filter_options is None diff --git a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_default_dataset_configurations_appended_files.py b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_default_dataset_configurations_appended_files.py index 48950e0b7..f33334e93 100644 --- a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_default_dataset_configurations_appended_files.py +++ b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_default_dataset_configurations_appended_files.py @@ -63,7 +63,7 @@ def test_unwrapped_time_series_hdf5(hdf5_nwbfile_path): assert isinstance(dataset_configuration, HDF5DatasetConfiguration) assert dataset_configuration.dataset_info.object_id == new_time_series.object_id assert dataset_configuration.dataset_info.location == "acquisition/NewTimeSeries/data" - assert dataset_configuration.dataset_info.maxshape == array.shape + assert dataset_configuration.dataset_info.full_shape == array.shape assert dataset_configuration.dataset_info.dtype == array.dtype assert dataset_configuration.chunk_shape == array.shape assert dataset_configuration.buffer_shape == array.shape @@ -86,7 +86,7 @@ def test_unwrapped_time_series_zarr(zarr_nwbfile_path): assert isinstance(dataset_configuration, ZarrDatasetConfiguration) assert dataset_configuration.dataset_info.object_id == new_time_series.object_id assert dataset_configuration.dataset_info.location == "acquisition/NewTimeSeries/data" - assert dataset_configuration.dataset_info.maxshape == array.shape + assert dataset_configuration.dataset_info.full_shape == array.shape assert dataset_configuration.dataset_info.dtype == array.dtype assert dataset_configuration.chunk_shape == array.shape assert dataset_configuration.buffer_shape == array.shape @@ -112,7 +112,7 @@ def test_unwrapped_dynamic_table_hdf5(hdf5_nwbfile_path): assert isinstance(dataset_configuration, HDF5DatasetConfiguration) assert dataset_configuration.dataset_info.object_id == column.object_id assert dataset_configuration.dataset_info.location == "acquisition/TestDynamicTable/TestColumn/data" - assert dataset_configuration.dataset_info.maxshape == array.shape + assert dataset_configuration.dataset_info.full_shape == array.shape assert dataset_configuration.dataset_info.dtype == array.dtype assert dataset_configuration.chunk_shape == array.shape assert dataset_configuration.buffer_shape == array.shape @@ -136,7 +136,7 @@ def test_unwrapped_dynamic_table_zarr(zarr_nwbfile_path): assert isinstance(dataset_configuration, ZarrDatasetConfiguration) assert dataset_configuration.dataset_info.object_id == column.object_id assert dataset_configuration.dataset_info.location == "acquisition/TestDynamicTable/TestColumn/data" - assert dataset_configuration.dataset_info.maxshape == array.shape + assert dataset_configuration.dataset_info.full_shape == array.shape assert dataset_configuration.dataset_info.dtype == array.dtype assert dataset_configuration.chunk_shape == array.shape assert dataset_configuration.buffer_shape == array.shape From b63161ae76ff3e915d48f2e8b9cf1a395cd0c352 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 4 Oct 2023 08:37:26 +0000 Subject: [PATCH 09/27] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../test_helpers/test_get_default_dataset_configurations.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_default_dataset_configurations.py b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_default_dataset_configurations.py index 00d030802..1c4a80adc 100644 --- a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_default_dataset_configurations.py +++ b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_default_dataset_configurations.py @@ -1,5 +1,6 @@ """Unit tests for `get_default_dataset_configurations`.""" import numpy as np +import pytest from hdmf.common import VectorData from hdmf.data_utils import DataChunkIterator from pynwb.base import DynamicTable @@ -7,7 +8,6 @@ from pynwb.testing.mock.base import mock_TimeSeries from pynwb.testing.mock.file import mock_NWBFile -import pytest from neuroconv.tools.hdmf import SliceableDataChunkIterator from neuroconv.tools.nwb_helpers import ( HDF5DatasetConfiguration, From d55e2a2b4037786dffd74643bc47ce43f4967b0f Mon Sep 17 00:00:00 2001 From: CodyCBakerPhD Date: Thu, 5 Oct 2023 03:59:26 -0400 Subject: [PATCH 10/27] remove comments --- ...test_get_default_dataset_configurations.py | 216 ------------------ 1 file changed, 216 deletions(-) diff --git a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_default_dataset_configurations.py b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_default_dataset_configurations.py index 00d030802..8485c0dac 100644 --- a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_default_dataset_configurations.py +++ b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_default_dataset_configurations.py @@ -68,102 +68,6 @@ def test_unwrapped_time_series_zarr(iterator: callable): assert dataset_configuration.filter_options is None -# def test_generic_iterator_wrapped_time_series_hdf5(): -# array = np.array([[1, 2, 3], [4, 5, 6]]) - -# nwbfile = mock_NWBFile() -# time_series = mock_TimeSeries(name="TestTimeSeries", data=SliceableDataChunkIterator(data=array)) -# nwbfile.add_acquisition(time_series) - -# dataset_configurations = list(get_default_dataset_configurations(nwbfile=nwbfile, backend="hdf5")) - -# assert len(dataset_configurations) == 1 - -# dataset_configuration = dataset_configurations[0] -# assert isinstance(dataset_configuration, HDF5DatasetConfiguration) -# assert dataset_configuration.dataset_info.object_id == time_series.object_id -# assert dataset_configuration.dataset_info.location == "acquisition/TestTimeSeries/data" -# assert dataset_configuration.dataset_info.full_shape == array.shape -# assert dataset_configuration.dataset_info.dtype == array.dtype -# assert dataset_configuration.chunk_shape == array.shape -# assert dataset_configuration.buffer_shape == array.shape -# assert dataset_configuration.compression_method == "gzip" -# assert dataset_configuration.compression_options is None - - -# def test_classic_iterator_wrapped_simple_time_series_zarr(): -# array = np.array([[1, 2, 3], [4, 5, 6]]) - -# nwbfile = mock_NWBFile() -# time_series = mock_TimeSeries(name="TestTimeSeries", data=DataChunkIterator(data=array)) -# nwbfile.add_acquisition(time_series) - -# dataset_configurations = list(get_default_dataset_configurations(nwbfile=nwbfile, backend="zarr")) - -# assert len(dataset_configurations) == 1 - -# dataset_configuration = dataset_configurations[0] -# assert isinstance(dataset_configuration, ZarrDatasetConfiguration) -# assert dataset_configuration.dataset_info.object_id == time_series.object_id -# assert dataset_configuration.dataset_info.location == "acquisition/TestTimeSeries/data" -# assert dataset_configuration.dataset_info.full_shape == array.shape -# assert dataset_configuration.dataset_info.dtype == array.dtype -# assert dataset_configuration.chunk_shape == array.shape -# assert dataset_configuration.buffer_shape == array.shape -# assert dataset_configuration.compression_method == "gzip" -# assert dataset_configuration.compression_options is None -# assert dataset_configuration.filter_methods is None -# assert dataset_configuration.filter_options is None - - -# def test_classic_iterator_wrapped_time_series_hdf5(): -# array = np.array([[1, 2, 3], [4, 5, 6]]) - -# nwbfile = mock_NWBFile() -# time_series = mock_TimeSeries(name="TestTimeSeries", data=DataChunkIterator(data=array)) -# nwbfile.add_acquisition(time_series) - -# dataset_configurations = list(get_default_dataset_configurations(nwbfile=nwbfile, backend="hdf5")) - -# assert len(dataset_configurations) == 1 - -# dataset_configuration = dataset_configurations[0] -# assert isinstance(dataset_configuration, HDF5DatasetConfiguration) -# assert dataset_configuration.dataset_info.object_id == time_series.object_id -# assert dataset_configuration.dataset_info.location == "acquisition/TestTimeSeries/data" -# assert dataset_configuration.dataset_info.full_shape == array.shape -# assert dataset_configuration.dataset_info.dtype == array.dtype -# assert dataset_configuration.chunk_shape == array.shape -# assert dataset_configuration.buffer_shape == array.shape -# assert dataset_configuration.compression_method == "gzip" -# assert dataset_configuration.compression_options is None - - -# def test_generic_iterator_wrapped_simple_time_series_zarr(): -# array = np.array([[1, 2, 3], [4, 5, 6]]) - -# nwbfile = mock_NWBFile() -# time_series = mock_TimeSeries(name="TestTimeSeries", data=SliceableDataChunkIterator(data=array)) -# nwbfile.add_acquisition(time_series) - -# dataset_configurations = list(get_default_dataset_configurations(nwbfile=nwbfile, backend="zarr")) - -# assert len(dataset_configurations) == 1 - -# dataset_configuration = dataset_configurations[0] -# assert isinstance(dataset_configuration, ZarrDatasetConfiguration) -# assert dataset_configuration.dataset_info.object_id == time_series.object_id -# assert dataset_configuration.dataset_info.location == "acquisition/TestTimeSeries/data" -# assert dataset_configuration.dataset_info.full_shape == array.shape -# assert dataset_configuration.dataset_info.dtype == array.dtype -# assert dataset_configuration.chunk_shape == array.shape -# assert dataset_configuration.buffer_shape == array.shape -# assert dataset_configuration.compression_method == "gzip" -# assert dataset_configuration.compression_options is None -# assert dataset_configuration.filter_methods is None -# assert dataset_configuration.filter_options is None - - def test_external_image_series_hdf5(): nwbfile = mock_NWBFile() image_series = ImageSeries(name="TestImageSeries", external_file=[""], rate=1.0) @@ -236,123 +140,3 @@ def test_unwrapped_dynamic_table_zarr(iterator: callable): assert dataset_configuration.compression_options is None assert dataset_configuration.filter_methods is None assert dataset_configuration.filter_options is None - - -# def test_generic_iterator_wrapped_dynamic_table_hdf5(): -# array = np.array([0.1, 0.2, 0.3]) - -# nwbfile = mock_NWBFile() -# column = VectorData(name="TestColumn", description="", data=SliceableDataChunkIterator(data=array.squeeze())) -# dynamic_table = DynamicTable( -# name="TestDynamicTable", -# description="", -# id=list(range(array.shape[0])), # Need to include ID since the data of the column is not wrapped in an IO -# columns=[column], -# ) -# nwbfile.add_acquisition(dynamic_table) - -# dataset_configurations = list(get_default_dataset_configurations(nwbfile=nwbfile, backend="hdf5")) - -# assert len(dataset_configurations) == 1 - -# dataset_configuration = dataset_configurations[0] -# assert isinstance(dataset_configuration, HDF5DatasetConfiguration) -# assert dataset_configuration.dataset_info.object_id == column.object_id -# assert dataset_configuration.dataset_info.location == "acquisition/TestDynamicTable/TestColumn/data" -# assert dataset_configuration.dataset_info.full_shape == (array.shape[0],) -# assert dataset_configuration.dataset_info.dtype == array.dtype -# assert dataset_configuration.chunk_shape == array.shape -# assert dataset_configuration.buffer_shape == array.shape -# assert dataset_configuration.compression_method == "gzip" -# assert dataset_configuration.compression_options is None - - -# def test_generic_iterator_wrapped_dynamic_table_zarr(): -# array = np.array([0.1, 0.2, 0.3]) - -# nwbfile = mock_NWBFile() -# column = VectorData(name="TestColumn", description="", data=SliceableDataChunkIterator(data=array.squeeze())) -# dynamic_table = DynamicTable( -# name="TestDynamicTable", -# description="", -# id=list(range(array.shape[0])), # Need to include ID since the data of the column is not wrapped in an IO -# columns=[column], -# ) -# nwbfile.add_acquisition(dynamic_table) - -# dataset_configurations = list(get_default_dataset_configurations(nwbfile=nwbfile, backend="zarr")) - -# assert len(dataset_configurations) == 1 - -# dataset_configuration = dataset_configurations[0] -# assert isinstance(dataset_configuration, ZarrDatasetConfiguration) -# assert dataset_configuration.dataset_info.object_id == column.object_id -# assert dataset_configuration.dataset_info.location == "acquisition/TestDynamicTable/TestColumn/data" -# assert dataset_configuration.dataset_info.full_shape == (array.shape[0],) -# assert dataset_configuration.dataset_info.dtype == array.dtype -# assert dataset_configuration.chunk_shape == array.shape -# assert dataset_configuration.buffer_shape == array.shape -# assert dataset_configuration.compression_method == "gzip" -# assert dataset_configuration.compression_options is None -# assert dataset_configuration.filter_methods is None -# assert dataset_configuration.filter_options is None - - -# def test_classic_iterator_wrapped_dynamic_table_hdf5(): -# array = np.array([0.1, 0.2, 0.3]) - -# nwbfile = mock_NWBFile() -# column = VectorData(name="TestColumn", description="", data=DataChunkIterator(data=array.squeeze())) -# dynamic_table = DynamicTable( -# name="TestDynamicTable", -# description="", -# id=list(range(array.shape[0])), # Need to include ID since the data of the column is not wrapped in an IO -# columns=[column], -# ) -# nwbfile.add_acquisition(dynamic_table) - -# dataset_configurations = list(get_default_dataset_configurations(nwbfile=nwbfile, backend="hdf5")) - -# assert len(dataset_configurations) == 1 - -# dataset_configuration = dataset_configurations[0] -# assert isinstance(dataset_configuration, HDF5DatasetConfiguration) -# assert dataset_configuration.dataset_info.object_id == column.object_id -# assert dataset_configuration.dataset_info.location == "acquisition/TestDynamicTable/TestColumn/data" -# assert dataset_configuration.dataset_info.full_shape == (array.shape[0],) -# assert dataset_configuration.dataset_info.dtype == array.dtype -# assert dataset_configuration.chunk_shape == array.shape -# assert dataset_configuration.buffer_shape == array.shape -# assert dataset_configuration.compression_method == "gzip" -# assert dataset_configuration.compression_options is None - - -# def test_classic_iterator_wrapped_dynamic_table_zarr(): -# array = np.array([0.1, 0.2, 0.3]) - -# nwbfile = mock_NWBFile() -# column = VectorData(name="TestColumn", description="", data=DataChunkIterator(data=array.squeeze())) -# dynamic_table = DynamicTable( -# name="TestDynamicTable", -# description="", -# id=list(range(array.shape[0])), # Need to include ID since the data of the column is not wrapped in an IO -# columns=[column], -# ) -# nwbfile.add_acquisition(dynamic_table) - -# dataset_configurations = list(get_default_dataset_configurations(nwbfile=nwbfile, backend="zarr")) - -# assert len(dataset_configurations) == 1 - -# dataset_configuration = dataset_configurations[0] -# assert isinstance(dataset_configuration, ZarrDatasetConfiguration) -# assert dataset_configuration.dataset_info.object_id == column.object_id -# assert dataset_configuration.dataset_info.location == "acquisition/TestDynamicTable/TestColumn/data" -# assert dataset_configuration.dataset_info.full_shape == (array.shape[0],) -# assert dataset_configuration.dataset_info.dtype == array.dtype -# assert dataset_configuration.chunk_shape == array.shape -# assert dataset_configuration.buffer_shape == array.shape -# assert dataset_configuration.compression_method == "gzip" -# assert dataset_configuration.compression_options is None -# assert dataset_configuration.filter_methods is None -# assert dataset_configuration.filter_options is None From 3c7cde84423d5e27e44d6b8bdbf58f73da1d5c59 Mon Sep 17 00:00:00 2001 From: Cody Baker Date: Tue, 7 Nov 2023 20:37:50 -0500 Subject: [PATCH 11/27] remove unused typing --- src/neuroconv/tools/hdmf.py | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/src/neuroconv/tools/hdmf.py b/src/neuroconv/tools/hdmf.py index b82b12e1f..843fda22b 100644 --- a/src/neuroconv/tools/hdmf.py +++ b/src/neuroconv/tools/hdmf.py @@ -4,8 +4,6 @@ import numpy as np from hdmf.data_utils import GenericDataChunkIterator as HDMFGenericDataChunkIterator -from pydantic import Field -from typing_extensions import Annotated class GenericDataChunkIterator(HDMFGenericDataChunkIterator): @@ -16,11 +14,7 @@ def _get_default_buffer_shape(self, buffer_gb: float = 1.0) -> Tuple[int]: # TODO: move this to the core iterator in HDMF so it can be easily swapped out as well as run on its own @staticmethod - def estimate_default_chunk_shape( - chunk_mb: Annotated[float, Field(gt=0.0)], - maxshape: Tuple[int, ...], - dtype: np.dtype, - ) -> Tuple[int, ...]: + def estimate_default_chunk_shape(chunk_mb: float, maxshape: Tuple[int, ...], dtype: np.dtype) -> Tuple[int, ...]: """ Select chunk shape with size in MB less than the threshold of chunk_mb. @@ -46,10 +40,7 @@ def estimate_default_chunk_shape( # TODO: move this to the core iterator in HDMF so it can be easily swapped out as well as run on its own @staticmethod def estimate_default_buffer_shape( - buffer_gb: Annotated[float, Field(gt=0.0)], - chunk_shape: Tuple[int, ...], - maxshape: Tuple[int, ...], - dtype: np.dtype, + buffer_gb: float, chunk_shape: Tuple[int, ...], maxshape: Tuple[int, ...], dtype: np.dtype ) -> Tuple[int]: num_axes = len(maxshape) chunk_bytes = math.prod(chunk_shape) * dtype.itemsize @@ -117,7 +108,7 @@ def estimate_default_buffer_shape( class SliceableDataChunkIterator(GenericDataChunkIterator): """ - Generic data chunk iterator that works for any memory mapped array, such as a np.memmap or an h5py.Dataset + Generic data chunk iterator that works for any memory mapped array, such as a np.memmap or h5py.Dataset object. """ def __init__(self, data, **kwargs): From b845ac6a2fec0fb6fe6f42fbcacd5cda85855a06 Mon Sep 17 00:00:00 2001 From: Cody Baker Date: Tue, 7 Nov 2023 20:58:06 -0500 Subject: [PATCH 12/27] improve error message and fix import test --- tests/imports.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/imports.py b/tests/imports.py index 781a4bb97..656ddfea9 100644 --- a/tests/imports.py +++ b/tests/imports.py @@ -4,7 +4,6 @@ Run them by using: pytest tests/import_structure.py::TestImportStructure::test_name """ - from unittest import TestCase @@ -44,7 +43,7 @@ def test_top_level(self): "BaseExtractorInterface", "run_conversion_from_yaml", ] - self.assertCountEqual(first=current_structure, second=expected_structure) + assert sorted(current_structure) == sorted(expected_structure) def test_tools(self): """Python dir() calls (and __dict__ as well) update dynamically based on global imports.""" @@ -64,8 +63,9 @@ def test_tools(self): "deploy_process", "LocalPathExpander", "get_module", + "hdmf", ] - self.assertCountEqual(first=current_structure, second=expected_structure) + assert sorted(current_structure) == sorted(expected_structure) def test_datainterfaces(self): from neuroconv import datainterfaces @@ -87,4 +87,4 @@ def test_datainterfaces(self): "interfaces_by_category", ] + interface_name_list - self.assertCountEqual(first=current_structure, second=expected_structure) + assert sorted(current_structure) == sorted(expected_structure) From 5c7fb6bdf326107d173cb53e0783e6c60689eb83 Mon Sep 17 00:00:00 2001 From: Cody Baker Date: Tue, 7 Nov 2023 21:18:30 -0500 Subject: [PATCH 13/27] add global static maps; further condense tests with parametrize --- src/neuroconv/tools/nwb_helpers/__init__.py | 3 + ...test_get_default_dataset_configurations.py | 100 ++++-------------- 2 files changed, 25 insertions(+), 78 deletions(-) diff --git a/src/neuroconv/tools/nwb_helpers/__init__.py b/src/neuroconv/tools/nwb_helpers/__init__.py index e381a3294..c17fa3ef0 100644 --- a/src/neuroconv/tools/nwb_helpers/__init__.py +++ b/src/neuroconv/tools/nwb_helpers/__init__.py @@ -17,3 +17,6 @@ ZarrBackendConfiguration, ZarrDatasetConfiguration, ) + +DATASET_CONFIGURATIONS = dict(hdf5=HDF5DatasetConfiguration, zarr=ZarrDatasetConfiguration) +BACKEND_CONFIGURATIONS = dict(hdf5=HDF5BackendConfiguration, zarr=ZarrBackendConfiguration) diff --git a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_default_dataset_configurations.py b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_default_dataset_configurations.py index bf34abd2d..76469979e 100644 --- a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_default_dataset_configurations.py +++ b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_default_dataset_configurations.py @@ -1,4 +1,6 @@ """Unit tests for `get_default_dataset_configurations`.""" +from typing import Literal + import numpy as np import pytest from hdmf.common import VectorData @@ -9,15 +11,12 @@ from pynwb.testing.mock.file import mock_NWBFile from neuroconv.tools.hdmf import SliceableDataChunkIterator -from neuroconv.tools.nwb_helpers import ( - HDF5DatasetConfiguration, - ZarrDatasetConfiguration, - get_default_dataset_configurations, -) +from neuroconv.tools.nwb_helpers import DATASET_CONFIGURATIONS, get_default_dataset_configurations @pytest.mark.parametrize("iterator", [lambda x: x, SliceableDataChunkIterator, DataChunkIterator]) -def test_unwrapped_time_series_hdf5(iterator: callable): +@pytest.mark.parametrize("backend", ["hdf5", "zarr"]) +def test_configuration_on_unwrapped_time_series_hdf5(iterator: callable, backend: Literal["hdf5", "zarr"]): array = np.array([[1, 2, 3], [4, 5, 6]]) data = iterator(array) @@ -25,12 +24,12 @@ def test_unwrapped_time_series_hdf5(iterator: callable): time_series = mock_TimeSeries(name="TestTimeSeries", data=data) nwbfile.add_acquisition(time_series) - dataset_configurations = list(get_default_dataset_configurations(nwbfile=nwbfile, backend="hdf5")) + dataset_configurations = list(get_default_dataset_configurations(nwbfile=nwbfile, backend=backend)) assert len(dataset_configurations) == 1 dataset_configuration = dataset_configurations[0] - assert isinstance(dataset_configuration, HDF5DatasetConfiguration) + assert isinstance(dataset_configuration, DATASET_CONFIGURATIONS[backend]) assert dataset_configuration.dataset_info.object_id == time_series.object_id assert dataset_configuration.dataset_info.location == "acquisition/TestTimeSeries/data" assert dataset_configuration.dataset_info.full_shape == array.shape @@ -40,70 +39,39 @@ def test_unwrapped_time_series_hdf5(iterator: callable): assert dataset_configuration.compression_method == "gzip" assert dataset_configuration.compression_options is None - -@pytest.mark.parametrize("iterator", [lambda x: x, SliceableDataChunkIterator, DataChunkIterator]) -def test_unwrapped_time_series_zarr(iterator: callable): - array = np.array([[1, 2, 3], [4, 5, 6]]) - data = iterator(array) - - nwbfile = mock_NWBFile() - time_series = mock_TimeSeries(name="TestTimeSeries", data=data) - nwbfile.add_acquisition(time_series) - - dataset_configurations = list(get_default_dataset_configurations(nwbfile=nwbfile, backend="zarr")) - - assert len(dataset_configurations) == 1 - - dataset_configuration = dataset_configurations[0] - assert isinstance(dataset_configuration, ZarrDatasetConfiguration) - assert dataset_configuration.dataset_info.object_id == time_series.object_id - assert dataset_configuration.dataset_info.location == "acquisition/TestTimeSeries/data" - assert dataset_configuration.dataset_info.full_shape == array.shape - assert dataset_configuration.dataset_info.dtype == array.dtype - assert dataset_configuration.chunk_shape == array.shape - assert dataset_configuration.buffer_shape == array.shape - assert dataset_configuration.compression_method == "gzip" - assert dataset_configuration.compression_options is None - assert dataset_configuration.filter_methods is None - assert dataset_configuration.filter_options is None + if backend == "zarr": + assert dataset_configuration.filter_methods is None + assert dataset_configuration.filter_options is None -def test_external_image_series_hdf5(): +@pytest.mark.parametrize("backend", ["hdf5", "zarr"]) +def test_configuration_on_external_image_series_hdf5(backend: Literal["hdf5", "zarr"]): nwbfile = mock_NWBFile() image_series = ImageSeries(name="TestImageSeries", external_file=[""], rate=1.0) nwbfile.add_acquisition(image_series) - dataset_configurations = list(get_default_dataset_configurations(nwbfile=nwbfile, backend="hdf5")) - - assert len(dataset_configurations) == 0 - - -def test_external_image_series_zarr(): - nwbfile = mock_NWBFile() - image_series = ImageSeries(name="TestImageSeries", external_file=[""], rate=1.0) - nwbfile.add_acquisition(image_series) - - dataset_configurations = list(get_default_dataset_configurations(nwbfile=nwbfile, backend="zarr")) + dataset_configurations = list(get_default_dataset_configurations(nwbfile=nwbfile, backend=backend)) assert len(dataset_configurations) == 0 @pytest.mark.parametrize("iterator", [lambda x: x, SliceableDataChunkIterator, DataChunkIterator]) -def test_unwrapped_dynamic_table_hdf5(iterator: callable): +@pytest.mark.parametrize("backend", ["hdf5", "zarr"]) +def test_configuration_on_dynamic_table(iterator: callable, backend: Literal["hdf5", "zarr"]): array = np.array([0.1, 0.2, 0.3]) - data = iterator(array.squeeze()) + data = iterator(array) nwbfile = mock_NWBFile() column = VectorData(name="TestColumn", description="", data=data) - dynamic_table = DynamicTable(name="TestDynamicTable", description="", columns=[column]) + dynamic_table = DynamicTable(name="TestDynamicTable", description="", columns=[column], id=list(range(len(array)))) nwbfile.add_acquisition(dynamic_table) - dataset_configurations = list(get_default_dataset_configurations(nwbfile=nwbfile, backend="hdf5")) + dataset_configurations = list(get_default_dataset_configurations(nwbfile=nwbfile, backend=backend)) assert len(dataset_configurations) == 1 dataset_configuration = dataset_configurations[0] - assert isinstance(dataset_configuration, HDF5DatasetConfiguration) + assert isinstance(dataset_configuration, DATASET_CONFIGURATIONS[backend]) assert dataset_configuration.dataset_info.object_id == column.object_id assert dataset_configuration.dataset_info.location == "acquisition/TestDynamicTable/TestColumn/data" assert dataset_configuration.dataset_info.full_shape == array.shape @@ -113,30 +81,6 @@ def test_unwrapped_dynamic_table_hdf5(iterator: callable): assert dataset_configuration.compression_method == "gzip" assert dataset_configuration.compression_options is None - -@pytest.mark.parametrize("iterator", [lambda x: x, SliceableDataChunkIterator, DataChunkIterator]) -def test_unwrapped_dynamic_table_zarr(iterator: callable): - array = np.array([0.1, 0.2, 0.3]) - data = iterator(array.squeeze()) - - nwbfile = mock_NWBFile() - column = VectorData(name="TestColumn", description="", data=data) - dynamic_table = DynamicTable(name="TestDynamicTable", description="", columns=[column]) - nwbfile.add_acquisition(dynamic_table) - - dataset_configurations = list(get_default_dataset_configurations(nwbfile=nwbfile, backend="zarr")) - - assert len(dataset_configurations) == 1 - - dataset_configuration = dataset_configurations[0] - assert isinstance(dataset_configuration, ZarrDatasetConfiguration) - assert dataset_configuration.dataset_info.object_id == column.object_id - assert dataset_configuration.dataset_info.location == "acquisition/TestDynamicTable/TestColumn/data" - assert dataset_configuration.dataset_info.full_shape == array.shape - assert dataset_configuration.dataset_info.dtype == array.dtype - assert dataset_configuration.chunk_shape == array.shape - assert dataset_configuration.buffer_shape == array.shape - assert dataset_configuration.compression_method == "gzip" - assert dataset_configuration.compression_options is None - assert dataset_configuration.filter_methods is None - assert dataset_configuration.filter_options is None + if backend == "zarr": + assert dataset_configuration.filter_methods is None + assert dataset_configuration.filter_options is None From 91aab8c27579c140793ed2d196128f4aacc1a425 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 8 Nov 2023 02:18:53 +0000 Subject: [PATCH 14/27] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../test_helpers/test_get_default_dataset_configurations.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_default_dataset_configurations.py b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_default_dataset_configurations.py index 76469979e..2ed3b81d5 100644 --- a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_default_dataset_configurations.py +++ b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_default_dataset_configurations.py @@ -11,7 +11,10 @@ from pynwb.testing.mock.file import mock_NWBFile from neuroconv.tools.hdmf import SliceableDataChunkIterator -from neuroconv.tools.nwb_helpers import DATASET_CONFIGURATIONS, get_default_dataset_configurations +from neuroconv.tools.nwb_helpers import ( + DATASET_CONFIGURATIONS, + get_default_dataset_configurations, +) @pytest.mark.parametrize("iterator", [lambda x: x, SliceableDataChunkIterator, DataChunkIterator]) From 65eee6bad4f93ce147ee68a02d1c14b053ace342 Mon Sep 17 00:00:00 2001 From: Cody Baker Date: Tue, 7 Nov 2023 22:25:22 -0500 Subject: [PATCH 15/27] fix name --- .../test_helpers/test_get_default_dataset_configurations.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_default_dataset_configurations.py b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_default_dataset_configurations.py index 76469979e..0b79fdedc 100644 --- a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_default_dataset_configurations.py +++ b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_default_dataset_configurations.py @@ -16,7 +16,7 @@ @pytest.mark.parametrize("iterator", [lambda x: x, SliceableDataChunkIterator, DataChunkIterator]) @pytest.mark.parametrize("backend", ["hdf5", "zarr"]) -def test_configuration_on_unwrapped_time_series_hdf5(iterator: callable, backend: Literal["hdf5", "zarr"]): +def test_configuration_on_time_series(iterator: callable, backend: Literal["hdf5", "zarr"]): array = np.array([[1, 2, 3], [4, 5, 6]]) data = iterator(array) @@ -45,7 +45,7 @@ def test_configuration_on_unwrapped_time_series_hdf5(iterator: callable, backend @pytest.mark.parametrize("backend", ["hdf5", "zarr"]) -def test_configuration_on_external_image_series_hdf5(backend: Literal["hdf5", "zarr"]): +def test_configuration_on_external_image_series(backend: Literal["hdf5", "zarr"]): nwbfile = mock_NWBFile() image_series = ImageSeries(name="TestImageSeries", external_file=[""], rate=1.0) nwbfile.add_acquisition(image_series) From 673e2f99e295fc5b47f9a29c6400d30adcc41687 Mon Sep 17 00:00:00 2001 From: Cody Baker <51133164+CodyCBakerPhD@users.noreply.github.com> Date: Mon, 20 Nov 2023 14:44:59 -0500 Subject: [PATCH 16/27] Apply suggestions from code review Co-authored-by: Heberto Mayorquin --- src/neuroconv/tools/nwb_helpers/_dataset_configuration.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py b/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py index e18b3d17f..2d9023130 100644 --- a/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py +++ b/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py @@ -21,7 +21,7 @@ def _get_mode(io: Union[NWBHDF5IO, NWBZarrIO]) -> str: - """NWBHDF5IO and NWBZarrIO have different ways of storing the mode they used on a path.""" + """NWBHDF5IO and NWBZarrIO have different ways of storing the io mode (e.g. "r", "a", "w") they used on a path.""" if isinstance(io, NWBHDF5IO): return io.mode elif isinstance(io, NWBZarrIO): @@ -65,7 +65,7 @@ def _parse_location_in_memory_nwbfile(current_location: str, neurodata_object: C def _get_dataset_metadata( neurodata_object: Union[TimeSeries, DynamicTable], field_name: str, backend: Literal["hdf5", "zarr"] -) -> Union[HDF5DatasetConfiguration, ZarrDatasetConfiguration]: +) -> Union[HDF5DatasetConfiguration, ZarrDatasetConfiguration, None]: """Fill in the Dataset model with as many values as can be automatically detected or inferred.""" DatasetConfigurationClass = BACKEND_TO_DATASET_CONFIGURATION[backend] @@ -176,7 +176,7 @@ def get_default_dataset_configurations( continue # skip # Edge case of in-memory ImageSeries with external mode; data is in fields and is empty array - if isinstance(candidate_dataset, np.ndarray) and not np.any(candidate_dataset): + if isinstance(candidate_dataset, np.ndarray) and candidate_dataset.size == 0: continue # skip yield _get_dataset_metadata(neurodata_object=time_series, field_name=field_name, backend=backend) From 8630316c05fae29e4666559350c8cbbdcdb40c4e Mon Sep 17 00:00:00 2001 From: Cody Baker Date: Tue, 21 Nov 2023 10:27:49 -0500 Subject: [PATCH 17/27] PR suggestions --- .../nwb_helpers/_dataset_configuration.py | 113 ++++++++++-------- 1 file changed, 61 insertions(+), 52 deletions(-) diff --git a/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py b/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py index 2d9023130..67b0a14b6 100644 --- a/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py +++ b/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py @@ -1,5 +1,5 @@ """Collection of helper functions related to configuration of datasets dependent on backend.""" -from typing import Iterable, Literal, Union +from typing import Iterable, Literal, Union, Generator import h5py import numpy as np @@ -20,7 +20,7 @@ BACKEND_TO_CONFIGURATION = dict(hdf5=HDF5BackendConfiguration, zarr=ZarrBackendConfiguration) -def _get_mode(io: Union[NWBHDF5IO, NWBZarrIO]) -> str: +def _get_io_mode(io: Union[NWBHDF5IO, NWBZarrIO]) -> str: """NWBHDF5IO and NWBZarrIO have different ways of storing the io mode (e.g. "r", "a", "w") they used on a path.""" if isinstance(io, NWBHDF5IO): return io.mode @@ -28,7 +28,7 @@ def _get_mode(io: Union[NWBHDF5IO, NWBZarrIO]) -> str: return io._ZarrIO__mode -def _is_value_already_written_to_file( +def _is_dataset_written_to_file( candidate_dataset: Union[h5py.Dataset, zarr.Array], backend: Literal["hdf5", "zarr"], existing_file: Union[h5py.File, zarr.Group, None], @@ -40,11 +40,11 @@ def _is_value_already_written_to_file( """ return ( isinstance(candidate_dataset, h5py.Dataset) # If the source data is an HDF5 Dataset - and backend == "hdf5" # If working in append mode + and backend == "hdf5" and candidate_dataset.file == existing_file # If the source HDF5 Dataset is the appending NWBFile ) or ( isinstance(candidate_dataset, zarr.Array) # If the source data is an Zarr Array - and backend == "zarr" # If working in append mode + and backend == "zarr" and candidate_dataset.store == existing_file # If the source Zarr 'file' is the appending NWBFile ) @@ -54,15 +54,30 @@ def _parse_location_in_memory_nwbfile(current_location: str, neurodata_object: C if isinstance(parent, NWBFile): # Items in defined top-level places like acquisition, intervals, etc. do not act as 'containers' # in the .parent sense; ask if object is in their in-memory dictionaries instead - for outer_field_name, outer_field_value in parent.fields.items(): - if isinstance(outer_field_value, dict) and neurodata_object.name in outer_field_value: - return outer_field_name + "/" + neurodata_object.name + "/" + current_location + for parent_field_name, parent_field_value in parent.fields.items(): + if isinstance(parent_field_value, dict) and neurodata_object.name in parent_field_value: + return parent_field_name + "/" + neurodata_object.name + "/" + current_location return neurodata_object.name + "/" + current_location return _parse_location_in_memory_nwbfile( current_location=neurodata_object.name + "/" + current_location, neurodata_object=parent ) +def _infer_dtype_using_data_chunk_iterator(candidate_dataset: Union[h5py.Dataset, zarr.Array]): + """ + The DataChunkIterator has one of the best generic dtype inference, though logic is hard to peel out of it. + + It can fail in rare cases but not essential to our default configuration + """ + try: + return DataChunkIterator(candidate_dataset).dtype + except Exception as exception: + if str(exception) != "Data type could not be determined. Please specify dtype in DataChunkIterator init.": + raise exception + else: + return np.dtype("object") + + def _get_dataset_metadata( neurodata_object: Union[TimeSeries, DynamicTable], field_name: str, backend: Literal["hdf5", "zarr"] ) -> Union[HDF5DatasetConfiguration, ZarrDatasetConfiguration, None]: @@ -70,56 +85,50 @@ def _get_dataset_metadata( DatasetConfigurationClass = BACKEND_TO_DATASET_CONFIGURATION[backend] candidate_dataset = getattr(neurodata_object, field_name) + # For now, skip over datasets already wrapped in DataIO # Could maybe eventually support modifying chunks in place # But setting buffer shape only possible if iterator was wrapped first - if not isinstance(candidate_dataset, DataIO): - # DataChunkIterator has best generic dtype inference, though logic is hard to peel out of it - # And it can fail in rare cases but not essential to our default configuration - try: - dtype = DataChunkIterator(candidate_dataset).dtype - except Exception as exception: - if str(exception) != "Data type could not be determined. Please specify dtype in DataChunkIterator init.": - raise exception - else: - dtype = np.dtype("object") - - full_shape = get_data_shape(data=candidate_dataset) - - if isinstance(candidate_dataset, GenericDataChunkIterator): - chunk_shape = candidate_dataset.chunk_shape - buffer_shape = candidate_dataset.buffer_shape - elif dtype != "unknown": - # TODO: eventually replace this with staticmethods on hdmf.data_utils.GenericDataChunkIterator - chunk_shape = SliceableDataChunkIterator.estimate_default_chunk_shape( - chunk_mb=10.0, maxshape=full_shape, dtype=np.dtype(dtype) - ) - buffer_shape = SliceableDataChunkIterator.estimate_default_buffer_shape( - buffer_gb=0.5, chunk_shape=chunk_shape, maxshape=full_shape, dtype=np.dtype(dtype) - ) - else: - pass # TODO: think on this; perhaps zarr's standalone estimator? - - location = _parse_location_in_memory_nwbfile(current_location=field_name, neurodata_object=neurodata_object) - dataset_name = location.strip("/")[-1] - dataset_info = DatasetInfo( - object_id=neurodata_object.object_id, - object_name=neurodata_object.name, - location=location, - dataset_name=dataset_name, - full_shape=full_shape, - dtype=dtype, + if isinstance(candidate_dataset, DataIO): + return None + + # DataChunkIterator has best generic dtype inference, though logic is hard to peel out of it + # And it can fail in rare cases but not essential to our default configuration + dtype = _infer_dtype_using_data_chunk_iterator(candidate_dataset=candidate_dataset) + full_shape = get_data_shape(data=candidate_dataset) + + if isinstance(candidate_dataset, GenericDataChunkIterator): + chunk_shape = candidate_dataset.chunk_shape + buffer_shape = candidate_dataset.buffer_shape + elif dtype != "unknown": + # TODO: eventually replace this with staticmethods on hdmf.data_utils.GenericDataChunkIterator + chunk_shape = SliceableDataChunkIterator.estimate_default_chunk_shape( + chunk_mb=10.0, maxshape=full_shape, dtype=np.dtype(dtype) ) - dataset_configuration = DatasetConfigurationClass( - dataset_info=dataset_info, chunk_shape=chunk_shape, buffer_shape=buffer_shape + buffer_shape = SliceableDataChunkIterator.estimate_default_buffer_shape( + buffer_gb=0.5, chunk_shape=chunk_shape, maxshape=full_shape, dtype=np.dtype(dtype) ) - return dataset_configuration + else: + pass # TODO: think on this; perhaps zarr's standalone estimator? + + location = _parse_location_in_memory_nwbfile(current_location=field_name, neurodata_object=neurodata_object) + dataset_info = DatasetInfo( + object_id=neurodata_object.object_id, + object_name=neurodata_object.name, + location=location, + full_shape=full_shape, + dtype=dtype, + ) + dataset_configuration = DatasetConfigurationClass( + dataset_info=dataset_info, chunk_shape=chunk_shape, buffer_shape=buffer_shape + ) + return dataset_configuration def get_default_dataset_configurations( nwbfile: NWBFile, backend: Union[None, Literal["hdf5", "zarr"]] = None, # None for auto-detect from append mode, otherwise required -) -> Iterable[DatasetConfiguration]: +) -> Generator[DatasetConfiguration, None, None]: """ Method for automatically detecting all objects in the file that could be wrapped in a DataIO. @@ -147,10 +156,10 @@ def get_default_dataset_configurations( detected_backend = None existing_file = None - if isinstance(nwbfile.read_io, NWBHDF5IO) and _get_mode(io=nwbfile.read_io) in ("r+", "a"): + if isinstance(nwbfile.read_io, NWBHDF5IO) and _get_io_mode(io=nwbfile.read_io) in ("r+", "a"): detected_backend = "hdf5" existing_file = nwbfile.read_io._file - elif isinstance(nwbfile.read_io, NWBZarrIO) and _get_mode(io=nwbfile.read_io) in ("r+", "a"): + elif isinstance(nwbfile.read_io, NWBZarrIO) and _get_io_mode(io=nwbfile.read_io) in ("r+", "a"): detected_backend = "zarr" existing_file = nwbfile.read_io.file.store backend = backend or detected_backend @@ -170,7 +179,7 @@ def get_default_dataset_configurations( continue candidate_dataset = getattr(time_series, field_name) - if _is_value_already_written_to_file( + if _is_dataset_written_to_file( candidate_dataset=candidate_dataset, backend=backend, existing_file=existing_file ): continue # skip @@ -185,7 +194,7 @@ def get_default_dataset_configurations( for column_name in dynamic_table.colnames: candidate_dataset = dynamic_table[column_name].data # VectorData object - if _is_value_already_written_to_file( + if _is_dataset_written_to_file( candidate_dataset=candidate_dataset, backend=backend, existing_file=existing_file ): continue # skip From f7e1be6dad73116a12a6e9dac96ebf5a15e3bf21 Mon Sep 17 00:00:00 2001 From: Cody Baker <51133164+CodyCBakerPhD@users.noreply.github.com> Date: Tue, 21 Nov 2023 10:29:19 -0500 Subject: [PATCH 18/27] Update src/neuroconv/tools/nwb_helpers/_dataset_configuration.py --- src/neuroconv/tools/nwb_helpers/_dataset_configuration.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py b/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py index 67b0a14b6..6a98b23e9 100644 --- a/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py +++ b/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py @@ -53,7 +53,7 @@ def _parse_location_in_memory_nwbfile(current_location: str, neurodata_object: C parent = neurodata_object.parent if isinstance(parent, NWBFile): # Items in defined top-level places like acquisition, intervals, etc. do not act as 'containers' - # in the .parent sense; ask if object is in their in-memory dictionaries instead + # in that they do not set the `.parent` attribute; ask if object is in their in-memory dictionaries instead for parent_field_name, parent_field_value in parent.fields.items(): if isinstance(parent_field_value, dict) and neurodata_object.name in parent_field_value: return parent_field_name + "/" + neurodata_object.name + "/" + current_location From 6f0806aeedbe3061dbac0b92cd071e4227528e0e Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 21 Nov 2023 15:30:07 +0000 Subject: [PATCH 19/27] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/neuroconv/tools/nwb_helpers/_dataset_configuration.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py b/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py index 6a98b23e9..f964ea472 100644 --- a/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py +++ b/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py @@ -1,5 +1,5 @@ """Collection of helper functions related to configuration of datasets dependent on backend.""" -from typing import Iterable, Literal, Union, Generator +from typing import Generator, Iterable, Literal, Union import h5py import numpy as np From b07a541a030eda6e5e36800a142854bb76b18276 Mon Sep 17 00:00:00 2001 From: Cody Baker Date: Tue, 21 Nov 2023 10:53:22 -0500 Subject: [PATCH 20/27] add IO to dataset config names --- src/neuroconv/tools/nwb_helpers/__init__.py | 8 ++-- .../nwb_helpers/_dataset_configuration.py | 22 +++++----- .../tools/nwb_helpers/_models/_base_models.py | 4 +- .../tools/nwb_helpers/_models/_hdf5_models.py | 6 +-- .../tools/nwb_helpers/_models/_zarr_models.py | 6 +-- src/neuroconv/tools/testing/__init__.py | 4 +- .../testing/_mock/_mock_dataset_models.py | 28 ++++++------- ..._common_dataset_io_configuration_model.py} | 40 +++++++++---------- ..._get_default_dataset_io_configurations.py} | 16 ++++---- ...taset_io_configurations_appended_files.py} | 24 +++++------ ...=> test_dataset_io_configuration_model.py} | 12 +++--- .../test_hdf5_backend_configuration_model.py | 2 +- ...st_hdf5_dataset_io_configuration_model.py} | 26 ++++++------ .../test_zarr_backend_configuration_model.py | 2 +- ...st_zarr_dataset_io_configuration_model.py} | 40 +++++++++---------- 15 files changed, 119 insertions(+), 121 deletions(-) rename tests/test_minimal/test_tools/test_backend_and_dataset_configuration/{test_common_dataset_configuration_model.py => test_common_dataset_io_configuration_model.py} (80%) rename tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/{test_get_default_dataset_configurations.py => test_get_default_dataset_io_configurations.py} (84%) rename tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/{test_get_default_dataset_configurations_appended_files.py => test_get_default_dataset_io_configurations_appended_files.py} (86%) rename tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/{test_dataset_configuration_model.py => test_dataset_io_configuration_model.py} (58%) rename tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/{test_hdf5_dataset_configuration_model.py => test_hdf5_dataset_io_configuration_model.py} (73%) rename tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/{test_zarr_dataset_configuration_model.py => test_zarr_dataset_io_configuration_model.py} (79%) diff --git a/src/neuroconv/tools/nwb_helpers/__init__.py b/src/neuroconv/tools/nwb_helpers/__init__.py index c17fa3ef0..cb78a67a5 100644 --- a/src/neuroconv/tools/nwb_helpers/__init__.py +++ b/src/neuroconv/tools/nwb_helpers/__init__.py @@ -1,4 +1,4 @@ -from ._dataset_configuration import get_default_dataset_configurations +from ._dataset_configuration import get_default_dataset_io_configurations from ._metadata_and_file_helpers import ( add_device_from_metadata, get_default_nwbfile_metadata, @@ -10,13 +10,13 @@ from ._models._hdf5_models import ( AVAILABLE_HDF5_COMPRESSION_METHODS, HDF5BackendConfiguration, - HDF5DatasetConfiguration, + HDF5DatasetIOConfiguration, ) from ._models._zarr_models import ( AVAILABLE_ZARR_COMPRESSION_METHODS, ZarrBackendConfiguration, - ZarrDatasetConfiguration, + ZarrDatasetIOConfiguration, ) -DATASET_CONFIGURATIONS = dict(hdf5=HDF5DatasetConfiguration, zarr=ZarrDatasetConfiguration) BACKEND_CONFIGURATIONS = dict(hdf5=HDF5BackendConfiguration, zarr=ZarrBackendConfiguration) +DATASET_IO_CONFIGURATIONS = dict(hdf5=HDF5DatasetIOConfiguration, zarr=ZarrDatasetIOConfiguration) diff --git a/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py b/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py index 6a98b23e9..775f7c923 100644 --- a/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py +++ b/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py @@ -1,5 +1,5 @@ """Collection of helper functions related to configuration of datasets dependent on backend.""" -from typing import Iterable, Literal, Union, Generator +from typing import Literal, Union, Generator import h5py import numpy as np @@ -11,12 +11,12 @@ from pynwb import NWBHDF5IO, NWBFile, TimeSeries from pynwb.base import DynamicTable -from ._models._base_models import DatasetConfiguration, DatasetInfo -from ._models._hdf5_models import HDF5BackendConfiguration, HDF5DatasetConfiguration -from ._models._zarr_models import ZarrBackendConfiguration, ZarrDatasetConfiguration +from ._models._base_models import DatasetIOConfiguration, DatasetInfo +from ._models._hdf5_models import HDF5BackendConfiguration, HDF5DatasetIOConfiguration +from ._models._zarr_models import ZarrBackendConfiguration, ZarrDatasetIOConfiguration from ..hdmf import SliceableDataChunkIterator -BACKEND_TO_DATASET_CONFIGURATION = dict(hdf5=HDF5DatasetConfiguration, zarr=ZarrDatasetConfiguration) +BACKEND_TO_DATASET_CONFIGURATION = dict(hdf5=HDF5DatasetIOConfiguration, zarr=ZarrDatasetIOConfiguration) BACKEND_TO_CONFIGURATION = dict(hdf5=HDF5BackendConfiguration, zarr=ZarrBackendConfiguration) @@ -80,9 +80,9 @@ def _infer_dtype_using_data_chunk_iterator(candidate_dataset: Union[h5py.Dataset def _get_dataset_metadata( neurodata_object: Union[TimeSeries, DynamicTable], field_name: str, backend: Literal["hdf5", "zarr"] -) -> Union[HDF5DatasetConfiguration, ZarrDatasetConfiguration, None]: +) -> Union[HDF5DatasetIOConfiguration, ZarrDatasetIOConfiguration, None]: """Fill in the Dataset model with as many values as can be automatically detected or inferred.""" - DatasetConfigurationClass = BACKEND_TO_DATASET_CONFIGURATION[backend] + DatasetIOConfigurationClass = BACKEND_TO_DATASET_CONFIGURATION[backend] candidate_dataset = getattr(neurodata_object, field_name) @@ -119,16 +119,16 @@ def _get_dataset_metadata( full_shape=full_shape, dtype=dtype, ) - dataset_configuration = DatasetConfigurationClass( + dataset_configuration = DatasetIOConfigurationClass( dataset_info=dataset_info, chunk_shape=chunk_shape, buffer_shape=buffer_shape ) return dataset_configuration -def get_default_dataset_configurations( +def get_default_dataset_io_configurations( nwbfile: NWBFile, backend: Union[None, Literal["hdf5", "zarr"]] = None, # None for auto-detect from append mode, otherwise required -) -> Generator[DatasetConfiguration, None, None]: +) -> Generator[DatasetIOConfiguration, None, None]: """ Method for automatically detecting all objects in the file that could be wrapped in a DataIO. @@ -141,7 +141,7 @@ def get_default_dataset_configurations( Yields ------ - DatasetConfiguration + DatasetIOConfiguration A summary of each detected object that can be wrapped in a DataIO. """ if backend is None and nwbfile.read_io is None: diff --git a/src/neuroconv/tools/nwb_helpers/_models/_base_models.py b/src/neuroconv/tools/nwb_helpers/_models/_base_models.py index 72b364dea..8a6486e74 100644 --- a/src/neuroconv/tools/nwb_helpers/_models/_base_models.py +++ b/src/neuroconv/tools/nwb_helpers/_models/_base_models.py @@ -62,7 +62,7 @@ def __init__(self, **values): super().__init__(**values) -class DatasetConfiguration(BaseModel, ABC): +class DatasetIOConfiguration(BaseModel, ABC): """A data model for configuring options about an object that will become a HDF5 or Zarr Dataset in the file.""" # TODO: When using Pydantic v2, remove @@ -188,7 +188,7 @@ class BackendConfiguration(BaseModel): backend: Literal["hdf5", "zarr"] = Field(description="The name of the backend used to configure the NWBFile.") data_io_class: Type[DataIO] = Field(description="The DataIO class that is specific to this backend.") - dataset_configurations: Dict[str, DatasetConfiguration] = Field( + dataset_configurations: Dict[str, DatasetIOConfiguration] = Field( description=( "A mapping from object locations (e.g. `acquisition/TestElectricalSeriesAP/data`) " "to their DatasetConfiguration specification that contains all information " diff --git a/src/neuroconv/tools/nwb_helpers/_models/_hdf5_models.py b/src/neuroconv/tools/nwb_helpers/_models/_hdf5_models.py index daf772688..b34671154 100644 --- a/src/neuroconv/tools/nwb_helpers/_models/_hdf5_models.py +++ b/src/neuroconv/tools/nwb_helpers/_models/_hdf5_models.py @@ -6,7 +6,7 @@ from pydantic import Field from pynwb import H5DataIO -from ._base_models import BackendConfiguration, DatasetConfiguration +from ._base_models import BackendConfiguration, DatasetIOConfiguration _base_hdf5_filters = set(h5py.filters.decode) _excluded_hdf5_filters = set( @@ -29,7 +29,7 @@ ) -class HDF5DatasetConfiguration(DatasetConfiguration): +class HDF5DatasetIOConfiguration(DatasetIOConfiguration): """A data model for configuring options about an object that will become a HDF5 Dataset in the file.""" # TODO: When using Pydantic v2, replace with `model_config = ConfigDict(...)` @@ -90,7 +90,7 @@ class HDF5BackendConfiguration(BackendConfiguration): data_io_class: Type[H5DataIO] = Field( # TODO: in pydantic v2 use property instead of class attribute default=H5DataIO, description="The DataIO class that is specific to HDF5." ) - dataset_configurations: Dict[str, HDF5DatasetConfiguration] = Field( + dataset_configurations: Dict[str, HDF5DatasetIOConfiguration] = Field( description=( "A mapping from object locations to their HDF5DatasetConfiguration specification that contains all " "information for writing the datasets to disk using the HDF5 backend." diff --git a/src/neuroconv/tools/nwb_helpers/_models/_zarr_models.py b/src/neuroconv/tools/nwb_helpers/_models/_zarr_models.py index 760c7c2a9..14214b513 100644 --- a/src/neuroconv/tools/nwb_helpers/_models/_zarr_models.py +++ b/src/neuroconv/tools/nwb_helpers/_models/_zarr_models.py @@ -7,7 +7,7 @@ from hdmf_zarr import ZarrDataIO from pydantic import Field, root_validator -from ._base_models import BackendConfiguration, DatasetConfiguration +from ._base_models import BackendConfiguration, DatasetIOConfiguration _base_zarr_codecs = set(zarr.codec_registry.keys()) _lossy_zarr_codecs = set(("astype", "bitround", "quantize")) @@ -43,7 +43,7 @@ } -class ZarrDatasetConfiguration(DatasetConfiguration): +class ZarrDatasetIOConfiguration(DatasetIOConfiguration): """A data model for configuring options about an object that will become a Zarr Dataset in the file.""" # TODO: When using Pydantic v2, replace with `model_config = ConfigDict(...)` @@ -147,7 +147,7 @@ class ZarrBackendConfiguration(BackendConfiguration): data_io_class: Type[ZarrDataIO] = Field( default=ZarrDataIO, description="The DataIO class that is specific to Zarr." ) - dataset_configurations: Dict[str, ZarrDatasetConfiguration] = Field( + dataset_configurations: Dict[str, ZarrDatasetIOConfiguration] = Field( description=( "A mapping from object locations to their ZarrDatasetConfiguration specification that contains all " "information for writing the datasets to disk using the Zarr backend." diff --git a/src/neuroconv/tools/testing/__init__.py b/src/neuroconv/tools/testing/__init__.py index 502634466..2d5b06497 100644 --- a/src/neuroconv/tools/testing/__init__.py +++ b/src/neuroconv/tools/testing/__init__.py @@ -1,9 +1,9 @@ from ._mock._mock_dataset_models import ( mock_DatasetInfo, mock_HDF5BackendConfiguration, - mock_HDF5DatasetConfiguration, + mock_HDF5DatasetIOConfiguration, mock_ZarrBackendConfiguration, - mock_ZarrDatasetConfiguration, + mock_ZarrDatasetIOConfiguration, ) from .mock_files import generate_path_expander_demo_ibl from .mock_interfaces import MockBehaviorEventInterface, MockSpikeGLXNIDQInterface diff --git a/src/neuroconv/tools/testing/_mock/_mock_dataset_models.py b/src/neuroconv/tools/testing/_mock/_mock_dataset_models.py index 6860f7078..e8ea80826 100644 --- a/src/neuroconv/tools/testing/_mock/_mock_dataset_models.py +++ b/src/neuroconv/tools/testing/_mock/_mock_dataset_models.py @@ -9,9 +9,9 @@ AVAILABLE_ZARR_COMPRESSION_METHODS, DatasetInfo, HDF5BackendConfiguration, - HDF5DatasetConfiguration, + HDF5DatasetIOConfiguration, ZarrBackendConfiguration, - ZarrDatasetConfiguration, + ZarrDatasetIOConfiguration, ) @@ -30,14 +30,14 @@ def mock_DatasetInfo( ) -def mock_HDF5DatasetConfiguration( +def mock_HDF5DatasetIOConfiguration( compression_method: Union[ Literal[tuple(AVAILABLE_HDF5_COMPRESSION_METHODS.keys())], h5py._hl.filters.FilterRefBase, None ] = "gzip", compression_options: Union[Dict[str, Any], None] = None, -) -> HDF5DatasetConfiguration: - """Mock instance of a HDF5DatasetConfiguration with NeuroPixel-like values to show chunk/buffer recommendations.""" - return HDF5DatasetConfiguration( +) -> HDF5DatasetIOConfiguration: + """Mock object of a HDF5DatasetIOConfiguration with NeuroPixel-like values to show chunk/buffer recommendations.""" + return HDF5DatasetIOConfiguration( dataset_info=mock_DatasetInfo(), chunk_shape=(78_125, 64), # ~10 MB buffer_shape=(1_250_000, 384), # ~1 GB @@ -46,7 +46,7 @@ def mock_HDF5DatasetConfiguration( ) -def mock_ZarrDatasetConfiguration( +def mock_ZarrDatasetIOConfiguration( compression_method: Union[ Literal[tuple(AVAILABLE_ZARR_COMPRESSION_METHODS.keys())], numcodecs.abc.Codec, None ] = "gzip", @@ -55,9 +55,9 @@ def mock_ZarrDatasetConfiguration( Union[Literal[tuple(AVAILABLE_ZARR_COMPRESSION_METHODS.keys())], numcodecs.abc.Codec, None] ] = None, filter_options: Union[Iterable[Dict[str, Any]], None] = None, -) -> ZarrDatasetConfiguration: - """Mock instance of a ZarrDatasetConfiguration with NeuroPixel-like values to show chunk/buffer recommendations.""" - return ZarrDatasetConfiguration( +) -> ZarrDatasetIOConfiguration: + """Mock object of a ZarrDatasetIOConfiguration with NeuroPixel-like values to show chunk/buffer recommendations.""" + return ZarrDatasetIOConfiguration( dataset_info=mock_DatasetInfo(), chunk_shape=(78_125, 64), # ~10 MB buffer_shape=(1_250_000, 384), # ~1 GB @@ -71,12 +71,12 @@ def mock_ZarrDatasetConfiguration( def mock_HDF5BackendConfiguration() -> HDF5BackendConfiguration: """Mock instance of a HDF5BackendConfiguration with two NeuroPixel-like datasets.""" dataset_configurations = { - "acquisition/TestElectricalSeriesAP/data": HDF5DatasetConfiguration( + "acquisition/TestElectricalSeriesAP/data": HDF5DatasetIOConfiguration( dataset_info=mock_DatasetInfo(location="acquisition/TestElectricalSeriesAP/data"), chunk_shape=(78_125, 64), # ~10 MB buffer_shape=(1_250_000, 384), # ~1 GB ), - "acquisition/TestElectricalSeriesLF/data": HDF5DatasetConfiguration( + "acquisition/TestElectricalSeriesLF/data": HDF5DatasetIOConfiguration( dataset_info=mock_DatasetInfo( object_id="bc37e164-519f-4b65-a976-206440f1d325", location="acquisition/TestElectricalSeriesLF/data", @@ -93,13 +93,13 @@ def mock_HDF5BackendConfiguration() -> HDF5BackendConfiguration: def mock_ZarrBackendConfiguration() -> ZarrBackendConfiguration: """Mock instance of a HDF5BackendConfiguration with several NeuroPixel-like datasets.""" dataset_configurations = { - "acquisition/TestElectricalSeriesAP/data": ZarrDatasetConfiguration( + "acquisition/TestElectricalSeriesAP/data": ZarrDatasetIOConfiguration( dataset_info=mock_DatasetInfo(location="acquisition/TestElectricalSeriesAP/data"), chunk_shape=(78_125, 64), buffer_shape=(1_250_000, 384), # ~1 GB filter_methods=["delta"], ), - "acquisition/TestElectricalSeriesLF/data": ZarrDatasetConfiguration( + "acquisition/TestElectricalSeriesLF/data": ZarrDatasetIOConfiguration( dataset_info=mock_DatasetInfo( object_id="bc37e164-519f-4b65-a976-206440f1d325", location="acquisition/TestElectricalSeriesLF/data", diff --git a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_common_dataset_configuration_model.py b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_common_dataset_io_configuration_model.py similarity index 80% rename from tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_common_dataset_configuration_model.py rename to tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_common_dataset_io_configuration_model.py index 892638a2c..3babb046e 100644 --- a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_common_dataset_configuration_model.py +++ b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_common_dataset_io_configuration_model.py @@ -4,21 +4,21 @@ import pytest from neuroconv.tools.nwb_helpers import ( - HDF5DatasetConfiguration, - ZarrDatasetConfiguration, + HDF5DatasetIOConfiguration, + ZarrDatasetIOConfiguration, ) from neuroconv.tools.testing import ( mock_DatasetInfo, - mock_HDF5DatasetConfiguration, - mock_ZarrDatasetConfiguration, + mock_HDF5DatasetIOConfiguration, + mock_ZarrDatasetIOConfiguration, ) @pytest.mark.parametrize( - argnames="dataset_configuration_class", argvalues=[HDF5DatasetConfiguration, ZarrDatasetConfiguration] + argnames="dataset_configuration_class", argvalues=[HDF5DatasetIOConfiguration, ZarrDatasetIOConfiguration] ) def test_validator_chunk_length_consistency( - dataset_configuration_class: Union[HDF5DatasetConfiguration, ZarrDatasetConfiguration] + dataset_configuration_class: Union[HDF5DatasetIOConfiguration, ZarrDatasetIOConfiguration] ): with pytest.raises(ValueError) as error_info: dataset_configuration_class( @@ -35,10 +35,10 @@ def test_validator_chunk_length_consistency( @pytest.mark.parametrize( - argnames="dataset_configuration_class", argvalues=[HDF5DatasetConfiguration, ZarrDatasetConfiguration] + argnames="dataset_configuration_class", argvalues=[HDF5DatasetIOConfiguration, ZarrDatasetIOConfiguration] ) def test_validator_chunk_and_buffer_length_consistency( - dataset_configuration_class: Union[HDF5DatasetConfiguration, ZarrDatasetConfiguration] + dataset_configuration_class: Union[HDF5DatasetIOConfiguration, ZarrDatasetIOConfiguration] ): with pytest.raises(ValueError) as error_info: dataset_configuration_class( @@ -55,10 +55,10 @@ def test_validator_chunk_and_buffer_length_consistency( @pytest.mark.parametrize( - argnames="dataset_configuration_class", argvalues=[HDF5DatasetConfiguration, ZarrDatasetConfiguration] + argnames="dataset_configuration_class", argvalues=[HDF5DatasetIOConfiguration, ZarrDatasetIOConfiguration] ) def test_validator_chunk_shape_nonpositive_elements( - dataset_configuration_class: Union[HDF5DatasetConfiguration, ZarrDatasetConfiguration] + dataset_configuration_class: Union[HDF5DatasetIOConfiguration, ZarrDatasetIOConfiguration] ): with pytest.raises(ValueError) as error_info: dataset_configuration_class( @@ -75,10 +75,10 @@ def test_validator_chunk_shape_nonpositive_elements( @pytest.mark.parametrize( - argnames="dataset_configuration_class", argvalues=[HDF5DatasetConfiguration, ZarrDatasetConfiguration] + argnames="dataset_configuration_class", argvalues=[HDF5DatasetIOConfiguration, ZarrDatasetIOConfiguration] ) def test_validator_buffer_shape_nonpositive_elements( - dataset_configuration_class: Union[HDF5DatasetConfiguration, ZarrDatasetConfiguration] + dataset_configuration_class: Union[HDF5DatasetIOConfiguration, ZarrDatasetIOConfiguration] ): with pytest.raises(ValueError) as error_info: dataset_configuration_class( @@ -95,10 +95,10 @@ def test_validator_buffer_shape_nonpositive_elements( @pytest.mark.parametrize( - argnames="dataset_configuration_class", argvalues=[HDF5DatasetConfiguration, ZarrDatasetConfiguration] + argnames="dataset_configuration_class", argvalues=[HDF5DatasetIOConfiguration, ZarrDatasetIOConfiguration] ) def test_validator_chunk_shape_exceeds_buffer_shape( - dataset_configuration_class: Union[HDF5DatasetConfiguration, ZarrDatasetConfiguration] + dataset_configuration_class: Union[HDF5DatasetIOConfiguration, ZarrDatasetIOConfiguration] ): with pytest.raises(ValueError) as error_info: dataset_configuration_class( @@ -115,10 +115,10 @@ def test_validator_chunk_shape_exceeds_buffer_shape( @pytest.mark.parametrize( - argnames="dataset_configuration_class", argvalues=[HDF5DatasetConfiguration, ZarrDatasetConfiguration] + argnames="dataset_configuration_class", argvalues=[HDF5DatasetIOConfiguration, ZarrDatasetIOConfiguration] ) def test_validator_buffer_shape_exceeds_full_shape( - dataset_configuration_class: Union[HDF5DatasetConfiguration, ZarrDatasetConfiguration] + dataset_configuration_class: Union[HDF5DatasetIOConfiguration, ZarrDatasetIOConfiguration] ): with pytest.raises(ValueError) as error_info: dataset_configuration_class( @@ -135,10 +135,10 @@ def test_validator_buffer_shape_exceeds_full_shape( @pytest.mark.parametrize( - argnames="dataset_configuration_class", argvalues=[HDF5DatasetConfiguration, ZarrDatasetConfiguration] + argnames="dataset_configuration_class", argvalues=[HDF5DatasetIOConfiguration, ZarrDatasetIOConfiguration] ) def test_validator_chunk_dimensions_do_not_evenly_divide_buffer( - dataset_configuration_class: Union[HDF5DatasetConfiguration, ZarrDatasetConfiguration] + dataset_configuration_class: Union[HDF5DatasetIOConfiguration, ZarrDatasetIOConfiguration] ): with pytest.raises(ValueError) as error_info: dataset_configuration_class( @@ -155,10 +155,10 @@ def test_validator_chunk_dimensions_do_not_evenly_divide_buffer( @pytest.mark.parametrize( - argnames="mock_dataset_configuration", argvalues=[mock_HDF5DatasetConfiguration(), mock_ZarrDatasetConfiguration()] + argnames="mock_dataset_configuration", argvalues=[mock_HDF5DatasetIOConfiguration(), mock_ZarrDatasetIOConfiguration()] ) def test_mutation_validation( - mock_dataset_configuration: Union[mock_HDF5DatasetConfiguration, mock_ZarrDatasetConfiguration] + mock_dataset_configuration: Union[mock_HDF5DatasetIOConfiguration, mock_ZarrDatasetIOConfiguration] ): """ Only testing on one dummy case to verify the root validator is triggered. diff --git a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_default_dataset_configurations.py b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_default_dataset_io_configurations.py similarity index 84% rename from tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_default_dataset_configurations.py rename to tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_default_dataset_io_configurations.py index 9b88c0027..b91cc45fb 100644 --- a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_default_dataset_configurations.py +++ b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_default_dataset_io_configurations.py @@ -1,4 +1,4 @@ -"""Unit tests for `get_default_dataset_configurations`.""" +"""Unit tests for `get_default_dataset_io_configurations`.""" from typing import Literal import numpy as np @@ -12,8 +12,8 @@ from neuroconv.tools.hdmf import SliceableDataChunkIterator from neuroconv.tools.nwb_helpers import ( - DATASET_CONFIGURATIONS, - get_default_dataset_configurations, + DATASET_IO_CONFIGURATIONS, + get_default_dataset_io_configurations, ) @@ -27,12 +27,12 @@ def test_configuration_on_time_series(iterator: callable, backend: Literal["hdf5 time_series = mock_TimeSeries(name="TestTimeSeries", data=data) nwbfile.add_acquisition(time_series) - dataset_configurations = list(get_default_dataset_configurations(nwbfile=nwbfile, backend=backend)) + dataset_configurations = list(get_default_dataset_io_configurations(nwbfile=nwbfile, backend=backend)) assert len(dataset_configurations) == 1 dataset_configuration = dataset_configurations[0] - assert isinstance(dataset_configuration, DATASET_CONFIGURATIONS[backend]) + assert isinstance(dataset_configuration, DATASET_IO_CONFIGURATIONS[backend]) assert dataset_configuration.dataset_info.object_id == time_series.object_id assert dataset_configuration.dataset_info.location == "acquisition/TestTimeSeries/data" assert dataset_configuration.dataset_info.full_shape == array.shape @@ -53,7 +53,7 @@ def test_configuration_on_external_image_series(backend: Literal["hdf5", "zarr"] image_series = ImageSeries(name="TestImageSeries", external_file=[""], rate=1.0) nwbfile.add_acquisition(image_series) - dataset_configurations = list(get_default_dataset_configurations(nwbfile=nwbfile, backend=backend)) + dataset_configurations = list(get_default_dataset_io_configurations(nwbfile=nwbfile, backend=backend)) assert len(dataset_configurations) == 0 @@ -69,12 +69,12 @@ def test_configuration_on_dynamic_table(iterator: callable, backend: Literal["hd dynamic_table = DynamicTable(name="TestDynamicTable", description="", columns=[column], id=list(range(len(array)))) nwbfile.add_acquisition(dynamic_table) - dataset_configurations = list(get_default_dataset_configurations(nwbfile=nwbfile, backend=backend)) + dataset_configurations = list(get_default_dataset_io_configurations(nwbfile=nwbfile, backend=backend)) assert len(dataset_configurations) == 1 dataset_configuration = dataset_configurations[0] - assert isinstance(dataset_configuration, DATASET_CONFIGURATIONS[backend]) + assert isinstance(dataset_configuration, DATASET_IO_CONFIGURATIONS[backend]) assert dataset_configuration.dataset_info.object_id == column.object_id assert dataset_configuration.dataset_info.location == "acquisition/TestDynamicTable/TestColumn/data" assert dataset_configuration.dataset_info.full_shape == array.shape diff --git a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_default_dataset_configurations_appended_files.py b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_default_dataset_io_configurations_appended_files.py similarity index 86% rename from tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_default_dataset_configurations_appended_files.py rename to tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_default_dataset_io_configurations_appended_files.py index f33334e93..3125bfc73 100644 --- a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_default_dataset_configurations_appended_files.py +++ b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_default_dataset_io_configurations_appended_files.py @@ -1,5 +1,5 @@ """ -Unit tests for `get_default_dataset_configurations` operating on already written files open in append mode. +Unit tests for `get_default_dataset_io_configurations` operating on already written files open in append mode. Mostly testing that the right objects are skipped from identification as candidates for configuration. """ from pathlib import Path @@ -14,9 +14,9 @@ from pynwb.testing.mock.file import mock_NWBFile from neuroconv.tools.nwb_helpers import ( - HDF5DatasetConfiguration, - ZarrDatasetConfiguration, - get_default_dataset_configurations, + HDF5DatasetIOConfiguration, + ZarrDatasetIOConfiguration, + get_default_dataset_io_configurations, ) @@ -55,12 +55,12 @@ def test_unwrapped_time_series_hdf5(hdf5_nwbfile_path): nwbfile = io.read() new_time_series = mock_TimeSeries(name="NewTimeSeries", data=array) nwbfile.add_acquisition(new_time_series) - dataset_configurations = list(get_default_dataset_configurations(nwbfile=nwbfile, backend="hdf5")) + dataset_configurations = list(get_default_dataset_io_configurations(nwbfile=nwbfile, backend="hdf5")) assert len(dataset_configurations) == 1 dataset_configuration = dataset_configurations[0] - assert isinstance(dataset_configuration, HDF5DatasetConfiguration) + assert isinstance(dataset_configuration, HDF5DatasetIOConfiguration) assert dataset_configuration.dataset_info.object_id == new_time_series.object_id assert dataset_configuration.dataset_info.location == "acquisition/NewTimeSeries/data" assert dataset_configuration.dataset_info.full_shape == array.shape @@ -78,12 +78,12 @@ def test_unwrapped_time_series_zarr(zarr_nwbfile_path): nwbfile = io.read() new_time_series = mock_TimeSeries(name="NewTimeSeries", data=array) nwbfile.add_acquisition(new_time_series) - dataset_configurations = list(get_default_dataset_configurations(nwbfile=nwbfile, backend="zarr")) + dataset_configurations = list(get_default_dataset_io_configurations(nwbfile=nwbfile, backend="zarr")) assert len(dataset_configurations) == 1 dataset_configuration = dataset_configurations[0] - assert isinstance(dataset_configuration, ZarrDatasetConfiguration) + assert isinstance(dataset_configuration, ZarrDatasetIOConfiguration) assert dataset_configuration.dataset_info.object_id == new_time_series.object_id assert dataset_configuration.dataset_info.location == "acquisition/NewTimeSeries/data" assert dataset_configuration.dataset_info.full_shape == array.shape @@ -104,12 +104,12 @@ def test_unwrapped_dynamic_table_hdf5(hdf5_nwbfile_path): column = VectorData(name="TestColumn", description="", data=array.squeeze()) dynamic_table = DynamicTable(name="TestDynamicTable", description="", columns=[column]) nwbfile.add_acquisition(dynamic_table) - dataset_configurations = list(get_default_dataset_configurations(nwbfile=nwbfile, backend="hdf5")) + dataset_configurations = list(get_default_dataset_io_configurations(nwbfile=nwbfile, backend="hdf5")) assert len(dataset_configurations) == 1 dataset_configuration = dataset_configurations[0] - assert isinstance(dataset_configuration, HDF5DatasetConfiguration) + assert isinstance(dataset_configuration, HDF5DatasetIOConfiguration) assert dataset_configuration.dataset_info.object_id == column.object_id assert dataset_configuration.dataset_info.location == "acquisition/TestDynamicTable/TestColumn/data" assert dataset_configuration.dataset_info.full_shape == array.shape @@ -128,12 +128,12 @@ def test_unwrapped_dynamic_table_zarr(zarr_nwbfile_path): column = VectorData(name="TestColumn", description="", data=array.squeeze()) dynamic_table = DynamicTable(name="TestDynamicTable", description="", columns=[column]) nwbfile.add_acquisition(dynamic_table) - dataset_configurations = list(get_default_dataset_configurations(nwbfile=nwbfile, backend="zarr")) + dataset_configurations = list(get_default_dataset_io_configurations(nwbfile=nwbfile, backend="zarr")) assert len(dataset_configurations) == 1 dataset_configuration = dataset_configurations[0] - assert isinstance(dataset_configuration, ZarrDatasetConfiguration) + assert isinstance(dataset_configuration, ZarrDatasetIOConfiguration) assert dataset_configuration.dataset_info.object_id == column.object_id assert dataset_configuration.dataset_info.location == "acquisition/TestDynamicTable/TestColumn/data" assert dataset_configuration.dataset_info.full_shape == array.shape diff --git a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/test_dataset_configuration_model.py b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/test_dataset_io_configuration_model.py similarity index 58% rename from tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/test_dataset_configuration_model.py rename to tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/test_dataset_io_configuration_model.py index fd9e624a3..33b32d10a 100644 --- a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/test_dataset_configuration_model.py +++ b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/test_dataset_io_configuration_model.py @@ -1,26 +1,26 @@ """Unit tests for the DatasetConfiguration Pydantic model.""" import pytest -from neuroconv.tools.nwb_helpers._models._base_models import DatasetConfiguration +from neuroconv.tools.nwb_helpers._models._base_models import DatasetIOConfiguration from neuroconv.tools.testing import mock_DatasetInfo def test_get_data_io_kwargs_abstract_error(): with pytest.raises(TypeError) as error_info: - DatasetConfiguration(dataset_info=mock_DatasetInfo(), chunk_shape=(78_125, 64), buffer_shape=(1_250_000, 384)) - assert "Can't instantiate abstract class DatasetConfiguration with abstract" in str(error_info.value) + DatasetIOConfiguration(dataset_info=mock_DatasetInfo(), chunk_shape=(78_125, 64), buffer_shape=(1_250_000, 384)) + assert "Can't instantiate abstract class DatasetIOConfiguration with abstract" in str(error_info.value) def test_get_data_io_kwargs_not_implemented(): - class TestDatasetConfiguration(DatasetConfiguration): + class TestDatasetIOConfiguration(DatasetIOConfiguration): def get_data_io_kwargs(self): super().get_data_io_kwargs() - dataset_configuration = TestDatasetConfiguration( + dataset_io_configuration = TestDatasetIOConfiguration( dataset_info=mock_DatasetInfo(), chunk_shape=(78_125, 64), buffer_shape=(1_250_000, 384), ) with pytest.raises(NotImplementedError): - dataset_configuration.get_data_io_kwargs() + dataset_io_configuration.get_data_io_kwargs() diff --git a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/test_hdf5_backend_configuration_model.py b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/test_hdf5_backend_configuration_model.py index 2d6242ad1..7377ff1b8 100644 --- a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/test_hdf5_backend_configuration_model.py +++ b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/test_hdf5_backend_configuration_model.py @@ -6,7 +6,7 @@ def test_hdf5_backend_configuration_print(): - """Test the printout display of a HDF5DatasetConfiguration model looks nice.""" + """Test the printout display of a HDF5BackendConfiguration model looks nice.""" hdf5_backend_configuration = mock_HDF5BackendConfiguration() with patch("sys.stdout", new=StringIO()) as out: diff --git a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/test_hdf5_dataset_configuration_model.py b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/test_hdf5_dataset_io_configuration_model.py similarity index 73% rename from tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/test_hdf5_dataset_configuration_model.py rename to tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/test_hdf5_dataset_io_configuration_model.py index d6de7ab4c..b31387fbf 100644 --- a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/test_hdf5_dataset_configuration_model.py +++ b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/test_hdf5_dataset_io_configuration_model.py @@ -1,16 +1,14 @@ -"""Unit tests for the HDF5DatasetConfiguration Pydantic model.""" +"""Unit tests for the HDF5DatasetIOConfiguration Pydantic model.""" from io import StringIO from unittest.mock import patch -import pytest - from neuroconv.tools.nwb_helpers import AVAILABLE_HDF5_COMPRESSION_METHODS -from neuroconv.tools.testing import mock_HDF5DatasetConfiguration +from neuroconv.tools.testing import mock_HDF5DatasetIOConfiguration def test_hdf5_dataset_configuration_print(): - """Test the printout display of a HDF5DatasetConfiguration model looks nice.""" - hdf5_dataset_configuration = mock_HDF5DatasetConfiguration() + """Test the printout display of a HDF5DatasetIOConfiguration model looks nice.""" + hdf5_dataset_configuration = mock_HDF5DatasetIOConfiguration() with patch("sys.stdout", new=StringIO()) as out: print(hdf5_dataset_configuration) @@ -35,8 +33,8 @@ def test_hdf5_dataset_configuration_print(): def test_hdf5_dataset_configuration_print_with_compression_options(): - """Test the printout display of a HDF5DatasetConfiguration model looks nice.""" - hdf5_dataset_configuration = mock_HDF5DatasetConfiguration(compression_options=dict(level=5)) + """Test the printout display of a HDF5DatasetIOConfiguration model looks nice.""" + hdf5_dataset_configuration = mock_HDF5DatasetIOConfiguration(compression_options=dict(level=5)) with patch("sys.stdout", new=StringIO()) as out: print(hdf5_dataset_configuration) @@ -62,8 +60,8 @@ def test_hdf5_dataset_configuration_print_with_compression_options(): def test_hdf5_dataset_configuration_print_with_compression_disabled(): - """Test the printout display of a HDF5DatasetConfiguration model looks nice.""" - hdf5_dataset_configuration = mock_HDF5DatasetConfiguration(compression_method=None) + """Test the printout display of a HDF5DatasetIOConfiguration model looks nice.""" + hdf5_dataset_configuration = mock_HDF5DatasetIOConfiguration(compression_method=None) with patch("sys.stdout", new=StringIO()) as out: print(hdf5_dataset_configuration) @@ -86,12 +84,12 @@ def test_hdf5_dataset_configuration_print_with_compression_disabled(): def test_hdf5_dataset_configuration_repr(): - """Test the programmatic repr of a HDF5DatasetConfiguration model is more dataclass-like.""" - hdf5_dataset_configuration = mock_HDF5DatasetConfiguration() + """Test the programmatic repr of a HDF5DatasetIOConfiguration model is more dataclass-like.""" + hdf5_dataset_configuration = mock_HDF5DatasetIOConfiguration() # Important to keep the `repr` unmodified for appearance inside iterables of DatasetInfo objects expected_repr = ( - "HDF5DatasetConfiguration(dataset_info=DatasetInfo(object_id='481a0860-3a0c-40ec-b931-df4a3e9b101f', " + "HDF5DatasetIOConfiguration(dataset_info=DatasetInfo(object_id='481a0860-3a0c-40ec-b931-df4a3e9b101f', " "location='acquisition/TestElectricalSeries/data', dataset_name='data', dtype=dtype('int16'), " "full_shape=(1800000, 384)), chunk_shape=(78125, 64), buffer_shape=(1250000, 384), compression_method='gzip', " "compression_options=None)" @@ -108,7 +106,7 @@ def test_default_compression_is_always_available(): def test_get_data_io_kwargs(): - hdf5_dataset_configuration = mock_HDF5DatasetConfiguration() + hdf5_dataset_configuration = mock_HDF5DatasetIOConfiguration() assert hdf5_dataset_configuration.get_data_io_kwargs() == dict( chunks=(78125, 64), compression="gzip", compression_opts=None diff --git a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/test_zarr_backend_configuration_model.py b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/test_zarr_backend_configuration_model.py index e8017c719..da417710c 100644 --- a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/test_zarr_backend_configuration_model.py +++ b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/test_zarr_backend_configuration_model.py @@ -6,7 +6,7 @@ def test_zarr_backend_configuration_print(): - """Test the printout display of a HDF5DatasetConfiguration model looks nice.""" + """Test the printout display of a ZarrBackendConfiguration model looks nice.""" zarr_backend_configuration = mock_ZarrBackendConfiguration() with patch("sys.stdout", new=StringIO()) as out: diff --git a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/test_zarr_dataset_configuration_model.py b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/test_zarr_dataset_io_configuration_model.py similarity index 79% rename from tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/test_zarr_dataset_configuration_model.py rename to tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/test_zarr_dataset_io_configuration_model.py index 8ddc5bf7e..e99c1dbca 100644 --- a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/test_zarr_dataset_configuration_model.py +++ b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/test_zarr_dataset_io_configuration_model.py @@ -1,4 +1,4 @@ -"""Unit tests for the ZarrDatasetConfiguration Pydantic model.""" +"""Unit tests for the ZarrDatasetIOConfiguration Pydantic model.""" from io import StringIO from unittest.mock import patch @@ -7,14 +7,14 @@ from neuroconv.tools.nwb_helpers import ( AVAILABLE_ZARR_COMPRESSION_METHODS, - ZarrDatasetConfiguration, + ZarrDatasetIOConfiguration, ) -from neuroconv.tools.testing import mock_DatasetInfo, mock_ZarrDatasetConfiguration +from neuroconv.tools.testing import mock_DatasetInfo, mock_ZarrDatasetIOConfiguration -def test_zarr_dataset_configuration_print(): - """Test the printout display of a ZarrDatasetConfiguration model looks nice.""" - zarr_dataset_configuration = mock_ZarrDatasetConfiguration() +def test_zarr_dataset_io_configuration_print(): + """Test the printout display of a ZarrDatasetIOConfiguration model looks nice.""" + zarr_dataset_configuration = mock_ZarrDatasetIOConfiguration() with patch("sys.stdout", new=StringIO()) as out: print(zarr_dataset_configuration) @@ -39,8 +39,8 @@ def test_zarr_dataset_configuration_print(): def test_zarr_dataset_configuration_print_with_compression_options(): - """Test the printout display of a ZarrDatasetConfiguration model looks nice.""" - zarr_dataset_configuration = mock_ZarrDatasetConfiguration(compression_options=dict(level=5)) + """Test the printout display of a ZarrDatasetIOConfiguration model looks nice.""" + zarr_dataset_configuration = mock_ZarrDatasetIOConfiguration(compression_options=dict(level=5)) with patch("sys.stdout", new=StringIO()) as out: print(zarr_dataset_configuration) @@ -66,8 +66,8 @@ def test_zarr_dataset_configuration_print_with_compression_options(): def test_zarr_dataset_configuration_print_with_compression_disabled(): - """Test the printout display of a ZarrDatasetConfiguration model looks nice.""" - zarr_dataset_configuration = mock_ZarrDatasetConfiguration(compression_method=None) + """Test the printout display of a ZarrDatasetIOConfiguration model looks nice.""" + zarr_dataset_configuration = mock_ZarrDatasetIOConfiguration(compression_method=None) with patch("sys.stdout", new=StringIO()) as out: print(zarr_dataset_configuration) @@ -90,8 +90,8 @@ def test_zarr_dataset_configuration_print_with_compression_disabled(): def test_zarr_dataset_configuration_print_with_filter_methods(): - """Test the printout display of a ZarrDatasetConfiguration model looks nice.""" - zarr_dataset_configuration = mock_ZarrDatasetConfiguration(filter_methods=["delta"]) + """Test the printout display of a ZarrDatasetIOConfiguration model looks nice.""" + zarr_dataset_configuration = mock_ZarrDatasetIOConfiguration(filter_methods=["delta"]) with patch("sys.stdout", new=StringIO()) as out: print(zarr_dataset_configuration) @@ -118,8 +118,8 @@ def test_zarr_dataset_configuration_print_with_filter_methods(): def test_zarr_dataset_configuration_print_with_filter_options(): - """Test the printout display of a ZarrDatasetConfiguration model looks nice.""" - zarr_dataset_configuration = mock_ZarrDatasetConfiguration( + """Test the printout display of a ZarrDatasetIOConfiguration model looks nice.""" + zarr_dataset_configuration = mock_ZarrDatasetIOConfiguration( filter_methods=["blosc"], filter_options=[dict(clevel=5)] ) @@ -149,12 +149,12 @@ def test_zarr_dataset_configuration_print_with_filter_options(): def test_zarr_dataset_configuration_repr(): - """Test the programmatic repr of a ZarrDatasetConfiguration model is more dataclass-like.""" - zarr_dataset_configuration = mock_ZarrDatasetConfiguration() + """Test the programmatic repr of a ZarrDatasetIOConfiguration model is more dataclass-like.""" + zarr_dataset_configuration = mock_ZarrDatasetIOConfiguration() # Important to keep the `repr` unmodified for appearance inside iterables of DatasetInfo objects expected_repr = ( - "ZarrDatasetConfiguration(dataset_info=DatasetInfo(object_id='481a0860-3a0c-40ec-b931-df4a3e9b101f', " + "ZarrDatasetIOConfiguration(dataset_info=DatasetInfo(object_id='481a0860-3a0c-40ec-b931-df4a3e9b101f', " "location='acquisition/TestElectricalSeries/data', dataset_name='data', dtype=dtype('int16'), " "full_shape=(1800000, 384)), chunk_shape=(78125, 64), buffer_shape=(1250000, 384), compression_method='gzip', " "compression_options=None, filter_methods=None, filter_options=None)" @@ -164,7 +164,7 @@ def test_zarr_dataset_configuration_repr(): def test_validator_filter_options_has_methods(): with pytest.raises(ValueError) as error_info: - ZarrDatasetConfiguration( + ZarrDatasetIOConfiguration( dataset_info=mock_DatasetInfo(), chunk_shape=(78_125, 64), buffer_shape=(1_250_000, 384), @@ -181,7 +181,7 @@ def test_validator_filter_options_has_methods(): def test_validator_filter_methods_length_match_options(): with pytest.raises(ValueError) as error_info: - ZarrDatasetConfiguration( + ZarrDatasetIOConfiguration( dataset_info=mock_DatasetInfo(), chunk_shape=(78_125, 64), buffer_shape=(1_250_000, 384), @@ -205,7 +205,7 @@ def test_default_compression_is_always_available(): def test_get_data_io_kwargs(): - zarr_dataset_configuration = mock_ZarrDatasetConfiguration() + zarr_dataset_configuration = mock_ZarrDatasetIOConfiguration() assert zarr_dataset_configuration.get_data_io_kwargs() == dict( chunks=(78125, 64), compressor=GZip(level=1), filters=None From 89915ab2a5ad449ee49049e7fb6acde74dd97a2b Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 21 Nov 2023 15:54:17 +0000 Subject: [PATCH 21/27] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/neuroconv/tools/nwb_helpers/_dataset_configuration.py | 4 ++-- .../test_common_dataset_io_configuration_model.py | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py b/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py index 775f7c923..2417ad19e 100644 --- a/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py +++ b/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py @@ -1,5 +1,5 @@ """Collection of helper functions related to configuration of datasets dependent on backend.""" -from typing import Literal, Union, Generator +from typing import Generator, Literal, Union import h5py import numpy as np @@ -11,7 +11,7 @@ from pynwb import NWBHDF5IO, NWBFile, TimeSeries from pynwb.base import DynamicTable -from ._models._base_models import DatasetIOConfiguration, DatasetInfo +from ._models._base_models import DatasetInfo, DatasetIOConfiguration from ._models._hdf5_models import HDF5BackendConfiguration, HDF5DatasetIOConfiguration from ._models._zarr_models import ZarrBackendConfiguration, ZarrDatasetIOConfiguration from ..hdmf import SliceableDataChunkIterator diff --git a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_common_dataset_io_configuration_model.py b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_common_dataset_io_configuration_model.py index 3babb046e..c8a6738b7 100644 --- a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_common_dataset_io_configuration_model.py +++ b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_common_dataset_io_configuration_model.py @@ -155,7 +155,8 @@ def test_validator_chunk_dimensions_do_not_evenly_divide_buffer( @pytest.mark.parametrize( - argnames="mock_dataset_configuration", argvalues=[mock_HDF5DatasetIOConfiguration(), mock_ZarrDatasetIOConfiguration()] + argnames="mock_dataset_configuration", + argvalues=[mock_HDF5DatasetIOConfiguration(), mock_ZarrDatasetIOConfiguration()], ) def test_mutation_validation( mock_dataset_configuration: Union[mock_HDF5DatasetIOConfiguration, mock_ZarrDatasetIOConfiguration] From bfe1049abd8e25773c2a37833acbddf8207dd5b8 Mon Sep 17 00:00:00 2001 From: Cody Baker Date: Tue, 21 Nov 2023 11:48:09 -0500 Subject: [PATCH 22/27] fix minimal test --- src/neuroconv/tools/hdmf.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/neuroconv/tools/hdmf.py b/src/neuroconv/tools/hdmf.py index ebfdf54c0..46f0fd865 100644 --- a/src/neuroconv/tools/hdmf.py +++ b/src/neuroconv/tools/hdmf.py @@ -58,14 +58,14 @@ def estimate_default_buffer_shape( return tuple(maxshape) buffer_bytes = chunk_bytes - axis_sizes_bytes = maxshape * self.dtype.itemsize + axis_sizes_bytes = maxshape * dtype.itemsize target_buffer_bytes = buffer_gb * 1e9 if min(axis_sizes_bytes) > target_buffer_bytes: if num_axes > 1: - smallest_chunk_axis, second_smallest_chunk_axis, *_ = np.argsort(self.chunk_shape) - # If the smallest full axis does not fit within the buffer size, form a square along the two smallest axes - sub_square_buffer_shape = np.array(self.chunk_shape) + smallest_chunk_axis, second_smallest_chunk_axis, *_ = np.argsort(chunk_shape) + # If the smallest full axis does not fit within the buffer size, form a square along the smallest axes + sub_square_buffer_shape = np.array(chunk_shape) if min(axis_sizes_bytes) > target_buffer_bytes: k1 = math.floor((target_buffer_bytes / chunk_bytes) ** 0.5) for axis in [smallest_chunk_axis, second_smallest_chunk_axis]: @@ -78,7 +78,7 @@ def estimate_default_buffer_shape( k1 = math.floor(target_buffer_bytes / chunk_bytes) return tuple( [ - k1 * self.chunk_shape[0], + k1 * chunk_shape[0], ] ) else: From f1683fa46c94efc676d7b7297123b5bf3d493902 Mon Sep 17 00:00:00 2001 From: Cody Baker Date: Tue, 21 Nov 2023 12:57:20 -0500 Subject: [PATCH 23/27] alter private method name --- .../tools/nwb_helpers/_dataset_configuration.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py b/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py index 2417ad19e..f0528e84d 100644 --- a/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py +++ b/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py @@ -49,7 +49,12 @@ def _is_dataset_written_to_file( ) -def _parse_location_in_memory_nwbfile(current_location: str, neurodata_object: Container) -> str: +def _find_location_in_memory_nwbfile(current_location: str, neurodata_object: Container) -> str: + """ + Method for determining the location of a neurodata object within an in-memory NWBFile object. + + Distinct from methods from other packages, such as the NWB Inspector, which rely on such files being read from disk. + """ parent = neurodata_object.parent if isinstance(parent, NWBFile): # Items in defined top-level places like acquisition, intervals, etc. do not act as 'containers' @@ -58,7 +63,7 @@ def _parse_location_in_memory_nwbfile(current_location: str, neurodata_object: C if isinstance(parent_field_value, dict) and neurodata_object.name in parent_field_value: return parent_field_name + "/" + neurodata_object.name + "/" + current_location return neurodata_object.name + "/" + current_location - return _parse_location_in_memory_nwbfile( + return _find_location_in_memory_nwbfile( current_location=neurodata_object.name + "/" + current_location, neurodata_object=parent ) @@ -92,8 +97,6 @@ def _get_dataset_metadata( if isinstance(candidate_dataset, DataIO): return None - # DataChunkIterator has best generic dtype inference, though logic is hard to peel out of it - # And it can fail in rare cases but not essential to our default configuration dtype = _infer_dtype_using_data_chunk_iterator(candidate_dataset=candidate_dataset) full_shape = get_data_shape(data=candidate_dataset) @@ -111,7 +114,7 @@ def _get_dataset_metadata( else: pass # TODO: think on this; perhaps zarr's standalone estimator? - location = _parse_location_in_memory_nwbfile(current_location=field_name, neurodata_object=neurodata_object) + location = _find_location_in_memory_nwbfile(current_location=field_name, neurodata_object=neurodata_object) dataset_info = DatasetInfo( object_id=neurodata_object.object_id, object_name=neurodata_object.name, From 185a69da8638dd16685f32a7aa39520a1b9220a4 Mon Sep 17 00:00:00 2001 From: Cody Baker Date: Tue, 21 Nov 2023 14:02:19 -0500 Subject: [PATCH 24/27] add extra tests --- .../nwb_helpers/_dataset_configuration.py | 30 ++--- ...t_get_default_dataset_io_configurations.py | 111 +++++++++++++++++- 2 files changed, 122 insertions(+), 19 deletions(-) diff --git a/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py b/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py index f0528e84d..14e5d1c79 100644 --- a/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py +++ b/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py @@ -174,7 +174,22 @@ def get_default_dataset_io_configurations( ) for neurodata_object in nwbfile.objects.values(): - if isinstance(neurodata_object, TimeSeries): + if isinstance(neurodata_object, DynamicTable): + dynamic_table = neurodata_object # for readability + + for column_name in dynamic_table.colnames: + candidate_dataset = dynamic_table[column_name].data # VectorData object + if _is_dataset_written_to_file( + candidate_dataset=candidate_dataset, backend=backend, existing_file=existing_file + ): + continue # skip + + yield _get_dataset_metadata( + neurodata_object=dynamic_table[column_name], field_name="data", backend=backend + ) + else: + # Primarily for TimeSeries, but also any extended class that has 'data' or 'timestamps' + # The most common example of this is ndx-events Events/LabeledEvents types time_series = neurodata_object # for readability for field_name in ("data", "timestamps"): @@ -192,16 +207,3 @@ def get_default_dataset_io_configurations( continue # skip yield _get_dataset_metadata(neurodata_object=time_series, field_name=field_name, backend=backend) - elif isinstance(neurodata_object, DynamicTable): - dynamic_table = neurodata_object # for readability - - for column_name in dynamic_table.colnames: - candidate_dataset = dynamic_table[column_name].data # VectorData object - if _is_dataset_written_to_file( - candidate_dataset=candidate_dataset, backend=backend, existing_file=existing_file - ): - continue # skip - - yield _get_dataset_metadata( - neurodata_object=dynamic_table[column_name], field_name="data", backend=backend - ) diff --git a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_default_dataset_io_configurations.py b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_default_dataset_io_configurations.py index b91cc45fb..9d9943d6e 100644 --- a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_default_dataset_io_configurations.py +++ b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_default_dataset_io_configurations.py @@ -7,14 +7,14 @@ from hdmf.data_utils import DataChunkIterator from pynwb.base import DynamicTable from pynwb.image import ImageSeries -from pynwb.testing.mock.base import mock_TimeSeries +from pynwb.behavior import CompassDirection from pynwb.testing.mock.file import mock_NWBFile +from pynwb.testing.mock.base import mock_TimeSeries +from pynwb.testing.mock.behavior import mock_SpatialSeries +from nwbinspector.utils import is_module_installed from neuroconv.tools.hdmf import SliceableDataChunkIterator -from neuroconv.tools.nwb_helpers import ( - DATASET_IO_CONFIGURATIONS, - get_default_dataset_io_configurations, -) +from neuroconv.tools.nwb_helpers import DATASET_IO_CONFIGURATIONS, get_default_dataset_io_configurations, get_module @pytest.mark.parametrize("iterator", [lambda x: x, SliceableDataChunkIterator, DataChunkIterator]) @@ -87,3 +87,104 @@ def test_configuration_on_dynamic_table(iterator: callable, backend: Literal["hd if backend == "zarr": assert dataset_configuration.filter_methods is None assert dataset_configuration.filter_options is None + + +@pytest.mark.parametrize("iterator", [lambda x: x, SliceableDataChunkIterator, DataChunkIterator]) +@pytest.mark.parametrize("backend", ["hdf5", "zarr"]) +def test_configuration_on_compass_direction(iterator: callable, backend: Literal["hdf5", "zarr"]): + array = np.array([[1, 2, 3], [4, 5, 6]]) + data = iterator(array) + + nwbfile = mock_NWBFile() + spatial_series = mock_SpatialSeries(name="TestSpatialSeries", data=data) + compass_direction = CompassDirection(name="TestCompassDirection", spatial_series=spatial_series) + behavior_module = get_module(nwbfile=nwbfile, name="behavior") + behavior_module.add(compass_direction) + + dataset_configurations = list(get_default_dataset_io_configurations(nwbfile=nwbfile, backend=backend)) + + assert len(dataset_configurations) == 1 + + dataset_configuration = dataset_configurations[0] + assert isinstance(dataset_configuration, DATASET_IO_CONFIGURATIONS[backend]) + assert dataset_configuration.dataset_info.object_id == spatial_series.object_id + assert ( + dataset_configuration.dataset_info.location == "processing/behavior/TestCompassDirection/TestSpatialSeries/data" + ) + assert dataset_configuration.dataset_info.full_shape == array.shape + assert dataset_configuration.dataset_info.dtype == array.dtype + assert dataset_configuration.chunk_shape == array.shape + assert dataset_configuration.buffer_shape == array.shape + assert dataset_configuration.compression_method == "gzip" + assert dataset_configuration.compression_options is None + + if backend == "zarr": + assert dataset_configuration.filter_methods is None + assert dataset_configuration.filter_options is None + + +@pytest.mark.skipif( + not is_module_installed(module_name="ndx_events"), reason="The extra testing package 'ndx-events' is not installed!" +) +@pytest.mark.parametrize("backend", ["hdf5", "zarr"]) +def test_configuration_on_ndx_events(backend: Literal["hdf5", "zarr"]): + from ndx_events import LabeledEvents + + # ndx_events data fields do not support wrapping in DataChunkIterators - data is nearly always small enough + # to fit entirely in memory + data = np.array([1, 2, 3], dtype="uint32") + timestamps = np.array([4.5, 6.7, 8.9]) + + nwbfile = mock_NWBFile() + labeled_events = LabeledEvents( + name="TestLabeledEvents", + description="", + timestamps=timestamps, + data=data, + labels=["response_left", "cue_onset", "cue_offset"], + ) + behavior_module = get_module(nwbfile=nwbfile, name="behavior") + behavior_module.add(labeled_events) + + dataset_configurations = list(get_default_dataset_io_configurations(nwbfile=nwbfile, backend=backend)) + + # Note that the labels dataset is not caught since we search only for 'data' and 'timestamps' fields + assert len(dataset_configurations) == 2 + + data_dataset_configuration = next( + dataset_configuration + for dataset_configuration in dataset_configurations + if dataset_configuration.dataset_info.dataset_name == "data" + ) + assert isinstance(data_dataset_configuration, DATASET_IO_CONFIGURATIONS[backend]) + assert data_dataset_configuration.dataset_info.object_id == labeled_events.object_id + assert data_dataset_configuration.dataset_info.location == "processing/behavior/TestLabeledEvents/data" + assert data_dataset_configuration.dataset_info.full_shape == data.shape + assert data_dataset_configuration.dataset_info.dtype == data.dtype + assert data_dataset_configuration.chunk_shape == data.shape + assert data_dataset_configuration.buffer_shape == data.shape + assert data_dataset_configuration.compression_method == "gzip" + assert data_dataset_configuration.compression_options is None + + if backend == "zarr": + assert data_dataset_configuration.filter_methods is None + assert data_dataset_configuration.filter_options is None + + timestamps_dataset_configuration = next( + dataset_configuration + for dataset_configuration in dataset_configurations + if dataset_configuration.dataset_info.dataset_name == "timestamps" + ) + assert isinstance(timestamps_dataset_configuration, DATASET_IO_CONFIGURATIONS[backend]) + assert timestamps_dataset_configuration.dataset_info.object_id == labeled_events.object_id + assert timestamps_dataset_configuration.dataset_info.location == "processing/behavior/TestLabeledEvents/timestamps" + assert timestamps_dataset_configuration.dataset_info.full_shape == timestamps.shape + assert timestamps_dataset_configuration.dataset_info.dtype == timestamps.dtype + assert timestamps_dataset_configuration.chunk_shape == timestamps.shape + assert timestamps_dataset_configuration.buffer_shape == timestamps.shape + assert timestamps_dataset_configuration.compression_method == "gzip" + assert timestamps_dataset_configuration.compression_options is None + + if backend == "zarr": + assert timestamps_dataset_configuration.filter_methods is None + assert timestamps_dataset_configuration.filter_options is None From 6fad0030fcd8369f385f98f0d48d76b72850c338 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 21 Nov 2023 19:03:39 +0000 Subject: [PATCH 25/27] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../test_get_default_dataset_io_configurations.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_default_dataset_io_configurations.py b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_default_dataset_io_configurations.py index 9d9943d6e..1d87821ab 100644 --- a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_default_dataset_io_configurations.py +++ b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_default_dataset_io_configurations.py @@ -5,16 +5,20 @@ import pytest from hdmf.common import VectorData from hdmf.data_utils import DataChunkIterator +from nwbinspector.utils import is_module_installed from pynwb.base import DynamicTable -from pynwb.image import ImageSeries from pynwb.behavior import CompassDirection -from pynwb.testing.mock.file import mock_NWBFile +from pynwb.image import ImageSeries from pynwb.testing.mock.base import mock_TimeSeries from pynwb.testing.mock.behavior import mock_SpatialSeries -from nwbinspector.utils import is_module_installed +from pynwb.testing.mock.file import mock_NWBFile from neuroconv.tools.hdmf import SliceableDataChunkIterator -from neuroconv.tools.nwb_helpers import DATASET_IO_CONFIGURATIONS, get_default_dataset_io_configurations, get_module +from neuroconv.tools.nwb_helpers import ( + DATASET_IO_CONFIGURATIONS, + get_default_dataset_io_configurations, + get_module, +) @pytest.mark.parametrize("iterator", [lambda x: x, SliceableDataChunkIterator, DataChunkIterator]) From 5ce5914660f23e50ec08ed720f5f7eadd99d6593 Mon Sep 17 00:00:00 2001 From: Cody Baker Date: Wed, 22 Nov 2023 11:15:24 -0500 Subject: [PATCH 26/27] add test for ragged tables; debug --- .../nwb_helpers/_dataset_configuration.py | 9 +- ...t_get_default_dataset_io_configurations.py | 106 ++++++++++++++++++ 2 files changed, 110 insertions(+), 5 deletions(-) diff --git a/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py b/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py index 14e5d1c79..4e7783aff 100644 --- a/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py +++ b/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py @@ -177,16 +177,15 @@ def get_default_dataset_io_configurations( if isinstance(neurodata_object, DynamicTable): dynamic_table = neurodata_object # for readability - for column_name in dynamic_table.colnames: - candidate_dataset = dynamic_table[column_name].data # VectorData object + for column in dynamic_table.columns: + column_name = column.name + candidate_dataset = column.data # VectorData object if _is_dataset_written_to_file( candidate_dataset=candidate_dataset, backend=backend, existing_file=existing_file ): continue # skip - yield _get_dataset_metadata( - neurodata_object=dynamic_table[column_name], field_name="data", backend=backend - ) + yield _get_dataset_metadata(neurodata_object=column, field_name="data", backend=backend) else: # Primarily for TimeSeries, but also any extended class that has 'data' or 'timestamps' # The most common example of this is ndx-events Events/LabeledEvents types diff --git a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_default_dataset_io_configurations.py b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_default_dataset_io_configurations.py index 1d87821ab..7db52da3d 100644 --- a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_default_dataset_io_configurations.py +++ b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_default_dataset_io_configurations.py @@ -9,6 +9,7 @@ from pynwb.base import DynamicTable from pynwb.behavior import CompassDirection from pynwb.image import ImageSeries +from pynwb.misc import Units from pynwb.testing.mock.base import mock_TimeSeries from pynwb.testing.mock.behavior import mock_SpatialSeries from pynwb.testing.mock.file import mock_NWBFile @@ -93,6 +94,111 @@ def test_configuration_on_dynamic_table(iterator: callable, backend: Literal["hd assert dataset_configuration.filter_options is None +@pytest.mark.parametrize("backend", ["hdf5", "zarr"]) +def test_configuration_on_ragged_units_table(backend: Literal["hdf5", "zarr"]): + nwbfile = mock_NWBFile() + units = Units(name="units", description="") + + spike_times = np.array([0.0, 1.0, 2.0]) + waveforms = np.array([[[1, 2, 3], [1, 2, 3], [1, 2, 3]], [[1, 2, 3], [1, 2, 3], [1, 2, 3]]]) + units.add_unit(spike_times=spike_times, waveforms=waveforms) + + spike_times = np.array([3.0, 4.0]) + waveforms = np.array([[[4, 5], [4, 5], [4, 5]], [[4, 5], [4, 5], [4, 5]]]) + units.add_unit(spike_times=spike_times, waveforms=waveforms) + + nwbfile.units = units + + dataset_configurations = list(get_default_dataset_io_configurations(nwbfile=nwbfile, backend=backend)) + + assert len(dataset_configurations) == 5 + + dataset_configuration = next( + dataset_configuration + for dataset_configuration in dataset_configurations + if dataset_configuration.dataset_info.location == "units/spike_times/data" + ) + assert isinstance(dataset_configuration, DATASET_IO_CONFIGURATIONS[backend]) + assert dataset_configuration.dataset_info.full_shape == (5,) + assert dataset_configuration.dataset_info.dtype == np.dtype("float64") + assert dataset_configuration.chunk_shape == (5,) + assert dataset_configuration.buffer_shape == (5,) + assert dataset_configuration.compression_method == "gzip" + assert dataset_configuration.compression_options is None + + if backend == "zarr": + assert dataset_configuration.filter_methods is None + assert dataset_configuration.filter_options is None + + dataset_configuration = next( + dataset_configuration + for dataset_configuration in dataset_configurations + if dataset_configuration.dataset_info.location == "units/spike_times_index/data" + ) + assert isinstance(dataset_configuration, DATASET_IO_CONFIGURATIONS[backend]) + assert dataset_configuration.dataset_info.full_shape == (2,) + assert dataset_configuration.dataset_info.dtype == np.dtype("uint8") + assert dataset_configuration.chunk_shape == (2,) + assert dataset_configuration.buffer_shape == (2,) + assert dataset_configuration.compression_method == "gzip" + assert dataset_configuration.compression_options is None + + if backend == "zarr": + assert dataset_configuration.filter_methods is None + assert dataset_configuration.filter_options is None + + dataset_configuration = next( + dataset_configuration + for dataset_configuration in dataset_configurations + if dataset_configuration.dataset_info.location == "units/waveforms/data" + ) + assert isinstance(dataset_configuration, DATASET_IO_CONFIGURATIONS[backend]) + assert dataset_configuration.dataset_info.full_shape == (12, 3) + assert dataset_configuration.dataset_info.dtype == np.dtype("int32") + assert dataset_configuration.chunk_shape == (12, 3) + assert dataset_configuration.buffer_shape == (12, 3) + assert dataset_configuration.compression_method == "gzip" + assert dataset_configuration.compression_options is None + + if backend == "zarr": + assert dataset_configuration.filter_methods is None + assert dataset_configuration.filter_options is None + + dataset_configuration = next( + dataset_configuration + for dataset_configuration in dataset_configurations + if dataset_configuration.dataset_info.location == "units/waveforms_index/data" + ) + assert isinstance(dataset_configuration, DATASET_IO_CONFIGURATIONS[backend]) + assert dataset_configuration.dataset_info.full_shape == (4,) + assert dataset_configuration.dataset_info.dtype == np.dtype("uint8") + assert dataset_configuration.chunk_shape == (4,) + assert dataset_configuration.buffer_shape == (4,) + assert dataset_configuration.compression_method == "gzip" + assert dataset_configuration.compression_options is None + + if backend == "zarr": + assert dataset_configuration.filter_methods is None + assert dataset_configuration.filter_options is None + + dataset_configuration = next( + dataset_configuration + for dataset_configuration in dataset_configurations + if dataset_configuration.dataset_info.location == "units/waveforms_index_index/data" + ) + assert isinstance(dataset_configuration, DATASET_IO_CONFIGURATIONS[backend]) + assert dataset_configuration.dataset_info.full_shape == (2,) + assert dataset_configuration.dataset_info.dtype == np.dtype("uint8") + assert dataset_configuration.chunk_shape == (2,) + assert dataset_configuration.buffer_shape == (2,) + assert dataset_configuration.compression_method == "gzip" + assert dataset_configuration.compression_options is None + + if backend == "zarr": + assert dataset_configuration.filter_methods is None + assert dataset_configuration.filter_options is None + + @pytest.mark.parametrize("iterator", [lambda x: x, SliceableDataChunkIterator, DataChunkIterator]) @pytest.mark.parametrize("backend", ["hdf5", "zarr"]) def test_configuration_on_compass_direction(iterator: callable, backend: Literal["hdf5", "zarr"]): From 3032755fbe7e4004f0fc8cca70309738b850737e Mon Sep 17 00:00:00 2001 From: Cody Baker Date: Wed, 22 Nov 2023 11:29:00 -0500 Subject: [PATCH 27/27] adjust for cross-platform --- .../test_get_default_dataset_io_configurations.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_default_dataset_io_configurations.py b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_default_dataset_io_configurations.py index 7db52da3d..69545adbf 100644 --- a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_default_dataset_io_configurations.py +++ b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_default_dataset_io_configurations.py @@ -100,11 +100,11 @@ def test_configuration_on_ragged_units_table(backend: Literal["hdf5", "zarr"]): units = Units(name="units", description="") spike_times = np.array([0.0, 1.0, 2.0]) - waveforms = np.array([[[1, 2, 3], [1, 2, 3], [1, 2, 3]], [[1, 2, 3], [1, 2, 3], [1, 2, 3]]]) + waveforms = np.array([[[1, 2, 3], [1, 2, 3], [1, 2, 3]], [[1, 2, 3], [1, 2, 3], [1, 2, 3]]], dtype="int32") units.add_unit(spike_times=spike_times, waveforms=waveforms) spike_times = np.array([3.0, 4.0]) - waveforms = np.array([[[4, 5], [4, 5], [4, 5]], [[4, 5], [4, 5], [4, 5]]]) + waveforms = np.array([[[4, 5], [4, 5], [4, 5]], [[4, 5], [4, 5], [4, 5]]], dtype="int32") units.add_unit(spike_times=spike_times, waveforms=waveforms) nwbfile.units = units