From de97c31b1919acd6194e2af7d63ed5e9e7a13dec Mon Sep 17 00:00:00 2001 From: Cody Baker Date: Mon, 28 Aug 2023 15:59:30 -0400 Subject: [PATCH 01/17] add first placeholders --- CHANGELOG.md | 6 ++- src/neuroconv/tools/nwb_helpers.py | 80 +++++++++++++++++++++++++++++- 2 files changed, 84 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 35245cc10..b0e5dee2d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,7 +15,11 @@ whether to load the imaging planes as a volume (`"contiguous"`) or separately (`"disjoint"`). The available data streams for the defined `plane_separation_type` can be checked by `BrukerTiffMultiPlaneImagingInterface.get_streams(folder_path, plane_separation_type)` method. [PR #507](https://github.com/catalystneuro/neuroconv/pull/507) -* Add tool function `nwb_helpers.get_io_dataset` and corresponding private methods and dataclass for detecting datasets from an in-memory `NWBFile` that can be wrapped in an H5DataIO before being written to a new or existing file. [PR #549](https://github.com/catalystneuro/neuroconv/pull/549) +* Add tool function `nwb_helpers.get_io_datasets` and corresponding private methods and Pydantic model for detecting datasets from an in-memory `NWBFile` that can be wrapped in an H5DataIO before being written to a new or existing file. [PR #549](https://github.com/catalystneuro/neuroconv/pull/549) + +* Add tool function `nwb_helpers.get_default_dataset_configurations(nwbfile: NWBFile) -> Dict[Dataset, DatasetConfiguration]` and Pydantic models `BackendConfiguration` for representing top-level backend configuration and `nwb_helpers.DatasetConfiguration` for representing configurable properties of the datasets (chunking & compression options) depending on each backend before writing to disk. + +* Add tool function `nwb_helpers.configure_datasets(nwbfile: NWBFile, dataset_configurations: Dict[Dataset, DatasetConfiguration])` for configuring backend and dataset options for an `NWBFile` before writing to disk. diff --git a/src/neuroconv/tools/nwb_helpers.py b/src/neuroconv/tools/nwb_helpers.py index 09878efb5..d20833574 100644 --- a/src/neuroconv/tools/nwb_helpers.py +++ b/src/neuroconv/tools/nwb_helpers.py @@ -5,13 +5,14 @@ from copy import deepcopy from datetime import datetime from pathlib import Path -from typing import Iterable, Optional, Tuple, Union, Literal +from typing import Iterable, Optional, Tuple, Union, Literal, Dict, Type from warnings import warn import zarr import h5py import jsonschema import numpy as np +import hdf5plugin from hdmf.data_utils import DataIO from hdmf.utils import get_data_shape from hdmf_zarr import NWBZarrIO @@ -19,6 +20,7 @@ from pynwb import NWBHDF5IO, NWBFile, TimeSeries from pynwb.base import DynamicTable from pynwb.file import Subject +from nwbinspector.utils import is_module_installed from ..utils import FilePathType, dict_deep_update from ..utils.dict import DeepDict, load_dict_from_file @@ -311,3 +313,79 @@ def get_io_datasets(nwbfile: NWBFile) -> Iterable[Dataset]: continue # skip yield _get_dataset_metadata(neurodata_object=neurodata_object[column_name], field_name="data") + + +class DatasetConfiguration(BaseModel): + """A data model for configruing options about an object that will become a HDF5 or Zarr Dataset in the file.""" + + object_id: str + object_name: str + parent: str + field: Literal["data", "timestamps"] + chunk_shape: Tuple[int, ...] + buffer_shape: Tuple[int, ...] + maxshape: Tuple[int, ...] + compression_type: str + compression_options: None # TODO: think about how to annotate generally + dtype: str # Think about how to constrain/specify this more + + def __str__(self) -> str: + """Not overriding __repr__ as this is intended to render only when wrapped in print().""" + string = ( + f"{self.object_name} of {self.parent}\n" + + f"{'-' * (len(self.object_name) + 4 + len(self.parent))}\n" + + f" {self.field}\n" + + f" maxshape: {self.maxshape}\n" + + f" dtype: {self.dtype}" + ) + return string + + +_available_hdf5_filters = set(h5py.filters.decode) - set(("shuffle", "fletcher32", "scaleoffset")) +if is_module_installed(module_name="hdf5plugin"): + _available_hdf5_filters = _available_hdf5_filters | set( + (filter_.filter_name for filter_ in hdf5plugin.get_filters()) + ) +AVAILABLE_HDF5_COMPRESSION_METHODS = Literal[tuple(_available_hdf5_filters)] + + +class HDF5DatasetConfiguration(BaseModel): + """A data model for configruing options about an object that will become a HDF5 Dataset in the file.""" + + object_id: str + object_name: str + parent: str + field: Literal["data", "timestamps"] + chunk_shape: Tuple[int, ...] + buffer_shape: Tuple[int, ...] + maxshape: Tuple[int, ...] + compression_type: AVAILABLE_HDF5_COMPRESSION_METHODS + compression_options: None # TODO + dtype: str # Think about how to constrain/specify this more + + +class ZarrDatasetConfiguration(BaseModel): + """A data model for configruing options about an object that will become a Zarr Dataset in the file.""" + + object_id: str + object_name: str + parent: str + field: Literal["data", "timestamps"] + chunk_shape: Tuple[int, ...] + buffer_shape: Tuple[int, ...] + maxshape: Tuple[int, ...] + compression_type: None # TODO + compression_options: None # TODO + dtype: str # Think about how to constrain/specify this more + + +class BackendConfiguration(BaseModel): + """A model for matching collections of DatasetConfigurations specific to a backend with its name and DataIO.""" + + backend_type: Literal["hdf5", "zarr"] + data_io: Type[DataIO] + dataset_configurations: Iterable[DatasetConfiguration] + + +def get_default_dataset_configurations(nwbfile: NWBFile) -> Dict[Dataset, DatasetConfiguration]: + pass # TODO From 1b16b7c476a92a844608dcadad7dbac0ce1141df Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 28 Aug 2023 20:02:54 +0000 Subject: [PATCH 02/17] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- CHANGELOG.md | 2 +- src/neuroconv/tools/nwb_helpers.py | 8 ++++---- .../test_minimal/test_tools/test_backend_configuration.py | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b0e5dee2d..619dae685 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,7 +19,7 @@ * Add tool function `nwb_helpers.get_default_dataset_configurations(nwbfile: NWBFile) -> Dict[Dataset, DatasetConfiguration]` and Pydantic models `BackendConfiguration` for representing top-level backend configuration and `nwb_helpers.DatasetConfiguration` for representing configurable properties of the datasets (chunking & compression options) depending on each backend before writing to disk. -* Add tool function `nwb_helpers.configure_datasets(nwbfile: NWBFile, dataset_configurations: Dict[Dataset, DatasetConfiguration])` for configuring backend and dataset options for an `NWBFile` before writing to disk. +* Add tool function `nwb_helpers.configure_datasets(nwbfile: NWBFile, dataset_configurations: Dict[Dataset, DatasetConfiguration])` for configuring backend and dataset options for an `NWBFile` before writing to disk. diff --git a/src/neuroconv/tools/nwb_helpers.py b/src/neuroconv/tools/nwb_helpers.py index d20833574..ce5b02ad3 100644 --- a/src/neuroconv/tools/nwb_helpers.py +++ b/src/neuroconv/tools/nwb_helpers.py @@ -5,22 +5,22 @@ from copy import deepcopy from datetime import datetime from pathlib import Path -from typing import Iterable, Optional, Tuple, Union, Literal, Dict, Type +from typing import Dict, Iterable, Literal, Optional, Tuple, Type, Union from warnings import warn -import zarr import h5py +import hdf5plugin import jsonschema import numpy as np -import hdf5plugin +import zarr from hdmf.data_utils import DataIO from hdmf.utils import get_data_shape from hdmf_zarr import NWBZarrIO +from nwbinspector.utils import is_module_installed from pydantic import BaseModel from pynwb import NWBHDF5IO, NWBFile, TimeSeries from pynwb.base import DynamicTable from pynwb.file import Subject -from nwbinspector.utils import is_module_installed from ..utils import FilePathType, dict_deep_update from ..utils.dict import DeepDict, load_dict_from_file diff --git a/tests/test_minimal/test_tools/test_backend_configuration.py b/tests/test_minimal/test_tools/test_backend_configuration.py index 2fa2b2ac9..37d4e2ccc 100644 --- a/tests/test_minimal/test_tools/test_backend_configuration.py +++ b/tests/test_minimal/test_tools/test_backend_configuration.py @@ -1,5 +1,5 @@ -from pathlib import Path from io import StringIO +from pathlib import Path from unittest.mock import patch import numpy as np From cb38e7dd243cb9ec76f013e88d2a9bcc953e2d83 Mon Sep 17 00:00:00 2001 From: Cody Baker Date: Tue, 29 Aug 2023 10:13:13 -0400 Subject: [PATCH 03/17] reorganize tests --- src/neuroconv/tools/nwb_helpers.py | 36 +++-- .../test_configurable_dataset.py | 48 +++++++ .../test_get_configurable_datasets.py | 135 ++++++++++++++++++ ...t_default_backend_configuration - Copy.py} | 0 4 files changed, 211 insertions(+), 8 deletions(-) create mode 100644 tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_configurable_dataset.py create mode 100644 tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_get_configurable_datasets.py rename tests/test_minimal/test_tools/{test_backend_configuration.py => test_backend_and_dataset_configuration/test_get_default_backend_configuration - Copy.py} (100%) diff --git a/src/neuroconv/tools/nwb_helpers.py b/src/neuroconv/tools/nwb_helpers.py index d20833574..686d31622 100644 --- a/src/neuroconv/tools/nwb_helpers.py +++ b/src/neuroconv/tools/nwb_helpers.py @@ -5,7 +5,7 @@ from copy import deepcopy from datetime import datetime from pathlib import Path -from typing import Iterable, Optional, Tuple, Union, Literal, Dict, Type +from typing import Iterable, Optional, Tuple, Union, Literal, Dict, Type, Any from warnings import warn import zarr @@ -16,11 +16,12 @@ from hdmf.data_utils import DataIO from hdmf.utils import get_data_shape from hdmf_zarr import NWBZarrIO -from pydantic import BaseModel +from pydantic import BaseModel, root_validator from pynwb import NWBHDF5IO, NWBFile, TimeSeries from pynwb.base import DynamicTable from pynwb.file import Subject from nwbinspector.utils import is_module_installed +from numcodecs.registry import codec_registry from ..utils import FilePathType, dict_deep_update from ..utils.dict import DeepDict, load_dict_from_file @@ -325,8 +326,8 @@ class DatasetConfiguration(BaseModel): chunk_shape: Tuple[int, ...] buffer_shape: Tuple[int, ...] maxshape: Tuple[int, ...] - compression_type: str - compression_options: None # TODO: think about how to annotate generally + compression_method: str + compression_options: Dict[str, Any] dtype: str # Think about how to constrain/specify this more def __str__(self) -> str: @@ -359,11 +360,17 @@ class HDF5DatasetConfiguration(BaseModel): chunk_shape: Tuple[int, ...] buffer_shape: Tuple[int, ...] maxshape: Tuple[int, ...] - compression_type: AVAILABLE_HDF5_COMPRESSION_METHODS - compression_options: None # TODO + compression_method: AVAILABLE_HDF5_COMPRESSION_METHODS = "gzip" + # TODO: actually provide better schematic rendering of options. Only support defaults in GUIDE for now + # Looks like they'll have to be hand-typed however... Can try parsing the google docstrings but no annotation typing + compression_options: Dict[str, Any] dtype: str # Think about how to constrain/specify this more +_available_zarr_filters = set(codec_registry.keys()) - set(("json2", "pickle")) +AVAILABLE_ZARR_COMPRESSION_METHODS = Literal[tuple(_available_zarr_filters)] + + class ZarrDatasetConfiguration(BaseModel): """A data model for configruing options about an object that will become a Zarr Dataset in the file.""" @@ -374,10 +381,23 @@ class ZarrDatasetConfiguration(BaseModel): chunk_shape: Tuple[int, ...] buffer_shape: Tuple[int, ...] maxshape: Tuple[int, ...] - compression_type: None # TODO - compression_options: None # TODO + filter_methods: Tuple[AVAILABLE_ZARR_COMPRESSION_METHODS, ...] + filter_options: Tuple[Dict[str, Any]] + compression_method: AVAILABLE_ZARR_COMPRESSION_METHODS + # TODO: actually provide better schematic rendering of options. Only support defaults in GUIDE for now + # Looks like they'll have to be hand-typed however... Can try parsing the google docstrings but no annotation typing + compression_option: Dict[str, Any] dtype: str # Think about how to constrain/specify this more + @root_validator() + def verify_filter_methods_and_options_match(cls, values): + password = values.get("password") + confirm_password = values.get("confirm_password") + + if password != confirm_password: + raise ValueError("The two passwords did not match.") + return values + class BackendConfiguration(BaseModel): """A model for matching collections of DatasetConfigurations specific to a backend with its name and DataIO.""" diff --git a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_configurable_dataset.py b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_configurable_dataset.py new file mode 100644 index 000000000..42e43ed16 --- /dev/null +++ b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_configurable_dataset.py @@ -0,0 +1,48 @@ +"""Unit tests for the ConfigurableDataset Pydantic model.""" +from io import StringIO +from unittest.mock import patch + + +from neuroconv.tools.nwb_helpers import Dataset + + +def test_dataset_print(): + """Test the printout display of a Dataset modellooks nice.""" + test_dataset = Dataset( + object_id="abc123", + object_name="TestObject", + parent="TestParent", + field="data", + maxshape=(2, 4), + dtype="int16", + ) + + with patch("sys.stdout", new=StringIO()) as out: + print(test_dataset) + + expected_print = """TestObject of TestParent +------------------------ + data + maxshape: (2, 4) + dtype: int16 +""" + assert out.getvalue() == expected_print + + +def test_dataset_repr(): + """Test the programmatic repr of a Dataset model is more dataclass-like.""" + test_dataset = Dataset( + object_id="abc123", + object_name="TestObject", + parent="TestParent", + field="data", + maxshape=(2, 4), + dtype="int16", + ) + + # Important to keep the `repr` unmodified for appearence inside lists of Datasets + expected_repr = ( + "Dataset(object_id='abc123', object_name='TestObject', parent='TestParent', " + "field='data', maxshape=(2, 4), dtype='int16')" + ) + assert repr(test_dataset) == expected_repr diff --git a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_get_configurable_datasets.py b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_get_configurable_datasets.py new file mode 100644 index 000000000..fa82618b9 --- /dev/null +++ b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_get_configurable_datasets.py @@ -0,0 +1,135 @@ +from pathlib import Path + +import numpy as np +import pytest +from hdmf_zarr import NWBZarrIO +from pynwb import NWBHDF5IO, NWBFile +from pynwb.base import DynamicTable +from pynwb.testing.mock.base import mock_TimeSeries +from pynwb.testing.mock.file import mock_NWBFile + +from neuroconv.tools.nwb_helpers import Dataset, get_io_datasets + + +def generate_1d_array() -> NWBFile: + array = np.array([[0.1, 0.2, 0.3]]) + return array + + +def generate_2d_array() -> NWBFile: + array = np.array([[1, 2, 3], [4, 5, 6]]) + return array + + +def generate_nwbfile_with_datasets() -> NWBFile: + nwbfile = mock_NWBFile() + array = generate_2d_array() + time_series = mock_TimeSeries(data=array) + nwbfile.add_acquisition(time_series) + return nwbfile + + +@pytest.fixture(scope="session") +def hdf5_nwbfile_path(tmpdir_factory): + nwbfile_path = tmpdir_factory.mktemp("data").join("test_hdf5_nwbfile_with_datasets.nwb") + if not Path(nwbfile_path).exists(): + nwbfile = generate_nwbfile_with_datasets() + with NWBHDF5IO(path=str(nwbfile_path), mode="w") as io: + io.write(nwbfile) + return str(nwbfile_path) + + +@pytest.fixture(scope="session") +def zarr_nwbfile_path(tmpdir_factory): + nwbfile_path = tmpdir_factory.mktemp("data").join("test_zarr_nwbfile_with_datasets.nwb") + if not Path(nwbfile_path).exists(): + nwbfile = generate_nwbfile_with_datasets() + with NWBZarrIO(path=str(nwbfile_path), mode="w") as io: + io.write(nwbfile) + return str(nwbfile_path) + + +def test_simple_time_series(): + array = generate_2d_array() + + nwbfile = mock_NWBFile() + time_series = mock_TimeSeries(name="TimeSeries", data=array) + nwbfile.add_acquisition(time_series) + + results = list(get_io_datasets(nwbfile=nwbfile)) + + assert len(results) == 1 + + result = results[0] + assert isinstance(result, Dataset) + assert result.object_name == "TimeSeries" + assert result.field == "data" + assert result.maxshape == array.shape + assert result.dtype == array.dtype + + +def test_simple_dynamic_table(): + array = generate_1d_array() + + nwbfile = mock_NWBFile() + column_length = array.shape[1] + dynamic_table = DynamicTable( + name="DynamicTable", + description="", + id=list(range(column_length)), # Need to include ID since the data of the column is not wrapped in an IO + ) + dynamic_table.add_column(name="TestColumn", description="", data=array.squeeze()) + nwbfile.add_acquisition(dynamic_table) + + results = list(get_io_datasets(nwbfile=nwbfile)) + + assert len(results) == 1 + + result = results[0] + assert isinstance(result, Dataset) + assert result.object_name == "TestColumn" + assert result.field == "data" + assert result.maxshape == (column_length,) + assert result.dtype == str(array.dtype) + + +def test_simple_on_appended_hdf5_file(hdf5_nwbfile_path): + array = generate_2d_array() + + with NWBHDF5IO(path=hdf5_nwbfile_path, mode="a") as io: + nwbfile = io.read() + array = generate_2d_array() + new_time_series = mock_TimeSeries(name="NewTimeSeries", data=array) + nwbfile.add_acquisition(new_time_series) + + results = list(get_io_datasets(nwbfile=nwbfile)) + + assert len(results) == 1 + + result = results[0] + assert isinstance(result, Dataset) + assert result.object_name == "NewTimeSeries" + assert result.field == "data" + assert result.maxshape == array.shape + assert result.dtype == str(array.dtype) # TODO: add tests for if source specification was np.dtype et al. + + +def test_simple_on_appended_zarr_file(zarr_nwbfile_path): + array = generate_2d_array() + + with NWBZarrIO(path=zarr_nwbfile_path, mode="a") as io: + nwbfile = io.read() + array = generate_2d_array() + new_time_series = mock_TimeSeries(name="NewTimeSeries", data=array) + nwbfile.add_acquisition(new_time_series) + + results = list(get_io_datasets(nwbfile=nwbfile)) + + assert len(results) == 1 + + result = results[0] + assert isinstance(result, Dataset) + assert result.object_name == "NewTimeSeries" + assert result.field == "data" + assert result.maxshape == array.shape + assert result.dtype == str(array.dtype) # TODO: add tests for if source specification was np.dtype et al. diff --git a/tests/test_minimal/test_tools/test_backend_configuration.py b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_get_default_backend_configuration - Copy.py similarity index 100% rename from tests/test_minimal/test_tools/test_backend_configuration.py rename to tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_get_default_backend_configuration - Copy.py From 9017d6d127c93a3ad912eecc1e036aa0568e3043 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 29 Aug 2023 14:49:17 +0000 Subject: [PATCH 04/17] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/neuroconv/tools/nwb_helpers/__init__.py | 11 +++++++++-- .../tools/nwb_helpers/_dataset_and_backend_models.py | 4 ++-- .../tools/nwb_helpers/_dataset_configuration.py | 2 +- 3 files changed, 12 insertions(+), 5 deletions(-) diff --git a/src/neuroconv/tools/nwb_helpers/__init__.py b/src/neuroconv/tools/nwb_helpers/__init__.py index c860007e3..c3178847f 100644 --- a/src/neuroconv/tools/nwb_helpers/__init__.py +++ b/src/neuroconv/tools/nwb_helpers/__init__.py @@ -1,3 +1,12 @@ +from ._dataset_and_backend_models import ( + BackendConfiguration, + ConfigurableDataset, + DatasetConfiguration, +) +from ._dataset_configuration import ( + get_configurable_datasets, + get_default_dataset_configurations, +) from ._metadata_and_file_helpers import ( add_device_from_metadata, get_default_nwbfile_metadata, @@ -5,5 +14,3 @@ make_nwbfile_from_metadata, make_or_load_nwbfile, ) -from ._dataset_and_backend_models import ConfigurableDataset, DatasetConfiguration, BackendConfiguration -from ._dataset_configuration import get_configurable_datasets, get_default_dataset_configurations diff --git a/src/neuroconv/tools/nwb_helpers/_dataset_and_backend_models.py b/src/neuroconv/tools/nwb_helpers/_dataset_and_backend_models.py index 9acd1d6fa..09cda6966 100644 --- a/src/neuroconv/tools/nwb_helpers/_dataset_and_backend_models.py +++ b/src/neuroconv/tools/nwb_helpers/_dataset_and_backend_models.py @@ -1,12 +1,12 @@ """Collection of helper functions related to configuration of datasets dependent on backend.""" -from typing import Iterable, Literal, Tuple, Dict, Any, Type +from typing import Any, Dict, Iterable, Literal, Tuple, Type import h5py import hdf5plugin import zarr from hdmf.data_utils import DataIO -from pydantic import BaseModel, root_validator from nwbinspector.utils import is_module_installed +from pydantic import BaseModel, root_validator class ConfigurableDataset(BaseModel): diff --git a/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py b/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py index 8638b355a..4ddac1804 100644 --- a/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py +++ b/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py @@ -1,5 +1,5 @@ """Collection of helper functions related to configuration of datasets dependent on backend.""" -from typing import Iterable, Literal, Union, Dict +from typing import Dict, Iterable, Literal, Union import h5py import zarr From e0cb578beac146e982e5cf288bdf3cbb4eb36c67 Mon Sep 17 00:00:00 2001 From: Cody Baker Date: Tue, 29 Aug 2023 11:00:34 -0400 Subject: [PATCH 05/17] add complex validiation --- .../nwb_helpers/_dataset_and_backend_models.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/src/neuroconv/tools/nwb_helpers/_dataset_and_backend_models.py b/src/neuroconv/tools/nwb_helpers/_dataset_and_backend_models.py index 09cda6966..bd5317f08 100644 --- a/src/neuroconv/tools/nwb_helpers/_dataset_and_backend_models.py +++ b/src/neuroconv/tools/nwb_helpers/_dataset_and_backend_models.py @@ -113,10 +113,15 @@ class ZarrDatasetConfiguration(BaseModel): dtype: str # Think about how to constrain/specify this more @root_validator() - def verify_filter_methods_and_options_match(cls, values): - password = values.get("password") - confirm_password = values.get("confirm_password") - - if password != confirm_password: - raise ValueError("The two passwords did not match.") + def verify_filter_methods_and_options_match(cls, values: Dict[str, Any]): + filter_methods = values.get("filter_methods") + filter_options = values.get("filter_options") + + len_filter_methods = len(filter_methods) + len_filter_options = len(filter_options) + if len_filter_methods != len_filter_options: + raise ValueError( + "Length mismatch between `filter_methods` ({len_filter_methods} methods specified) and " + "`filter_options` ({len_filter_options} options found)! These two must match one-to-one." + ) return values From ab2245225267905cf40f571c23e929b546b63db3 Mon Sep 17 00:00:00 2001 From: Cody Baker Date: Thu, 31 Aug 2023 15:21:37 -0400 Subject: [PATCH 06/17] add backend models exposing parallelization; enhance models; finish prototype of default config fetcher --- src/neuroconv/tools/nwb_helpers/__init__.py | 2 +- .../_dataset_and_backend_models.py | 131 ++++++++++------- .../nwb_helpers/_dataset_configuration.py | 72 ++++++++-- .../test_get_configurable_datasets.py | 1 + .../test_get_default_backend_configuration.py | 136 ++++++++++++++++++ 5 files changed, 280 insertions(+), 62 deletions(-) diff --git a/src/neuroconv/tools/nwb_helpers/__init__.py b/src/neuroconv/tools/nwb_helpers/__init__.py index c3178847f..c517aa79c 100644 --- a/src/neuroconv/tools/nwb_helpers/__init__.py +++ b/src/neuroconv/tools/nwb_helpers/__init__.py @@ -5,7 +5,7 @@ ) from ._dataset_configuration import ( get_configurable_datasets, - get_default_dataset_configurations, + get_default_backend_configuration, ) from ._metadata_and_file_helpers import ( add_device_from_metadata, diff --git a/src/neuroconv/tools/nwb_helpers/_dataset_and_backend_models.py b/src/neuroconv/tools/nwb_helpers/_dataset_and_backend_models.py index bd5317f08..423b0a714 100644 --- a/src/neuroconv/tools/nwb_helpers/_dataset_and_backend_models.py +++ b/src/neuroconv/tools/nwb_helpers/_dataset_and_backend_models.py @@ -1,12 +1,15 @@ """Collection of helper functions related to configuration of datasets dependent on backend.""" -from typing import Any, Dict, Iterable, Literal, Tuple, Type +from typing import Any, Dict, Literal, Tuple, Type, Union +import psutil import h5py import hdf5plugin import zarr +from pynwb import NWBHDF5IO +from hdmf_zarr import NWBZarrIO from hdmf.data_utils import DataIO from nwbinspector.utils import is_module_installed -from pydantic import BaseModel, root_validator +from pydantic import BaseModel, root_validator, Field class ConfigurableDataset(BaseModel): @@ -31,38 +34,49 @@ def __str__(self) -> str: return string -class DatasetConfiguration(BaseModel): +class DatasetConfiguration(ConfigurableDataset): """A data model for configruing options about an object that will become a HDF5 or Zarr Dataset in the file.""" - object_id: str - object_name: str - parent: str - field: Literal["data", "timestamps"] chunk_shape: Tuple[int, ...] buffer_shape: Tuple[int, ...] - maxshape: Tuple[int, ...] - compression_method: str - compression_options: Dict[str, Any] - dtype: str # Think about how to constrain/specify this more - - def __str__(self) -> str: - """Not overriding __repr__ as this is intended to render only when wrapped in print().""" - string = ( - f"{self.object_name} of {self.parent}\n" - + f"{'-' * (len(self.object_name) + 4 + len(self.parent))}\n" - + f" {self.field}\n" - + f" maxshape: {self.maxshape}\n" - + f" dtype: {self.dtype}" - ) - return string + compression_method: Union[str, None] # Backend configurations should specify Literals; None means no compression + compression_options: Union[Dict[str, Any], None] = None class BackendConfiguration(BaseModel): """A model for matching collections of DatasetConfigurations specific to a backend with its name and DataIO.""" backend_type: Literal["hdf5", "zarr"] - data_io: Type[DataIO] - dataset_configurations: Iterable[DatasetConfiguration] + data_io: Type[DataIO] # Auto-set by __init__ + dataset_configurations: Dict[ConfigurableDataset, DatasetConfiguration] + + def __init__( + self, + backend_type: Literal["hdf5", "zarr"], + dataset_configurations: Dict[ConfigurableDataset, DatasetConfiguration], + ): + backend_to_data_io = dict(hdf5=NWBHDF5IO, zarr=NWBZarrIO) + data_io = backend_to_data_io[backend_type] + super().__init__( + backend_to_data_io=backend_to_data_io, data_io=data_io, dataset_configurations=dataset_configurations + ) + + +class HDF5BackendConfiguration(BackendConfiguration): + """A model for matching collections of DatasetConfigurations specific to the HDF5 backend.""" + + pass # No extra arguments exposed to HDF5 backend + + +class ZarrBackendConfiguration(BackendConfiguration): + """A model for matching collections of DatasetConfigurations specific to the Zarr backend.""" + + number_of_jobs: int = Field( + description="Number of jobs to use in parallel during write.", + ge=psutil.cpu_count(), # TODO: should we specify logical=False in cpu_count? + le=psutil.cpu_count(), + default=-2, # -2 translates to 'all CPU except for one' + ) _available_hdf5_filters = set(h5py.filters.decode) - set(("shuffle", "fletcher32", "scaleoffset")) @@ -73,44 +87,55 @@ class BackendConfiguration(BaseModel): AVAILABLE_HDF5_COMPRESSION_METHODS = Literal[tuple(_available_hdf5_filters)] -class HDF5DatasetConfiguration(BaseModel): +class HDF5DatasetConfiguration(DatasetConfiguration): """A data model for configruing options about an object that will become a HDF5 Dataset in the file.""" - object_id: str - object_name: str - parent: str - field: Literal["data", "timestamps"] - chunk_shape: Tuple[int, ...] - buffer_shape: Tuple[int, ...] - maxshape: Tuple[int, ...] - compression_method: AVAILABLE_HDF5_COMPRESSION_METHODS = "gzip" + compression_method: Union[AVAILABLE_HDF5_COMPRESSION_METHODS, None] = "gzip" # TODO: actually provide better schematic rendering of options. Only support defaults in GUIDE for now # Looks like they'll have to be hand-typed however... Can try parsing the google docstrings but no annotation typing - compression_options: Dict[str, Any] - dtype: str # Think about how to constrain/specify this more - - -_available_zarr_filters = set(zarr.codec_registry.keys()) - set(("json2", "pickle")) + compression_options: Union[Dict[str, Any], None] = None + + +_available_zarr_filters = ( + set(zarr.codec_registry.keys()) + - set( + # These filters do nothing for us, or are things that ought to be implemented at lower HDMF levels + # or indirectly using HDMF data structures + ( + "json2", + "pickle", + "astype", + "vlen-utf8", + "vlen-array", + "vlen-bytes", + "adler32", + "crc32", + "fixedscaleoffset", + "msgpack2", + "base64", + "n5_wrapper", + ) + ) + - set( # Forbidding lossy codecs for now, but they could be allowed in the future with warnings + ("bitround", "quantize") + ) +) +# TODO: would like to eventually (as separate feature) add an 'auto' method to Zarr +# to harness the wider range of potential methods that are ideal for certain dtypes or structures +# E.g., 'packbits' for boolean (logical) VectorData columns +# | set(("auto",)) AVAILABLE_ZARR_COMPRESSION_METHODS = Literal[tuple(_available_zarr_filters)] -class ZarrDatasetConfiguration(BaseModel): +class ZarrDatasetConfiguration(DatasetConfiguration): """A data model for configruing options about an object that will become a Zarr Dataset in the file.""" - object_id: str - object_name: str - parent: str - field: Literal["data", "timestamps"] - chunk_shape: Tuple[int, ...] - buffer_shape: Tuple[int, ...] - maxshape: Tuple[int, ...] - filter_methods: Tuple[AVAILABLE_ZARR_COMPRESSION_METHODS, ...] - filter_options: Tuple[Dict[str, Any]] - compression_method: AVAILABLE_ZARR_COMPRESSION_METHODS + filter_methods: Union[Tuple[AVAILABLE_ZARR_COMPRESSION_METHODS, ...], None] = None + filter_options: Union[Tuple[Dict[str, Any]], None] = None + compression_method: Union[AVAILABLE_ZARR_COMPRESSION_METHODS, None] = "gzip" # TODO: would like this to be 'auto' # TODO: actually provide better schematic rendering of options. Only support defaults in GUIDE for now # Looks like they'll have to be hand-typed however... Can try parsing the google docstrings but no annotation typing - compression_option: Dict[str, Any] - dtype: str # Think about how to constrain/specify this more + compression_options: Union[Dict[str, Any], None] = None @root_validator() def verify_filter_methods_and_options_match(cls, values: Dict[str, Any]): @@ -121,7 +146,7 @@ def verify_filter_methods_and_options_match(cls, values: Dict[str, Any]): len_filter_options = len(filter_options) if len_filter_methods != len_filter_options: raise ValueError( - "Length mismatch between `filter_methods` ({len_filter_methods} methods specified) and " - "`filter_options` ({len_filter_options} options found)! These two must match one-to-one." + f"Length mismatch between `filter_methods` ({len_filter_methods} methods specified) and " + f"`filter_options` ({len_filter_options} options found)! These two must match one-to-one." ) return values diff --git a/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py b/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py index 4ddac1804..e9ef70765 100644 --- a/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py +++ b/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py @@ -3,13 +3,16 @@ import h5py import zarr -from hdmf.data_utils import DataIO +from hdmf.data_utils import DataIO, GenericDataChunkIterator from hdmf.utils import get_data_shape from hdmf_zarr import NWBZarrIO from pynwb import NWBHDF5IO, NWBFile, TimeSeries from pynwb.base import DynamicTable from ._dataset_and_backend_models import ( + BackendConfiguration, + HDF5BackendConfiguration, + ZarrBackendConfiguration, ConfigurableDataset, DatasetConfiguration, HDF5DatasetConfiguration, @@ -64,7 +67,7 @@ def get_configurable_datasets(nwbfile: NWBFile) -> Iterable[ConfigurableDataset] Yields ------ - Dataset + ConfigurableDataset A summary of each detected object that can be wrapped in a DataIO. """ backend_type = None # Used for filtering out datasets that have already been written to disk when appending @@ -82,23 +85,76 @@ def get_configurable_datasets(nwbfile: NWBFile) -> Iterable[ConfigurableDataset] for field_name in ("data", "timestamps"): if field_name not in neurodata_object.fields: # timestamps is optional continue + + field_value = getattr(neurodata_object, field_name) if _value_already_written_to_file( - value=getattr(neurodata_object, field_name), - backend_type=backend_type, - existing_file=existing_file, + value=field_value, backend_type=backend_type, existing_file=existing_file ): continue # skip + # Currently requiring a ConfigurableDataset to apply only to data wrapped in a GenericDataChunkIterator + # TODO: in follow-up, can maybe be wrapped automatically? + if not isinstance(field_value, GenericDataChunkIterator): + continue # skip yield _get_dataset_metadata(neurodata_object=neurodata_object, field_name=field_name) elif isinstance(neurodata_object, DynamicTable): for column_name in getattr(neurodata_object, "colnames"): + column_value = getattr(neurodata_object, column_name) if _value_already_written_to_file( - value=getattr(neurodata_object, column_name), backend_type=backend_type, existing_file=existing_file + value=column_value, backend_type=backend_type, existing_file=existing_file ): continue # skip + # Currently requiring a ConfigurableDataset to apply only to data wrapped in a GenericDataChunkIterator + # TODO: in follow-up, can maybe be wrapped automatically? + if not isinstance(column_value, GenericDataChunkIterator): + continue # skip yield _get_dataset_metadata(neurodata_object=neurodata_object[column_name], field_name="data") -def get_default_dataset_configurations(nwbfile: NWBFile) -> Dict[ConfigurableDataset, DatasetConfiguration]: - pass # TODO +def _get_default_configuration( + nwbfile: NWBFile, backend_type: Literal["hdf5", "zarr"], configurable_dataset: ConfigurableDataset +) -> DatasetConfiguration: + backend_to_dataset_configuration_class = dict(hdf5=HDF5DatasetConfiguration, zarr=ZarrDatasetConfiguration) + DatasetConfigurationClass = backend_to_dataset_configuration_class[backend_type] + + neurodata_object = nwbfile.objects[configurable_dataset.object_id] + field_value = getattr(neurodata_object, configurable_dataset.field) + iterator = field_value # Currently restricting to values that are already wrapped in GenericDataChunkIterators + # TODO: in follow-up, can maybe be wrapped automatically? + + dataset_configuration = DatasetConfigurationClass( + object_id=configurable_dataset.object_id, + object_name=configurable_dataset.object_name, + parent=configurable_dataset.parent, + field=configurable_dataset.field, + maxshape=configurable_dataset.maxshape, + dtype=configurable_dataset.dtype, + chunk_shape=iterator.chunk_shape, + buffer_shape=iterator.buffer_shape, + # Let the compression and/or filters default to the back-end specific values + ) + return dataset_configuration + + +def get_default_backend_configuration(nwbfile: NWBFile, backend_type: Literal["hdf5", "zarr"]) -> BackendConfiguration: + """Fill a default backend configuration to serve as a starting point for further customization.""" + backend_type_to_backend_configuration_classes = dict(hdf5=HDF5BackendConfiguration, zarr=ZarrBackendConfiguration) + + configurable_datasets = get_configurable_datasets(nwbfile=nwbfile) + + dataset_configurations = dict() + for configurable_dataset in configurable_datasets: + dataset_configurations.update( + { + configurable_dataset: _get_default_configuration( + nwbfile=nwbfile, backend_type=backend_type, configurable_dataset=configurable_dataset + ) + } + ) + + DatasetConfigurationClass = backend_type_to_backend_configuration_classes[backend_type] + backend_configuration = DatasetConfigurationClass( + backend_type=backend_type, dataset_configurations=dataset_configurations + ) + return backend_configuration diff --git a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_get_configurable_datasets.py b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_get_configurable_datasets.py index 137889f7b..b39866fa5 100644 --- a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_get_configurable_datasets.py +++ b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_get_configurable_datasets.py @@ -1,3 +1,4 @@ +"""Unit tests for `get_configurable_datasets`.""" from pathlib import Path import numpy as np diff --git a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_get_default_backend_configuration.py b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_get_default_backend_configuration.py index e69de29bb..a28c75b7f 100644 --- a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_get_default_backend_configuration.py +++ b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_get_default_backend_configuration.py @@ -0,0 +1,136 @@ +"""Unit tests for `get_default_backend_configuration`.""" +from pathlib import Path + +import numpy as np +import pytest +from hdmf_zarr import NWBZarrIO +from pynwb import NWBHDF5IO, NWBFile +from pynwb.base import DynamicTable +from pynwb.testing.mock.base import mock_TimeSeries +from pynwb.testing.mock.file import mock_NWBFile + +from neuroconv.tools.nwb_helpers import ConfigurableDataset, get_default_backend_configuration + + +def generate_1d_array() -> NWBFile: + array = np.array([[0.1, 0.2, 0.3]]) + return array + + +def generate_2d_array() -> NWBFile: + array = np.array([[1, 2, 3], [4, 5, 6]]) + return array + + +def generate_nwbfile_with_ConfigurableDatasets() -> NWBFile: + nwbfile = mock_NWBFile() + array = generate_2d_array() + time_series = mock_TimeSeries(data=array) + nwbfile.add_acquisition(time_series) + return nwbfile + + +@pytest.fixture(scope="session") +def hdf5_nwbfile_path(tmpdir_factory): + nwbfile_path = tmpdir_factory.mktemp("data").join("test_hdf5_nwbfile_with_configurable_datasets.nwb.h5") + if not Path(nwbfile_path).exists(): + nwbfile = generate_nwbfile_with_ConfigurableDatasets() + with NWBHDF5IO(path=str(nwbfile_path), mode="w") as io: + io.write(nwbfile) + return str(nwbfile_path) + + +@pytest.fixture(scope="session") +def zarr_nwbfile_path(tmpdir_factory): + nwbfile_path = tmpdir_factory.mktemp("data").join("test_zarr_nwbfile_with_configurable_datasets.nwb.zarr") + if not Path(nwbfile_path).exists(): + nwbfile = generate_nwbfile_with_ConfigurableDatasets() + with NWBZarrIO(path=str(nwbfile_path), mode="w") as io: + io.write(nwbfile) + return str(nwbfile_path) + + +def test_simple_time_series(): + array = generate_2d_array() + + nwbfile = mock_NWBFile() + time_series = mock_TimeSeries(name="TimeSeries", data=array) + nwbfile.add_acquisition(time_series) + + results = list(get_default_backend_configuration(nwbfile=nwbfile)) + + assert len(results) == 1 + + result = results[0] + assert isinstance(result, ConfigurableDataset) + assert result.object_name == "TimeSeries" + assert result.field == "data" + assert result.maxshape == array.shape + assert result.dtype == array.dtype + + +def test_simple_dynamic_table(): + array = generate_1d_array() + + nwbfile = mock_NWBFile() + column_length = array.shape[1] + dynamic_table = DynamicTable( + name="DynamicTable", + description="", + id=list(range(column_length)), # Need to include ID since the data of the column is not wrapped in an IO + ) + dynamic_table.add_column(name="TestColumn", description="", data=array.squeeze()) + nwbfile.add_acquisition(dynamic_table) + + results = list(get_default_backend_configuration(nwbfile=nwbfile)) + + assert len(results) == 1 + + result = results[0] + assert isinstance(result, ConfigurableDataset) + assert result.object_name == "TestColumn" + assert result.field == "data" + assert result.maxshape == (column_length,) + assert result.dtype == str(array.dtype) + + +def test_simple_on_appended_hdf5_file(hdf5_nwbfile_path): + array = generate_2d_array() + + with NWBHDF5IO(path=hdf5_nwbfile_path, mode="a") as io: + nwbfile = io.read() + array = generate_2d_array() + new_time_series = mock_TimeSeries(name="NewTimeSeries", data=array) + nwbfile.add_acquisition(new_time_series) + + results = list(get_default_backend_configuration(nwbfile=nwbfile)) + + assert len(results) == 1 + + result = results[0] + assert isinstance(result, ConfigurableDataset) + assert result.object_name == "NewTimeSeries" + assert result.field == "data" + assert result.maxshape == array.shape + assert result.dtype == str(array.dtype) # TODO: add tests for if source specification was np.dtype et al. + + +def test_simple_on_appended_zarr_file(zarr_nwbfile_path): + array = generate_2d_array() + + with NWBZarrIO(path=zarr_nwbfile_path, mode="a") as io: + nwbfile = io.read() + array = generate_2d_array() + new_time_series = mock_TimeSeries(name="NewTimeSeries", data=array) + nwbfile.add_acquisition(new_time_series) + + results = list(get_default_backend_configuration(nwbfile=nwbfile)) + + assert len(results) == 1 + + result = results[0] + assert isinstance(result, ConfigurableDataset) + assert result.object_name == "NewTimeSeries" + assert result.field == "data" + assert result.maxshape == array.shape + assert result.dtype == str(array.dtype) # TODO: add tests for if source specification was np.dtype et al. From bfa9aa0ea6acf709a34b9f6c819fae853b010a6f Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 31 Aug 2023 19:21:53 +0000 Subject: [PATCH 07/17] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../tools/nwb_helpers/_dataset_and_backend_models.py | 8 ++++---- src/neuroconv/tools/nwb_helpers/_dataset_configuration.py | 4 ++-- .../test_get_default_backend_configuration.py | 5 ++++- 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/src/neuroconv/tools/nwb_helpers/_dataset_and_backend_models.py b/src/neuroconv/tools/nwb_helpers/_dataset_and_backend_models.py index 423b0a714..d03b746ab 100644 --- a/src/neuroconv/tools/nwb_helpers/_dataset_and_backend_models.py +++ b/src/neuroconv/tools/nwb_helpers/_dataset_and_backend_models.py @@ -1,15 +1,15 @@ """Collection of helper functions related to configuration of datasets dependent on backend.""" from typing import Any, Dict, Literal, Tuple, Type, Union -import psutil import h5py import hdf5plugin +import psutil import zarr -from pynwb import NWBHDF5IO -from hdmf_zarr import NWBZarrIO from hdmf.data_utils import DataIO +from hdmf_zarr import NWBZarrIO from nwbinspector.utils import is_module_installed -from pydantic import BaseModel, root_validator, Field +from pydantic import BaseModel, Field, root_validator +from pynwb import NWBHDF5IO class ConfigurableDataset(BaseModel): diff --git a/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py b/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py index e9ef70765..ff90936ec 100644 --- a/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py +++ b/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py @@ -11,11 +11,11 @@ from ._dataset_and_backend_models import ( BackendConfiguration, - HDF5BackendConfiguration, - ZarrBackendConfiguration, ConfigurableDataset, DatasetConfiguration, + HDF5BackendConfiguration, HDF5DatasetConfiguration, + ZarrBackendConfiguration, ZarrDatasetConfiguration, ) diff --git a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_get_default_backend_configuration.py b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_get_default_backend_configuration.py index a28c75b7f..7134f93f5 100644 --- a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_get_default_backend_configuration.py +++ b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_get_default_backend_configuration.py @@ -9,7 +9,10 @@ from pynwb.testing.mock.base import mock_TimeSeries from pynwb.testing.mock.file import mock_NWBFile -from neuroconv.tools.nwb_helpers import ConfigurableDataset, get_default_backend_configuration +from neuroconv.tools.nwb_helpers import ( + ConfigurableDataset, + get_default_backend_configuration, +) def generate_1d_array() -> NWBFile: From a45e1c00be375ce7a9644c9ca2b7f39bd4d7d29c Mon Sep 17 00:00:00 2001 From: Cody Baker Date: Thu, 31 Aug 2023 15:55:38 -0400 Subject: [PATCH 08/17] debug previous tests --- .../tools/nwb_helpers/_dataset_configuration.py | 2 +- .../test_get_configurable_datasets.py | 12 ++++++++---- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py b/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py index ff90936ec..c2473efb2 100644 --- a/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py +++ b/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py @@ -99,7 +99,7 @@ def get_configurable_datasets(nwbfile: NWBFile) -> Iterable[ConfigurableDataset] yield _get_dataset_metadata(neurodata_object=neurodata_object, field_name=field_name) elif isinstance(neurodata_object, DynamicTable): for column_name in getattr(neurodata_object, "colnames"): - column_value = getattr(neurodata_object, column_name) + column_value = getattr(neurodata_object, column_name).data if _value_already_written_to_file( value=column_value, backend_type=backend_type, existing_file=existing_file ): diff --git a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_get_configurable_datasets.py b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_get_configurable_datasets.py index b39866fa5..db212ffe1 100644 --- a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_get_configurable_datasets.py +++ b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_get_configurable_datasets.py @@ -3,12 +3,14 @@ import numpy as np import pytest +from hdmf.common import VectorData from hdmf_zarr import NWBZarrIO from pynwb import NWBHDF5IO, NWBFile from pynwb.base import DynamicTable from pynwb.testing.mock.base import mock_TimeSeries from pynwb.testing.mock.file import mock_NWBFile +from neuroconv.tools.hdmf import SliceableDataChunkIterator from neuroconv.tools.nwb_helpers import ConfigurableDataset, get_configurable_datasets @@ -54,7 +56,7 @@ def test_simple_time_series(): array = generate_2d_array() nwbfile = mock_NWBFile() - time_series = mock_TimeSeries(name="TimeSeries", data=array) + time_series = mock_TimeSeries(name="TimeSeries", data=SliceableDataChunkIterator(data=array)) nwbfile.add_acquisition(time_series) results = list(get_configurable_datasets(nwbfile=nwbfile)) @@ -74,12 +76,14 @@ def test_simple_dynamic_table(): nwbfile = mock_NWBFile() column_length = array.shape[1] + column = VectorData(name="TestColumn", description="", data=SliceableDataChunkIterator(data=array.squeeze())) dynamic_table = DynamicTable( name="DynamicTable", description="", id=list(range(column_length)), # Need to include ID since the data of the column is not wrapped in an IO + columns=[column], ) - dynamic_table.add_column(name="TestColumn", description="", data=array.squeeze()) + # dynamic_table.add_column(name="TestColumn", description="", data=SliceableDataChunkIterator(data=array.squeeze())) nwbfile.add_acquisition(dynamic_table) results = list(get_configurable_datasets(nwbfile=nwbfile)) @@ -100,7 +104,7 @@ def test_simple_on_appended_hdf5_file(hdf5_nwbfile_path): with NWBHDF5IO(path=hdf5_nwbfile_path, mode="a") as io: nwbfile = io.read() array = generate_2d_array() - new_time_series = mock_TimeSeries(name="NewTimeSeries", data=array) + new_time_series = mock_TimeSeries(name="NewTimeSeries", data=SliceableDataChunkIterator(data=array)) nwbfile.add_acquisition(new_time_series) results = list(get_configurable_datasets(nwbfile=nwbfile)) @@ -121,7 +125,7 @@ def test_simple_on_appended_zarr_file(zarr_nwbfile_path): with NWBZarrIO(path=zarr_nwbfile_path, mode="a") as io: nwbfile = io.read() array = generate_2d_array() - new_time_series = mock_TimeSeries(name="NewTimeSeries", data=array) + new_time_series = mock_TimeSeries(name="NewTimeSeries", data=SliceableDataChunkIterator(data=array)) nwbfile.add_acquisition(new_time_series) results = list(get_configurable_datasets(nwbfile=nwbfile)) From 907fc0d4e82e287fc5aeb3ab09205120da1f7a11 Mon Sep 17 00:00:00 2001 From: Cody Baker Date: Thu, 31 Aug 2023 19:09:36 -0400 Subject: [PATCH 09/17] debugs and first working tests --- src/neuroconv/tools/nwb_helpers/__init__.py | 5 +- .../_dataset_and_backend_models.py | 82 +++++++++---------- .../nwb_helpers/_dataset_configuration.py | 11 +-- .../test_get_default_backend_configuration.py | 81 +++++++++++++++--- 4 files changed, 121 insertions(+), 58 deletions(-) diff --git a/src/neuroconv/tools/nwb_helpers/__init__.py b/src/neuroconv/tools/nwb_helpers/__init__.py index c517aa79c..6e4a8df8e 100644 --- a/src/neuroconv/tools/nwb_helpers/__init__.py +++ b/src/neuroconv/tools/nwb_helpers/__init__.py @@ -1,7 +1,10 @@ from ._dataset_and_backend_models import ( - BackendConfiguration, + HDF5BackendConfiguration, + ZarrBackendConfiguration, ConfigurableDataset, DatasetConfiguration, + HDF5DatasetConfiguration, + ZarrDatasetConfiguration, ) from ._dataset_configuration import ( get_configurable_datasets, diff --git a/src/neuroconv/tools/nwb_helpers/_dataset_and_backend_models.py b/src/neuroconv/tools/nwb_helpers/_dataset_and_backend_models.py index d03b746ab..32fabbb6f 100644 --- a/src/neuroconv/tools/nwb_helpers/_dataset_and_backend_models.py +++ b/src/neuroconv/tools/nwb_helpers/_dataset_and_backend_models.py @@ -5,11 +5,10 @@ import hdf5plugin import psutil import zarr -from hdmf.data_utils import DataIO -from hdmf_zarr import NWBZarrIO +from hdmf.backends.hdf5 import H5DataIO +from hdmf_zarr import ZarrDataIO from nwbinspector.utils import is_module_installed from pydantic import BaseModel, Field, root_validator -from pynwb import NWBHDF5IO class ConfigurableDataset(BaseModel): @@ -22,6 +21,13 @@ class ConfigurableDataset(BaseModel): maxshape: Tuple[int, ...] dtype: str # Think about how to constrain/specify this more + class Config: # noqa: D106 + allow_mutation = False # To enforce hashability + + def __hash__(self): + """To allow instances of this class to be used as keys in dictionaries.""" + return hash((type(self),) + tuple(self.__dict__.values())) + def __str__(self) -> str: """Not overriding __repr__ as this is intended to render only when wrapped in print().""" string = ( @@ -43,42 +49,6 @@ class DatasetConfiguration(ConfigurableDataset): compression_options: Union[Dict[str, Any], None] = None -class BackendConfiguration(BaseModel): - """A model for matching collections of DatasetConfigurations specific to a backend with its name and DataIO.""" - - backend_type: Literal["hdf5", "zarr"] - data_io: Type[DataIO] # Auto-set by __init__ - dataset_configurations: Dict[ConfigurableDataset, DatasetConfiguration] - - def __init__( - self, - backend_type: Literal["hdf5", "zarr"], - dataset_configurations: Dict[ConfigurableDataset, DatasetConfiguration], - ): - backend_to_data_io = dict(hdf5=NWBHDF5IO, zarr=NWBZarrIO) - data_io = backend_to_data_io[backend_type] - super().__init__( - backend_to_data_io=backend_to_data_io, data_io=data_io, dataset_configurations=dataset_configurations - ) - - -class HDF5BackendConfiguration(BackendConfiguration): - """A model for matching collections of DatasetConfigurations specific to the HDF5 backend.""" - - pass # No extra arguments exposed to HDF5 backend - - -class ZarrBackendConfiguration(BackendConfiguration): - """A model for matching collections of DatasetConfigurations specific to the Zarr backend.""" - - number_of_jobs: int = Field( - description="Number of jobs to use in parallel during write.", - ge=psutil.cpu_count(), # TODO: should we specify logical=False in cpu_count? - le=psutil.cpu_count(), - default=-2, # -2 translates to 'all CPU except for one' - ) - - _available_hdf5_filters = set(h5py.filters.decode) - set(("shuffle", "fletcher32", "scaleoffset")) if is_module_installed(module_name="hdf5plugin"): _available_hdf5_filters = _available_hdf5_filters | set( @@ -138,9 +108,14 @@ class ZarrDatasetConfiguration(DatasetConfiguration): compression_options: Union[Dict[str, Any], None] = None @root_validator() - def verify_filter_methods_and_options_match(cls, values: Dict[str, Any]): - filter_methods = values.get("filter_methods") - filter_options = values.get("filter_options") + def validate_filter_methods_and_options_match(cls, values: Dict[str, Any]): + filter_methods = values["filter_methods"] + filter_options = values["filter_options"] + + if filter_methods is None and filter_options is not None: + raise ValueError(f"`filter_methods` is `None` but `filter_options` is not ({filter_options})!") + elif filter_methods is None and filter_options is None: + return values len_filter_methods = len(filter_methods) len_filter_options = len(filter_options) @@ -149,4 +124,27 @@ def verify_filter_methods_and_options_match(cls, values: Dict[str, Any]): f"Length mismatch between `filter_methods` ({len_filter_methods} methods specified) and " f"`filter_options` ({len_filter_options} options found)! These two must match one-to-one." ) + return values + + +class HDF5BackendConfiguration(BaseModel): + """A model for matching collections of DatasetConfigurations specific to the HDF5 backend.""" + + backend_type: Literal["hdf5"] = "hdf5" + data_io: Type[H5DataIO] = H5DataIO + dataset_configurations: Dict[ConfigurableDataset, HDF5DatasetConfiguration] + + +class ZarrBackendConfiguration(BaseModel): + """A model for matching collections of DatasetConfigurations specific to the Zarr backend.""" + + backend_type: Literal["zarr"] = "zarr" + data_io: Type[ZarrDataIO] = ZarrDataIO + dataset_configurations: Dict[ConfigurableDataset, ZarrDatasetConfiguration] + number_of_jobs: int = Field( + description="Number of jobs to use in parallel during write.", + ge=-psutil.cpu_count(), # TODO: should we specify logical=False in cpu_count? + le=psutil.cpu_count(), + default=-2, # -2 translates to 'all CPU except for one' + ) diff --git a/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py b/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py index c2473efb2..d07c5759f 100644 --- a/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py +++ b/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py @@ -10,7 +10,6 @@ from pynwb.base import DynamicTable from ._dataset_and_backend_models import ( - BackendConfiguration, ConfigurableDataset, DatasetConfiguration, HDF5BackendConfiguration, @@ -134,10 +133,13 @@ def _get_default_configuration( buffer_shape=iterator.buffer_shape, # Let the compression and/or filters default to the back-end specific values ) + return dataset_configuration -def get_default_backend_configuration(nwbfile: NWBFile, backend_type: Literal["hdf5", "zarr"]) -> BackendConfiguration: +def get_default_backend_configuration( + nwbfile: NWBFile, backend_type: Literal["hdf5", "zarr"] +) -> Union[HDF5BackendConfiguration, ZarrBackendConfiguration]: """Fill a default backend configuration to serve as a starting point for further customization.""" backend_type_to_backend_configuration_classes = dict(hdf5=HDF5BackendConfiguration, zarr=ZarrBackendConfiguration) @@ -154,7 +156,6 @@ def get_default_backend_configuration(nwbfile: NWBFile, backend_type: Literal["h ) DatasetConfigurationClass = backend_type_to_backend_configuration_classes[backend_type] - backend_configuration = DatasetConfigurationClass( - backend_type=backend_type, dataset_configurations=dataset_configurations - ) + backend_configuration = DatasetConfigurationClass(dataset_configurations=dataset_configurations) + return backend_configuration diff --git a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_get_default_backend_configuration.py b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_get_default_backend_configuration.py index 7134f93f5..74aea6b1a 100644 --- a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_get_default_backend_configuration.py +++ b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_get_default_backend_configuration.py @@ -9,8 +9,13 @@ from pynwb.testing.mock.base import mock_TimeSeries from pynwb.testing.mock.file import mock_NWBFile +from neuroconv.tools.hdmf import SliceableDataChunkIterator from neuroconv.tools.nwb_helpers import ( ConfigurableDataset, + HDF5BackendConfiguration, + ZarrBackendConfiguration, + HDF5DatasetConfiguration, + ZarrDatasetConfiguration, get_default_backend_configuration, ) @@ -53,23 +58,79 @@ def zarr_nwbfile_path(tmpdir_factory): return str(nwbfile_path) -def test_simple_time_series(): +def test_simple_time_series_hdf5_backend(): array = generate_2d_array() nwbfile = mock_NWBFile() - time_series = mock_TimeSeries(name="TimeSeries", data=array) + time_series = mock_TimeSeries( + name="TimeSeries", data=SliceableDataChunkIterator(data=array, chunk_shape=(1, 2), buffer_shape=(1, 3)) + ) nwbfile.add_acquisition(time_series) - results = list(get_default_backend_configuration(nwbfile=nwbfile)) + default_backend_configuration = get_default_backend_configuration(nwbfile=nwbfile, backend_type="hdf5") - assert len(results) == 1 + assert isinstance(default_backend_configuration, HDF5BackendConfiguration) + assert len(default_backend_configuration.dataset_configurations) == 1 - result = results[0] - assert isinstance(result, ConfigurableDataset) - assert result.object_name == "TimeSeries" - assert result.field == "data" - assert result.maxshape == array.shape - assert result.dtype == array.dtype + default_configurable_dataset, default_dataset_configuration = next( + iter(default_backend_configuration.dataset_configurations.items()) + ) + assert isinstance(default_configurable_dataset, ConfigurableDataset) + assert default_configurable_dataset.object_name == "TimeSeries" + assert default_configurable_dataset.parent == "root" + assert default_configurable_dataset.field == "data" + assert default_configurable_dataset.maxshape == (2, 3) + assert default_configurable_dataset.dtype == "int32" + + assert isinstance(default_dataset_configuration, HDF5DatasetConfiguration) + assert default_dataset_configuration.object_name == "TimeSeries" + assert default_dataset_configuration.parent == "root" + assert default_dataset_configuration.field == "data" + assert default_dataset_configuration.chunk_shape == (1, 2) + assert default_dataset_configuration.buffer_shape == (1, 3) + assert default_dataset_configuration.maxshape == (2, 3) + assert default_dataset_configuration.dtype == "int32" + assert default_dataset_configuration.compression_method == "gzip" + assert default_dataset_configuration.compression_options is None + + +def test_simple_time_series_zarr_backend(): + array = generate_2d_array() + + nwbfile = mock_NWBFile() + time_series = mock_TimeSeries( + name="TimeSeries", data=SliceableDataChunkIterator(data=array, chunk_shape=(1, 2), buffer_shape=(1, 3)) + ) + nwbfile.add_acquisition(time_series) + + default_backend_configuration = get_default_backend_configuration(nwbfile=nwbfile, backend_type="zarr") + + assert isinstance(default_backend_configuration, ZarrBackendConfiguration) + assert default_backend_configuration.number_of_jobs == -2 + assert len(default_backend_configuration.dataset_configurations) == 1 + + default_configurable_dataset, default_dataset_configuration = next( + iter(default_backend_configuration.dataset_configurations.items()) + ) + assert isinstance(default_configurable_dataset, ConfigurableDataset) + assert default_configurable_dataset.object_name == "TimeSeries" + assert default_configurable_dataset.parent == "root" + assert default_configurable_dataset.field == "data" + assert default_configurable_dataset.maxshape == (2, 3) + assert default_configurable_dataset.dtype == "int32" + + assert isinstance(default_dataset_configuration, ZarrDatasetConfiguration) + assert default_dataset_configuration.object_name == "TimeSeries" + assert default_dataset_configuration.parent == "root" + assert default_dataset_configuration.field == "data" + assert default_dataset_configuration.chunk_shape == (1, 2) + assert default_dataset_configuration.buffer_shape == (1, 3) + assert default_dataset_configuration.maxshape == (2, 3) + assert default_dataset_configuration.dtype == "int32" + assert default_dataset_configuration.compression_method == "gzip" + assert default_dataset_configuration.compression_options is None + assert default_dataset_configuration.filter_methods is None + assert default_dataset_configuration.filter_options is None def test_simple_dynamic_table(): From 400893d7df7ab31b444d49b173e17358ade3eecb Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 31 Aug 2023 23:09:49 +0000 Subject: [PATCH 10/17] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/neuroconv/tools/nwb_helpers/__init__.py | 4 ++-- .../test_get_default_backend_configuration.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/neuroconv/tools/nwb_helpers/__init__.py b/src/neuroconv/tools/nwb_helpers/__init__.py index 6e4a8df8e..41edd93b6 100644 --- a/src/neuroconv/tools/nwb_helpers/__init__.py +++ b/src/neuroconv/tools/nwb_helpers/__init__.py @@ -1,9 +1,9 @@ from ._dataset_and_backend_models import ( - HDF5BackendConfiguration, - ZarrBackendConfiguration, ConfigurableDataset, DatasetConfiguration, + HDF5BackendConfiguration, HDF5DatasetConfiguration, + ZarrBackendConfiguration, ZarrDatasetConfiguration, ) from ._dataset_configuration import ( diff --git a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_get_default_backend_configuration.py b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_get_default_backend_configuration.py index 74aea6b1a..43f636eeb 100644 --- a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_get_default_backend_configuration.py +++ b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_get_default_backend_configuration.py @@ -13,8 +13,8 @@ from neuroconv.tools.nwb_helpers import ( ConfigurableDataset, HDF5BackendConfiguration, - ZarrBackendConfiguration, HDF5DatasetConfiguration, + ZarrBackendConfiguration, ZarrDatasetConfiguration, get_default_backend_configuration, ) From 2c0713fb1aa7465c4c67d2a453937fafa28d808c Mon Sep 17 00:00:00 2001 From: Cody Baker Date: Fri, 1 Sep 2023 00:48:48 -0400 Subject: [PATCH 11/17] add chunk and buffer shape inferece; force staticmethods in neuroconv iterator; pin to dev hdmf --- requirements-minimal.txt | 2 +- src/neuroconv/tools/hdmf.py | 81 ++++++++++--- src/neuroconv/tools/nwb_helpers/__init__.py | 1 - .../_dataset_and_backend_models.py | 56 ++++----- .../nwb_helpers/_dataset_configuration.py | 112 +++++++++++------- 5 files changed, 156 insertions(+), 96 deletions(-) diff --git a/requirements-minimal.txt b/requirements-minimal.txt index 447077ecc..06b339cb4 100644 --- a/requirements-minimal.txt +++ b/requirements-minimal.txt @@ -3,7 +3,7 @@ jsonschema>=3.2.0 PyYAML>=5.4 scipy>=1.4.1 h5py>=2.10.0 -hdmf>=3.4.7 +hdmf @ git+https://github.com/hdmf-dev/hdmf.git@master hdmf_zarr>=0.3.0 pynwb>=2.3.2;python_version>='3.8' psutil>=5.8.0 diff --git a/src/neuroconv/tools/hdmf.py b/src/neuroconv/tools/hdmf.py index 6ad712d51..af1707546 100644 --- a/src/neuroconv/tools/hdmf.py +++ b/src/neuroconv/tools/hdmf.py @@ -1,35 +1,78 @@ """Collection of modifications of HDMF functions that are to be tested/used on this repo until propagation upstream.""" +import math from typing import Tuple +from pydantic import Field +from typing_extensions import Annotated + import numpy as np from hdmf.data_utils import GenericDataChunkIterator as HDMFGenericDataChunkIterator class GenericDataChunkIterator(HDMFGenericDataChunkIterator): def _get_default_buffer_shape(self, buffer_gb: float = 1.0) -> Tuple[int]: - num_axes = len(self.maxshape) - chunk_bytes = np.prod(self.chunk_shape) * self.dtype.itemsize + self.estimate_default_buffer_shape( + buffer_gb=buffer_gb, chunk_shape=self.chunk_shape, maxshape=self.maxshape, dtype=self.dtype + ) + + # TODO: move this to the core iterator in HDMF so it can be easily swapped out as well as run on its own + @staticmethod + def estimate_default_chunk_shape( + chunk_mb: Annotated[float, Field(gt=0.0)], + maxshape: Tuple[int, ...], + dtype: np.dtype, + ) -> Tuple[int, ...]: + """ + Select chunk shape with size in MB less than the threshold of chunk_mb. + + Keeps the dimensional ratios of the original data. + """ + assert chunk_mb > 0.0, f"chunk_mb ({chunk_mb}) must be greater than zero!" + # Eventually, Pydantic validation can handle this validation for us + + n_dims = len(maxshape) + itemsize = dtype.itemsize + chunk_bytes = chunk_mb * 1e6 + + min_maxshape = min(maxshape) + v = tuple(math.floor(maxshape_axis / min_maxshape) for maxshape_axis in maxshape) + prod_v = math.prod(v) + while prod_v * itemsize > chunk_bytes and prod_v != 1: + non_unit_min_v = min(x for x in v if x != 1) + v = tuple(math.floor(x / non_unit_min_v) if x != 1 else x for x in v) + prod_v = math.prod(v) + k = math.floor((chunk_bytes / (prod_v * itemsize)) ** (1 / n_dims)) + return tuple([min(k * x, maxshape[dim]) for dim, x in enumerate(v)]) + + # TODO: move this to the core iterator in HDMF so it can be easily swapped out as well as run on its own + @staticmethod + def estimate_default_buffer_shape( + buffer_gb: Annotated[float, Field(gt=0.0)], + chunk_shape: Tuple[int, ...], + maxshape: Tuple[int, ...], + dtype: np.dtype, + ) -> Tuple[int]: + num_axes = len(maxshape) + chunk_bytes = math.prod(chunk_shape) * dtype.itemsize assert buffer_gb > 0, f"buffer_gb ({buffer_gb}) must be greater than zero!" assert ( buffer_gb >= chunk_bytes / 1e9 ), f"buffer_gb ({buffer_gb}) must be greater than the chunk size ({chunk_bytes / 1e9})!" - assert all( - np.array(self.chunk_shape) > 0 - ), f"Some dimensions of chunk_shape ({self.chunk_shape}) are less than zero!" + assert all(np.array(chunk_shape) > 0), f"Some dimensions of chunk_shape ({chunk_shape}) are less than zero!" - maxshape = np.array(self.maxshape) + maxshape = np.array(maxshape) # Early termination condition - if np.prod(maxshape) * self.dtype.itemsize / 1e9 < buffer_gb: - return tuple(self.maxshape) + if math.prod(maxshape) * dtype.itemsize / 1e9 < buffer_gb: + return tuple(maxshape) buffer_bytes = chunk_bytes - axis_sizes_bytes = maxshape * self.dtype.itemsize - smallest_chunk_axis, second_smallest_chunk_axis, *_ = np.argsort(self.chunk_shape) + axis_sizes_bytes = maxshape * dtype.itemsize + smallest_chunk_axis, second_smallest_chunk_axis, *_ = np.argsort(chunk_shape) target_buffer_bytes = buffer_gb * 1e9 # If the smallest full axis does not fit within the buffer size, form a square along the two smallest axes - sub_square_buffer_shape = np.array(self.chunk_shape) + sub_square_buffer_shape = np.array(chunk_shape) if min(axis_sizes_bytes) > target_buffer_bytes: k1 = np.floor((target_buffer_bytes / chunk_bytes) ** 0.5) for axis in [smallest_chunk_axis, second_smallest_chunk_axis]: @@ -40,32 +83,32 @@ def _get_default_buffer_shape(self, buffer_gb: float = 1.0) -> Tuple[int]: chunk_to_buffer_ratio = buffer_gb * 1e9 / chunk_bytes chunk_scaling_factor = np.floor(chunk_to_buffer_ratio ** (1 / num_axes)) unpadded_buffer_shape = [ - np.clip(a=int(x), a_min=self.chunk_shape[j], a_max=self.maxshape[j]) - for j, x in enumerate(chunk_scaling_factor * np.array(self.chunk_shape)) + np.clip(a=int(x), a_min=chunk_shape[j], a_max=maxshape[j]) + for j, x in enumerate(chunk_scaling_factor * np.array(chunk_shape)) ] - unpadded_buffer_bytes = np.prod(unpadded_buffer_shape) * self.dtype.itemsize + unpadded_buffer_bytes = math.prod(unpadded_buffer_shape) * dtype.itemsize # Method that starts by filling the smallest axis completely or calculates best partial fill - padded_buffer_shape = np.array(self.chunk_shape) - chunks_per_axis = np.ceil(maxshape / self.chunk_shape) + padded_buffer_shape = np.array(chunk_shape) + chunks_per_axis = np.ceil(maxshape / chunk_shape) small_axis_fill_size = chunk_bytes * min(chunks_per_axis) full_axes_used = np.zeros(shape=num_axes, dtype=bool) if small_axis_fill_size <= target_buffer_bytes: buffer_bytes = small_axis_fill_size - padded_buffer_shape[smallest_chunk_axis] = self.maxshape[smallest_chunk_axis] + padded_buffer_shape[smallest_chunk_axis] = maxshape[smallest_chunk_axis] full_axes_used[smallest_chunk_axis] = True for axis, chunks_on_axis in enumerate(chunks_per_axis): if full_axes_used[axis]: # If the smallest axis, skip since already used continue if chunks_on_axis * buffer_bytes <= target_buffer_bytes: # If multiple axes can be used together buffer_bytes *= chunks_on_axis - padded_buffer_shape[axis] = self.maxshape[axis] + padded_buffer_shape[axis] = maxshape[axis] else: # Found an axis that is too large to use with the rest of the buffer; calculate how much can be used k3 = np.floor(target_buffer_bytes / buffer_bytes) padded_buffer_shape[axis] *= k3 break - padded_buffer_bytes = np.prod(padded_buffer_shape) * self.dtype.itemsize + padded_buffer_bytes = math.prod(padded_buffer_shape) * dtype.itemsize if padded_buffer_bytes >= unpadded_buffer_bytes: return tuple(padded_buffer_shape) diff --git a/src/neuroconv/tools/nwb_helpers/__init__.py b/src/neuroconv/tools/nwb_helpers/__init__.py index 6e4a8df8e..e07b6e08f 100644 --- a/src/neuroconv/tools/nwb_helpers/__init__.py +++ b/src/neuroconv/tools/nwb_helpers/__init__.py @@ -1,7 +1,6 @@ from ._dataset_and_backend_models import ( HDF5BackendConfiguration, ZarrBackendConfiguration, - ConfigurableDataset, DatasetConfiguration, HDF5DatasetConfiguration, ZarrDatasetConfiguration, diff --git a/src/neuroconv/tools/nwb_helpers/_dataset_and_backend_models.py b/src/neuroconv/tools/nwb_helpers/_dataset_and_backend_models.py index 32fabbb6f..8eb8dfa98 100644 --- a/src/neuroconv/tools/nwb_helpers/_dataset_and_backend_models.py +++ b/src/neuroconv/tools/nwb_helpers/_dataset_and_backend_models.py @@ -1,5 +1,5 @@ """Collection of helper functions related to configuration of datasets dependent on backend.""" -from typing import Any, Dict, Literal, Tuple, Type, Union +from typing import Any, Dict, Literal, Tuple, Type, Union, Iterable import h5py import hdf5plugin @@ -11,22 +11,19 @@ from pydantic import BaseModel, Field, root_validator -class ConfigurableDataset(BaseModel): - """A data model for summarizing information about an object that will become a HDF5 or Zarr Dataset in the file.""" +class DatasetConfiguration(BaseModel): + """A data model for configruing options about an object that will become a HDF5 or Zarr Dataset in the file.""" object_id: str object_name: str parent: str field: Literal["data", "timestamps"] + chunk_shape: Tuple[int, ...] + buffer_shape: Tuple[int, ...] maxshape: Tuple[int, ...] dtype: str # Think about how to constrain/specify this more - - class Config: # noqa: D106 - allow_mutation = False # To enforce hashability - - def __hash__(self): - """To allow instances of this class to be used as keys in dictionaries.""" - return hash((type(self),) + tuple(self.__dict__.values())) + compression_method: Union[str, None] # Backend configurations should specify Literals; None means no compression + compression_options: Union[Dict[str, Any], None] = None def __str__(self) -> str: """Not overriding __repr__ as this is intended to render only when wrapped in print().""" @@ -40,15 +37,6 @@ def __str__(self) -> str: return string -class DatasetConfiguration(ConfigurableDataset): - """A data model for configruing options about an object that will become a HDF5 or Zarr Dataset in the file.""" - - chunk_shape: Tuple[int, ...] - buffer_shape: Tuple[int, ...] - compression_method: Union[str, None] # Backend configurations should specify Literals; None means no compression - compression_options: Union[Dict[str, Any], None] = None - - _available_hdf5_filters = set(h5py.filters.decode) - set(("shuffle", "fletcher32", "scaleoffset")) if is_module_installed(module_name="hdf5plugin"): _available_hdf5_filters = _available_hdf5_filters | set( @@ -72,22 +60,20 @@ class HDF5DatasetConfiguration(DatasetConfiguration): # These filters do nothing for us, or are things that ought to be implemented at lower HDMF levels # or indirectly using HDMF data structures ( - "json2", - "pickle", - "astype", - "vlen-utf8", - "vlen-array", - "vlen-bytes", - "adler32", - "crc32", - "fixedscaleoffset", - "msgpack2", - "base64", - "n5_wrapper", + "json2", # no data savings + "pickle", # no data savings + "vlen-utf8", # enforced by HDMF + "vlen-array", # enforced by HDMF + "vlen-bytes", # enforced by HDMF + "adler32", # checksum + "crc32", # checksum + "fixedscaleoffset", # enforced indrectly by HDMF/PyNWB data types + "base64", # unsure what this would ever be used for + "n5_wrapper", # different data format ) ) - set( # Forbidding lossy codecs for now, but they could be allowed in the future with warnings - ("bitround", "quantize") + ("astype", "bitround", "quantize") ) ) # TODO: would like to eventually (as separate feature) add an 'auto' method to Zarr @@ -127,13 +113,15 @@ def validate_filter_methods_and_options_match(cls, values: Dict[str, Any]): return values + # think about extra validation that msgpack2 compression only ideal for datasets of vlen strings + class HDF5BackendConfiguration(BaseModel): """A model for matching collections of DatasetConfigurations specific to the HDF5 backend.""" backend_type: Literal["hdf5"] = "hdf5" data_io: Type[H5DataIO] = H5DataIO - dataset_configurations: Dict[ConfigurableDataset, HDF5DatasetConfiguration] + dataset_configurations: Iterable[HDF5DatasetConfiguration] class ZarrBackendConfiguration(BaseModel): @@ -141,7 +129,7 @@ class ZarrBackendConfiguration(BaseModel): backend_type: Literal["zarr"] = "zarr" data_io: Type[ZarrDataIO] = ZarrDataIO - dataset_configurations: Dict[ConfigurableDataset, ZarrDatasetConfiguration] + dataset_configurations: Iterable[ZarrDatasetConfiguration] number_of_jobs: int = Field( description="Number of jobs to use in parallel during write.", ge=-psutil.cpu_count(), # TODO: should we specify logical=False in cpu_count? diff --git a/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py b/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py index d07c5759f..8189743cf 100644 --- a/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py +++ b/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py @@ -1,14 +1,16 @@ """Collection of helper functions related to configuration of datasets dependent on backend.""" -from typing import Dict, Iterable, Literal, Union +from typing import Iterable, Literal, Union import h5py import zarr -from hdmf.data_utils import DataIO, GenericDataChunkIterator +import numpy as np +from hdmf.data_utils import DataIO, GenericDataChunkIterator, DataChunkIterator from hdmf.utils import get_data_shape from hdmf_zarr import NWBZarrIO from pynwb import NWBHDF5IO, NWBFile, TimeSeries from pynwb.base import DynamicTable +from ..hdmf import SliceableDataChunkIterator from ._dataset_and_backend_models import ( ConfigurableDataset, DatasetConfiguration, @@ -19,23 +21,8 @@ ) -def _get_dataset_metadata(neurodata_object: Union[TimeSeries, DynamicTable], field_name: str) -> ConfigurableDataset: - """Fill in the Dataset model with as many values as can be automatically detected or inferred.""" - field_value = getattr(neurodata_object, field_name) - if field_value is not None and not isinstance(field_value, DataIO): - return ConfigurableDataset( - object_id=neurodata_object.object_id, - object_name=neurodata_object.name, - parent=neurodata_object.get_ancestor().name, - field=field_name, - maxshape=get_data_shape(data=field_value), - # think on cases that don't have a dtype attr - dtype=str(getattr(field_value, "dtype", "unknown")), - ) - - -def _value_already_written_to_file( - value: Union[h5py.Dataset, zarr.Array], +def _is_value_already_written_to_file( + candidate_dataset: Union[h5py.Dataset, zarr.Array], backend_type: Literal["hdf5", "zarr"], existing_file: Union[h5py.File, zarr.Group, None], ) -> bool: @@ -45,17 +32,60 @@ def _value_already_written_to_file( This object should then be skipped by the `get_io_datasets` function when working in append mode. """ return ( - isinstance(value, h5py.Dataset) # If the source data is an HDF5 Dataset + isinstance(candidate_dataset, h5py.Dataset) # If the source data is an HDF5 Dataset and backend_type == "hdf5" # If working in append mode - and value.file == existing_file # If the source HDF5 Dataset is the appending NWBFile + and candidate_dataset.file == existing_file # If the source HDF5 Dataset is the appending NWBFile ) or ( - isinstance(value, zarr.Array) # If the source data is an Zarr Array + isinstance(candidate_dataset, zarr.Array) # If the source data is an Zarr Array and backend_type == "zarr" # If working in append mode - and value.store == existing_file # If the source Zarr 'file' is the appending NWBFile + and candidate_dataset.store == existing_file # If the source Zarr 'file' is the appending NWBFile ) -def get_configurable_datasets(nwbfile: NWBFile) -> Iterable[ConfigurableDataset]: +def _get_dataset_metadata(neurodata_object: Union[TimeSeries, DynamicTable], field_name: str) -> DatasetConfiguration: + """Fill in the Dataset model with as many values as can be automatically detected or inferred.""" + candidate_dataset = getattr(neurodata_object, field_name) + # For now, skip over datasets already wrapped in DataIO + # Could maybe eventually support modifying chunks in place + # But setting buffer shape only possible if iterator was wrapped first + if not isinstance(candidate_dataset, DataIO): + # DataChunkIterator has best generic dtype inference, though logic is hard to peel out of it + # And it can fail in rare cases but not essential to our default configuration + try: + dtype = str(DataChunkIterator(candidate_dataset).dtype) # string cast to be JSON friendly + except Exception as exception: + if str(exception) != "Data type could not be determined. Please specify dtype in DataChunkIterator init.": + raise exception + else: + dtype = "unknown" + + maxshape = get_data_shape(data=candidate_dataset) + + if isinstance(candidate_dataset, GenericDataChunkIterator): + chunk_shape = candidate_dataset.chunk_shape + buffer_shape = candidate_dataset.buffer_shape + elif dtype != "unknown": + # TODO: eventually replace this with staticmethods on hdmf.data_utils.GenericDataChunkIterator + chunk_shape = SliceableDataChunkIterator.estimate_chunk_shape(chunk_mb=10.0, maxshape=maxshape, dtype=dtype) + buffer_shape = SliceableDataChunkIterator.estimate_buffer_shape( + buffer_gb=0.5, chunk_shape=chunk_shape, maxshape=maxshape, dtype=dtype + ) + else: + pass # TODO: think on this; perhaps zarr's standalone estimator? + + return DatasetConfiguration( + object_id=neurodata_object.object_id, + object_name=neurodata_object.name, + parent=neurodata_object.get_ancestor().name, # or should this be full location relative to root? + field=field_name, + chunk_shape=chunk_shape, + buffer_shape=buffer_shape, + maxshape=maxshape, + dtype=dtype, + ) + + +def get_configurable_datasets(nwbfile: NWBFile) -> Iterable[DatasetConfiguration]: """ Method for automatically detecting all objects in the file that could be wrapped in a DataIO. @@ -81,34 +111,34 @@ def get_configurable_datasets(nwbfile: NWBFile) -> Iterable[ConfigurableDataset] for _, neurodata_object in nwbfile.objects.items(): # TODO: edge case of ImageSeries with external file mode? if isinstance(neurodata_object, TimeSeries): + time_series = neurodata_object # for readability + for field_name in ("data", "timestamps"): - if field_name not in neurodata_object.fields: # timestamps is optional + if field_name not in time_series.fields: # timestamps is optional continue - field_value = getattr(neurodata_object, field_name) - if _value_already_written_to_file( - value=field_value, backend_type=backend_type, existing_file=existing_file + candidate_dataset = getattr(time_series, field_name) + if _is_value_already_written_to_file( + candidate_dataset=candidate_dataset, backend_type=backend_type, existing_file=existing_file ): continue # skip - # Currently requiring a ConfigurableDataset to apply only to data wrapped in a GenericDataChunkIterator - # TODO: in follow-up, can maybe be wrapped automatically? - if not isinstance(field_value, GenericDataChunkIterator): + + # Edge case of in-memory ImageSeries with external mode; data is in fields and is empty array + if isinstance(candidate_dataset, np.ndarray) and not candidate_dataset: continue # skip - yield _get_dataset_metadata(neurodata_object=neurodata_object, field_name=field_name) + yield _get_dataset_metadata(neurodata_object=time_series, field_name=field_name) elif isinstance(neurodata_object, DynamicTable): - for column_name in getattr(neurodata_object, "colnames"): - column_value = getattr(neurodata_object, column_name).data - if _value_already_written_to_file( - value=column_value, backend_type=backend_type, existing_file=existing_file + dynamic_table = neurodata_object # for readability + + for column_name in dynamic_table.colnames: + candidate_dataset = dynamic_table[column_name].data # VectorData object + if _is_value_already_written_to_file( + candidate_dataset=candidate_dataset, backend_type=backend_type, existing_file=existing_file ): continue # skip - # Currently requiring a ConfigurableDataset to apply only to data wrapped in a GenericDataChunkIterator - # TODO: in follow-up, can maybe be wrapped automatically? - if not isinstance(column_value, GenericDataChunkIterator): - continue # skip - yield _get_dataset_metadata(neurodata_object=neurodata_object[column_name], field_name="data") + yield _get_dataset_metadata(neurodata_object=dynamic_table[column_name], field_name="data") def _get_default_configuration( From 246614acb681d2c64d84c3d8ecba6ab824028068 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 1 Sep 2023 04:49:48 +0000 Subject: [PATCH 12/17] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/neuroconv/tools/hdmf.py | 5 ++--- src/neuroconv/tools/nwb_helpers/__init__.py | 2 -- .../tools/nwb_helpers/_dataset_and_backend_models.py | 2 +- src/neuroconv/tools/nwb_helpers/_dataset_configuration.py | 6 +++--- 4 files changed, 6 insertions(+), 9 deletions(-) diff --git a/src/neuroconv/tools/hdmf.py b/src/neuroconv/tools/hdmf.py index af1707546..3faa8d245 100644 --- a/src/neuroconv/tools/hdmf.py +++ b/src/neuroconv/tools/hdmf.py @@ -2,11 +2,10 @@ import math from typing import Tuple -from pydantic import Field -from typing_extensions import Annotated - import numpy as np from hdmf.data_utils import GenericDataChunkIterator as HDMFGenericDataChunkIterator +from pydantic import Field +from typing_extensions import Annotated class GenericDataChunkIterator(HDMFGenericDataChunkIterator): diff --git a/src/neuroconv/tools/nwb_helpers/__init__.py b/src/neuroconv/tools/nwb_helpers/__init__.py index 80706fceb..9e7fc1599 100644 --- a/src/neuroconv/tools/nwb_helpers/__init__.py +++ b/src/neuroconv/tools/nwb_helpers/__init__.py @@ -1,6 +1,4 @@ from ._dataset_and_backend_models import ( - HDF5BackendConfiguration, - ZarrBackendConfiguration, DatasetConfiguration, HDF5BackendConfiguration, ZarrBackendConfiguration, diff --git a/src/neuroconv/tools/nwb_helpers/_dataset_and_backend_models.py b/src/neuroconv/tools/nwb_helpers/_dataset_and_backend_models.py index 8eb8dfa98..ab8cfa079 100644 --- a/src/neuroconv/tools/nwb_helpers/_dataset_and_backend_models.py +++ b/src/neuroconv/tools/nwb_helpers/_dataset_and_backend_models.py @@ -1,5 +1,5 @@ """Collection of helper functions related to configuration of datasets dependent on backend.""" -from typing import Any, Dict, Literal, Tuple, Type, Union, Iterable +from typing import Any, Dict, Iterable, Literal, Tuple, Type, Union import h5py import hdf5plugin diff --git a/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py b/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py index 8189743cf..ebf91a340 100644 --- a/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py +++ b/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py @@ -2,15 +2,14 @@ from typing import Iterable, Literal, Union import h5py -import zarr import numpy as np -from hdmf.data_utils import DataIO, GenericDataChunkIterator, DataChunkIterator +import zarr +from hdmf.data_utils import DataChunkIterator, DataIO, GenericDataChunkIterator from hdmf.utils import get_data_shape from hdmf_zarr import NWBZarrIO from pynwb import NWBHDF5IO, NWBFile, TimeSeries from pynwb.base import DynamicTable -from ..hdmf import SliceableDataChunkIterator from ._dataset_and_backend_models import ( ConfigurableDataset, DatasetConfiguration, @@ -19,6 +18,7 @@ ZarrBackendConfiguration, ZarrDatasetConfiguration, ) +from ..hdmf import SliceableDataChunkIterator def _is_value_already_written_to_file( From 5e95d4c4018d17998db1f055ab548634647cc910 Mon Sep 17 00:00:00 2001 From: Cody Baker Date: Fri, 1 Sep 2023 14:32:51 -0400 Subject: [PATCH 13/17] extensive refinement of models and workflow; many tests; many debugs --- src/neuroconv/tools/hdmf.py | 2 +- src/neuroconv/tools/nwb_helpers/__init__.py | 11 +- .../_dataset_and_backend_models.py | 54 ++- .../nwb_helpers/_dataset_configuration.py | 158 ++++--- .../test_configurable_dataset.py | 86 ++-- .../test_get_configurable_datasets.py | 140 ------ .../test_get_default_backend_configuration.py | 399 +++++++++--------- ...test_get_default_dataset_configurations.py | 349 +++++++++++++++ ...t_dataset_configurations_appended_files.py | 146 +++++++ 9 files changed, 878 insertions(+), 467 deletions(-) delete mode 100644 tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_get_configurable_datasets.py create mode 100644 tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_get_default_dataset_configurations.py create mode 100644 tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_get_default_dataset_configurations_appended_files.py diff --git a/src/neuroconv/tools/hdmf.py b/src/neuroconv/tools/hdmf.py index 3faa8d245..b82b12e1f 100644 --- a/src/neuroconv/tools/hdmf.py +++ b/src/neuroconv/tools/hdmf.py @@ -10,7 +10,7 @@ class GenericDataChunkIterator(HDMFGenericDataChunkIterator): def _get_default_buffer_shape(self, buffer_gb: float = 1.0) -> Tuple[int]: - self.estimate_default_buffer_shape( + return self.estimate_default_buffer_shape( buffer_gb=buffer_gb, chunk_shape=self.chunk_shape, maxshape=self.maxshape, dtype=self.dtype ) diff --git a/src/neuroconv/tools/nwb_helpers/__init__.py b/src/neuroconv/tools/nwb_helpers/__init__.py index 9e7fc1599..457e7e77b 100644 --- a/src/neuroconv/tools/nwb_helpers/__init__.py +++ b/src/neuroconv/tools/nwb_helpers/__init__.py @@ -1,12 +1,15 @@ from ._dataset_and_backend_models import ( + DatasetInfo, + BackendConfiguration, DatasetConfiguration, + HDF5DatasetConfiguration, + ZarrDatasetConfiguration, HDF5BackendConfiguration, ZarrBackendConfiguration, + BACKEND_TO_DATASET_CONFIGURATION, + BACKEND_TO_CONFIGURATION, ) -from ._dataset_configuration import ( - get_configurable_datasets, - get_default_backend_configuration, -) +from ._dataset_configuration import get_default_dataset_configurations, get_default_backend_configuration from ._metadata_and_file_helpers import ( add_device_from_metadata, get_default_nwbfile_metadata, diff --git a/src/neuroconv/tools/nwb_helpers/_dataset_and_backend_models.py b/src/neuroconv/tools/nwb_helpers/_dataset_and_backend_models.py index ab8cfa079..38f8d41a1 100644 --- a/src/neuroconv/tools/nwb_helpers/_dataset_and_backend_models.py +++ b/src/neuroconv/tools/nwb_helpers/_dataset_and_backend_models.py @@ -1,27 +1,42 @@ """Collection of helper functions related to configuration of datasets dependent on backend.""" -from typing import Any, Dict, Iterable, Literal, Tuple, Type, Union +from typing import Any, Dict, Literal, Tuple, Type, Union import h5py import hdf5plugin import psutil import zarr +from hdmf.container import DataIO from hdmf.backends.hdf5 import H5DataIO from hdmf_zarr import ZarrDataIO from nwbinspector.utils import is_module_installed from pydantic import BaseModel, Field, root_validator +class DatasetInfo(BaseModel): + object_id: str + location: str + maxshape: Tuple[int, ...] + dtype: str # Think about how to constrain/specify this more + + class Config: # noqa: D106 + allow_mutation = False + + def __hash__(self): + """To allow instances of this class to be used as keys in dictionaries.""" + return hash((type(self),) + tuple(self.__dict__.values())) + + def __eq__(self, other): + if other.__class__ is self.__class__: + return self.__dict__ == other.__dict__ + return NotImplemented + + class DatasetConfiguration(BaseModel): """A data model for configruing options about an object that will become a HDF5 or Zarr Dataset in the file.""" - object_id: str - object_name: str - parent: str - field: Literal["data", "timestamps"] + dataset_info: DatasetInfo chunk_shape: Tuple[int, ...] buffer_shape: Tuple[int, ...] - maxshape: Tuple[int, ...] - dtype: str # Think about how to constrain/specify this more compression_method: Union[str, None] # Backend configurations should specify Literals; None means no compression compression_options: Union[Dict[str, Any], None] = None @@ -36,6 +51,11 @@ def __str__(self) -> str: ) return string + def __eq__(self, other): + if other.__class__ is self.__class__: + return self.__dict__ == other.__dict__ + return NotImplemented + _available_hdf5_filters = set(h5py.filters.decode) - set(("shuffle", "fletcher32", "scaleoffset")) if is_module_installed(module_name="hdf5plugin"): @@ -116,23 +136,35 @@ def validate_filter_methods_and_options_match(cls, values: Dict[str, Any]): # think about extra validation that msgpack2 compression only ideal for datasets of vlen strings -class HDF5BackendConfiguration(BaseModel): +class BackendConfiguration(BaseModel): + """A model for matching collections of DatasetConfigurations specific to the HDF5 backend.""" + + backend_type: Literal["hdf5", "zarr"] + data_io: Type[DataIO] + dataset_configurations: Dict[str, DatasetConfiguration] # str is location field of DatasetConfiguration + + +class HDF5BackendConfiguration(BackendConfiguration): """A model for matching collections of DatasetConfigurations specific to the HDF5 backend.""" backend_type: Literal["hdf5"] = "hdf5" data_io: Type[H5DataIO] = H5DataIO - dataset_configurations: Iterable[HDF5DatasetConfiguration] + dataset_configurations: Dict[str, HDF5DatasetConfiguration] # str is location field of DatasetConfiguration -class ZarrBackendConfiguration(BaseModel): +class ZarrBackendConfiguration(BackendConfiguration): """A model for matching collections of DatasetConfigurations specific to the Zarr backend.""" backend_type: Literal["zarr"] = "zarr" data_io: Type[ZarrDataIO] = ZarrDataIO - dataset_configurations: Iterable[ZarrDatasetConfiguration] + dataset_configurations: Dict[str, ZarrDatasetConfiguration] # str is location field of DatasetConfiguration number_of_jobs: int = Field( description="Number of jobs to use in parallel during write.", ge=-psutil.cpu_count(), # TODO: should we specify logical=False in cpu_count? le=psutil.cpu_count(), default=-2, # -2 translates to 'all CPU except for one' ) + + +BACKEND_TO_DATASET_CONFIGURATION = dict(hdf5=HDF5DatasetConfiguration, zarr=ZarrDatasetConfiguration) +BACKEND_TO_CONFIGURATION = dict(hdf5=HDF5BackendConfiguration, zarr=ZarrBackendConfiguration) diff --git a/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py b/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py index ebf91a340..036775fc5 100644 --- a/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py +++ b/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py @@ -4,6 +4,7 @@ import h5py import numpy as np import zarr +from hdmf import Container from hdmf.data_utils import DataChunkIterator, DataIO, GenericDataChunkIterator from hdmf.utils import get_data_shape from hdmf_zarr import NWBZarrIO @@ -11,19 +12,29 @@ from pynwb.base import DynamicTable from ._dataset_and_backend_models import ( - ConfigurableDataset, + DatasetInfo, DatasetConfiguration, HDF5BackendConfiguration, HDF5DatasetConfiguration, ZarrBackendConfiguration, ZarrDatasetConfiguration, + BACKEND_TO_DATASET_CONFIGURATION, + BACKEND_TO_CONFIGURATION, ) from ..hdmf import SliceableDataChunkIterator +def _get_mode(io: Union[NWBHDF5IO, NWBZarrIO]) -> str: + """NWBHDF5IO and NWBZarrIO have different ways of storing the mode they used on a path.""" + if isinstance(io, NWBHDF5IO): + return io.mode + elif isinstance(io, NWBZarrIO): + return io._ZarrIO__mode + + def _is_value_already_written_to_file( candidate_dataset: Union[h5py.Dataset, zarr.Array], - backend_type: Literal["hdf5", "zarr"], + backend: Literal["hdf5", "zarr"], existing_file: Union[h5py.File, zarr.Group, None], ) -> bool: """ @@ -33,17 +44,35 @@ def _is_value_already_written_to_file( """ return ( isinstance(candidate_dataset, h5py.Dataset) # If the source data is an HDF5 Dataset - and backend_type == "hdf5" # If working in append mode + and backend == "hdf5" # If working in append mode and candidate_dataset.file == existing_file # If the source HDF5 Dataset is the appending NWBFile ) or ( isinstance(candidate_dataset, zarr.Array) # If the source data is an Zarr Array - and backend_type == "zarr" # If working in append mode + and backend == "zarr" # If working in append mode and candidate_dataset.store == existing_file # If the source Zarr 'file' is the appending NWBFile ) -def _get_dataset_metadata(neurodata_object: Union[TimeSeries, DynamicTable], field_name: str) -> DatasetConfiguration: +def _parse_location_in_memory_nwbfile(current_location: str, neurodata_object: Container) -> str: + parent = neurodata_object.parent + if isinstance(parent, NWBFile): + # Items in defined top-level places like acquisition, intervals, etc. do not act as 'containers' + # in the .parent sense; ask if object is in their in-memory dictionaries instead + for outer_field_name, outer_field_value in parent.fields.items(): + if isinstance(outer_field_value, dict) and neurodata_object.name in outer_field_value: + return outer_field_name + "/" + neurodata_object.name + "/" + current_location + return neurodata_object.name + "/" + current_location + return _parse_location_in_memory_nwbfile( + current_location=neurodata_object.name + "/" + current_location, neurodata_object=parent + ) + + +def _get_dataset_metadata( + neurodata_object: Union[TimeSeries, DynamicTable], field_name: str, backend: Literal["hdf5", "zarr"] +) -> Union[HDF5DatasetConfiguration, ZarrDatasetConfiguration]: """Fill in the Dataset model with as many values as can be automatically detected or inferred.""" + DatasetConfigurationClass = BACKEND_TO_DATASET_CONFIGURATION[backend] + candidate_dataset = getattr(neurodata_object, field_name) # For now, skip over datasets already wrapped in DataIO # Could maybe eventually support modifying chunks in place @@ -66,26 +95,33 @@ def _get_dataset_metadata(neurodata_object: Union[TimeSeries, DynamicTable], fie buffer_shape = candidate_dataset.buffer_shape elif dtype != "unknown": # TODO: eventually replace this with staticmethods on hdmf.data_utils.GenericDataChunkIterator - chunk_shape = SliceableDataChunkIterator.estimate_chunk_shape(chunk_mb=10.0, maxshape=maxshape, dtype=dtype) - buffer_shape = SliceableDataChunkIterator.estimate_buffer_shape( - buffer_gb=0.5, chunk_shape=chunk_shape, maxshape=maxshape, dtype=dtype + chunk_shape = SliceableDataChunkIterator.estimate_default_chunk_shape( + chunk_mb=10.0, maxshape=maxshape, dtype=np.dtype(dtype) + ) + buffer_shape = SliceableDataChunkIterator.estimate_default_buffer_shape( + buffer_gb=0.5, chunk_shape=chunk_shape, maxshape=maxshape, dtype=np.dtype(dtype) ) else: pass # TODO: think on this; perhaps zarr's standalone estimator? - return DatasetConfiguration( + dataset_info = DatasetInfo( object_id=neurodata_object.object_id, object_name=neurodata_object.name, - parent=neurodata_object.get_ancestor().name, # or should this be full location relative to root? + location=_parse_location_in_memory_nwbfile(current_location=field_name, neurodata_object=neurodata_object), field=field_name, - chunk_shape=chunk_shape, - buffer_shape=buffer_shape, maxshape=maxshape, dtype=dtype, ) + dataset_configuration = DatasetConfigurationClass( + dataset_info=dataset_info, chunk_shape=chunk_shape, buffer_shape=buffer_shape + ) + return dataset_configuration -def get_configurable_datasets(nwbfile: NWBFile) -> Iterable[DatasetConfiguration]: +def get_default_dataset_configurations( + nwbfile: NWBFile, + backend: Union[None, Literal["hdf5", "zarr"]] = None, # None for auto-detect from append mode, otherwise required +) -> Iterable[DatasetConfiguration]: """ Method for automatically detecting all objects in the file that could be wrapped in a DataIO. @@ -93,23 +129,41 @@ def get_configurable_datasets(nwbfile: NWBFile) -> Iterable[DatasetConfiguration ---------- nwbfile : pynwb.NWBFile An in-memory NWBFile object, either generated from the base class or read from an existing file of any backend. + backend : "hdf5" or "zarr" + Which backend format type you would like to use in configuring each datasets compression methods and options. Yields ------ - ConfigurableDataset + DatasetConfiguration A summary of each detected object that can be wrapped in a DataIO. """ - backend_type = None # Used for filtering out datasets that have already been written to disk when appending + if backend is None and nwbfile.read_io is None: + raise ValueError( + "Keyword argument `backend` (either 'hdf5' or 'zarr') must be specified if the `nwbfile` was not " + "read from an existing file!" + ) + if backend is None and nwbfile.read_io is not None and nwbfile.read_io.mode not in ("r+", "a"): + raise ValueError( + "Keyword argument `backend` (either 'hdf5' or 'zarr') must be specified if the `nwbfile` is being appended." + ) + + detected_backend = None existing_file = None - if isinstance(nwbfile.read_io, NWBHDF5IO): - backend_type = "hdf5" + if isinstance(nwbfile.read_io, NWBHDF5IO) and _get_mode(io=nwbfile.read_io) in ("r+", "a"): + detected_backend = "hdf5" existing_file = nwbfile.read_io._file - elif isinstance(nwbfile.read_io, NWBZarrIO): - backend_type = "zarr" + elif isinstance(nwbfile.read_io, NWBZarrIO) and _get_mode(io=nwbfile.read_io) in ("r+", "a"): + detected_backend = "zarr" existing_file = nwbfile.read_io.file.store + backend = backend or detected_backend + + if detected_backend is not None and detected_backend != backend: + raise ValueError( + f"Detected backend '{detected_backend}' for appending file, but specified `backend` " + f"({backend}) does not match! Set `backend=None` or remove the keyword argument to allow it to auto-detect." + ) - for _, neurodata_object in nwbfile.objects.items(): - # TODO: edge case of ImageSeries with external file mode? + for neurodata_object in nwbfile.objects.values(): if isinstance(neurodata_object, TimeSeries): time_series = neurodata_object # for readability @@ -119,73 +173,41 @@ def get_configurable_datasets(nwbfile: NWBFile) -> Iterable[DatasetConfiguration candidate_dataset = getattr(time_series, field_name) if _is_value_already_written_to_file( - candidate_dataset=candidate_dataset, backend_type=backend_type, existing_file=existing_file + candidate_dataset=candidate_dataset, backend=backend, existing_file=existing_file ): continue # skip # Edge case of in-memory ImageSeries with external mode; data is in fields and is empty array - if isinstance(candidate_dataset, np.ndarray) and not candidate_dataset: + if isinstance(candidate_dataset, np.ndarray) and not np.any(candidate_dataset): continue # skip - yield _get_dataset_metadata(neurodata_object=time_series, field_name=field_name) + yield _get_dataset_metadata(neurodata_object=time_series, field_name=field_name, backend=backend) elif isinstance(neurodata_object, DynamicTable): dynamic_table = neurodata_object # for readability for column_name in dynamic_table.colnames: candidate_dataset = dynamic_table[column_name].data # VectorData object if _is_value_already_written_to_file( - candidate_dataset=candidate_dataset, backend_type=backend_type, existing_file=existing_file + candidate_dataset=candidate_dataset, backend=backend, existing_file=existing_file ): continue # skip - yield _get_dataset_metadata(neurodata_object=dynamic_table[column_name], field_name="data") - - -def _get_default_configuration( - nwbfile: NWBFile, backend_type: Literal["hdf5", "zarr"], configurable_dataset: ConfigurableDataset -) -> DatasetConfiguration: - backend_to_dataset_configuration_class = dict(hdf5=HDF5DatasetConfiguration, zarr=ZarrDatasetConfiguration) - DatasetConfigurationClass = backend_to_dataset_configuration_class[backend_type] - - neurodata_object = nwbfile.objects[configurable_dataset.object_id] - field_value = getattr(neurodata_object, configurable_dataset.field) - iterator = field_value # Currently restricting to values that are already wrapped in GenericDataChunkIterators - # TODO: in follow-up, can maybe be wrapped automatically? - - dataset_configuration = DatasetConfigurationClass( - object_id=configurable_dataset.object_id, - object_name=configurable_dataset.object_name, - parent=configurable_dataset.parent, - field=configurable_dataset.field, - maxshape=configurable_dataset.maxshape, - dtype=configurable_dataset.dtype, - chunk_shape=iterator.chunk_shape, - buffer_shape=iterator.buffer_shape, - # Let the compression and/or filters default to the back-end specific values - ) - - return dataset_configuration + yield _get_dataset_metadata( + neurodata_object=dynamic_table[column_name], field_name="data", backend=backend + ) def get_default_backend_configuration( - nwbfile: NWBFile, backend_type: Literal["hdf5", "zarr"] + nwbfile: NWBFile, backend: Literal["hdf5", "zarr"] ) -> Union[HDF5BackendConfiguration, ZarrBackendConfiguration]: """Fill a default backend configuration to serve as a starting point for further customization.""" - backend_type_to_backend_configuration_classes = dict(hdf5=HDF5BackendConfiguration, zarr=ZarrBackendConfiguration) - - configurable_datasets = get_configurable_datasets(nwbfile=nwbfile) - - dataset_configurations = dict() - for configurable_dataset in configurable_datasets: - dataset_configurations.update( - { - configurable_dataset: _get_default_configuration( - nwbfile=nwbfile, backend_type=backend_type, configurable_dataset=configurable_dataset - ) - } - ) + BackendConfigurationClass = BACKEND_TO_CONFIGURATION[backend] - DatasetConfigurationClass = backend_type_to_backend_configuration_classes[backend_type] - backend_configuration = DatasetConfigurationClass(dataset_configurations=dataset_configurations) + default_dataset_configurations = get_default_dataset_configurations(nwbfile=nwbfile, backend=backend) + dataset_configurations = { + default_dataset_configuration.dataset_info.location: default_dataset_configuration + for default_dataset_configuration in default_dataset_configurations + } + backend_configuration = BackendConfigurationClass(dataset_configurations=dataset_configurations) return backend_configuration diff --git a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_configurable_dataset.py b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_configurable_dataset.py index 90d60eb2f..28ab594b6 100644 --- a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_configurable_dataset.py +++ b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_configurable_dataset.py @@ -2,46 +2,46 @@ from io import StringIO from unittest.mock import patch -from neuroconv.tools.nwb_helpers import ConfigurableDataset - - -def test_configurable_dataset_print(): - """Test the printout display of a Dataset modellooks nice.""" - test_dataset = ConfigurableDataset( - object_id="abc123", - object_name="TestObject", - parent="TestParent", - field="data", - maxshape=(2, 4), - dtype="int16", - ) - - with patch("sys.stdout", new=StringIO()) as out: - print(test_dataset) - - expected_print = """TestObject of TestParent ------------------------- - data - maxshape: (2, 4) - dtype: int16 -""" - assert out.getvalue() == expected_print - - -def test_configurable_dataset_repr(): - """Test the programmatic repr of a Dataset model is more dataclass-like.""" - test_dataset = ConfigurableDataset( - object_id="abc123", - object_name="TestObject", - parent="TestParent", - field="data", - maxshape=(2, 4), - dtype="int16", - ) - - # Important to keep the `repr` unmodified for appearance inside lists of Datasets - expected_repr = ( - "ConfigurableDataset(object_id='abc123', object_name='TestObject', parent='TestParent', " - "field='data', maxshape=(2, 4), dtype='int16')" - ) - assert repr(test_dataset) == expected_repr +# from neuroconv.tools.nwb_helpers import ConfigurableDataset + + +# def test_configurable_dataset_print(): +# """Test the printout display of a Dataset modellooks nice.""" +# test_dataset = ConfigurableDataset( +# object_id="abc123", +# object_name="TestObject", +# parent="TestParent", +# field="data", +# maxshape=(2, 4), +# dtype="int16", +# ) + +# with patch("sys.stdout", new=StringIO()) as out: +# print(test_dataset) + +# expected_print = """TestObject of TestParent +# ------------------------ +# data +# maxshape: (2, 4) +# dtype: int16 +# """ +# assert out.getvalue() == expected_print + + +# def test_configurable_dataset_repr(): +# """Test the programmatic repr of a Dataset model is more dataclass-like.""" +# test_dataset = ConfigurableDataset( +# object_id="abc123", +# object_name="TestObject", +# parent="TestParent", +# field="data", +# maxshape=(2, 4), +# dtype="int16", +# ) + +# # Important to keep the `repr` unmodified for appearance inside lists of Datasets +# expected_repr = ( +# "ConfigurableDataset(object_id='abc123', object_name='TestObject', parent='TestParent', " +# "field='data', maxshape=(2, 4), dtype='int16')" +# ) +# assert repr(test_dataset) == expected_repr diff --git a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_get_configurable_datasets.py b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_get_configurable_datasets.py deleted file mode 100644 index db212ffe1..000000000 --- a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_get_configurable_datasets.py +++ /dev/null @@ -1,140 +0,0 @@ -"""Unit tests for `get_configurable_datasets`.""" -from pathlib import Path - -import numpy as np -import pytest -from hdmf.common import VectorData -from hdmf_zarr import NWBZarrIO -from pynwb import NWBHDF5IO, NWBFile -from pynwb.base import DynamicTable -from pynwb.testing.mock.base import mock_TimeSeries -from pynwb.testing.mock.file import mock_NWBFile - -from neuroconv.tools.hdmf import SliceableDataChunkIterator -from neuroconv.tools.nwb_helpers import ConfigurableDataset, get_configurable_datasets - - -def generate_1d_array() -> NWBFile: - array = np.array([[0.1, 0.2, 0.3]]) - return array - - -def generate_2d_array() -> NWBFile: - array = np.array([[1, 2, 3], [4, 5, 6]]) - return array - - -def generate_nwbfile_with_ConfigurableDatasets() -> NWBFile: - nwbfile = mock_NWBFile() - array = generate_2d_array() - time_series = mock_TimeSeries(data=array) - nwbfile.add_acquisition(time_series) - return nwbfile - - -@pytest.fixture(scope="session") -def hdf5_nwbfile_path(tmpdir_factory): - nwbfile_path = tmpdir_factory.mktemp("data").join("test_hdf5_nwbfile_with_configurable_datasets.nwb.h5") - if not Path(nwbfile_path).exists(): - nwbfile = generate_nwbfile_with_ConfigurableDatasets() - with NWBHDF5IO(path=str(nwbfile_path), mode="w") as io: - io.write(nwbfile) - return str(nwbfile_path) - - -@pytest.fixture(scope="session") -def zarr_nwbfile_path(tmpdir_factory): - nwbfile_path = tmpdir_factory.mktemp("data").join("test_zarr_nwbfile_with_configurable_datasets.nwb.zarr") - if not Path(nwbfile_path).exists(): - nwbfile = generate_nwbfile_with_ConfigurableDatasets() - with NWBZarrIO(path=str(nwbfile_path), mode="w") as io: - io.write(nwbfile) - return str(nwbfile_path) - - -def test_simple_time_series(): - array = generate_2d_array() - - nwbfile = mock_NWBFile() - time_series = mock_TimeSeries(name="TimeSeries", data=SliceableDataChunkIterator(data=array)) - nwbfile.add_acquisition(time_series) - - results = list(get_configurable_datasets(nwbfile=nwbfile)) - - assert len(results) == 1 - - result = results[0] - assert isinstance(result, ConfigurableDataset) - assert result.object_name == "TimeSeries" - assert result.field == "data" - assert result.maxshape == array.shape - assert result.dtype == array.dtype - - -def test_simple_dynamic_table(): - array = generate_1d_array() - - nwbfile = mock_NWBFile() - column_length = array.shape[1] - column = VectorData(name="TestColumn", description="", data=SliceableDataChunkIterator(data=array.squeeze())) - dynamic_table = DynamicTable( - name="DynamicTable", - description="", - id=list(range(column_length)), # Need to include ID since the data of the column is not wrapped in an IO - columns=[column], - ) - # dynamic_table.add_column(name="TestColumn", description="", data=SliceableDataChunkIterator(data=array.squeeze())) - nwbfile.add_acquisition(dynamic_table) - - results = list(get_configurable_datasets(nwbfile=nwbfile)) - - assert len(results) == 1 - - result = results[0] - assert isinstance(result, ConfigurableDataset) - assert result.object_name == "TestColumn" - assert result.field == "data" - assert result.maxshape == (column_length,) - assert result.dtype == str(array.dtype) - - -def test_simple_on_appended_hdf5_file(hdf5_nwbfile_path): - array = generate_2d_array() - - with NWBHDF5IO(path=hdf5_nwbfile_path, mode="a") as io: - nwbfile = io.read() - array = generate_2d_array() - new_time_series = mock_TimeSeries(name="NewTimeSeries", data=SliceableDataChunkIterator(data=array)) - nwbfile.add_acquisition(new_time_series) - - results = list(get_configurable_datasets(nwbfile=nwbfile)) - - assert len(results) == 1 - - result = results[0] - assert isinstance(result, ConfigurableDataset) - assert result.object_name == "NewTimeSeries" - assert result.field == "data" - assert result.maxshape == array.shape - assert result.dtype == str(array.dtype) # TODO: add tests for if source specification was np.dtype et al. - - -def test_simple_on_appended_zarr_file(zarr_nwbfile_path): - array = generate_2d_array() - - with NWBZarrIO(path=zarr_nwbfile_path, mode="a") as io: - nwbfile = io.read() - array = generate_2d_array() - new_time_series = mock_TimeSeries(name="NewTimeSeries", data=SliceableDataChunkIterator(data=array)) - nwbfile.add_acquisition(new_time_series) - - results = list(get_configurable_datasets(nwbfile=nwbfile)) - - assert len(results) == 1 - - result = results[0] - assert isinstance(result, ConfigurableDataset) - assert result.object_name == "NewTimeSeries" - assert result.field == "data" - assert result.maxshape == array.shape - assert result.dtype == str(array.dtype) # TODO: add tests for if source specification was np.dtype et al. diff --git a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_get_default_backend_configuration.py b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_get_default_backend_configuration.py index 43f636eeb..0da3d0e04 100644 --- a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_get_default_backend_configuration.py +++ b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_get_default_backend_configuration.py @@ -1,200 +1,199 @@ -"""Unit tests for `get_default_backend_configuration`.""" -from pathlib import Path - -import numpy as np -import pytest -from hdmf_zarr import NWBZarrIO -from pynwb import NWBHDF5IO, NWBFile -from pynwb.base import DynamicTable -from pynwb.testing.mock.base import mock_TimeSeries -from pynwb.testing.mock.file import mock_NWBFile - -from neuroconv.tools.hdmf import SliceableDataChunkIterator -from neuroconv.tools.nwb_helpers import ( - ConfigurableDataset, - HDF5BackendConfiguration, - HDF5DatasetConfiguration, - ZarrBackendConfiguration, - ZarrDatasetConfiguration, - get_default_backend_configuration, -) - - -def generate_1d_array() -> NWBFile: - array = np.array([[0.1, 0.2, 0.3]]) - return array - - -def generate_2d_array() -> NWBFile: - array = np.array([[1, 2, 3], [4, 5, 6]]) - return array - - -def generate_nwbfile_with_ConfigurableDatasets() -> NWBFile: - nwbfile = mock_NWBFile() - array = generate_2d_array() - time_series = mock_TimeSeries(data=array) - nwbfile.add_acquisition(time_series) - return nwbfile - - -@pytest.fixture(scope="session") -def hdf5_nwbfile_path(tmpdir_factory): - nwbfile_path = tmpdir_factory.mktemp("data").join("test_hdf5_nwbfile_with_configurable_datasets.nwb.h5") - if not Path(nwbfile_path).exists(): - nwbfile = generate_nwbfile_with_ConfigurableDatasets() - with NWBHDF5IO(path=str(nwbfile_path), mode="w") as io: - io.write(nwbfile) - return str(nwbfile_path) - - -@pytest.fixture(scope="session") -def zarr_nwbfile_path(tmpdir_factory): - nwbfile_path = tmpdir_factory.mktemp("data").join("test_zarr_nwbfile_with_configurable_datasets.nwb.zarr") - if not Path(nwbfile_path).exists(): - nwbfile = generate_nwbfile_with_ConfigurableDatasets() - with NWBZarrIO(path=str(nwbfile_path), mode="w") as io: - io.write(nwbfile) - return str(nwbfile_path) - - -def test_simple_time_series_hdf5_backend(): - array = generate_2d_array() - - nwbfile = mock_NWBFile() - time_series = mock_TimeSeries( - name="TimeSeries", data=SliceableDataChunkIterator(data=array, chunk_shape=(1, 2), buffer_shape=(1, 3)) - ) - nwbfile.add_acquisition(time_series) - - default_backend_configuration = get_default_backend_configuration(nwbfile=nwbfile, backend_type="hdf5") - - assert isinstance(default_backend_configuration, HDF5BackendConfiguration) - assert len(default_backend_configuration.dataset_configurations) == 1 - - default_configurable_dataset, default_dataset_configuration = next( - iter(default_backend_configuration.dataset_configurations.items()) - ) - assert isinstance(default_configurable_dataset, ConfigurableDataset) - assert default_configurable_dataset.object_name == "TimeSeries" - assert default_configurable_dataset.parent == "root" - assert default_configurable_dataset.field == "data" - assert default_configurable_dataset.maxshape == (2, 3) - assert default_configurable_dataset.dtype == "int32" - - assert isinstance(default_dataset_configuration, HDF5DatasetConfiguration) - assert default_dataset_configuration.object_name == "TimeSeries" - assert default_dataset_configuration.parent == "root" - assert default_dataset_configuration.field == "data" - assert default_dataset_configuration.chunk_shape == (1, 2) - assert default_dataset_configuration.buffer_shape == (1, 3) - assert default_dataset_configuration.maxshape == (2, 3) - assert default_dataset_configuration.dtype == "int32" - assert default_dataset_configuration.compression_method == "gzip" - assert default_dataset_configuration.compression_options is None - - -def test_simple_time_series_zarr_backend(): - array = generate_2d_array() - - nwbfile = mock_NWBFile() - time_series = mock_TimeSeries( - name="TimeSeries", data=SliceableDataChunkIterator(data=array, chunk_shape=(1, 2), buffer_shape=(1, 3)) - ) - nwbfile.add_acquisition(time_series) - - default_backend_configuration = get_default_backend_configuration(nwbfile=nwbfile, backend_type="zarr") - - assert isinstance(default_backend_configuration, ZarrBackendConfiguration) - assert default_backend_configuration.number_of_jobs == -2 - assert len(default_backend_configuration.dataset_configurations) == 1 - - default_configurable_dataset, default_dataset_configuration = next( - iter(default_backend_configuration.dataset_configurations.items()) - ) - assert isinstance(default_configurable_dataset, ConfigurableDataset) - assert default_configurable_dataset.object_name == "TimeSeries" - assert default_configurable_dataset.parent == "root" - assert default_configurable_dataset.field == "data" - assert default_configurable_dataset.maxshape == (2, 3) - assert default_configurable_dataset.dtype == "int32" - - assert isinstance(default_dataset_configuration, ZarrDatasetConfiguration) - assert default_dataset_configuration.object_name == "TimeSeries" - assert default_dataset_configuration.parent == "root" - assert default_dataset_configuration.field == "data" - assert default_dataset_configuration.chunk_shape == (1, 2) - assert default_dataset_configuration.buffer_shape == (1, 3) - assert default_dataset_configuration.maxshape == (2, 3) - assert default_dataset_configuration.dtype == "int32" - assert default_dataset_configuration.compression_method == "gzip" - assert default_dataset_configuration.compression_options is None - assert default_dataset_configuration.filter_methods is None - assert default_dataset_configuration.filter_options is None - - -def test_simple_dynamic_table(): - array = generate_1d_array() - - nwbfile = mock_NWBFile() - column_length = array.shape[1] - dynamic_table = DynamicTable( - name="DynamicTable", - description="", - id=list(range(column_length)), # Need to include ID since the data of the column is not wrapped in an IO - ) - dynamic_table.add_column(name="TestColumn", description="", data=array.squeeze()) - nwbfile.add_acquisition(dynamic_table) - - results = list(get_default_backend_configuration(nwbfile=nwbfile)) - - assert len(results) == 1 - - result = results[0] - assert isinstance(result, ConfigurableDataset) - assert result.object_name == "TestColumn" - assert result.field == "data" - assert result.maxshape == (column_length,) - assert result.dtype == str(array.dtype) - - -def test_simple_on_appended_hdf5_file(hdf5_nwbfile_path): - array = generate_2d_array() - - with NWBHDF5IO(path=hdf5_nwbfile_path, mode="a") as io: - nwbfile = io.read() - array = generate_2d_array() - new_time_series = mock_TimeSeries(name="NewTimeSeries", data=array) - nwbfile.add_acquisition(new_time_series) - - results = list(get_default_backend_configuration(nwbfile=nwbfile)) - - assert len(results) == 1 - - result = results[0] - assert isinstance(result, ConfigurableDataset) - assert result.object_name == "NewTimeSeries" - assert result.field == "data" - assert result.maxshape == array.shape - assert result.dtype == str(array.dtype) # TODO: add tests for if source specification was np.dtype et al. - - -def test_simple_on_appended_zarr_file(zarr_nwbfile_path): - array = generate_2d_array() - - with NWBZarrIO(path=zarr_nwbfile_path, mode="a") as io: - nwbfile = io.read() - array = generate_2d_array() - new_time_series = mock_TimeSeries(name="NewTimeSeries", data=array) - nwbfile.add_acquisition(new_time_series) - - results = list(get_default_backend_configuration(nwbfile=nwbfile)) - - assert len(results) == 1 - - result = results[0] - assert isinstance(result, ConfigurableDataset) - assert result.object_name == "NewTimeSeries" - assert result.field == "data" - assert result.maxshape == array.shape - assert result.dtype == str(array.dtype) # TODO: add tests for if source specification was np.dtype et al. +# """Unit tests for `get_default_backend_configuration`.""" +# from pathlib import Path + +# import numpy as np +# import pytest +# from hdmf_zarr import NWBZarrIO +# from pynwb import NWBHDF5IO, NWBFile +# from pynwb.base import DynamicTable +# from pynwb.testing.mock.base import mock_TimeSeries +# from pynwb.testing.mock.file import mock_NWBFile + +# from neuroconv.tools.hdmf import SliceableDataChunkIterator +# from neuroconv.tools.nwb_helpers import ( +# HDF5BackendConfiguration, +# HDF5DatasetConfiguration, +# ZarrBackendConfiguration, +# ZarrDatasetConfiguration, +# get_default_backend_configuration, +# ) + + +# def generate_1d_array() -> NWBFile: +# array = np.array([[0.1, 0.2, 0.3]]) +# return array + + +# def generate_2d_array() -> NWBFile: +# array = np.array([[1, 2, 3], [4, 5, 6]]) +# return array + + +# def generate_nwbfile_with_ConfigurableDatasets() -> NWBFile: +# nwbfile = mock_NWBFile() +# array = generate_2d_array() +# time_series = mock_TimeSeries(data=array) +# nwbfile.add_acquisition(time_series) +# return nwbfile + + +# @pytest.fixture(scope="session") +# def hdf5_nwbfile_path(tmpdir_factory): +# nwbfile_path = tmpdir_factory.mktemp("data").join("test_hdf5_nwbfile_with_configurable_datasets.nwb.h5") +# if not Path(nwbfile_path).exists(): +# nwbfile = generate_nwbfile_with_ConfigurableDatasets() +# with NWBHDF5IO(path=str(nwbfile_path), mode="w") as io: +# io.write(nwbfile) +# return str(nwbfile_path) + + +# @pytest.fixture(scope="session") +# def zarr_nwbfile_path(tmpdir_factory): +# nwbfile_path = tmpdir_factory.mktemp("data").join("test_zarr_nwbfile_with_configurable_datasets.nwb.zarr") +# if not Path(nwbfile_path).exists(): +# nwbfile = generate_nwbfile_with_ConfigurableDatasets() +# with NWBZarrIO(path=str(nwbfile_path), mode="w") as io: +# io.write(nwbfile) +# return str(nwbfile_path) + + +# def test_simple_time_series_hdf5_backend(): +# array = generate_2d_array() + +# nwbfile = mock_NWBFile() +# time_series = mock_TimeSeries( +# name="TimeSeries", data=SliceableDataChunkIterator(data=array, chunk_shape=(1, 2), buffer_shape=(1, 3)) +# ) +# nwbfile.add_acquisition(time_series) + +# default_backend_configuration = get_default_backend_configuration(nwbfile=nwbfile, backend_type="hdf5") + +# assert isinstance(default_backend_configuration, HDF5BackendConfiguration) +# assert len(default_backend_configuration.dataset_configurations) == 1 + +# default_configurable_dataset, default_dataset_configuration = next( +# iter(default_backend_configuration.dataset_configurations.items()) +# ) +# assert isinstance(default_configurable_dataset, ConfigurableDataset) +# assert default_configurable_dataset.object_name == "TimeSeries" +# assert default_configurable_dataset.parent == "root" +# assert default_configurable_dataset.field == "data" +# assert default_configurable_dataset.maxshape == (2, 3) +# assert default_configurable_dataset.dtype == "int32" + +# assert isinstance(default_dataset_configuration, HDF5DatasetConfiguration) +# assert default_dataset_configuration.object_name == "TimeSeries" +# assert default_dataset_configuration.parent == "root" +# assert default_dataset_configuration.field == "data" +# assert default_dataset_configuration.chunk_shape == (1, 2) +# assert default_dataset_configuration.buffer_shape == (1, 3) +# assert default_dataset_configuration.maxshape == (2, 3) +# assert default_dataset_configuration.dtype == "int32" +# assert default_dataset_configuration.compression_method == "gzip" +# assert default_dataset_configuration.compression_options is None + + +# def test_simple_time_series_zarr_backend(): +# array = generate_2d_array() + +# nwbfile = mock_NWBFile() +# time_series = mock_TimeSeries( +# name="TimeSeries", data=SliceableDataChunkIterator(data=array, chunk_shape=(1, 2), buffer_shape=(1, 3)) +# ) +# nwbfile.add_acquisition(time_series) + +# default_backend_configuration = get_default_backend_configuration(nwbfile=nwbfile, backend_type="zarr") + +# assert isinstance(default_backend_configuration, ZarrBackendConfiguration) +# assert default_backend_configuration.number_of_jobs == -2 +# assert len(default_backend_configuration.dataset_configurations) == 1 + +# default_configurable_dataset, default_dataset_configuration = next( +# iter(default_backend_configuration.dataset_configurations.items()) +# ) +# assert isinstance(default_configurable_dataset, ConfigurableDataset) +# assert default_configurable_dataset.object_name == "TimeSeries" +# assert default_configurable_dataset.parent == "root" +# assert default_configurable_dataset.field == "data" +# assert default_configurable_dataset.maxshape == (2, 3) +# assert default_configurable_dataset.dtype == "int32" + +# assert isinstance(default_dataset_configuration, ZarrDatasetConfiguration) +# assert default_dataset_configuration.object_name == "TimeSeries" +# assert default_dataset_configuration.parent == "root" +# assert default_dataset_configuration.field == "data" +# assert default_dataset_configuration.chunk_shape == (1, 2) +# assert default_dataset_configuration.buffer_shape == (1, 3) +# assert default_dataset_configuration.maxshape == (2, 3) +# assert default_dataset_configuration.dtype == "int32" +# assert default_dataset_configuration.compression_method == "gzip" +# assert default_dataset_configuration.compression_options is None +# assert default_dataset_configuration.filter_methods is None +# assert default_dataset_configuration.filter_options is None + + +# def test_simple_dynamic_table(): +# array = generate_1d_array() + +# nwbfile = mock_NWBFile() +# column_length = array.shape[1] +# dynamic_table = DynamicTable( +# name="DynamicTable", +# description="", +# id=list(range(column_length)), # Need to include ID since the data of the column is not wrapped in an IO +# ) +# dynamic_table.add_column(name="TestColumn", description="", data=array.squeeze()) +# nwbfile.add_acquisition(dynamic_table) + +# results = list(get_default_backend_configuration(nwbfile=nwbfile)) + +# assert len(results) == 1 + +# result = results[0] +# assert isinstance(result, ConfigurableDataset) +# assert result.object_name == "TestColumn" +# assert result.field == "data" +# assert result.maxshape == (column_length,) +# assert result.dtype == str(array.dtype) + + +# def test_simple_on_appended_hdf5_file(hdf5_nwbfile_path): +# array = generate_2d_array() + +# with NWBHDF5IO(path=hdf5_nwbfile_path, mode="a") as io: +# nwbfile = io.read() +# array = generate_2d_array() +# new_time_series = mock_TimeSeries(name="NewTimeSeries", data=array) +# nwbfile.add_acquisition(new_time_series) + +# results = list(get_default_backend_configuration(nwbfile=nwbfile)) + +# assert len(results) == 1 + +# result = results[0] +# assert isinstance(result, ConfigurableDataset) +# assert result.object_name == "NewTimeSeries" +# assert result.field == "data" +# assert result.maxshape == array.shape +# assert result.dtype == str(array.dtype) # TODO: add tests for if source specification was np.dtype et al. + + +# def test_simple_on_appended_zarr_file(zarr_nwbfile_path): +# array = generate_2d_array() + +# with NWBZarrIO(path=zarr_nwbfile_path, mode="a") as io: +# nwbfile = io.read() +# array = generate_2d_array() +# new_time_series = mock_TimeSeries(name="NewTimeSeries", data=array) +# nwbfile.add_acquisition(new_time_series) + +# results = list(get_default_backend_configuration(nwbfile=nwbfile)) + +# assert len(results) == 1 + +# result = results[0] +# assert isinstance(result, ConfigurableDataset) +# assert result.object_name == "NewTimeSeries" +# assert result.field == "data" +# assert result.maxshape == array.shape +# assert result.dtype == str(array.dtype) # TODO: add tests for if source specification was np.dtype et al. diff --git a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_get_default_dataset_configurations.py b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_get_default_dataset_configurations.py new file mode 100644 index 000000000..8183e1a75 --- /dev/null +++ b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_get_default_dataset_configurations.py @@ -0,0 +1,349 @@ +"""Unit tests for `get_default_dataset_configurations`.""" +import numpy as np +from hdmf.data_utils import DataChunkIterator +from hdmf.common import VectorData +from pynwb.base import DynamicTable +from pynwb.image import ImageSeries +from pynwb.testing.mock.base import mock_TimeSeries +from pynwb.testing.mock.file import mock_NWBFile + +from neuroconv.tools.hdmf import SliceableDataChunkIterator +from neuroconv.tools.nwb_helpers import ( + HDF5DatasetConfiguration, + ZarrDatasetConfiguration, + get_default_dataset_configurations, +) + + +def test_unwrapped_time_series_hdf5(): + array = np.array([[1, 2, 3], [4, 5, 6]]) + + nwbfile = mock_NWBFile() + time_series = mock_TimeSeries(name="TestTimeSeries", data=array) + nwbfile.add_acquisition(time_series) + + dataset_configurations = list(get_default_dataset_configurations(nwbfile=nwbfile, backend="hdf5")) + + assert len(dataset_configurations) == 1 + + dataset_configuration = dataset_configurations[0] + assert isinstance(dataset_configuration, HDF5DatasetConfiguration) + assert dataset_configuration.dataset_info.object_id == time_series.object_id + assert dataset_configuration.dataset_info.location == "acquisition/TestTimeSeries/data" + assert dataset_configuration.dataset_info.maxshape == array.shape + assert dataset_configuration.dataset_info.dtype == array.dtype + assert dataset_configuration.chunk_shape == array.shape + assert dataset_configuration.buffer_shape == array.shape + assert dataset_configuration.compression_method == "gzip" + assert dataset_configuration.compression_options is None + + +def test_unwrapped_time_series_zarr(): + array = np.array([[1, 2, 3], [4, 5, 6]]) + + nwbfile = mock_NWBFile() + time_series = mock_TimeSeries(name="TestTimeSeries", data=array) + nwbfile.add_acquisition(time_series) + + dataset_configurations = list(get_default_dataset_configurations(nwbfile=nwbfile, backend="zarr")) + + assert len(dataset_configurations) == 1 + + dataset_configuration = dataset_configurations[0] + assert isinstance(dataset_configuration, ZarrDatasetConfiguration) + assert dataset_configuration.dataset_info.object_id == time_series.object_id + assert dataset_configuration.dataset_info.location == "acquisition/TestTimeSeries/data" + assert dataset_configuration.dataset_info.maxshape == array.shape + assert dataset_configuration.dataset_info.dtype == array.dtype + assert dataset_configuration.chunk_shape == array.shape + assert dataset_configuration.buffer_shape == array.shape + assert dataset_configuration.compression_method == "gzip" + assert dataset_configuration.compression_options is None + assert dataset_configuration.filter_methods is None + assert dataset_configuration.filter_options is None + + +def test_generic_iterator_wrapped_time_series_hdf5(): + array = np.array([[1, 2, 3], [4, 5, 6]]) + + nwbfile = mock_NWBFile() + time_series = mock_TimeSeries(name="TestTimeSeries", data=SliceableDataChunkIterator(data=array)) + nwbfile.add_acquisition(time_series) + + dataset_configurations = list(get_default_dataset_configurations(nwbfile=nwbfile, backend="hdf5")) + + assert len(dataset_configurations) == 1 + + dataset_configuration = dataset_configurations[0] + assert isinstance(dataset_configuration, HDF5DatasetConfiguration) + assert dataset_configuration.dataset_info.object_id == time_series.object_id + assert dataset_configuration.dataset_info.location == "acquisition/TestTimeSeries/data" + assert dataset_configuration.dataset_info.maxshape == array.shape + assert dataset_configuration.dataset_info.dtype == array.dtype + assert dataset_configuration.chunk_shape == array.shape + assert dataset_configuration.buffer_shape == array.shape + assert dataset_configuration.compression_method == "gzip" + assert dataset_configuration.compression_options is None + + +def test_classic_iterator_wrapped_simple_time_series_zarr(): + array = np.array([[1, 2, 3], [4, 5, 6]]) + + nwbfile = mock_NWBFile() + time_series = mock_TimeSeries(name="TestTimeSeries", data=DataChunkIterator(data=array)) + nwbfile.add_acquisition(time_series) + + dataset_configurations = list(get_default_dataset_configurations(nwbfile=nwbfile, backend="zarr")) + + assert len(dataset_configurations) == 1 + + dataset_configuration = dataset_configurations[0] + assert isinstance(dataset_configuration, ZarrDatasetConfiguration) + assert dataset_configuration.dataset_info.object_id == time_series.object_id + assert dataset_configuration.dataset_info.location == "acquisition/TestTimeSeries/data" + assert dataset_configuration.dataset_info.maxshape == array.shape + assert dataset_configuration.dataset_info.dtype == array.dtype + assert dataset_configuration.chunk_shape == array.shape + assert dataset_configuration.buffer_shape == array.shape + assert dataset_configuration.compression_method == "gzip" + assert dataset_configuration.compression_options is None + assert dataset_configuration.filter_methods is None + assert dataset_configuration.filter_options is None + + +def test_classic_iterator_wrapped_time_series_hdf5(): + array = np.array([[1, 2, 3], [4, 5, 6]]) + + nwbfile = mock_NWBFile() + time_series = mock_TimeSeries(name="TestTimeSeries", data=DataChunkIterator(data=array)) + nwbfile.add_acquisition(time_series) + + dataset_configurations = list(get_default_dataset_configurations(nwbfile=nwbfile, backend="hdf5")) + + assert len(dataset_configurations) == 1 + + dataset_configuration = dataset_configurations[0] + assert isinstance(dataset_configuration, HDF5DatasetConfiguration) + assert dataset_configuration.dataset_info.object_id == time_series.object_id + assert dataset_configuration.dataset_info.location == "acquisition/TestTimeSeries/data" + assert dataset_configuration.dataset_info.maxshape == array.shape + assert dataset_configuration.dataset_info.dtype == array.dtype + assert dataset_configuration.chunk_shape == array.shape + assert dataset_configuration.buffer_shape == array.shape + assert dataset_configuration.compression_method == "gzip" + assert dataset_configuration.compression_options is None + + +def test_generic_iterator_wrapped_simple_time_series_zarr(): + array = np.array([[1, 2, 3], [4, 5, 6]]) + + nwbfile = mock_NWBFile() + time_series = mock_TimeSeries(name="TestTimeSeries", data=SliceableDataChunkIterator(data=array)) + nwbfile.add_acquisition(time_series) + + dataset_configurations = list(get_default_dataset_configurations(nwbfile=nwbfile, backend="zarr")) + + assert len(dataset_configurations) == 1 + + dataset_configuration = dataset_configurations[0] + assert isinstance(dataset_configuration, ZarrDatasetConfiguration) + assert dataset_configuration.dataset_info.object_id == time_series.object_id + assert dataset_configuration.dataset_info.location == "acquisition/TestTimeSeries/data" + assert dataset_configuration.dataset_info.maxshape == array.shape + assert dataset_configuration.dataset_info.dtype == array.dtype + assert dataset_configuration.chunk_shape == array.shape + assert dataset_configuration.buffer_shape == array.shape + assert dataset_configuration.compression_method == "gzip" + assert dataset_configuration.compression_options is None + assert dataset_configuration.filter_methods is None + assert dataset_configuration.filter_options is None + + +def test_external_image_series_hdf5(): + nwbfile = mock_NWBFile() + image_series = ImageSeries(name="TestImageSeries", external_file=[""], rate=1.0) + nwbfile.add_acquisition(image_series) + + dataset_configurations = list(get_default_dataset_configurations(nwbfile=nwbfile, backend="hdf5")) + + assert len(dataset_configurations) == 0 + + +def test_external_image_series_zarr(): + nwbfile = mock_NWBFile() + image_series = ImageSeries(name="TestImageSeries", external_file=[""], rate=1.0) + nwbfile.add_acquisition(image_series) + + dataset_configurations = list(get_default_dataset_configurations(nwbfile=nwbfile, backend="zarr")) + + assert len(dataset_configurations) == 0 + + +def test_unwrapped_dynamic_table_hdf5(): + array = np.array([0.1, 0.2, 0.3]) + + nwbfile = mock_NWBFile() + column = VectorData(name="TestColumn", description="", data=array.squeeze()) + dynamic_table = DynamicTable(name="TestDynamicTable", description="", columns=[column]) + nwbfile.add_acquisition(dynamic_table) + + dataset_configurations = list(get_default_dataset_configurations(nwbfile=nwbfile, backend="hdf5")) + + assert len(dataset_configurations) == 1 + + dataset_configuration = dataset_configurations[0] + assert isinstance(dataset_configuration, HDF5DatasetConfiguration) + assert dataset_configuration.dataset_info.object_id == column.object_id + assert dataset_configuration.dataset_info.location == "acquisition/TestDynamicTable/TestColumn/data" + assert dataset_configuration.dataset_info.maxshape == array.shape + assert dataset_configuration.dataset_info.dtype == array.dtype + assert dataset_configuration.chunk_shape == array.shape + assert dataset_configuration.buffer_shape == array.shape + assert dataset_configuration.compression_method == "gzip" + assert dataset_configuration.compression_options is None + + +def test_unwrapped_dynamic_table_zarr(): + array = np.array([0.1, 0.2, 0.3]) + + nwbfile = mock_NWBFile() + column = VectorData(name="TestColumn", description="", data=array.squeeze()) + dynamic_table = DynamicTable(name="TestDynamicTable", description="", columns=[column]) + nwbfile.add_acquisition(dynamic_table) + + dataset_configurations = list(get_default_dataset_configurations(nwbfile=nwbfile, backend="zarr")) + + assert len(dataset_configurations) == 1 + + dataset_configuration = dataset_configurations[0] + assert isinstance(dataset_configuration, ZarrDatasetConfiguration) + assert dataset_configuration.dataset_info.object_id == column.object_id + assert dataset_configuration.dataset_info.location == "acquisition/TestDynamicTable/TestColumn/data" + assert dataset_configuration.dataset_info.maxshape == array.shape + assert dataset_configuration.dataset_info.dtype == array.dtype + assert dataset_configuration.chunk_shape == array.shape + assert dataset_configuration.buffer_shape == array.shape + assert dataset_configuration.compression_method == "gzip" + assert dataset_configuration.compression_options is None + assert dataset_configuration.filter_methods is None + assert dataset_configuration.filter_options is None + + +def test_generic_iterator_wrapped_dynamic_table_hdf5(): + array = np.array([0.1, 0.2, 0.3]) + + nwbfile = mock_NWBFile() + column = VectorData(name="TestColumn", description="", data=SliceableDataChunkIterator(data=array.squeeze())) + dynamic_table = DynamicTable( + name="TestDynamicTable", + description="", + id=list(range(array.shape[0])), # Need to include ID since the data of the column is not wrapped in an IO + columns=[column], + ) + nwbfile.add_acquisition(dynamic_table) + + dataset_configurations = list(get_default_dataset_configurations(nwbfile=nwbfile, backend="hdf5")) + + assert len(dataset_configurations) == 1 + + dataset_configuration = dataset_configurations[0] + assert isinstance(dataset_configuration, HDF5DatasetConfiguration) + assert dataset_configuration.dataset_info.object_id == column.object_id + assert dataset_configuration.dataset_info.location == "acquisition/TestDynamicTable/TestColumn/data" + assert dataset_configuration.dataset_info.maxshape == (array.shape[0],) + assert dataset_configuration.dataset_info.dtype == array.dtype + assert dataset_configuration.chunk_shape == array.shape + assert dataset_configuration.buffer_shape == array.shape + assert dataset_configuration.compression_method == "gzip" + assert dataset_configuration.compression_options is None + + +def test_generic_iterator_wrapped_dynamic_table_zarr(): + array = np.array([0.1, 0.2, 0.3]) + + nwbfile = mock_NWBFile() + column = VectorData(name="TestColumn", description="", data=SliceableDataChunkIterator(data=array.squeeze())) + dynamic_table = DynamicTable( + name="TestDynamicTable", + description="", + id=list(range(array.shape[0])), # Need to include ID since the data of the column is not wrapped in an IO + columns=[column], + ) + nwbfile.add_acquisition(dynamic_table) + + dataset_configurations = list(get_default_dataset_configurations(nwbfile=nwbfile, backend="zarr")) + + assert len(dataset_configurations) == 1 + + dataset_configuration = dataset_configurations[0] + assert isinstance(dataset_configuration, ZarrDatasetConfiguration) + assert dataset_configuration.dataset_info.object_id == column.object_id + assert dataset_configuration.dataset_info.location == "acquisition/TestDynamicTable/TestColumn/data" + assert dataset_configuration.dataset_info.maxshape == (array.shape[0],) + assert dataset_configuration.dataset_info.dtype == array.dtype + assert dataset_configuration.chunk_shape == array.shape + assert dataset_configuration.buffer_shape == array.shape + assert dataset_configuration.compression_method == "gzip" + assert dataset_configuration.compression_options is None + assert dataset_configuration.filter_methods is None + assert dataset_configuration.filter_options is None + + +def test_classic_iterator_wrapped_dynamic_table_hdf5(): + array = np.array([0.1, 0.2, 0.3]) + + nwbfile = mock_NWBFile() + column = VectorData(name="TestColumn", description="", data=DataChunkIterator(data=array.squeeze())) + dynamic_table = DynamicTable( + name="TestDynamicTable", + description="", + id=list(range(array.shape[0])), # Need to include ID since the data of the column is not wrapped in an IO + columns=[column], + ) + nwbfile.add_acquisition(dynamic_table) + + dataset_configurations = list(get_default_dataset_configurations(nwbfile=nwbfile, backend="hdf5")) + + assert len(dataset_configurations) == 1 + + dataset_configuration = dataset_configurations[0] + assert isinstance(dataset_configuration, HDF5DatasetConfiguration) + assert dataset_configuration.dataset_info.object_id == column.object_id + assert dataset_configuration.dataset_info.location == "acquisition/TestDynamicTable/TestColumn/data" + assert dataset_configuration.dataset_info.maxshape == (array.shape[0],) + assert dataset_configuration.dataset_info.dtype == array.dtype + assert dataset_configuration.chunk_shape == array.shape + assert dataset_configuration.buffer_shape == array.shape + assert dataset_configuration.compression_method == "gzip" + assert dataset_configuration.compression_options is None + + +def test_classic_iterator_wrapped_dynamic_table_zarr(): + array = np.array([0.1, 0.2, 0.3]) + + nwbfile = mock_NWBFile() + column = VectorData(name="TestColumn", description="", data=DataChunkIterator(data=array.squeeze())) + dynamic_table = DynamicTable( + name="TestDynamicTable", + description="", + id=list(range(array.shape[0])), # Need to include ID since the data of the column is not wrapped in an IO + columns=[column], + ) + nwbfile.add_acquisition(dynamic_table) + + dataset_configurations = list(get_default_dataset_configurations(nwbfile=nwbfile, backend="zarr")) + + assert len(dataset_configurations) == 1 + + dataset_configuration = dataset_configurations[0] + assert isinstance(dataset_configuration, ZarrDatasetConfiguration) + assert dataset_configuration.dataset_info.object_id == column.object_id + assert dataset_configuration.dataset_info.location == "acquisition/TestDynamicTable/TestColumn/data" + assert dataset_configuration.dataset_info.maxshape == (array.shape[0],) + assert dataset_configuration.dataset_info.dtype == array.dtype + assert dataset_configuration.chunk_shape == array.shape + assert dataset_configuration.buffer_shape == array.shape + assert dataset_configuration.compression_method == "gzip" + assert dataset_configuration.compression_options is None + assert dataset_configuration.filter_methods is None + assert dataset_configuration.filter_options is None diff --git a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_get_default_dataset_configurations_appended_files.py b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_get_default_dataset_configurations_appended_files.py new file mode 100644 index 000000000..34c1a6314 --- /dev/null +++ b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_get_default_dataset_configurations_appended_files.py @@ -0,0 +1,146 @@ +"""Unit tests for `get_default_dataset_configurations` operating on already written files open in append mode.""" +from pathlib import Path + +import pytest +import numpy as np +from hdmf.data_utils import DataChunkIterator +from hdmf.common import VectorData +from hdmf_zarr import NWBZarrIO +from pynwb import NWBFile, NWBHDF5IO +from pynwb.base import DynamicTable +from pynwb.image import ImageSeries +from pynwb.testing.mock.base import mock_TimeSeries +from pynwb.testing.mock.file import mock_NWBFile + +from neuroconv.tools.hdmf import SliceableDataChunkIterator +from neuroconv.tools.nwb_helpers import ( + HDF5DatasetConfiguration, + ZarrDatasetConfiguration, + get_default_dataset_configurations, +) + + +def generate_nwbfile_with_existing_time_series() -> NWBFile: + nwbfile = mock_NWBFile() + array = np.array([[1, 2, 3], [4, 5, 6]]) + time_series = mock_TimeSeries(name="ExistingTimeSeries", data=array) + nwbfile.add_acquisition(time_series) + return nwbfile + + +@pytest.fixture(scope="session") +def hdf5_nwbfile_path(tmpdir_factory): + nwbfile_path = tmpdir_factory.mktemp("data").join("test_hdf5_nwbfile_with_configurable_datasets.nwb.h5") + if not Path(nwbfile_path).exists(): + nwbfile = generate_nwbfile_with_existing_time_series() + with NWBHDF5IO(path=str(nwbfile_path), mode="w") as io: + io.write(nwbfile) + return str(nwbfile_path) + + +@pytest.fixture(scope="session") +def zarr_nwbfile_path(tmpdir_factory): + nwbfile_path = tmpdir_factory.mktemp("data").join("test_zarr_nwbfile_with_configurable_datasets.nwb.zarr") + if not Path(nwbfile_path).exists(): + nwbfile = generate_nwbfile_with_existing_time_series() + with NWBZarrIO(path=str(nwbfile_path), mode="w") as io: + io.write(nwbfile) + return str(nwbfile_path) + + +def test_unwrapped_time_series_hdf5(hdf5_nwbfile_path): + array = np.array([[1, 2, 3], [4, 5, 6]]) + + with NWBHDF5IO(path=hdf5_nwbfile_path, mode="a") as io: + nwbfile = io.read() + new_time_series = mock_TimeSeries(name="NewTimeSeries", data=array) + nwbfile.add_acquisition(new_time_series) + dataset_configurations = list(get_default_dataset_configurations(nwbfile=nwbfile, backend="hdf5")) + + assert len(dataset_configurations) == 1 + + dataset_configuration = dataset_configurations[0] + assert isinstance(dataset_configuration, HDF5DatasetConfiguration) + assert dataset_configuration.dataset_info.object_id == new_time_series.object_id + assert dataset_configuration.dataset_info.location == "acquisition/NewTimeSeries/data" + assert dataset_configuration.dataset_info.maxshape == array.shape + assert dataset_configuration.dataset_info.dtype == array.dtype + assert dataset_configuration.chunk_shape == array.shape + assert dataset_configuration.buffer_shape == array.shape + assert dataset_configuration.compression_method == "gzip" + assert dataset_configuration.compression_options is None + + +def test_unwrapped_time_series_zarr(zarr_nwbfile_path): + array = np.array([[1, 2, 3], [4, 5, 6]]) + + with NWBZarrIO(path=zarr_nwbfile_path, mode="a") as io: + nwbfile = io.read() + new_time_series = mock_TimeSeries(name="NewTimeSeries", data=array) + nwbfile.add_acquisition(new_time_series) + dataset_configurations = list(get_default_dataset_configurations(nwbfile=nwbfile, backend="zarr")) + + assert len(dataset_configurations) == 1 + + dataset_configuration = dataset_configurations[0] + assert isinstance(dataset_configuration, ZarrDatasetConfiguration) + assert dataset_configuration.dataset_info.object_id == new_time_series.object_id + assert dataset_configuration.dataset_info.location == "acquisition/NewTimeSeries/data" + assert dataset_configuration.dataset_info.maxshape == array.shape + assert dataset_configuration.dataset_info.dtype == array.dtype + assert dataset_configuration.chunk_shape == array.shape + assert dataset_configuration.buffer_shape == array.shape + assert dataset_configuration.compression_method == "gzip" + assert dataset_configuration.compression_options is None + assert dataset_configuration.filter_methods is None + assert dataset_configuration.filter_options is None + + +def test_unwrapped_dynamic_table_hdf5(hdf5_nwbfile_path): + array = np.array([0.1, 0.2, 0.3]) + + with NWBHDF5IO(path=hdf5_nwbfile_path, mode="a") as io: + nwbfile = io.read() + column = VectorData(name="TestColumn", description="", data=array.squeeze()) + dynamic_table = DynamicTable(name="TestDynamicTable", description="", columns=[column]) + nwbfile.add_acquisition(dynamic_table) + dataset_configurations = list(get_default_dataset_configurations(nwbfile=nwbfile, backend="hdf5")) + + assert len(dataset_configurations) == 1 + + dataset_configuration = dataset_configurations[0] + assert isinstance(dataset_configuration, HDF5DatasetConfiguration) + assert dataset_configuration.dataset_info.object_id == column.object_id + assert dataset_configuration.dataset_info.location == "acquisition/TestDynamicTable/TestColumn/data" + assert dataset_configuration.dataset_info.maxshape == array.shape + assert dataset_configuration.dataset_info.dtype == array.dtype + assert dataset_configuration.chunk_shape == array.shape + assert dataset_configuration.buffer_shape == array.shape + assert dataset_configuration.compression_method == "gzip" + assert dataset_configuration.compression_options is None + + +def test_unwrapped_dynamic_table_zarr(zarr_nwbfile_path): + array = np.array([0.1, 0.2, 0.3]) + + with NWBZarrIO(path=zarr_nwbfile_path, mode="a") as io: + nwbfile = io.read() + column = VectorData(name="TestColumn", description="", data=array.squeeze()) + dynamic_table = DynamicTable(name="TestDynamicTable", description="", columns=[column]) + nwbfile.add_acquisition(dynamic_table) + dataset_configurations = list(get_default_dataset_configurations(nwbfile=nwbfile, backend="zarr")) + + assert len(dataset_configurations) == 1 + + dataset_configuration = dataset_configurations[0] + assert isinstance(dataset_configuration, ZarrDatasetConfiguration) + assert dataset_configuration.dataset_info.object_id == column.object_id + assert dataset_configuration.dataset_info.location == "acquisition/TestDynamicTable/TestColumn/data" + assert dataset_configuration.dataset_info.maxshape == array.shape + assert dataset_configuration.dataset_info.dtype == array.dtype + assert dataset_configuration.chunk_shape == array.shape + assert dataset_configuration.buffer_shape == array.shape + assert dataset_configuration.compression_method == "gzip" + assert dataset_configuration.compression_options is None + assert dataset_configuration.filter_methods is None + assert dataset_configuration.filter_options is None From 111797d6d3daa5b16ff12348a6f8aab2a4bfd96d Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 1 Sep 2023 18:33:12 +0000 Subject: [PATCH 14/17] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/neuroconv/tools/nwb_helpers/__init__.py | 15 +++++++++------ .../nwb_helpers/_dataset_and_backend_models.py | 2 +- .../tools/nwb_helpers/_dataset_configuration.py | 6 +++--- .../test_get_default_dataset_configurations.py | 2 +- ...fault_dataset_configurations_appended_files.py | 6 +++--- 5 files changed, 17 insertions(+), 14 deletions(-) diff --git a/src/neuroconv/tools/nwb_helpers/__init__.py b/src/neuroconv/tools/nwb_helpers/__init__.py index 457e7e77b..34490b0bb 100644 --- a/src/neuroconv/tools/nwb_helpers/__init__.py +++ b/src/neuroconv/tools/nwb_helpers/__init__.py @@ -1,15 +1,18 @@ from ._dataset_and_backend_models import ( - DatasetInfo, + BACKEND_TO_CONFIGURATION, + BACKEND_TO_DATASET_CONFIGURATION, BackendConfiguration, DatasetConfiguration, - HDF5DatasetConfiguration, - ZarrDatasetConfiguration, + DatasetInfo, HDF5BackendConfiguration, + HDF5DatasetConfiguration, ZarrBackendConfiguration, - BACKEND_TO_DATASET_CONFIGURATION, - BACKEND_TO_CONFIGURATION, + ZarrDatasetConfiguration, +) +from ._dataset_configuration import ( + get_default_backend_configuration, + get_default_dataset_configurations, ) -from ._dataset_configuration import get_default_dataset_configurations, get_default_backend_configuration from ._metadata_and_file_helpers import ( add_device_from_metadata, get_default_nwbfile_metadata, diff --git a/src/neuroconv/tools/nwb_helpers/_dataset_and_backend_models.py b/src/neuroconv/tools/nwb_helpers/_dataset_and_backend_models.py index 38f8d41a1..1d97f096b 100644 --- a/src/neuroconv/tools/nwb_helpers/_dataset_and_backend_models.py +++ b/src/neuroconv/tools/nwb_helpers/_dataset_and_backend_models.py @@ -5,8 +5,8 @@ import hdf5plugin import psutil import zarr -from hdmf.container import DataIO from hdmf.backends.hdf5 import H5DataIO +from hdmf.container import DataIO from hdmf_zarr import ZarrDataIO from nwbinspector.utils import is_module_installed from pydantic import BaseModel, Field, root_validator diff --git a/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py b/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py index 036775fc5..272d558a3 100644 --- a/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py +++ b/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py @@ -12,14 +12,14 @@ from pynwb.base import DynamicTable from ._dataset_and_backend_models import ( - DatasetInfo, + BACKEND_TO_CONFIGURATION, + BACKEND_TO_DATASET_CONFIGURATION, DatasetConfiguration, + DatasetInfo, HDF5BackendConfiguration, HDF5DatasetConfiguration, ZarrBackendConfiguration, ZarrDatasetConfiguration, - BACKEND_TO_DATASET_CONFIGURATION, - BACKEND_TO_CONFIGURATION, ) from ..hdmf import SliceableDataChunkIterator diff --git a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_get_default_dataset_configurations.py b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_get_default_dataset_configurations.py index 8183e1a75..be0c6f0d8 100644 --- a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_get_default_dataset_configurations.py +++ b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_get_default_dataset_configurations.py @@ -1,7 +1,7 @@ """Unit tests for `get_default_dataset_configurations`.""" import numpy as np -from hdmf.data_utils import DataChunkIterator from hdmf.common import VectorData +from hdmf.data_utils import DataChunkIterator from pynwb.base import DynamicTable from pynwb.image import ImageSeries from pynwb.testing.mock.base import mock_TimeSeries diff --git a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_get_default_dataset_configurations_appended_files.py b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_get_default_dataset_configurations_appended_files.py index 34c1a6314..f3ecaee22 100644 --- a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_get_default_dataset_configurations_appended_files.py +++ b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_get_default_dataset_configurations_appended_files.py @@ -1,12 +1,12 @@ """Unit tests for `get_default_dataset_configurations` operating on already written files open in append mode.""" from pathlib import Path -import pytest import numpy as np -from hdmf.data_utils import DataChunkIterator +import pytest from hdmf.common import VectorData +from hdmf.data_utils import DataChunkIterator from hdmf_zarr import NWBZarrIO -from pynwb import NWBFile, NWBHDF5IO +from pynwb import NWBHDF5IO, NWBFile from pynwb.base import DynamicTable from pynwb.image import ImageSeries from pynwb.testing.mock.base import mock_TimeSeries From f69daf39873bf9da4d34789565aa9ab6e4ceb716 Mon Sep 17 00:00:00 2001 From: Cody Baker Date: Fri, 1 Sep 2023 15:39:38 -0400 Subject: [PATCH 15/17] backend config printout and integration tests --- .../_dataset_and_backend_models.py | 50 +- .../test_configurable_dataset.py | 2 +- .../test_get_default_backend_configuration.py | 430 ++++++++++-------- ...t_dataset_configurations_appended_files.py | 13 +- 4 files changed, 286 insertions(+), 209 deletions(-) diff --git a/src/neuroconv/tools/nwb_helpers/_dataset_and_backend_models.py b/src/neuroconv/tools/nwb_helpers/_dataset_and_backend_models.py index 38f8d41a1..0533eed7a 100644 --- a/src/neuroconv/tools/nwb_helpers/_dataset_and_backend_models.py +++ b/src/neuroconv/tools/nwb_helpers/_dataset_and_backend_models.py @@ -139,15 +139,36 @@ def validate_filter_methods_and_options_match(cls, values: Dict[str, Any]): class BackendConfiguration(BaseModel): """A model for matching collections of DatasetConfigurations specific to the HDF5 backend.""" - backend_type: Literal["hdf5", "zarr"] + backend: Literal["hdf5", "zarr"] data_io: Type[DataIO] dataset_configurations: Dict[str, DatasetConfiguration] # str is location field of DatasetConfiguration + def __str__(self) -> str: + """Not overriding __repr__ as this is intended to render only when wrapped in print().""" + string = ( + f"Configurable datasets identified using the {self.backend} backend\n" + f"{'-' * (43 + len(self.backend) + 8)}\n" + ) + + for dataset_configuration in self.dataset_configurations.values(): + dataset_info = dataset_configuration.dataset_info + string += ( + f"{dataset_info.location}\n" + f" maxshape : {dataset_info.maxshape}\n" + f" dtype : {dataset_info.dtype}\n\n" + f" chunk shape : {dataset_configuration.chunk_shape}\n" + f" buffer shape : {dataset_configuration.buffer_shape}\n" + f" compression method : {dataset_configuration.compression_method}\n" + f" compression options : {dataset_configuration.compression_options}\n\n\n" + ) + + return string + class HDF5BackendConfiguration(BackendConfiguration): """A model for matching collections of DatasetConfigurations specific to the HDF5 backend.""" - backend_type: Literal["hdf5"] = "hdf5" + backend: Literal["hdf5"] = "hdf5" data_io: Type[H5DataIO] = H5DataIO dataset_configurations: Dict[str, HDF5DatasetConfiguration] # str is location field of DatasetConfiguration @@ -155,7 +176,7 @@ class HDF5BackendConfiguration(BackendConfiguration): class ZarrBackendConfiguration(BackendConfiguration): """A model for matching collections of DatasetConfigurations specific to the Zarr backend.""" - backend_type: Literal["zarr"] = "zarr" + backend: Literal["zarr"] = "zarr" data_io: Type[ZarrDataIO] = ZarrDataIO dataset_configurations: Dict[str, ZarrDatasetConfiguration] # str is location field of DatasetConfiguration number_of_jobs: int = Field( @@ -165,6 +186,29 @@ class ZarrBackendConfiguration(BackendConfiguration): default=-2, # -2 translates to 'all CPU except for one' ) + def __str__(self) -> str: + """Not overriding __repr__ as this is intended to render only when wrapped in print().""" + string = ( + f"Configurable datasets identified using the {self.backend} backend\n" + f"{'-' * (43 + len(self.backend) + 8)}\n" + ) + + for dataset_configuration in self.dataset_configurations.values(): + dataset_info = dataset_configuration.dataset_info + string += ( + f"{dataset_info.location}\n" + f" maxshape : {dataset_info.maxshape}\n" + f" dtype : {dataset_info.dtype}\n\n" + f" chunk shape : {dataset_configuration.chunk_shape}\n" + f" buffer shape : {dataset_configuration.buffer_shape}\n" + f" compression method : {dataset_configuration.compression_method}\n" + f" compression options : {dataset_configuration.compression_options}\n" + f" filter methods : {dataset_configuration.filter_methods}\n" + f" filter options : {dataset_configuration.filter_options}\n\n\n" + ) + + return string + BACKEND_TO_DATASET_CONFIGURATION = dict(hdf5=HDF5DatasetConfiguration, zarr=ZarrDatasetConfiguration) BACKEND_TO_CONFIGURATION = dict(hdf5=HDF5BackendConfiguration, zarr=ZarrBackendConfiguration) diff --git a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_configurable_dataset.py b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_configurable_dataset.py index 28ab594b6..4814918fe 100644 --- a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_configurable_dataset.py +++ b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_configurable_dataset.py @@ -2,7 +2,7 @@ from io import StringIO from unittest.mock import patch -# from neuroconv.tools.nwb_helpers import ConfigurableDataset +from neuroconv.tools.nwb_helpers import ConfigurableDataset # def test_configurable_dataset_print(): diff --git a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_get_default_backend_configuration.py b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_get_default_backend_configuration.py index 0da3d0e04..4c0597bba 100644 --- a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_get_default_backend_configuration.py +++ b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_get_default_backend_configuration.py @@ -1,199 +1,231 @@ -# """Unit tests for `get_default_backend_configuration`.""" -# from pathlib import Path - -# import numpy as np -# import pytest -# from hdmf_zarr import NWBZarrIO -# from pynwb import NWBHDF5IO, NWBFile -# from pynwb.base import DynamicTable -# from pynwb.testing.mock.base import mock_TimeSeries -# from pynwb.testing.mock.file import mock_NWBFile - -# from neuroconv.tools.hdmf import SliceableDataChunkIterator -# from neuroconv.tools.nwb_helpers import ( -# HDF5BackendConfiguration, -# HDF5DatasetConfiguration, -# ZarrBackendConfiguration, -# ZarrDatasetConfiguration, -# get_default_backend_configuration, -# ) - - -# def generate_1d_array() -> NWBFile: -# array = np.array([[0.1, 0.2, 0.3]]) -# return array - - -# def generate_2d_array() -> NWBFile: -# array = np.array([[1, 2, 3], [4, 5, 6]]) -# return array - - -# def generate_nwbfile_with_ConfigurableDatasets() -> NWBFile: -# nwbfile = mock_NWBFile() -# array = generate_2d_array() -# time_series = mock_TimeSeries(data=array) -# nwbfile.add_acquisition(time_series) -# return nwbfile - - -# @pytest.fixture(scope="session") -# def hdf5_nwbfile_path(tmpdir_factory): -# nwbfile_path = tmpdir_factory.mktemp("data").join("test_hdf5_nwbfile_with_configurable_datasets.nwb.h5") -# if not Path(nwbfile_path).exists(): -# nwbfile = generate_nwbfile_with_ConfigurableDatasets() -# with NWBHDF5IO(path=str(nwbfile_path), mode="w") as io: -# io.write(nwbfile) -# return str(nwbfile_path) - - -# @pytest.fixture(scope="session") -# def zarr_nwbfile_path(tmpdir_factory): -# nwbfile_path = tmpdir_factory.mktemp("data").join("test_zarr_nwbfile_with_configurable_datasets.nwb.zarr") -# if not Path(nwbfile_path).exists(): -# nwbfile = generate_nwbfile_with_ConfigurableDatasets() -# with NWBZarrIO(path=str(nwbfile_path), mode="w") as io: -# io.write(nwbfile) -# return str(nwbfile_path) - - -# def test_simple_time_series_hdf5_backend(): -# array = generate_2d_array() - -# nwbfile = mock_NWBFile() -# time_series = mock_TimeSeries( -# name="TimeSeries", data=SliceableDataChunkIterator(data=array, chunk_shape=(1, 2), buffer_shape=(1, 3)) -# ) -# nwbfile.add_acquisition(time_series) - -# default_backend_configuration = get_default_backend_configuration(nwbfile=nwbfile, backend_type="hdf5") - -# assert isinstance(default_backend_configuration, HDF5BackendConfiguration) -# assert len(default_backend_configuration.dataset_configurations) == 1 - -# default_configurable_dataset, default_dataset_configuration = next( -# iter(default_backend_configuration.dataset_configurations.items()) -# ) -# assert isinstance(default_configurable_dataset, ConfigurableDataset) -# assert default_configurable_dataset.object_name == "TimeSeries" -# assert default_configurable_dataset.parent == "root" -# assert default_configurable_dataset.field == "data" -# assert default_configurable_dataset.maxshape == (2, 3) -# assert default_configurable_dataset.dtype == "int32" - -# assert isinstance(default_dataset_configuration, HDF5DatasetConfiguration) -# assert default_dataset_configuration.object_name == "TimeSeries" -# assert default_dataset_configuration.parent == "root" -# assert default_dataset_configuration.field == "data" -# assert default_dataset_configuration.chunk_shape == (1, 2) -# assert default_dataset_configuration.buffer_shape == (1, 3) -# assert default_dataset_configuration.maxshape == (2, 3) -# assert default_dataset_configuration.dtype == "int32" -# assert default_dataset_configuration.compression_method == "gzip" -# assert default_dataset_configuration.compression_options is None - - -# def test_simple_time_series_zarr_backend(): -# array = generate_2d_array() - -# nwbfile = mock_NWBFile() -# time_series = mock_TimeSeries( -# name="TimeSeries", data=SliceableDataChunkIterator(data=array, chunk_shape=(1, 2), buffer_shape=(1, 3)) -# ) -# nwbfile.add_acquisition(time_series) - -# default_backend_configuration = get_default_backend_configuration(nwbfile=nwbfile, backend_type="zarr") - -# assert isinstance(default_backend_configuration, ZarrBackendConfiguration) -# assert default_backend_configuration.number_of_jobs == -2 -# assert len(default_backend_configuration.dataset_configurations) == 1 - -# default_configurable_dataset, default_dataset_configuration = next( -# iter(default_backend_configuration.dataset_configurations.items()) -# ) -# assert isinstance(default_configurable_dataset, ConfigurableDataset) -# assert default_configurable_dataset.object_name == "TimeSeries" -# assert default_configurable_dataset.parent == "root" -# assert default_configurable_dataset.field == "data" -# assert default_configurable_dataset.maxshape == (2, 3) -# assert default_configurable_dataset.dtype == "int32" - -# assert isinstance(default_dataset_configuration, ZarrDatasetConfiguration) -# assert default_dataset_configuration.object_name == "TimeSeries" -# assert default_dataset_configuration.parent == "root" -# assert default_dataset_configuration.field == "data" -# assert default_dataset_configuration.chunk_shape == (1, 2) -# assert default_dataset_configuration.buffer_shape == (1, 3) -# assert default_dataset_configuration.maxshape == (2, 3) -# assert default_dataset_configuration.dtype == "int32" -# assert default_dataset_configuration.compression_method == "gzip" -# assert default_dataset_configuration.compression_options is None -# assert default_dataset_configuration.filter_methods is None -# assert default_dataset_configuration.filter_options is None - - -# def test_simple_dynamic_table(): -# array = generate_1d_array() - -# nwbfile = mock_NWBFile() -# column_length = array.shape[1] -# dynamic_table = DynamicTable( -# name="DynamicTable", -# description="", -# id=list(range(column_length)), # Need to include ID since the data of the column is not wrapped in an IO -# ) -# dynamic_table.add_column(name="TestColumn", description="", data=array.squeeze()) -# nwbfile.add_acquisition(dynamic_table) - -# results = list(get_default_backend_configuration(nwbfile=nwbfile)) - -# assert len(results) == 1 - -# result = results[0] -# assert isinstance(result, ConfigurableDataset) -# assert result.object_name == "TestColumn" -# assert result.field == "data" -# assert result.maxshape == (column_length,) -# assert result.dtype == str(array.dtype) - - -# def test_simple_on_appended_hdf5_file(hdf5_nwbfile_path): -# array = generate_2d_array() - -# with NWBHDF5IO(path=hdf5_nwbfile_path, mode="a") as io: -# nwbfile = io.read() -# array = generate_2d_array() -# new_time_series = mock_TimeSeries(name="NewTimeSeries", data=array) -# nwbfile.add_acquisition(new_time_series) - -# results = list(get_default_backend_configuration(nwbfile=nwbfile)) - -# assert len(results) == 1 - -# result = results[0] -# assert isinstance(result, ConfigurableDataset) -# assert result.object_name == "NewTimeSeries" -# assert result.field == "data" -# assert result.maxshape == array.shape -# assert result.dtype == str(array.dtype) # TODO: add tests for if source specification was np.dtype et al. - - -# def test_simple_on_appended_zarr_file(zarr_nwbfile_path): -# array = generate_2d_array() - -# with NWBZarrIO(path=zarr_nwbfile_path, mode="a") as io: -# nwbfile = io.read() -# array = generate_2d_array() -# new_time_series = mock_TimeSeries(name="NewTimeSeries", data=array) -# nwbfile.add_acquisition(new_time_series) - -# results = list(get_default_backend_configuration(nwbfile=nwbfile)) - -# assert len(results) == 1 - -# result = results[0] -# assert isinstance(result, ConfigurableDataset) -# assert result.object_name == "NewTimeSeries" -# assert result.field == "data" -# assert result.maxshape == array.shape -# assert result.dtype == str(array.dtype) # TODO: add tests for if source specification was np.dtype et al. +"""Integration tests for `get_default_backend_configuration`.""" +from pathlib import Path +from io import StringIO +from unittest.mock import patch + +import pytest +import numpy as np +from hdmf_zarr import NWBZarrIO +from pynwb import NWBFile, NWBHDF5IO +from pynwb.testing.mock.base import mock_TimeSeries +from pynwb.testing.mock.file import mock_NWBFile + +from neuroconv.tools.nwb_helpers import ( + get_module, + HDF5BackendConfiguration, + ZarrBackendConfiguration, + get_default_backend_configuration, +) + + +def generate_complex_nwbfile() -> NWBFile: + nwbfile = mock_NWBFile() + + raw_array = np.array([[1, 2, 3], [4, 5, 6]]) + raw_time_series = mock_TimeSeries(name="RawTimeSeries", data=raw_array) + nwbfile.add_acquisition(raw_time_series) + + number_of_trials = 10 + for start_time, stop_time in zip( + np.linspace(start=0.0, stop=10.0, num=number_of_trials), np.linspace(start=1.0, stop=11.0, num=number_of_trials) + ): + nwbfile.add_trial(start_time=start_time, stop_time=stop_time) + + ecephys_module = get_module(nwbfile=nwbfile, name="ecephys") + processed_array = np.array([[7.0, 8.0], [9.0, 10.0], [11.0, 12.0], [13.0, 14.0]]) + processed_time_series = mock_TimeSeries(name="ProcessedTimeSeries", data=processed_array) + ecephys_module.add(processed_time_series) + + return nwbfile + + +@pytest.fixture(scope="session") +def hdf5_nwbfile_path(tmpdir_factory): + nwbfile_path = tmpdir_factory.mktemp("data").join("test_default_backend_configuration_hdf5_nwbfile.nwb.h5") + if not Path(nwbfile_path).exists(): + nwbfile = generate_complex_nwbfile() + with NWBHDF5IO(path=str(nwbfile_path), mode="w") as io: + io.write(nwbfile) + return str(nwbfile_path) + + +@pytest.fixture(scope="session") +def zarr_nwbfile_path(tmpdir_factory): + nwbfile_path = tmpdir_factory.mktemp("data").join("test_default_backend_configuration_hdf5_nwbfile.nwb.zarr") + if not Path(nwbfile_path).exists(): + nwbfile = generate_complex_nwbfile() + with NWBZarrIO(path=str(nwbfile_path), mode="w") as io: + io.write(nwbfile) + return str(nwbfile_path) + + +def test_complex_hdf5(hdf5_nwbfile_path): + with NWBHDF5IO(path=hdf5_nwbfile_path, mode="a") as io: + nwbfile = io.read() + + raw_array = np.array([[11, 21, 31], [41, 51, 61]]) + raw_time_series = mock_TimeSeries(name="NewRawTimeSeries", data=raw_array) + nwbfile.add_acquisition(raw_time_series) + + number_of_epochs = 5 + for start_time, stop_time in zip( + np.linspace(start=0.0, stop=10.0, num=number_of_epochs), + np.linspace(start=1.0, stop=11.0, num=number_of_epochs), + ): + nwbfile.add_epoch(start_time=start_time, stop_time=stop_time) + + ecephys_module = get_module(nwbfile=nwbfile, name="ecephys") + processed_array = np.array([[7.1, 8.1], [9.1, 10.1], [11.1, 12.1], [13.1, 14.1]]) + processed_time_series = mock_TimeSeries(name="NewProcessedTimeSeries", data=processed_array) + ecephys_module.add(processed_time_series) + + backend_configuration = get_default_backend_configuration(nwbfile=nwbfile, backend="hdf5") + + assert isinstance(backend_configuration, HDF5BackendConfiguration) + + dataset_configurations = backend_configuration.dataset_configurations + assert len(dataset_configurations) == 4 + assert "acquisition/NewRawTimeSeries/data" in dataset_configurations + assert "epochs/start_time/data" in dataset_configurations + assert "epochs/stop_time/data" in dataset_configurations + assert "processing/ecephys/NewProcessedTimeSeries/data" in dataset_configurations + + # Best summary test of expected output is the printout + with patch("sys.stdout", new=StringIO()) as stdout: + print(backend_configuration) + + expected_print = """Configurable datasets identified using the hdf5 backend +------------------------------------------------------- +epochs/start_time/data + maxshape : (5,) + dtype : float64 + + chunk shape : (5,) + buffer shape : (5,) + compression method : gzip + compression options : None + + +epochs/stop_time/data + maxshape : (5,) + dtype : float64 + + chunk shape : (5,) + buffer shape : (5,) + compression method : gzip + compression options : None + + +acquisition/NewRawTimeSeries/data + maxshape : (2, 3) + dtype : int32 + + chunk shape : (2, 3) + buffer shape : (2, 3) + compression method : gzip + compression options : None + + +processing/ecephys/NewProcessedTimeSeries/data + maxshape : (4, 2) + dtype : float64 + + chunk shape : (4, 2) + buffer shape : (4, 2) + compression method : gzip + compression options : None + + + +""" + assert stdout.getvalue() == expected_print + + +def test_complex_zarr(zarr_nwbfile_path): + with NWBZarrIO(path=zarr_nwbfile_path, mode="a") as io: + nwbfile = io.read() + + raw_array = np.array([[11, 21, 31], [41, 51, 61]]) + raw_time_series = mock_TimeSeries(name="NewRawTimeSeries", data=raw_array) + nwbfile.add_acquisition(raw_time_series) + + number_of_epochs = 5 + for start_time, stop_time in zip( + np.linspace(start=0.0, stop=10.0, num=number_of_epochs), + np.linspace(start=1.0, stop=11.0, num=number_of_epochs), + ): + nwbfile.add_epoch(start_time=start_time, stop_time=stop_time) + + ecephys_module = get_module(nwbfile=nwbfile, name="ecephys") + processed_array = np.array([[7.1, 8.1], [9.1, 10.1], [11.1, 12.1], [13.1, 14.1]]) + processed_time_series = mock_TimeSeries(name="NewProcessedTimeSeries", data=processed_array) + ecephys_module.add(processed_time_series) + + backend_configuration = get_default_backend_configuration(nwbfile=nwbfile, backend="zarr") + + assert isinstance(backend_configuration, ZarrBackendConfiguration) + + dataset_configurations = backend_configuration.dataset_configurations + assert len(dataset_configurations) == 4 + assert "acquisition/NewRawTimeSeries/data" in dataset_configurations + assert "epochs/start_time/data" in dataset_configurations + assert "epochs/stop_time/data" in dataset_configurations + assert "processing/ecephys/NewProcessedTimeSeries/data" in dataset_configurations + + # Best summary test of expected output is the printout + with patch("sys.stdout", new=StringIO()) as stdout: + print(backend_configuration) + + expected_print = """Configurable datasets identified using the zarr backend +------------------------------------------------------- +epochs/start_time/data + maxshape : (5,) + dtype : float64 + + chunk shape : (5,) + buffer shape : (5,) + compression method : gzip + compression options : None + filter methods : None + filter options : None + + +epochs/stop_time/data + maxshape : (5,) + dtype : float64 + + chunk shape : (5,) + buffer shape : (5,) + compression method : gzip + compression options : None + filter methods : None + filter options : None + + +acquisition/NewRawTimeSeries/data + maxshape : (2, 3) + dtype : int32 + + chunk shape : (2, 3) + buffer shape : (2, 3) + compression method : gzip + compression options : None + filter methods : None + filter options : None + + +processing/ecephys/NewProcessedTimeSeries/data + maxshape : (4, 2) + dtype : float64 + + chunk shape : (4, 2) + buffer shape : (4, 2) + compression method : gzip + compression options : None + filter methods : None + filter options : None + + + +""" + assert stdout.getvalue() == expected_print diff --git a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_get_default_dataset_configurations_appended_files.py b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_get_default_dataset_configurations_appended_files.py index 34c1a6314..6131dd769 100644 --- a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_get_default_dataset_configurations_appended_files.py +++ b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_get_default_dataset_configurations_appended_files.py @@ -1,18 +1,19 @@ -"""Unit tests for `get_default_dataset_configurations` operating on already written files open in append mode.""" +""" +Unit tests for `get_default_dataset_configurations` operating on already written files open in append mode. + +Mostly testing that the right objects are skipped from identification as candidates for configuration. +""" from pathlib import Path import pytest import numpy as np -from hdmf.data_utils import DataChunkIterator from hdmf.common import VectorData from hdmf_zarr import NWBZarrIO from pynwb import NWBFile, NWBHDF5IO from pynwb.base import DynamicTable -from pynwb.image import ImageSeries from pynwb.testing.mock.base import mock_TimeSeries from pynwb.testing.mock.file import mock_NWBFile -from neuroconv.tools.hdmf import SliceableDataChunkIterator from neuroconv.tools.nwb_helpers import ( HDF5DatasetConfiguration, ZarrDatasetConfiguration, @@ -30,7 +31,7 @@ def generate_nwbfile_with_existing_time_series() -> NWBFile: @pytest.fixture(scope="session") def hdf5_nwbfile_path(tmpdir_factory): - nwbfile_path = tmpdir_factory.mktemp("data").join("test_hdf5_nwbfile_with_configurable_datasets.nwb.h5") + nwbfile_path = tmpdir_factory.mktemp("data").join("test_default_dataset_configurations_hdf5_nwbfile_.nwb.h5") if not Path(nwbfile_path).exists(): nwbfile = generate_nwbfile_with_existing_time_series() with NWBHDF5IO(path=str(nwbfile_path), mode="w") as io: @@ -40,7 +41,7 @@ def hdf5_nwbfile_path(tmpdir_factory): @pytest.fixture(scope="session") def zarr_nwbfile_path(tmpdir_factory): - nwbfile_path = tmpdir_factory.mktemp("data").join("test_zarr_nwbfile_with_configurable_datasets.nwb.zarr") + nwbfile_path = tmpdir_factory.mktemp("data").join("test_default_dataset_configurations_zarr_nwbfile.nwb.zarr") if not Path(nwbfile_path).exists(): nwbfile = generate_nwbfile_with_existing_time_series() with NWBZarrIO(path=str(nwbfile_path), mode="w") as io: From 23b531c995939471d86ef9c14488ad835b319c7e Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 1 Sep 2023 19:40:48 +0000 Subject: [PATCH 16/17] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../test_configurable_dataset.py | 1 - .../test_get_default_backend_configuration.py | 8 ++++---- ...t_get_default_dataset_configurations_appended_files.py | 2 +- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_configurable_dataset.py b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_configurable_dataset.py index 4814918fe..93572ff3a 100644 --- a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_configurable_dataset.py +++ b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_configurable_dataset.py @@ -4,7 +4,6 @@ from neuroconv.tools.nwb_helpers import ConfigurableDataset - # def test_configurable_dataset_print(): # """Test the printout display of a Dataset modellooks nice.""" # test_dataset = ConfigurableDataset( diff --git a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_get_default_backend_configuration.py b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_get_default_backend_configuration.py index 4c0597bba..2c15cf94c 100644 --- a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_get_default_backend_configuration.py +++ b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_get_default_backend_configuration.py @@ -1,20 +1,20 @@ """Integration tests for `get_default_backend_configuration`.""" -from pathlib import Path from io import StringIO +from pathlib import Path from unittest.mock import patch -import pytest import numpy as np +import pytest from hdmf_zarr import NWBZarrIO -from pynwb import NWBFile, NWBHDF5IO +from pynwb import NWBHDF5IO, NWBFile from pynwb.testing.mock.base import mock_TimeSeries from pynwb.testing.mock.file import mock_NWBFile from neuroconv.tools.nwb_helpers import ( - get_module, HDF5BackendConfiguration, ZarrBackendConfiguration, get_default_backend_configuration, + get_module, ) diff --git a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_get_default_dataset_configurations_appended_files.py b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_get_default_dataset_configurations_appended_files.py index 9e056b6eb..bac169bc2 100644 --- a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_get_default_dataset_configurations_appended_files.py +++ b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_get_default_dataset_configurations_appended_files.py @@ -5,8 +5,8 @@ """ from pathlib import Path -import pytest import numpy as np +import pytest from hdmf.common import VectorData from hdmf_zarr import NWBZarrIO from pynwb import NWBHDF5IO, NWBFile From 721e09ad051683d62033703d4ab74e09bfd87ea6 Mon Sep 17 00:00:00 2001 From: Cody Baker Date: Fri, 1 Sep 2023 15:44:48 -0400 Subject: [PATCH 17/17] remove unused equality override --- .../tools/nwb_helpers/_dataset_and_backend_models.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/src/neuroconv/tools/nwb_helpers/_dataset_and_backend_models.py b/src/neuroconv/tools/nwb_helpers/_dataset_and_backend_models.py index e79821142..789510e2a 100644 --- a/src/neuroconv/tools/nwb_helpers/_dataset_and_backend_models.py +++ b/src/neuroconv/tools/nwb_helpers/_dataset_and_backend_models.py @@ -25,11 +25,6 @@ def __hash__(self): """To allow instances of this class to be used as keys in dictionaries.""" return hash((type(self),) + tuple(self.__dict__.values())) - def __eq__(self, other): - if other.__class__ is self.__class__: - return self.__dict__ == other.__dict__ - return NotImplemented - class DatasetConfiguration(BaseModel): """A data model for configruing options about an object that will become a HDF5 or Zarr Dataset in the file.""" @@ -51,11 +46,6 @@ def __str__(self) -> str: ) return string - def __eq__(self, other): - if other.__class__ is self.__class__: - return self.__dict__ == other.__dict__ - return NotImplemented - _available_hdf5_filters = set(h5py.filters.decode) - set(("shuffle", "fletcher32", "scaleoffset")) if is_module_installed(module_name="hdf5plugin"):