From 0e1fe79615d758e5fec2c687fddffba7c1032e65 Mon Sep 17 00:00:00 2001 From: Heberto Mayorquin Date: Tue, 12 Nov 2024 00:44:17 -0600 Subject: [PATCH 1/3] wip 2 --- .../_configuration_models/_base_dataset_io.py | 61 ++++++++++--------- .../_configuration_models/_hdf5_dataset_io.py | 2 +- .../tools/nwb_helpers/_configure_backend.py | 2 +- .../nwb_helpers/_dataset_configuration.py | 12 ++-- .../test_ecephys/test_tools_spikeinterface.py | 47 ++++++++++++++ 5 files changed, 87 insertions(+), 37 deletions(-) diff --git a/src/neuroconv/tools/nwb_helpers/_configuration_models/_base_dataset_io.py b/src/neuroconv/tools/nwb_helpers/_configuration_models/_base_dataset_io.py index 8b4e98436..d50ce77e6 100644 --- a/src/neuroconv/tools/nwb_helpers/_configuration_models/_base_dataset_io.py +++ b/src/neuroconv/tools/nwb_helpers/_configuration_models/_base_dataset_io.py @@ -102,7 +102,9 @@ class DatasetIOConfiguration(BaseModel, ABC): ) dataset_name: Literal["data", "timestamps"] = Field(description="The reference name of the dataset.", frozen=True) dtype: InstanceOf[np.dtype] = Field(description="The data type of elements of this dataset.", frozen=True) - full_shape: tuple[int, ...] = Field(description="The maximum shape of the entire dataset.", frozen=True) + full_shape: tuple[Union[int, None], ...] = Field( + description="The maximum shape of the entire dataset.", frozen=False + ) # User specifiable fields chunk_shape: Union[tuple[PositiveInt, ...], None] = Field( @@ -210,21 +212,21 @@ def validate_all_shapes(cls, values: dict[str, Any]) -> dict[str, Any]: f"Some dimensions of the {chunk_shape=} exceed the {buffer_shape=} for dataset at " f"location '{location_in_file}'!" ) - if any(buffer_axis > full_axis for buffer_axis, full_axis in zip(buffer_shape, full_shape)): - raise ValueError( - f"Some dimensions of the {buffer_shape=} exceed the {full_shape=} for dataset at " - f"location '{location_in_file}'!" - ) - - if any( - buffer_axis % chunk_axis != 0 - for chunk_axis, buffer_axis, full_axis in zip(chunk_shape, buffer_shape, full_shape) - if buffer_axis != full_axis - ): - raise ValueError( - f"Some dimensions of the {chunk_shape=} do not evenly divide the {buffer_shape=} for dataset at " - f"location '{location_in_file}'!" - ) + # if any(buffer_axis > full_axis for buffer_axis, full_axis in zip(buffer_shape, full_shape)): + # raise ValueError( + # f"Some dimensions of the {buffer_shape=} exceed the {full_shape=} for dataset at " + # f"location '{location_in_file}'!" + # ) + + # if any( + # buffer_axis % chunk_axis != 0 + # for chunk_axis, buffer_axis, full_axis in zip(chunk_shape, buffer_shape, full_shape) + # if buffer_axis != full_axis + # ): + # raise ValueError( + # f"Some dimensions of the {chunk_shape=} do not evenly divide the {buffer_shape=} for dataset at " + # f"location '{location_in_file}'!" + # ) return values @@ -277,7 +279,12 @@ def from_neurodata_object(cls, neurodata_object: Container, dataset_name: Litera compression_method = "gzip" elif dtype == np.dtype("object"): # Unclear what default chunking/compression should be for compound objects # pandas reads in strings as objects by default: https://pandas.pydata.org/docs/user_guide/text.html - all_elements_are_strings = all([isinstance(element, str) for element in candidate_dataset[:].flat]) + if isinstance(candidate_dataset, np.ndarray): + all_elements_are_strings = all([isinstance(element, str) for element in candidate_dataset[:].flat]) + elif isinstance(candidate_dataset, list): + all_elements_are_strings = all([isinstance(element, str) for element in candidate_dataset]) + else: + all_elements_are_strings = False if all_elements_are_strings: dtype = np.array([element for element in candidate_dataset[:].flat]).dtype chunk_shape = SliceableDataChunkIterator.estimate_default_chunk_shape( @@ -288,18 +295,16 @@ def from_neurodata_object(cls, neurodata_object: Container, dataset_name: Litera ) compression_method = "gzip" else: - raise NotImplementedError( - f"Unable to create a `DatasetIOConfiguration` for the dataset at '{location_in_file}'" - f"for neurodata object '{neurodata_object}' of type '{type(neurodata_object)}'!" + + chunk_shape = full_shape # validate_all_shapes fails if chunk_shape or buffer_shape is None + buffer_shape = full_shape + compression_method = None + import warnings + + warnings.warn( + f"Default chunking and compression options for compound objects are not optimized. " + f"Consider manually specifying DatasetIOConfiguration for dataset at '{location_in_file}'." ) - # TODO: Add support for compound objects with non-string elements - # chunk_shape = full_shape # validate_all_shapes fails if chunk_shape or buffer_shape is None - # buffer_shape = full_shape - # compression_method = None - # warnings.warn( - # f"Default chunking and compression options for compound objects are not optimized. " - # f"Consider manually specifying DatasetIOConfiguration for dataset at '{location_in_file}'." - # ) return cls( object_id=neurodata_object.object_id, diff --git a/src/neuroconv/tools/nwb_helpers/_configuration_models/_hdf5_dataset_io.py b/src/neuroconv/tools/nwb_helpers/_configuration_models/_hdf5_dataset_io.py index 44c7660ab..233c00f70 100644 --- a/src/neuroconv/tools/nwb_helpers/_configuration_models/_hdf5_dataset_io.py +++ b/src/neuroconv/tools/nwb_helpers/_configuration_models/_hdf5_dataset_io.py @@ -77,4 +77,4 @@ def get_data_io_kwargs(self) -> dict[str, Any]: compression_opts = list(self.compression_options.values())[0] compression_bundle = dict(compression=self.compression_method, compression_opts=compression_opts) - return dict(chunks=self.chunk_shape, **compression_bundle) + return dict(chunks=self.chunk_shape, maxshape=self.full_shape, **compression_bundle) diff --git a/src/neuroconv/tools/nwb_helpers/_configure_backend.py b/src/neuroconv/tools/nwb_helpers/_configure_backend.py index a67308d43..e9d6c28a2 100644 --- a/src/neuroconv/tools/nwb_helpers/_configure_backend.py +++ b/src/neuroconv/tools/nwb_helpers/_configure_backend.py @@ -27,7 +27,7 @@ def configure_backend( is_ndx_events_installed = is_package_installed(package_name="ndx_events") ndx_events = importlib.import_module("ndx_events") if is_ndx_events_installed else None - # A remapping of the object IDs in the backend configuration might necessary + # A remapping of the object IDs in the backend configuration might be necessary locations_to_remap = backend_configuration.find_locations_requiring_remapping(nwbfile=nwbfile) if any(locations_to_remap): backend_configuration = backend_configuration.build_remapped_backend(locations_to_remap=locations_to_remap) diff --git a/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py b/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py index f3d8e7560..bfa119267 100644 --- a/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py +++ b/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py @@ -5,7 +5,6 @@ import h5py import numpy as np import zarr -from hdmf import Container from hdmf.data_utils import DataIO from hdmf.utils import get_data_shape from hdmf_zarr import NWBZarrIO @@ -106,8 +105,9 @@ def get_default_dataset_io_configurations( if isinstance(neurodata_object, DynamicTable): dynamic_table = neurodata_object # For readability - for column in dynamic_table.columns: - candidate_dataset = column.data # VectorData object + for column in (*dynamic_table.columns, dynamic_table.id): + dataset_name = "data" + candidate_dataset = getattr(column, dataset_name) # Safer way to access data attribute # noinspection PyTypeChecker if _is_dataset_written_to_file( candidate_dataset=candidate_dataset, backend=backend, existing_file=existing_file @@ -119,16 +119,14 @@ def get_default_dataset_io_configurations( continue # Skip # Skip over columns whose values are links, such as the 'group' of an ElectrodesTable - if any(isinstance(value, Container) for value in candidate_dataset): - continue # Skip + # if any(isinstance(value, Container) for value in candidate_dataset): + # continue # Skip # Skip when columns whose values are a reference type if isinstance(column, TimeSeriesReferenceVectorData): continue # Skip datasets with any zero-length axes - dataset_name = "data" - candidate_dataset = getattr(column, dataset_name) full_shape = get_data_shape(data=candidate_dataset) if any(axis_length == 0 for axis_length in full_shape): continue diff --git a/tests/test_ecephys/test_tools_spikeinterface.py b/tests/test_ecephys/test_tools_spikeinterface.py index 3436a2e70..a896b5f0d 100644 --- a/tests/test_ecephys/test_tools_spikeinterface.py +++ b/tests/test_ecephys/test_tools_spikeinterface.py @@ -753,6 +753,53 @@ def test_manual_row_adition_before_add_electrodes_function_to_nwbfile(self): self.assertListEqual(list(self.nwbfile.electrodes.id.data), expected_ids) self.assertListEqual(list(self.nwbfile.electrodes["channel_name"].data), expected_names) + def test_add_electrodes_with_previously_written_nwbfile(self): + """Add electrodes to a file that was already written""" + values_dic = self.defaults + + values_dic.update(id=123) + self.nwbfile.add_electrode(**values_dic) + + values_dic.update(id=124) + self.nwbfile.add_electrode(**values_dic) + + from pynwb import NWBHDF5IO + + from neuroconv.tools.nwb_helpers import ( + configure_backend, + get_default_backend_configuration, + ) + + backend_configuration = get_default_backend_configuration(nwbfile=self.nwbfile, backend="hdf5") + dataset_configurations = backend_configuration.dataset_configurations + + electrodes_location_dataset = dataset_configurations["electrodes/location/data"] + electrodes_group_dataset = dataset_configurations["electrodes/group/data"] + electrodes_group_name_dataset = dataset_configurations["electrodes/group_name/data"] + electrodes_id_dataset = dataset_configurations["electrodes/id/data"] + + # Make expandable + electrodes_location_dataset.full_shape = (None,) + electrodes_group_name_dataset.full_shape = (None,) + electrodes_id_dataset.full_shape = (None,) + electrodes_group_dataset.full_shape = (None,) + + configure_backend(nwbfile=self.nwbfile, backend_configuration=backend_configuration) + with NWBHDF5IO("test.nwb", "w") as io: + io.write(self.nwbfile) + + with NWBHDF5IO("test.nwb", "a") as io: + nwbfile_read = io.read() + + add_electrodes_to_nwbfile(recording=self.recording_1, nwbfile=nwbfile_read) + backend_configuration = get_default_backend_configuration(nwbfile=self.nwbfile, backend="hdf5") + dataset_configurations = backend_configuration.dataset_configurations + + expected_ids = [123, 124, 2, 3, 4, 5] + expected_names = ["123", "124", "a", "b", "c", "d"] + self.assertListEqual(list(nwbfile_read.electrodes.id.data), expected_ids) + self.assertListEqual(list(nwbfile_read.electrodes["channel_name"].data), expected_names) + def test_manual_row_adition_after_add_electrodes_function_to_nwbfile(self): """Add some rows to the electrode table after using the add_electrodes_to_nwbfile function""" add_electrodes_to_nwbfile(recording=self.recording_1, nwbfile=self.nwbfile) From 3f27639c230479ecd3f5460d6aebd084989ea3dd Mon Sep 17 00:00:00 2001 From: Heberto Mayorquin Date: Tue, 12 Nov 2024 11:08:28 -0600 Subject: [PATCH 2/3] restore and eliminate maxshape --- .../_configuration_models/_base_dataset_io.py | 30 +++++++++---------- .../_configuration_models/_hdf5_dataset_io.py | 2 +- .../test_ecephys/test_tools_spikeinterface.py | 12 -------- 3 files changed, 16 insertions(+), 28 deletions(-) diff --git a/src/neuroconv/tools/nwb_helpers/_configuration_models/_base_dataset_io.py b/src/neuroconv/tools/nwb_helpers/_configuration_models/_base_dataset_io.py index d50ce77e6..41bf80abb 100644 --- a/src/neuroconv/tools/nwb_helpers/_configuration_models/_base_dataset_io.py +++ b/src/neuroconv/tools/nwb_helpers/_configuration_models/_base_dataset_io.py @@ -212,21 +212,21 @@ def validate_all_shapes(cls, values: dict[str, Any]) -> dict[str, Any]: f"Some dimensions of the {chunk_shape=} exceed the {buffer_shape=} for dataset at " f"location '{location_in_file}'!" ) - # if any(buffer_axis > full_axis for buffer_axis, full_axis in zip(buffer_shape, full_shape)): - # raise ValueError( - # f"Some dimensions of the {buffer_shape=} exceed the {full_shape=} for dataset at " - # f"location '{location_in_file}'!" - # ) - - # if any( - # buffer_axis % chunk_axis != 0 - # for chunk_axis, buffer_axis, full_axis in zip(chunk_shape, buffer_shape, full_shape) - # if buffer_axis != full_axis - # ): - # raise ValueError( - # f"Some dimensions of the {chunk_shape=} do not evenly divide the {buffer_shape=} for dataset at " - # f"location '{location_in_file}'!" - # ) + if any(buffer_axis > full_axis for buffer_axis, full_axis in zip(buffer_shape, full_shape)): + raise ValueError( + f"Some dimensions of the {buffer_shape=} exceed the {full_shape=} for dataset at " + f"location '{location_in_file}'!" + ) + + if any( + buffer_axis % chunk_axis != 0 + for chunk_axis, buffer_axis, full_axis in zip(chunk_shape, buffer_shape, full_shape) + if buffer_axis != full_axis + ): + raise ValueError( + f"Some dimensions of the {chunk_shape=} do not evenly divide the {buffer_shape=} for dataset at " + f"location '{location_in_file}'!" + ) return values diff --git a/src/neuroconv/tools/nwb_helpers/_configuration_models/_hdf5_dataset_io.py b/src/neuroconv/tools/nwb_helpers/_configuration_models/_hdf5_dataset_io.py index 233c00f70..44c7660ab 100644 --- a/src/neuroconv/tools/nwb_helpers/_configuration_models/_hdf5_dataset_io.py +++ b/src/neuroconv/tools/nwb_helpers/_configuration_models/_hdf5_dataset_io.py @@ -77,4 +77,4 @@ def get_data_io_kwargs(self) -> dict[str, Any]: compression_opts = list(self.compression_options.values())[0] compression_bundle = dict(compression=self.compression_method, compression_opts=compression_opts) - return dict(chunks=self.chunk_shape, maxshape=self.full_shape, **compression_bundle) + return dict(chunks=self.chunk_shape, **compression_bundle) diff --git a/tests/test_ecephys/test_tools_spikeinterface.py b/tests/test_ecephys/test_tools_spikeinterface.py index a896b5f0d..9bfdcb554 100644 --- a/tests/test_ecephys/test_tools_spikeinterface.py +++ b/tests/test_ecephys/test_tools_spikeinterface.py @@ -771,18 +771,6 @@ def test_add_electrodes_with_previously_written_nwbfile(self): ) backend_configuration = get_default_backend_configuration(nwbfile=self.nwbfile, backend="hdf5") - dataset_configurations = backend_configuration.dataset_configurations - - electrodes_location_dataset = dataset_configurations["electrodes/location/data"] - electrodes_group_dataset = dataset_configurations["electrodes/group/data"] - electrodes_group_name_dataset = dataset_configurations["electrodes/group_name/data"] - electrodes_id_dataset = dataset_configurations["electrodes/id/data"] - - # Make expandable - electrodes_location_dataset.full_shape = (None,) - electrodes_group_name_dataset.full_shape = (None,) - electrodes_id_dataset.full_shape = (None,) - electrodes_group_dataset.full_shape = (None,) configure_backend(nwbfile=self.nwbfile, backend_configuration=backend_configuration) with NWBHDF5IO("test.nwb", "w") as io: From df9d6f84a70f0840d78c09c42dd6e67afd24ff3c Mon Sep 17 00:00:00 2001 From: Heberto Mayorquin Date: Tue, 12 Nov 2024 12:32:26 -0600 Subject: [PATCH 3/3] stop point to show Ben --- .../nwb_helpers/_configuration_models/_base_dataset_io.py | 3 ++- src/neuroconv/tools/nwb_helpers/_dataset_configuration.py | 2 +- tests/test_ecephys/test_tools_spikeinterface.py | 2 -- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/src/neuroconv/tools/nwb_helpers/_configuration_models/_base_dataset_io.py b/src/neuroconv/tools/nwb_helpers/_configuration_models/_base_dataset_io.py index 41bf80abb..cf545debf 100644 --- a/src/neuroconv/tools/nwb_helpers/_configuration_models/_base_dataset_io.py +++ b/src/neuroconv/tools/nwb_helpers/_configuration_models/_base_dataset_io.py @@ -296,7 +296,8 @@ def from_neurodata_object(cls, neurodata_object: Container, dataset_name: Litera compression_method = "gzip" else: - chunk_shape = full_shape # validate_all_shapes fails if chunk_shape or buffer_shape is None + # chunk_shape = (1,) * len(full_shape) # Worse possible chunking + chunk_shape = full_shape buffer_shape = full_shape compression_method = None import warnings diff --git a/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py b/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py index bfa119267..ad45d2a32 100644 --- a/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py +++ b/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py @@ -103,8 +103,8 @@ def get_default_dataset_io_configurations( known_dataset_fields = ("data", "timestamps") for neurodata_object in nwbfile.objects.values(): if isinstance(neurodata_object, DynamicTable): - dynamic_table = neurodata_object # For readability + dynamic_table = neurodata_object # For readability for column in (*dynamic_table.columns, dynamic_table.id): dataset_name = "data" candidate_dataset = getattr(column, dataset_name) # Safer way to access data attribute diff --git a/tests/test_ecephys/test_tools_spikeinterface.py b/tests/test_ecephys/test_tools_spikeinterface.py index 9bfdcb554..c95672196 100644 --- a/tests/test_ecephys/test_tools_spikeinterface.py +++ b/tests/test_ecephys/test_tools_spikeinterface.py @@ -780,8 +780,6 @@ def test_add_electrodes_with_previously_written_nwbfile(self): nwbfile_read = io.read() add_electrodes_to_nwbfile(recording=self.recording_1, nwbfile=nwbfile_read) - backend_configuration = get_default_backend_configuration(nwbfile=self.nwbfile, backend="hdf5") - dataset_configurations = backend_configuration.dataset_configurations expected_ids = [123, 124, 2, 3, 4, 5] expected_names = ["123", "124", "a", "b", "c", "d"]