From 0e1fe79615d758e5fec2c687fddffba7c1032e65 Mon Sep 17 00:00:00 2001
From: Heberto Mayorquin <h.mayorquin@gmail.com>
Date: Tue, 12 Nov 2024 00:44:17 -0600
Subject: [PATCH 1/3] wip 2

---
 .../_configuration_models/_base_dataset_io.py | 61 ++++++++++---------
 .../_configuration_models/_hdf5_dataset_io.py |  2 +-
 .../tools/nwb_helpers/_configure_backend.py   |  2 +-
 .../nwb_helpers/_dataset_configuration.py     | 12 ++--
 .../test_ecephys/test_tools_spikeinterface.py | 47 ++++++++++++++
 5 files changed, 87 insertions(+), 37 deletions(-)

diff --git a/src/neuroconv/tools/nwb_helpers/_configuration_models/_base_dataset_io.py b/src/neuroconv/tools/nwb_helpers/_configuration_models/_base_dataset_io.py
index 8b4e98436..d50ce77e6 100644
--- a/src/neuroconv/tools/nwb_helpers/_configuration_models/_base_dataset_io.py
+++ b/src/neuroconv/tools/nwb_helpers/_configuration_models/_base_dataset_io.py
@@ -102,7 +102,9 @@ class DatasetIOConfiguration(BaseModel, ABC):
     )
     dataset_name: Literal["data", "timestamps"] = Field(description="The reference name of the dataset.", frozen=True)
     dtype: InstanceOf[np.dtype] = Field(description="The data type of elements of this dataset.", frozen=True)
-    full_shape: tuple[int, ...] = Field(description="The maximum shape of the entire dataset.", frozen=True)
+    full_shape: tuple[Union[int, None], ...] = Field(
+        description="The maximum shape of the entire dataset.", frozen=False
+    )
 
     # User specifiable fields
     chunk_shape: Union[tuple[PositiveInt, ...], None] = Field(
@@ -210,21 +212,21 @@ def validate_all_shapes(cls, values: dict[str, Any]) -> dict[str, Any]:
                 f"Some dimensions of the {chunk_shape=} exceed the {buffer_shape=} for dataset at "
                 f"location '{location_in_file}'!"
             )
-        if any(buffer_axis > full_axis for buffer_axis, full_axis in zip(buffer_shape, full_shape)):
-            raise ValueError(
-                f"Some dimensions of the {buffer_shape=} exceed the {full_shape=} for dataset at "
-                f"location '{location_in_file}'!"
-            )
-
-        if any(
-            buffer_axis % chunk_axis != 0
-            for chunk_axis, buffer_axis, full_axis in zip(chunk_shape, buffer_shape, full_shape)
-            if buffer_axis != full_axis
-        ):
-            raise ValueError(
-                f"Some dimensions of the {chunk_shape=} do not evenly divide the {buffer_shape=} for dataset at "
-                f"location '{location_in_file}'!"
-            )
+        # if any(buffer_axis > full_axis for buffer_axis, full_axis in zip(buffer_shape, full_shape)):
+        #     raise ValueError(
+        #         f"Some dimensions of the {buffer_shape=} exceed the {full_shape=} for dataset at "
+        #         f"location '{location_in_file}'!"
+        #     )
+
+        # if any(
+        #     buffer_axis % chunk_axis != 0
+        #     for chunk_axis, buffer_axis, full_axis in zip(chunk_shape, buffer_shape, full_shape)
+        #     if buffer_axis != full_axis
+        # ):
+        #     raise ValueError(
+        #         f"Some dimensions of the {chunk_shape=} do not evenly divide the {buffer_shape=} for dataset at "
+        #         f"location '{location_in_file}'!"
+        #     )
 
         return values
 
@@ -277,7 +279,12 @@ def from_neurodata_object(cls, neurodata_object: Container, dataset_name: Litera
             compression_method = "gzip"
         elif dtype == np.dtype("object"):  # Unclear what default chunking/compression should be for compound objects
             # pandas reads in strings as objects by default: https://pandas.pydata.org/docs/user_guide/text.html
-            all_elements_are_strings = all([isinstance(element, str) for element in candidate_dataset[:].flat])
+            if isinstance(candidate_dataset, np.ndarray):
+                all_elements_are_strings = all([isinstance(element, str) for element in candidate_dataset[:].flat])
+            elif isinstance(candidate_dataset, list):
+                all_elements_are_strings = all([isinstance(element, str) for element in candidate_dataset])
+            else:
+                all_elements_are_strings = False
             if all_elements_are_strings:
                 dtype = np.array([element for element in candidate_dataset[:].flat]).dtype
                 chunk_shape = SliceableDataChunkIterator.estimate_default_chunk_shape(
@@ -288,18 +295,16 @@ def from_neurodata_object(cls, neurodata_object: Container, dataset_name: Litera
                 )
                 compression_method = "gzip"
             else:
-                raise NotImplementedError(
-                    f"Unable to create a `DatasetIOConfiguration` for the dataset at '{location_in_file}'"
-                    f"for neurodata object '{neurodata_object}' of type '{type(neurodata_object)}'!"
+
+                chunk_shape = full_shape  # validate_all_shapes fails if chunk_shape or buffer_shape is None
+                buffer_shape = full_shape
+                compression_method = None
+                import warnings
+
+                warnings.warn(
+                    f"Default chunking and compression options for compound objects are not optimized. "
+                    f"Consider manually specifying DatasetIOConfiguration for dataset at '{location_in_file}'."
                 )
-                # TODO: Add support for compound objects with non-string elements
-                # chunk_shape = full_shape  # validate_all_shapes fails if chunk_shape or buffer_shape is None
-                # buffer_shape = full_shape
-                # compression_method = None
-                # warnings.warn(
-                #     f"Default chunking and compression options for compound objects are not optimized. "
-                #     f"Consider manually specifying DatasetIOConfiguration for dataset at '{location_in_file}'."
-                # )
 
         return cls(
             object_id=neurodata_object.object_id,
diff --git a/src/neuroconv/tools/nwb_helpers/_configuration_models/_hdf5_dataset_io.py b/src/neuroconv/tools/nwb_helpers/_configuration_models/_hdf5_dataset_io.py
index 44c7660ab..233c00f70 100644
--- a/src/neuroconv/tools/nwb_helpers/_configuration_models/_hdf5_dataset_io.py
+++ b/src/neuroconv/tools/nwb_helpers/_configuration_models/_hdf5_dataset_io.py
@@ -77,4 +77,4 @@ def get_data_io_kwargs(self) -> dict[str, Any]:
                 compression_opts = list(self.compression_options.values())[0]
             compression_bundle = dict(compression=self.compression_method, compression_opts=compression_opts)
 
-        return dict(chunks=self.chunk_shape, **compression_bundle)
+        return dict(chunks=self.chunk_shape, maxshape=self.full_shape, **compression_bundle)
diff --git a/src/neuroconv/tools/nwb_helpers/_configure_backend.py b/src/neuroconv/tools/nwb_helpers/_configure_backend.py
index a67308d43..e9d6c28a2 100644
--- a/src/neuroconv/tools/nwb_helpers/_configure_backend.py
+++ b/src/neuroconv/tools/nwb_helpers/_configure_backend.py
@@ -27,7 +27,7 @@ def configure_backend(
     is_ndx_events_installed = is_package_installed(package_name="ndx_events")
     ndx_events = importlib.import_module("ndx_events") if is_ndx_events_installed else None
 
-    # A remapping of the object IDs in the backend configuration might necessary
+    # A remapping of the object IDs in the backend configuration might be necessary
     locations_to_remap = backend_configuration.find_locations_requiring_remapping(nwbfile=nwbfile)
     if any(locations_to_remap):
         backend_configuration = backend_configuration.build_remapped_backend(locations_to_remap=locations_to_remap)
diff --git a/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py b/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py
index f3d8e7560..bfa119267 100644
--- a/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py
+++ b/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py
@@ -5,7 +5,6 @@
 import h5py
 import numpy as np
 import zarr
-from hdmf import Container
 from hdmf.data_utils import DataIO
 from hdmf.utils import get_data_shape
 from hdmf_zarr import NWBZarrIO
@@ -106,8 +105,9 @@ def get_default_dataset_io_configurations(
         if isinstance(neurodata_object, DynamicTable):
             dynamic_table = neurodata_object  # For readability
 
-            for column in dynamic_table.columns:
-                candidate_dataset = column.data  # VectorData object
+            for column in (*dynamic_table.columns, dynamic_table.id):
+                dataset_name = "data"
+                candidate_dataset = getattr(column, dataset_name)  # Safer way to access data attribute
                 # noinspection PyTypeChecker
                 if _is_dataset_written_to_file(
                     candidate_dataset=candidate_dataset, backend=backend, existing_file=existing_file
@@ -119,16 +119,14 @@ def get_default_dataset_io_configurations(
                     continue  # Skip
 
                 # Skip over columns whose values are links, such as the 'group' of an ElectrodesTable
-                if any(isinstance(value, Container) for value in candidate_dataset):
-                    continue  # Skip
+                # if any(isinstance(value, Container) for value in candidate_dataset):
+                #     continue  # Skip
 
                 # Skip when columns whose values are a reference type
                 if isinstance(column, TimeSeriesReferenceVectorData):
                     continue
 
                 # Skip datasets with any zero-length axes
-                dataset_name = "data"
-                candidate_dataset = getattr(column, dataset_name)
                 full_shape = get_data_shape(data=candidate_dataset)
                 if any(axis_length == 0 for axis_length in full_shape):
                     continue
diff --git a/tests/test_ecephys/test_tools_spikeinterface.py b/tests/test_ecephys/test_tools_spikeinterface.py
index 3436a2e70..a896b5f0d 100644
--- a/tests/test_ecephys/test_tools_spikeinterface.py
+++ b/tests/test_ecephys/test_tools_spikeinterface.py
@@ -753,6 +753,53 @@ def test_manual_row_adition_before_add_electrodes_function_to_nwbfile(self):
         self.assertListEqual(list(self.nwbfile.electrodes.id.data), expected_ids)
         self.assertListEqual(list(self.nwbfile.electrodes["channel_name"].data), expected_names)
 
+    def test_add_electrodes_with_previously_written_nwbfile(self):
+        """Add electrodes to a file that was already written"""
+        values_dic = self.defaults
+
+        values_dic.update(id=123)
+        self.nwbfile.add_electrode(**values_dic)
+
+        values_dic.update(id=124)
+        self.nwbfile.add_electrode(**values_dic)
+
+        from pynwb import NWBHDF5IO
+
+        from neuroconv.tools.nwb_helpers import (
+            configure_backend,
+            get_default_backend_configuration,
+        )
+
+        backend_configuration = get_default_backend_configuration(nwbfile=self.nwbfile, backend="hdf5")
+        dataset_configurations = backend_configuration.dataset_configurations
+
+        electrodes_location_dataset = dataset_configurations["electrodes/location/data"]
+        electrodes_group_dataset = dataset_configurations["electrodes/group/data"]
+        electrodes_group_name_dataset = dataset_configurations["electrodes/group_name/data"]
+        electrodes_id_dataset = dataset_configurations["electrodes/id/data"]
+
+        # Make expandable
+        electrodes_location_dataset.full_shape = (None,)
+        electrodes_group_name_dataset.full_shape = (None,)
+        electrodes_id_dataset.full_shape = (None,)
+        electrodes_group_dataset.full_shape = (None,)
+
+        configure_backend(nwbfile=self.nwbfile, backend_configuration=backend_configuration)
+        with NWBHDF5IO("test.nwb", "w") as io:
+            io.write(self.nwbfile)
+
+        with NWBHDF5IO("test.nwb", "a") as io:
+            nwbfile_read = io.read()
+
+            add_electrodes_to_nwbfile(recording=self.recording_1, nwbfile=nwbfile_read)
+            backend_configuration = get_default_backend_configuration(nwbfile=self.nwbfile, backend="hdf5")
+            dataset_configurations = backend_configuration.dataset_configurations
+
+            expected_ids = [123, 124, 2, 3, 4, 5]
+            expected_names = ["123", "124", "a", "b", "c", "d"]
+            self.assertListEqual(list(nwbfile_read.electrodes.id.data), expected_ids)
+            self.assertListEqual(list(nwbfile_read.electrodes["channel_name"].data), expected_names)
+
     def test_manual_row_adition_after_add_electrodes_function_to_nwbfile(self):
         """Add some rows to the electrode table after using the add_electrodes_to_nwbfile function"""
         add_electrodes_to_nwbfile(recording=self.recording_1, nwbfile=self.nwbfile)

From 3f27639c230479ecd3f5460d6aebd084989ea3dd Mon Sep 17 00:00:00 2001
From: Heberto Mayorquin <h.mayorquin@gmail.com>
Date: Tue, 12 Nov 2024 11:08:28 -0600
Subject: [PATCH 2/3] restore and eliminate maxshape

---
 .../_configuration_models/_base_dataset_io.py | 30 +++++++++----------
 .../_configuration_models/_hdf5_dataset_io.py |  2 +-
 .../test_ecephys/test_tools_spikeinterface.py | 12 --------
 3 files changed, 16 insertions(+), 28 deletions(-)

diff --git a/src/neuroconv/tools/nwb_helpers/_configuration_models/_base_dataset_io.py b/src/neuroconv/tools/nwb_helpers/_configuration_models/_base_dataset_io.py
index d50ce77e6..41bf80abb 100644
--- a/src/neuroconv/tools/nwb_helpers/_configuration_models/_base_dataset_io.py
+++ b/src/neuroconv/tools/nwb_helpers/_configuration_models/_base_dataset_io.py
@@ -212,21 +212,21 @@ def validate_all_shapes(cls, values: dict[str, Any]) -> dict[str, Any]:
                 f"Some dimensions of the {chunk_shape=} exceed the {buffer_shape=} for dataset at "
                 f"location '{location_in_file}'!"
             )
-        # if any(buffer_axis > full_axis for buffer_axis, full_axis in zip(buffer_shape, full_shape)):
-        #     raise ValueError(
-        #         f"Some dimensions of the {buffer_shape=} exceed the {full_shape=} for dataset at "
-        #         f"location '{location_in_file}'!"
-        #     )
-
-        # if any(
-        #     buffer_axis % chunk_axis != 0
-        #     for chunk_axis, buffer_axis, full_axis in zip(chunk_shape, buffer_shape, full_shape)
-        #     if buffer_axis != full_axis
-        # ):
-        #     raise ValueError(
-        #         f"Some dimensions of the {chunk_shape=} do not evenly divide the {buffer_shape=} for dataset at "
-        #         f"location '{location_in_file}'!"
-        #     )
+        if any(buffer_axis > full_axis for buffer_axis, full_axis in zip(buffer_shape, full_shape)):
+            raise ValueError(
+                f"Some dimensions of the {buffer_shape=} exceed the {full_shape=} for dataset at "
+                f"location '{location_in_file}'!"
+            )
+
+        if any(
+            buffer_axis % chunk_axis != 0
+            for chunk_axis, buffer_axis, full_axis in zip(chunk_shape, buffer_shape, full_shape)
+            if buffer_axis != full_axis
+        ):
+            raise ValueError(
+                f"Some dimensions of the {chunk_shape=} do not evenly divide the {buffer_shape=} for dataset at "
+                f"location '{location_in_file}'!"
+            )
 
         return values
 
diff --git a/src/neuroconv/tools/nwb_helpers/_configuration_models/_hdf5_dataset_io.py b/src/neuroconv/tools/nwb_helpers/_configuration_models/_hdf5_dataset_io.py
index 233c00f70..44c7660ab 100644
--- a/src/neuroconv/tools/nwb_helpers/_configuration_models/_hdf5_dataset_io.py
+++ b/src/neuroconv/tools/nwb_helpers/_configuration_models/_hdf5_dataset_io.py
@@ -77,4 +77,4 @@ def get_data_io_kwargs(self) -> dict[str, Any]:
                 compression_opts = list(self.compression_options.values())[0]
             compression_bundle = dict(compression=self.compression_method, compression_opts=compression_opts)
 
-        return dict(chunks=self.chunk_shape, maxshape=self.full_shape, **compression_bundle)
+        return dict(chunks=self.chunk_shape, **compression_bundle)
diff --git a/tests/test_ecephys/test_tools_spikeinterface.py b/tests/test_ecephys/test_tools_spikeinterface.py
index a896b5f0d..9bfdcb554 100644
--- a/tests/test_ecephys/test_tools_spikeinterface.py
+++ b/tests/test_ecephys/test_tools_spikeinterface.py
@@ -771,18 +771,6 @@ def test_add_electrodes_with_previously_written_nwbfile(self):
         )
 
         backend_configuration = get_default_backend_configuration(nwbfile=self.nwbfile, backend="hdf5")
-        dataset_configurations = backend_configuration.dataset_configurations
-
-        electrodes_location_dataset = dataset_configurations["electrodes/location/data"]
-        electrodes_group_dataset = dataset_configurations["electrodes/group/data"]
-        electrodes_group_name_dataset = dataset_configurations["electrodes/group_name/data"]
-        electrodes_id_dataset = dataset_configurations["electrodes/id/data"]
-
-        # Make expandable
-        electrodes_location_dataset.full_shape = (None,)
-        electrodes_group_name_dataset.full_shape = (None,)
-        electrodes_id_dataset.full_shape = (None,)
-        electrodes_group_dataset.full_shape = (None,)
 
         configure_backend(nwbfile=self.nwbfile, backend_configuration=backend_configuration)
         with NWBHDF5IO("test.nwb", "w") as io:

From df9d6f84a70f0840d78c09c42dd6e67afd24ff3c Mon Sep 17 00:00:00 2001
From: Heberto Mayorquin <h.mayorquin@gmail.com>
Date: Tue, 12 Nov 2024 12:32:26 -0600
Subject: [PATCH 3/3] stop point to show Ben

---
 .../nwb_helpers/_configuration_models/_base_dataset_io.py      | 3 ++-
 src/neuroconv/tools/nwb_helpers/_dataset_configuration.py      | 2 +-
 tests/test_ecephys/test_tools_spikeinterface.py                | 2 --
 3 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/neuroconv/tools/nwb_helpers/_configuration_models/_base_dataset_io.py b/src/neuroconv/tools/nwb_helpers/_configuration_models/_base_dataset_io.py
index 41bf80abb..cf545debf 100644
--- a/src/neuroconv/tools/nwb_helpers/_configuration_models/_base_dataset_io.py
+++ b/src/neuroconv/tools/nwb_helpers/_configuration_models/_base_dataset_io.py
@@ -296,7 +296,8 @@ def from_neurodata_object(cls, neurodata_object: Container, dataset_name: Litera
                 compression_method = "gzip"
             else:
 
-                chunk_shape = full_shape  # validate_all_shapes fails if chunk_shape or buffer_shape is None
+                # chunk_shape = (1,) * len(full_shape)  # Worse possible chunking
+                chunk_shape = full_shape
                 buffer_shape = full_shape
                 compression_method = None
                 import warnings
diff --git a/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py b/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py
index bfa119267..ad45d2a32 100644
--- a/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py
+++ b/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py
@@ -103,8 +103,8 @@ def get_default_dataset_io_configurations(
     known_dataset_fields = ("data", "timestamps")
     for neurodata_object in nwbfile.objects.values():
         if isinstance(neurodata_object, DynamicTable):
-            dynamic_table = neurodata_object  # For readability
 
+            dynamic_table = neurodata_object  # For readability
             for column in (*dynamic_table.columns, dynamic_table.id):
                 dataset_name = "data"
                 candidate_dataset = getattr(column, dataset_name)  # Safer way to access data attribute
diff --git a/tests/test_ecephys/test_tools_spikeinterface.py b/tests/test_ecephys/test_tools_spikeinterface.py
index 9bfdcb554..c95672196 100644
--- a/tests/test_ecephys/test_tools_spikeinterface.py
+++ b/tests/test_ecephys/test_tools_spikeinterface.py
@@ -780,8 +780,6 @@ def test_add_electrodes_with_previously_written_nwbfile(self):
             nwbfile_read = io.read()
 
             add_electrodes_to_nwbfile(recording=self.recording_1, nwbfile=nwbfile_read)
-            backend_configuration = get_default_backend_configuration(nwbfile=self.nwbfile, backend="hdf5")
-            dataset_configurations = backend_configuration.dataset_configurations
 
             expected_ids = [123, 124, 2, 3, 4, 5]
             expected_names = ["123", "124", "a", "b", "c", "d"]