From 679418c151cc46eecaf9ca016f4a7721d3e7be38 Mon Sep 17 00:00:00 2001
From: Paul Adkisson <paul.wesley.adkisson@gmail.com>
Date: Fri, 13 Sep 2024 04:28:01 +1000
Subject: [PATCH] Added chunking/compression for string-only objects (#1042)

Co-authored-by: Cody Baker <51133164+CodyCBakerPhD@users.noreply.github.com>
Co-authored-by: Heberto Mayorquin <h.mayorquin@gmail.com>
---
 CHANGELOG.md                                  |  4 +-
 .../_configuration_models/_base_dataset_io.py | 28 +++++++++++--
 .../test_dataset_io_configuration_model.py    | 42 +++++++++++++++++++
 3 files changed, 69 insertions(+), 5 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 12ed1eaf9..cf4f6bfbb 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,6 +1,6 @@
 # Upcoming
 
-## v.0.6.3
+## v0.6.4
 
 ## Bug Fixes
 * Fixed a setup bug introduced in `v0.6.2` where installation process created a directory instead of a file for test configuration file  [PR #1070](https://github.com/catalystneuro/neuroconv/pull/1070)
@@ -9,10 +9,12 @@
 ## Deprecations
 
 ## Features
+* Added chunking/compression for string-only compound objects: [PR #1042](https://github.com/catalystneuro/neuroconv/pull/1042)
 * Added automated EFS volume creation and mounting to the `submit_aws_job` helper function. [PR #1018](https://github.com/catalystneuro/neuroconv/pull/1018)
 
 ## Improvements
 
+## v0.6.3
 
 
 ## v0.6.2 (September 10, 2024)
diff --git a/src/neuroconv/tools/nwb_helpers/_configuration_models/_base_dataset_io.py b/src/neuroconv/tools/nwb_helpers/_configuration_models/_base_dataset_io.py
index 8b40e9a9e..9cdb97405 100644
--- a/src/neuroconv/tools/nwb_helpers/_configuration_models/_base_dataset_io.py
+++ b/src/neuroconv/tools/nwb_helpers/_configuration_models/_base_dataset_io.py
@@ -277,10 +277,30 @@ def from_neurodata_object(cls, neurodata_object: Container, dataset_name: Litera
             )
             compression_method = "gzip"
         elif dtype == np.dtype("object"):  # Unclear what default chunking/compression should be for compound objects
-            raise NotImplementedError(
-                f"Unable to create a `DatasetIOConfiguration` for the dataset at '{location_in_file}'"
-                f"for neurodata object '{neurodata_object}' of type '{type(neurodata_object)}'!"
-            )
+            # pandas reads in strings as objects by default: https://pandas.pydata.org/docs/user_guide/text.html
+            all_elements_are_strings = all([isinstance(element, str) for element in candidate_dataset[:].flat])
+            if all_elements_are_strings:
+                dtype = np.array([element for element in candidate_dataset[:].flat]).dtype
+                chunk_shape = SliceableDataChunkIterator.estimate_default_chunk_shape(
+                    chunk_mb=10.0, maxshape=full_shape, dtype=dtype
+                )
+                buffer_shape = SliceableDataChunkIterator.estimate_default_buffer_shape(
+                    buffer_gb=0.5, chunk_shape=chunk_shape, maxshape=full_shape, dtype=dtype
+                )
+                compression_method = "gzip"
+            else:
+                raise NotImplementedError(
+                    f"Unable to create a `DatasetIOConfiguration` for the dataset at '{location_in_file}'"
+                    f"for neurodata object '{neurodata_object}' of type '{type(neurodata_object)}'!"
+                )
+                # TODO: Add support for compound objects with non-string elements
+                # chunk_shape = full_shape  # validate_all_shapes fails if chunk_shape or buffer_shape is None
+                # buffer_shape = full_shape
+                # compression_method = None
+                # warnings.warn(
+                #     f"Default chunking and compression options for compound objects are not optimized. "
+                #     f"Consider manually specifying DatasetIOConfiguration for dataset at '{location_in_file}'."
+                # )
 
         return cls(
             object_id=neurodata_object.object_id,
diff --git a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/test_dataset_io_configuration_model.py b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/test_dataset_io_configuration_model.py
index 901a82d63..616c6e9d4 100644
--- a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/test_dataset_io_configuration_model.py
+++ b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/test_dataset_io_configuration_model.py
@@ -2,6 +2,7 @@
 
 import numpy as np
 import pytest
+from pynwb.testing.mock.file import mock_NWBFile
 
 from neuroconv.tools.nwb_helpers import DatasetIOConfiguration
 
@@ -53,3 +54,44 @@ def test_model_json_schema_generator_assertion():
         DatasetIOConfiguration.model_json_schema(schema_generator="anything")
 
     assert "The 'schema_generator' of this method cannot be changed." == str(error_info.value)
+
+
+# TODO: Add support for compound objects with non-string elements
+# def test_from_neurodata_object_dtype_object():
+#     class TestDatasetIOConfiguration(DatasetIOConfiguration):
+#         def get_data_io_kwargs(self):
+#             super().get_data_io_kwargs()
+
+#     nwbfile = mock_NWBFile()
+#     nwbfile.add_trial(start_time=0.0, stop_time=1.0)
+#     nwbfile.add_trial(start_time=1.0, stop_time=2.0)
+#     nwbfile.add_trial(start_time=2.0, stop_time=3.0)
+#     data = np.array(["test", 5, False], dtype=object)
+#     nwbfile.add_trial_column(name="test", description="test column with object dtype", data=data)
+#     neurodata_object = nwbfile.trials.columns[2]
+
+#     dataset_io_configuration = TestDatasetIOConfiguration.from_neurodata_object(neurodata_object, dataset_name="data")
+
+#     assert dataset_io_configuration.chunk_shape == (3,)
+#     assert dataset_io_configuration.buffer_shape == (3,)
+#     assert dataset_io_configuration.compression_method is None
+
+
+def test_from_neurodata_object_dtype_object_all_strings():
+    class TestDatasetIOConfiguration(DatasetIOConfiguration):
+        def get_data_io_kwargs(self):
+            super().get_data_io_kwargs()
+
+    nwbfile = mock_NWBFile()
+    nwbfile.add_trial(start_time=0.0, stop_time=1.0)
+    nwbfile.add_trial(start_time=1.0, stop_time=2.0)
+    nwbfile.add_trial(start_time=2.0, stop_time=3.0)
+    data = np.array(["test", "string", "abc"], dtype=object)
+    nwbfile.add_trial_column(name="test", description="test column with object dtype but all strings", data=data)
+    neurodata_object = nwbfile.trials.columns[2]
+
+    dataset_io_configuration = TestDatasetIOConfiguration.from_neurodata_object(neurodata_object, dataset_name="data")
+
+    assert dataset_io_configuration.chunk_shape == (3,)
+    assert dataset_io_configuration.buffer_shape == (3,)
+    assert dataset_io_configuration.compression_method == "gzip"