From 679418c151cc46eecaf9ca016f4a7721d3e7be38 Mon Sep 17 00:00:00 2001 From: Paul Adkisson Date: Fri, 13 Sep 2024 04:28:01 +1000 Subject: [PATCH] Added chunking/compression for string-only objects (#1042) Co-authored-by: Cody Baker <51133164+CodyCBakerPhD@users.noreply.github.com> Co-authored-by: Heberto Mayorquin --- CHANGELOG.md | 4 +- .../_configuration_models/_base_dataset_io.py | 28 +++++++++++-- .../test_dataset_io_configuration_model.py | 42 +++++++++++++++++++ 3 files changed, 69 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 12ed1eaf9..cf4f6bfbb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,6 @@ # Upcoming -## v.0.6.3 +## v0.6.4 ## Bug Fixes * Fixed a setup bug introduced in `v0.6.2` where installation process created a directory instead of a file for test configuration file [PR #1070](https://github.com/catalystneuro/neuroconv/pull/1070) @@ -9,10 +9,12 @@ ## Deprecations ## Features +* Added chunking/compression for string-only compound objects: [PR #1042](https://github.com/catalystneuro/neuroconv/pull/1042) * Added automated EFS volume creation and mounting to the `submit_aws_job` helper function. [PR #1018](https://github.com/catalystneuro/neuroconv/pull/1018) ## Improvements +## v0.6.3 ## v0.6.2 (September 10, 2024) diff --git a/src/neuroconv/tools/nwb_helpers/_configuration_models/_base_dataset_io.py b/src/neuroconv/tools/nwb_helpers/_configuration_models/_base_dataset_io.py index 8b40e9a9e..9cdb97405 100644 --- a/src/neuroconv/tools/nwb_helpers/_configuration_models/_base_dataset_io.py +++ b/src/neuroconv/tools/nwb_helpers/_configuration_models/_base_dataset_io.py @@ -277,10 +277,30 @@ def from_neurodata_object(cls, neurodata_object: Container, dataset_name: Litera ) compression_method = "gzip" elif dtype == np.dtype("object"): # Unclear what default chunking/compression should be for compound objects - raise NotImplementedError( - f"Unable to create a `DatasetIOConfiguration` for the dataset at '{location_in_file}'" - f"for neurodata object '{neurodata_object}' of type '{type(neurodata_object)}'!" - ) + # pandas reads in strings as objects by default: https://pandas.pydata.org/docs/user_guide/text.html + all_elements_are_strings = all([isinstance(element, str) for element in candidate_dataset[:].flat]) + if all_elements_are_strings: + dtype = np.array([element for element in candidate_dataset[:].flat]).dtype + chunk_shape = SliceableDataChunkIterator.estimate_default_chunk_shape( + chunk_mb=10.0, maxshape=full_shape, dtype=dtype + ) + buffer_shape = SliceableDataChunkIterator.estimate_default_buffer_shape( + buffer_gb=0.5, chunk_shape=chunk_shape, maxshape=full_shape, dtype=dtype + ) + compression_method = "gzip" + else: + raise NotImplementedError( + f"Unable to create a `DatasetIOConfiguration` for the dataset at '{location_in_file}'" + f"for neurodata object '{neurodata_object}' of type '{type(neurodata_object)}'!" + ) + # TODO: Add support for compound objects with non-string elements + # chunk_shape = full_shape # validate_all_shapes fails if chunk_shape or buffer_shape is None + # buffer_shape = full_shape + # compression_method = None + # warnings.warn( + # f"Default chunking and compression options for compound objects are not optimized. " + # f"Consider manually specifying DatasetIOConfiguration for dataset at '{location_in_file}'." + # ) return cls( object_id=neurodata_object.object_id, diff --git a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/test_dataset_io_configuration_model.py b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/test_dataset_io_configuration_model.py index 901a82d63..616c6e9d4 100644 --- a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/test_dataset_io_configuration_model.py +++ b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/test_dataset_io_configuration_model.py @@ -2,6 +2,7 @@ import numpy as np import pytest +from pynwb.testing.mock.file import mock_NWBFile from neuroconv.tools.nwb_helpers import DatasetIOConfiguration @@ -53,3 +54,44 @@ def test_model_json_schema_generator_assertion(): DatasetIOConfiguration.model_json_schema(schema_generator="anything") assert "The 'schema_generator' of this method cannot be changed." == str(error_info.value) + + +# TODO: Add support for compound objects with non-string elements +# def test_from_neurodata_object_dtype_object(): +# class TestDatasetIOConfiguration(DatasetIOConfiguration): +# def get_data_io_kwargs(self): +# super().get_data_io_kwargs() + +# nwbfile = mock_NWBFile() +# nwbfile.add_trial(start_time=0.0, stop_time=1.0) +# nwbfile.add_trial(start_time=1.0, stop_time=2.0) +# nwbfile.add_trial(start_time=2.0, stop_time=3.0) +# data = np.array(["test", 5, False], dtype=object) +# nwbfile.add_trial_column(name="test", description="test column with object dtype", data=data) +# neurodata_object = nwbfile.trials.columns[2] + +# dataset_io_configuration = TestDatasetIOConfiguration.from_neurodata_object(neurodata_object, dataset_name="data") + +# assert dataset_io_configuration.chunk_shape == (3,) +# assert dataset_io_configuration.buffer_shape == (3,) +# assert dataset_io_configuration.compression_method is None + + +def test_from_neurodata_object_dtype_object_all_strings(): + class TestDatasetIOConfiguration(DatasetIOConfiguration): + def get_data_io_kwargs(self): + super().get_data_io_kwargs() + + nwbfile = mock_NWBFile() + nwbfile.add_trial(start_time=0.0, stop_time=1.0) + nwbfile.add_trial(start_time=1.0, stop_time=2.0) + nwbfile.add_trial(start_time=2.0, stop_time=3.0) + data = np.array(["test", "string", "abc"], dtype=object) + nwbfile.add_trial_column(name="test", description="test column with object dtype but all strings", data=data) + neurodata_object = nwbfile.trials.columns[2] + + dataset_io_configuration = TestDatasetIOConfiguration.from_neurodata_object(neurodata_object, dataset_name="data") + + assert dataset_io_configuration.chunk_shape == (3,) + assert dataset_io_configuration.buffer_shape == (3,) + assert dataset_io_configuration.compression_method == "gzip"