hdmf-dev · magland · Mar 5, 2024 · Mar 5, 2024 · Mar 5, 2024 · Mar 6, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,11 @@
 # HDMF-ZARR Changelog
 
+## 0.7.0 (Upcoming)
+
+### Enhancements
+
+* Changed default object_codec_class for ZarrIO to numcodecs.JSON. The issue with the old default (numcodecs.Pickle) was that it was not readable outside of Python. Exposed the object_codec_class as a parameter to the NWBZarrIO constructor. Resort to Pickle for complex cases such as structured arrays or compound datasets with refs.
+
 ## 0.6.0 (February 21, 2024)
 
 ### Enhancements

diff --git a/docs/source/storage.rst b/docs/source/storage.rst
@@ -249,7 +249,7 @@ Zarr file. The individual object references are defined in the
 :py:class:`~hdmf_zarr.backend.ZarrIO` as py:class:`~hdmf_zarr.utils.ZarrReference` object created via
 the :py:meth:`~hdmf_zarr.backend.ZarrIO.__get_ref` helper function.
 
-By default, :py:class:`~hdmf_zarr.backend.ZarrIO` uses the ``numcodecs.pickles.Pickle`` codec to
+By default, :py:class:`~hdmf_zarr.backend.ZarrIO` uses the ``numcodecs.JSON`` codec to
 encode object references defined as py:class:`~hdmf_zarr.utils.ZarrReference` dicts in datasets.
 Users may set the codec used to encode objects in Zarr datasets via the ``object_codec_class``
 parameter of the :py:func:`~hdmf_zarr.backend.ZarrIO.__init__` constructor of

diff --git a/src/hdmf_zarr/backend.py b/src/hdmf_zarr/backend.py
@@ -94,7 +94,7 @@ def can_read(path):
              'default': None},
             {'name': 'object_codec_class', 'type': None,
              'doc': 'Set the numcodec object codec class to be used to encode objects.'
-                    'Use numcodecs.pickles.Pickle by default.',
+                    'Use numcodecs.JSON by default.',
              'default': None},
             {'name': 'storage_options', 'type': dict,
              'doc': 'Zarr storage options to read remote folders',
@@ -120,8 +120,8 @@ def __init__(self, **kwargs):
         self.__built = dict()
         self._written_builders = WriteStatusTracker()  # track which builders were written (or read) by this IO object
         self.__dci_queue = None  # Will be initialized on call to io.write
-        # Codec class to be used. Alternates, e.g., =numcodecs.JSON
-        self.__codec_cls = numcodecs.pickles.Pickle if object_codec_class is None else object_codec_class
+        # Codec class to be used. Alternates, e.g., =numcodecs.pickles.Pickle
+        self.__codec_cls = numcodecs.JSON if object_codec_class is None else object_codec_class
         source_path = self.__path
         if isinstance(self.__path, SUPPORTED_ZARR_STORES):
             source_path = self.__path.path
@@ -1050,13 +1050,18 @@ def write_dataset(self, **kwargs):  # noqa: C901
                         new_dtype.append((field['name'], self.__resolve_dtype_helper__(field['dtype'])))
                 dtype = np.dtype(new_dtype)
 
+                object_codec = self.__codec_cls()
+                if not isinstance(object_codec, numcodecs.Pickle):
+                    warnings.warn(f'Resorting to Pickle codec for dataset {name} of {parent.name}')
+                    object_codec = numcodecs.Pickle()
+
                 # cast and store compound dataset
                 arr = np.array(new_items, dtype=dtype)
                 dset = parent.require_dataset(
                     name,
                     shape=(len(arr),),
                     dtype=dtype,
-                    object_codec=self.__codec_cls(),
+                    object_codec=object_codec,
                     **options['io_settings']
                 )
                 dset.attrs['zarr_dtype'] = type_str
@@ -1268,6 +1273,23 @@ def __list_fill__(self, parent, name, data, options=None):  # noqa: C901
         else:
             data_shape = get_data_shape(data)
 
+        # Let's check to see if we have a structured array somewhere in the data
+        # If we do, then we are going to resort to pickling the data and
+        # printing a warning.
+        has_structured_array = False
+        if dtype == object:
+            for c in np.ndindex(data_shape):
+                o = data
+                for i in c:
+                    o = o[i]
+                if isinstance(o, np.void) and o.dtype.names is not None:
+                    has_structured_array = True
 for substype in dtype.fields.items(): 
     if np.issubdtype(substype[1][0], np.flexible) or np.issubdtype(substype[1][0], np.object_): 
         dtype = object 
         io_settings['object_codec'] = self.__codec_cls() 
         break 
 for substype in dtype.fields.items(): 
     if np.issubdtype(substype[1][0], np.flexible) or np.issubdtype(substype[1][0], np.object_): 
         dtype = object 
         io_settings['object_codec'] = self.__codec_cls() 
         break 
+        if has_structured_array:
+            object_codec = io_settings.get('object_codec')
+            if not isinstance(object_codec, numcodecs.Pickle):
+                warnings.warn(f'Resorting to Pickle codec for {name} of {parent.name}.')
+                io_settings['object_codec'] = numcodecs.Pickle()
+
         # Create the dataset
         dset = parent.require_dataset(name, shape=data_shape, dtype=dtype, **io_settings)
         dset.attrs['zarr_dtype'] = type_str

diff --git a/src/hdmf_zarr/nwb.py b/src/hdmf_zarr/nwb.py
@@ -27,9 +27,10 @@ class NWBZarrIO(ZarrIO):
                  'doc': 'a path to a namespace, a TypeMap, or a list consisting paths  to namespaces and TypeMaps',
                  'default': None})
         def __init__(self, **kwargs):
-            path, mode, manager, extensions, load_namespaces, synchronizer, storage_options = \
+            path, mode, manager, extensions, load_namespaces, synchronizer, storage_options, object_codec_class = \
                 popargs('path', 'mode', 'manager', 'extensions',
-                        'load_namespaces', 'synchronizer', 'storage_options', kwargs)
+                        'load_namespaces', 'synchronizer', 'storage_options',
+                        'object_codec_class', kwargs)
             if load_namespaces:
                 if manager is not None:
                     warn("loading namespaces from file - ignoring 'manager'")
@@ -53,7 +54,8 @@ def __init__(self, **kwargs):
                                             manager=manager,
                                             mode=mode,
                                             synchronizer=synchronizer,
-                                            storage_options=storage_options)
+                                            storage_options=storage_options,
+                                            object_codec_class=object_codec_class)
 
         @docval({'name': 'src_io', 'type': HDMFIO, 'doc': 'the HDMFIO object for reading the data to export'},
                 {'name': 'nwbfile', 'type': 'NWBFile',

diff --git a/tests/unit/base_tests_zarrio.py b/tests/unit/base_tests_zarrio.py
@@ -17,7 +17,7 @@
 
 # Try to import numcodecs and disable compression tests if it is not available
 try:
-    from numcodecs import Blosc, Delta, JSON
+    from numcodecs import Blosc, Delta, JSON, Pickle
     DISABLE_ZARR_COMPRESSION_TESTS = False
 except ImportError:
     DISABLE_ZARR_COMPRESSION_TESTS = True
@@ -491,12 +491,12 @@ def setUp(self):
     #  ZarrDataIO general
     #############################################
     def test_set_object_codec(self):
-        # Test that the default codec is the Pickle store
+        # Test that the default codec is JSON
         tempIO = ZarrIO(self.store, mode='w')
-        self.assertEqual(tempIO.object_codec_class.__qualname__, 'Pickle')
-        del tempIO  # also calls tempIO.close()
-        tempIO = ZarrIO(self.store, mode='w', object_codec_class=JSON)
         self.assertEqual(tempIO.object_codec_class.__qualname__, 'JSON')
+        del tempIO  # also calls tempIO.close()
+        tempIO = ZarrIO(self.store, mode='w', object_codec_class=Pickle)
+        self.assertEqual(tempIO.object_codec_class.__qualname__, 'Pickle')
         tempIO.close()
 
     def test_synchronizer_constructor_arg_bool(self):