Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

default object_codec_class -> JSON #173

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
# HDMF-ZARR Changelog

## 0.7.0 (Upcoming)

### Enhancements

* Changed default object_codec_class for ZarrIO to numcodecs.JSON. The issue with the old default (numcodecs.Pickle) was that it was not readable outside of Python. Exposed the object_codec_class as a parameter to the NWBZarrIO constructor. Resort to Pickle for complex cases such as structured arrays or compound datasets with refs.

## 0.6.0 (February 21, 2024)

### Enhancements
Expand Down
2 changes: 1 addition & 1 deletion docs/source/storage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -249,7 +249,7 @@ Zarr file. The individual object references are defined in the
:py:class:`~hdmf_zarr.backend.ZarrIO` as py:class:`~hdmf_zarr.utils.ZarrReference` object created via
the :py:meth:`~hdmf_zarr.backend.ZarrIO.__get_ref` helper function.

By default, :py:class:`~hdmf_zarr.backend.ZarrIO` uses the ``numcodecs.pickles.Pickle`` codec to
By default, :py:class:`~hdmf_zarr.backend.ZarrIO` uses the ``numcodecs.JSON`` codec to
encode object references defined as py:class:`~hdmf_zarr.utils.ZarrReference` dicts in datasets.
Users may set the codec used to encode objects in Zarr datasets via the ``object_codec_class``
parameter of the :py:func:`~hdmf_zarr.backend.ZarrIO.__init__` constructor of
Expand Down
30 changes: 26 additions & 4 deletions src/hdmf_zarr/backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ def can_read(path):
'default': None},
{'name': 'object_codec_class', 'type': None,
'doc': 'Set the numcodec object codec class to be used to encode objects.'
'Use numcodecs.pickles.Pickle by default.',
mavaylon1 marked this conversation as resolved.
Show resolved Hide resolved
'Use numcodecs.JSON by default.',
'default': None},
{'name': 'storage_options', 'type': dict,
'doc': 'Zarr storage options to read remote folders',
Expand All @@ -120,8 +120,8 @@ def __init__(self, **kwargs):
self.__built = dict()
self._written_builders = WriteStatusTracker() # track which builders were written (or read) by this IO object
self.__dci_queue = None # Will be initialized on call to io.write
# Codec class to be used. Alternates, e.g., =numcodecs.JSON
self.__codec_cls = numcodecs.pickles.Pickle if object_codec_class is None else object_codec_class
# Codec class to be used. Alternates, e.g., =numcodecs.pickles.Pickle
self.__codec_cls = numcodecs.JSON if object_codec_class is None else object_codec_class
source_path = self.__path
if isinstance(self.__path, SUPPORTED_ZARR_STORES):
source_path = self.__path.path
Expand Down Expand Up @@ -1050,13 +1050,18 @@ def write_dataset(self, **kwargs): # noqa: C901
new_dtype.append((field['name'], self.__resolve_dtype_helper__(field['dtype'])))
dtype = np.dtype(new_dtype)

object_codec = self.__codec_cls()
if not isinstance(object_codec, numcodecs.Pickle):
warnings.warn(f'Resorting to Pickle codec for dataset {name} of {parent.name}')
object_codec = numcodecs.Pickle()

# cast and store compound dataset
arr = np.array(new_items, dtype=dtype)
dset = parent.require_dataset(
name,
shape=(len(arr),),
dtype=dtype,
object_codec=self.__codec_cls(),
object_codec=object_codec,
**options['io_settings']
)
dset.attrs['zarr_dtype'] = type_str
Expand Down Expand Up @@ -1268,6 +1273,23 @@ def __list_fill__(self, parent, name, data, options=None): # noqa: C901
else:
data_shape = get_data_shape(data)

# Let's check to see if we have a structured array somewhere in the data
# If we do, then we are going to resort to pickling the data and
# printing a warning.
has_structured_array = False
if dtype == object:
for c in np.ndindex(data_shape):
o = data
for i in c:
o = o[i]
if isinstance(o, np.void) and o.dtype.names is not None:
has_structured_array = True
Comment on lines +1281 to +1286
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this seems pretty slow, iterating through every value in the array to check type. in structured arrays, wouldn't you just have to check the dtype of the whole array? in any case shouldn't we break the first time we get a True?

Copy link
Contributor

@sneakers-the-rat sneakers-the-rat Mar 15, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

also couldn't this switch happen here:

for substype in dtype.fields.items():
if np.issubdtype(substype[1][0], np.flexible) or np.issubdtype(substype[1][0], np.object_):
dtype = object
io_settings['object_codec'] = self.__codec_cls()
break
?

if has_structured_array:
object_codec = io_settings.get('object_codec')
if not isinstance(object_codec, numcodecs.Pickle):
warnings.warn(f'Resorting to Pickle codec for {name} of {parent.name}.')
io_settings['object_codec'] = numcodecs.Pickle()

# Create the dataset
dset = parent.require_dataset(name, shape=data_shape, dtype=dtype, **io_settings)
dset.attrs['zarr_dtype'] = type_str
Expand Down
8 changes: 5 additions & 3 deletions src/hdmf_zarr/nwb.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,10 @@ class NWBZarrIO(ZarrIO):
'doc': 'a path to a namespace, a TypeMap, or a list consisting paths to namespaces and TypeMaps',
'default': None})
def __init__(self, **kwargs):
path, mode, manager, extensions, load_namespaces, synchronizer, storage_options = \
path, mode, manager, extensions, load_namespaces, synchronizer, storage_options, object_codec_class = \
popargs('path', 'mode', 'manager', 'extensions',
'load_namespaces', 'synchronizer', 'storage_options', kwargs)
'load_namespaces', 'synchronizer', 'storage_options',
'object_codec_class', kwargs)
if load_namespaces:
if manager is not None:
warn("loading namespaces from file - ignoring 'manager'")
Expand All @@ -53,7 +54,8 @@ def __init__(self, **kwargs):
manager=manager,
mode=mode,
synchronizer=synchronizer,
storage_options=storage_options)
storage_options=storage_options,
object_codec_class=object_codec_class)

@docval({'name': 'src_io', 'type': HDMFIO, 'doc': 'the HDMFIO object for reading the data to export'},
{'name': 'nwbfile', 'type': 'NWBFile',
Expand Down
10 changes: 5 additions & 5 deletions tests/unit/base_tests_zarrio.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

# Try to import numcodecs and disable compression tests if it is not available
try:
from numcodecs import Blosc, Delta, JSON
from numcodecs import Blosc, Delta, JSON, Pickle
DISABLE_ZARR_COMPRESSION_TESTS = False
except ImportError:
DISABLE_ZARR_COMPRESSION_TESTS = True
Expand Down Expand Up @@ -491,12 +491,12 @@ def setUp(self):
# ZarrDataIO general
#############################################
def test_set_object_codec(self):
# Test that the default codec is the Pickle store
# Test that the default codec is JSON
tempIO = ZarrIO(self.store, mode='w')
self.assertEqual(tempIO.object_codec_class.__qualname__, 'Pickle')
del tempIO # also calls tempIO.close()
tempIO = ZarrIO(self.store, mode='w', object_codec_class=JSON)
self.assertEqual(tempIO.object_codec_class.__qualname__, 'JSON')
del tempIO # also calls tempIO.close()
tempIO = ZarrIO(self.store, mode='w', object_codec_class=Pickle)
self.assertEqual(tempIO.object_codec_class.__qualname__, 'Pickle')
tempIO.close()

def test_synchronizer_constructor_arg_bool(self):
Expand Down
Loading