From bb3de8b98785a27d60eb2eb16d2023cf07b2fa83 Mon Sep 17 00:00:00 2001 From: rly Date: Tue, 20 Aug 2024 01:27:01 -0700 Subject: [PATCH] Add tests and change zarr array handling --- CHANGELOG.md | 2 +- src/hdmf/backends/hdf5/h5tools.py | 22 +++- src/hdmf/build/objectmapper.py | 20 +++- tests/unit/build_tests/test_convert_dtype.py | 50 +++++++- tests/unit/test_io_hdf5_h5tools.py | 114 ++++++++++++++++++- 5 files changed, 194 insertions(+), 14 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4a2f7d80d..8f8171520 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,7 +8,7 @@ - Improved "already exists" error message when adding a container to a `MultiContainerInterface`. @rly [#1165](https://github.com/hdmf-dev/hdmf/pull/1165) ### Bug fixes -- Fixed bug when converting string datasets from Zarr to HDF5. @oruebel [#1171](https://github.com/hdmf-dev/hdmf/pull/1171) +- Fixed bug when converting string datasets from Zarr to HDF5. @oruebel @rly [#1171](https://github.com/hdmf-dev/hdmf/pull/1171) ## HDMF 3.14.3 (July 29, 2024) diff --git a/src/hdmf/backends/hdf5/h5tools.py b/src/hdmf/backends/hdf5/h5tools.py index 9b06e80f1..585ee4536 100644 --- a/src/hdmf/backends/hdf5/h5tools.py +++ b/src/hdmf/backends/hdf5/h5tools.py @@ -22,6 +22,13 @@ from ...utils import docval, getargs, popargs, get_data_shape, get_docval, StrDataset from ..utils import NamespaceToBuilderHelper, WriteStatusTracker +# try: +# from zarr import Array as ZarrArray +# import numcodecs +# ZARR_INSTALLED = True +# except ImportError: +# ZARR_INSTALLED = False + ROOT_NAME = 'root' SPEC_LOC_ATTR = '.specloc' H5_TEXT = special_dtype(vlen=str) @@ -34,6 +41,10 @@ H5PY_3 = h5py.__version__.startswith('3') +# def _is_zarr_array(value): +# return ZARR_INSTALLED and isinstance(value, ZarrArray) + + class HDF5IO(HDMFIO): __ns_spec_path = 'namespace' # path to the namespace dataset within a namespace group @@ -924,10 +935,13 @@ def __resolve_dtype__(cls, dtype, data): # binary # number - # Use text dtype for Zarr datasets of strings. Zarr stores variable length strings - # as objects, so we need to detect this special case here - if hasattr(data, 'attrs') and 'zarr_dtype' in data.attrs and data.attrs['zarr_dtype'] == 'str': - return cls.__dtypes['text'] + # # Use text dtype for Zarr datasets of strings. Zarr stores variable length strings + # # as objects, so we need to detect this special case here + # if _is_zarr_array(data) and data.filters: + # if numcodecs.VLenUTF8() in data.filters: + # return cls.__dtypes['text'] + # elif numcodecs.VLenBytes() in data.filters: + # return cls.__dtypes['ascii'] dtype = cls.__resolve_dtype_helper__(dtype) if dtype is None: diff --git a/src/hdmf/build/objectmapper.py b/src/hdmf/build/objectmapper.py index 58dcf19f6..02bfdae49 100644 --- a/src/hdmf/build/objectmapper.py +++ b/src/hdmf/build/objectmapper.py @@ -20,6 +20,16 @@ from ..spec.spec import BaseStorageSpec from ..utils import docval, getargs, ExtenderMeta, get_docval, get_data_shape +try: + from zarr import Array as ZarrArray + ZARR_INSTALLED = True +except ImportError: + ZARR_INSTALLED = False + + +def _is_zarr_array(value): + return ZARR_INSTALLED and isinstance(value, ZarrArray) + _const_arg = '__constructor_arg' @@ -206,17 +216,19 @@ def convert_dtype(cls, spec, value, spec_dtype=None): # noqa: C901 spec_dtype_type = cls.__dtypes[spec_dtype] warning_msg = None # Numpy Array or Zarr array - if (isinstance(value, np.ndarray) or - (hasattr(value, 'astype') and hasattr(value, 'dtype'))): + # NOTE: Numpy < 2.0 has only fixed-length strings. + # Numpy 2.0 introduces variable-length strings (dtype=np.dtypes.StringDType()). + # HDMF does not yet do any special handling of numpy arrays with variable-length strings. + if isinstance(value, np.ndarray) or _is_zarr_array(value): if spec_dtype_type is _unicode: - if hasattr(value, 'attrs') and 'zarr_dtype' in value.attrs: + if _is_zarr_array(value): # Zarr stores strings as objects, so we cannot convert to unicode dtype ret = value else: ret = value.astype('U') ret_dtype = "utf8" elif spec_dtype_type is _ascii: - if hasattr(value, 'attrs') and 'zarr_dtype' in value.attrs: + if _is_zarr_array(value): # Zarr stores strings as objects, so we cannot convert to unicode dtype ret = value else: diff --git a/tests/unit/build_tests/test_convert_dtype.py b/tests/unit/build_tests/test_convert_dtype.py index 8f9e49239..331c42589 100644 --- a/tests/unit/build_tests/test_convert_dtype.py +++ b/tests/unit/build_tests/test_convert_dtype.py @@ -1,12 +1,20 @@ from datetime import datetime, date - import numpy as np +import unittest + from hdmf.backends.hdf5 import H5DataIO from hdmf.build import ObjectMapper from hdmf.data_utils import DataChunkIterator from hdmf.spec import DatasetSpec, RefSpec, DtypeSpec from hdmf.testing import TestCase +try: + import zarr + import numcodecs + SKIP_ZARR_TESTS = False +except ImportError: + SKIP_ZARR_TESTS = True + class TestConvertDtype(TestCase): @@ -551,3 +559,43 @@ def test_isodate_spec(self): self.assertEqual(ret, b'2020-11-10') self.assertIs(type(ret), bytes) self.assertEqual(ret_dtype, 'ascii') + + @unittest.skipIf(SKIP_ZARR_TESTS, "Zarr is not installed") + def test_zarr_array_spec_vlen_utf8(self): + """Test that converting a zarr array with utf8 dtype for a variable length utf8 dtype spec + returns the same object with a utf8 ret_dtype.""" + spec = DatasetSpec('an example dataset', 'text', name='data') + + value = zarr.array(['a', 'b']) # fixed length unicode (dtype =