diff --git a/lindi/LindiH5pyFile/LindiH5pyDataset.py b/lindi/LindiH5pyFile/LindiH5pyDataset.py index de552b0..27afe93 100644 --- a/lindi/LindiH5pyFile/LindiH5pyDataset.py +++ b/lindi/LindiH5pyFile/LindiH5pyDataset.py @@ -78,11 +78,13 @@ def _resolve_references(x: Any): else: for k, v in x.items(): x[k] = _resolve_references(v) + elif isinstance(x, LindiZarrWrapperReference): + return LindiH5pyReference(x) elif isinstance(x, list): for i, v in enumerate(x): x[i] = _resolve_references(v) elif isinstance(x, np.ndarray): - if x.dtype == object: + if x.dtype == object or x.dtype is None: view_1d = x.reshape(-1) for i in range(len(view_1d)): view_1d[i] = _resolve_references(view_1d[i]) diff --git a/lindi/LindiH5pyFile/LindiH5pyFile.py b/lindi/LindiH5pyFile/LindiH5pyFile.py index d1e8bf7..2ddf0d0 100644 --- a/lindi/LindiH5pyFile/LindiH5pyFile.py +++ b/lindi/LindiH5pyFile/LindiH5pyFile.py @@ -4,7 +4,7 @@ from .LindiH5pyGroup import LindiH5pyGroup from .LindiH5pyDataset import LindiH5pyDataset -from ..LindiZarrWrapper import LindiZarrWrapper, LindiZarrWrapperGroup, LindiZarrWrapperDataset +from ..LindiZarrWrapper import LindiZarrWrapper, LindiZarrWrapperGroup, LindiZarrWrapperDataset, LindiZarrWrapperReference from .LindiH5pyAttributes import LindiH5pyAttributes from .LindiH5pyReference import LindiH5pyReference @@ -99,6 +99,10 @@ def __repr__(self): # Group methods def __getitem__(self, name): + if isinstance(name, LindiZarrWrapperReference): + # annoyingly we have to do this because references + # in arrays of compound types will come in as LindiZarrWrapperReference + name = LindiH5pyReference(name) if isinstance(name, LindiH5pyReference): assert isinstance(self._file_object, LindiZarrWrapper) x = self._file_object[name._reference] @@ -117,6 +121,13 @@ def __getitem__(self, name): return LindiH5pyDataset(x, self) else: raise Exception(f"Unexpected type for resolved reference at path {name}: {type(x)}") + # if it contains slashes, it's a path + if isinstance(name, str) and "/" in name: + parts = name.split("/") + x = self._the_group + for part in parts: + x = x[part] + return x return self._the_group[name] def get(self, name, default=None, getclass=False, getlink=False): diff --git a/lindi/LindiH5pyFile/LindiH5pyLink.py b/lindi/LindiH5pyFile/LindiH5pyLink.py index 0fa2f7a..f50576f 100644 --- a/lindi/LindiH5pyFile/LindiH5pyLink.py +++ b/lindi/LindiH5pyFile/LindiH5pyLink.py @@ -1,9 +1,12 @@ -class LindiH5pyHardLink: +import h5py + + +class LindiH5pyHardLink(h5py.HardLink): def __init__(self): pass -class LindiH5pySoftLink: +class LindiH5pySoftLink(h5py.SoftLink): def __init__(self, path: str): self._path = path diff --git a/lindi/LindiZarrWrapper/LindiZarrWrapperReference.py b/lindi/LindiZarrWrapper/LindiZarrWrapperReference.py index b84314f..3d56069 100644 --- a/lindi/LindiZarrWrapper/LindiZarrWrapperReference.py +++ b/lindi/LindiZarrWrapper/LindiZarrWrapperReference.py @@ -1,4 +1,9 @@ -class LindiZarrWrapperReference: +import h5py + + +# We need h5py.Reference as a base class so that type checking will be okay for +# arrays of compound types that contain references +class LindiZarrWrapperReference(h5py.Reference): def __init__(self, obj: dict): self._object_id = obj["object_id"] self._path = obj["path"] diff --git a/tests/test_core.py b/tests/test_core.py index 5882b32..ac2073e 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -1,267 +1,240 @@ import numpy as np import h5py import tempfile -from lindi import LindiH5ZarrStore, LindiZarrWrapper, LindiZarrWrapperDataset, LindiZarrWrapperGroup, LindiZarrWrapperReference -import pytest +import lindi +from lindi import ( + LindiH5ZarrStore, + LindiZarrWrapper, + LindiZarrWrapperDataset +) -def test_scalar_datasets(): - for val in ["abc", b"abc", 1, 3.6]: - print(f"Testing scalar {val} of type {type(val)}") - with tempfile.TemporaryDirectory() as tmpdir: - filename = f"{tmpdir}/test.h5" - with h5py.File(filename, "w") as f: - ds = f.create_dataset("X", data=val) - ds.attrs["foo"] = "bar" - with LindiH5ZarrStore.from_file( - filename, url=filename - ) as store: # set url so that a reference file system can be created - rfs = store.to_reference_file_system() - client = LindiZarrWrapper.from_reference_file_system(rfs) - h5f = h5py.File(filename, "r") - X1 = h5f["X"] - assert isinstance(X1, h5py.Dataset) - X2 = client["X"] - assert isinstance(X2, LindiZarrWrapperDataset) - if not _check_equal(X1[()], X2[()]): - print(f"WARNING: {X1} ({type(X1)}) != {X2} ({type(X2)})") - raise ValueError("Scalar datasets are not equal") - assert '.zgroup' in store - assert '.zarray' not in rfs['refs'] - assert '.zarray' not in store - assert '.zattrs' in store # it's in the store but not in the ref file system -- see notes in LindiH5ZarrStore source code - assert '.zattrs' not in rfs['refs'] - assert 'X/.zgroup' not in store - assert 'X/.zattrs' in store # foo is set to bar - assert store['X/.zattrs'] == rfs['refs']['X/.zattrs'].encode() - assert 'X/.zarray' in rfs['refs'] - assert store['X/.zarray'] == rfs['refs']['X/.zarray'].encode() - - -def test_numpy_arrays(): - array_1 = ("1", np.arange(60).reshape(3, 20), (3, 7)) - array_2 = ("2", np.arange(60).reshape(3, 20), None) - array_boolean = ("3", np.array([[True, False, True], [False, True, False]]), None) - for label, array, chunks in [array_1, array_2, array_boolean]: - print(f"Testing numpy array {label}") - with tempfile.TemporaryDirectory() as tmpdir: - filename = f"{tmpdir}/test.h5" - with h5py.File(filename, "w") as f: - f.create_dataset("X", data=array, chunks=chunks) - with LindiH5ZarrStore.from_file( - filename, url=filename - ) as store: # set url so that a reference file system can be created - rfs = store.to_reference_file_system() - client = LindiZarrWrapper.from_reference_file_system(rfs) - h5f = h5py.File(filename, "r") - X1 = h5f["X"] - assert isinstance(X1, h5py.Dataset) - X2 = client["X"] - assert isinstance(X2, LindiZarrWrapperDataset) - - assert X1.shape == X2.shape - assert X1.dtype == X2.dtype - assert X1.size == X2.size - assert X1.nbytes == X2.nbytes - assert len(X1) == len(X2) - - # iterate over the first axis - count = 0 - for aa in X2: - assert _check_equal(aa[:], X1[count][:]) - count += 1 - - if not _check_equal(X1[:], X2[:]): - print("WARNING. Arrays are not equal") - print(X1[:]) - print(X2[:]) - raise ValueError("Arrays are not equal") - - -def test_numpy_array_of_strings(): - print("Testing numpy array of strings") +def test_variety(): with tempfile.TemporaryDirectory() as tmpdir: filename = f"{tmpdir}/test.h5" with h5py.File(filename, "w") as f: - f.create_dataset("X", data=["abc", "def", "ghi"]) + f.create_dataset("dataset1", data=[1, 2, 3]) + f.create_group("group1") + f.attrs["int1"] = 1 + f.attrs["float1"] = 3.14 + f.attrs["str1"] = "abc" + f.attrs["bytes1"] = b"def" + f.attrs["list1"] = [1, 2, 3] + f.attrs["tuple1"] = (3, 4, 5) + f.attrs["array1"] = np.arange(10) + f.attrs["dataset1_ref"] = f["dataset1"].ref + f.attrs["group1_ref"] = f["group1"].ref + f["dataset1"].attrs["test_attr1"] = "attribute-of-dataset1" + f["group1"].attrs["test_attr2"] = "attribute-of-group1" h5f = h5py.File(filename, "r") with LindiH5ZarrStore.from_file(filename, url=filename) as store: rfs = store.to_reference_file_system() - client = LindiZarrWrapper.from_reference_file_system(rfs) - X1 = h5f["X"] - assert isinstance(X1, h5py.Dataset) - X2 = client["X"] - assert isinstance(X2, LindiZarrWrapperDataset) - if not _check_equal(X1[:], X2[:]): - print("WARNING. Arrays are not equal") - print(X1[:]) - print(X2[:]) - raise ValueError("Arrays are not equal") - - -def test_compound_dtype(): - print("Testing compound dtype") + h5f_2 = lindi.LindiH5pyFile.from_reference_file_system(rfs) + assert h5f_2.attrs["int1"] == h5f.attrs["int1"] + assert h5f_2.attrs["float1"] == h5f.attrs["float1"] + assert h5f_2.attrs["str1"] == h5f.attrs["str1"] + assert h5f_2.attrs["bytes1"] == h5f.attrs["bytes1"] + assert _lists_are_equal(h5f_2.attrs["list1"], h5f.attrs["list1"]) + assert _lists_are_equal(h5f_2.attrs["tuple1"], h5f.attrs["tuple1"]) + assert _arrays_are_equal(np.array(h5f_2.attrs["array1"]), h5f.attrs["array1"]) + assert h5f_2["dataset1"].attrs["test_attr1"] == h5f["dataset1"].attrs["test_attr1"] + assert _arrays_are_equal(h5f_2["dataset1"][()], h5f["dataset1"][()]) # type: ignore + assert h5f_2["group1"].attrs["test_attr2"] == h5f["group1"].attrs["test_attr2"] + target_1 = h5f[h5f.attrs["dataset1_ref"]] + target_2 = h5f_2[h5f_2.attrs["dataset1_ref"]] + assert target_1.attrs["test_attr1"] == target_2.attrs["test_attr1"] + target_1 = h5f[h5f.attrs["group1_ref"]] + target_2 = h5f_2[h5f_2.attrs["group1_ref"]] + assert target_1.attrs["test_attr2"] == target_2.attrs["test_attr2"] + + +def test_soft_links(): + with tempfile.TemporaryDirectory() as tmpdir: + filename = f"{tmpdir}/test.h5" + with h5py.File(filename, "w") as f: + g = f.create_group('group_target') + g.attrs['foo'] = 'bar' + g.create_dataset('dataset1', data=[5, 6, 7]) + f['soft_link'] = h5py.SoftLink('/group_target') + h5f = h5py.File(filename, "r") + with LindiH5ZarrStore.from_file(filename, url=filename) as store: + rfs = store.to_reference_file_system() + h5f_2 = lindi.LindiH5pyFile.from_reference_file_system(rfs) + g1 = h5f['group_target'] + g2 = h5f_2['group_target'] + assert g1.attrs['foo'] == g2.attrs['foo'] + h1 = h5f['soft_link'] + h2 = h5f_2['soft_link'] + assert h1.attrs['foo'] == h2.attrs['foo'] + # this is tricky: it seems that with h5py, the name of the soft link + # is the source name. So the following assertion will fail. + # assert h1.name == h2.name + k1 = h5f.get('soft_link', getlink=True) + k2 = h5f_2.get('soft_link', getlink=True) + assert isinstance(k1, h5py.SoftLink) + assert isinstance(k2, h5py.SoftLink) + ds1 = h5f['soft_link']['dataset1'] # type: ignore + assert isinstance(ds1, h5py.Dataset) + ds2 = h5f_2['soft_link']['dataset1'] + assert isinstance(ds2, h5py.Dataset) + assert _arrays_are_equal(ds1[()], ds2[()]) + ds1 = h5f['soft_link/dataset1'] + assert isinstance(ds1, h5py.Dataset) + ds2 = h5f_2['soft_link/dataset1'] + assert isinstance(ds2, h5py.Dataset) + assert _arrays_are_equal(ds1[()], ds2[()]) + ds1 = h5f['group_target/dataset1'] + assert isinstance(ds1, h5py.Dataset) + ds2 = h5f_2['group_target/dataset1'] + assert isinstance(ds2, h5py.Dataset) + assert _arrays_are_equal(ds1[()], ds2[()]) + + +def test_arrays_of_compound_dtype(): with tempfile.TemporaryDirectory() as tmpdir: filename = f"{tmpdir}/test.h5" with h5py.File(filename, "w") as f: dt = np.dtype([("x", "i4"), ("y", "f8")]) - f.create_dataset("X", data=[(1, 3.14), (2, 6.28)], dtype=dt) + f.create_dataset("dataset1", data=[(1, 3.14), (2, 6.28)], dtype=dt) + dt = np.dtype([("a", "i4"), ("b", "f8"), ("c", "S10")]) + f.create_dataset("dataset2", data=[(1, 3.14, "abc"), (2, 6.28, "def")], dtype=dt) h5f = h5py.File(filename, "r") - store = LindiH5ZarrStore.from_file(filename, url=filename) - rfs = store.to_reference_file_system() - client = LindiZarrWrapper.from_reference_file_system(rfs) - X1 = h5f["X"] - assert isinstance(X1, h5py.Dataset) - X2 = client["X"] - assert isinstance(X2, LindiZarrWrapperDataset) - assert X1.shape == X2.shape - assert X1.dtype == X2.dtype - assert X1.size == X2.size - # assert X1.nbytes == X2.nbytes # nbytes are not going to match because the internal representation is different - assert len(X1) == len(X2) - if not _check_equal(X1['x'][:], X2['x'][:]): - print("WARNING. Arrays for x are not equal") - print(X1['x'][:]) - print(X2['x'][:]) - raise ValueError("Arrays are not equal") - if not _check_equal(X1['y'][:], X2['y'][:]): - print("WARNING. Arrays for y are not equal") - print(X1['y'][:]) - print(X2['y'][:]) - raise ValueError("Arrays are not equal") - store.close() - - -def test_attributes(): - print("Testing attributes") + with LindiH5ZarrStore.from_file(filename, url=filename) as store: + rfs = store.to_reference_file_system() + h5f_2 = lindi.LindiH5pyFile.from_reference_file_system(rfs) + ds1_1 = h5f['dataset1'] + assert isinstance(ds1_1, h5py.Dataset) + ds1_2 = h5f_2['dataset1'] + assert isinstance(ds1_2, h5py.Dataset) + assert ds1_1.dtype == ds1_2.dtype + assert _arrays_are_equal(ds1_1['x'][()], ds1_2['x'][()]) # type: ignore + assert _arrays_are_equal(ds1_1['y'][()], ds1_2['y'][()]) # type: ignore + ds2_1 = h5f['dataset2'] + assert isinstance(ds2_1, h5py.Dataset) + ds2_2 = h5f_2['dataset2'] + assert isinstance(ds2_2, h5py.Dataset) + assert ds2_1.dtype == ds2_2.dtype + assert _arrays_are_equal(ds2_1['a'][()], ds2_2['a'][()]) # type: ignore + assert _arrays_are_equal(ds2_1['b'][()], ds2_2['b'][()]) # type: ignore + assert _arrays_are_equal(ds2_1['c'][()], ds2_2['c'][()]) # type: ignore + + +def test_arrays_of_compound_dtype_with_references(): with tempfile.TemporaryDirectory() as tmpdir: filename = f"{tmpdir}/test.h5" with h5py.File(filename, "w") as f: - f.create_dataset("X", data=[1, 2, 3]) - f["X"].attrs["foo"] = "bar" - f["X"].attrs["baz"] = 3.14 - f["X"].attrs["qux"] = [1, 2, 3] - f["X"].attrs["corge"] = np.int32(5) - f.create_group("group") - f["group"].attrs["foo"] = "bar2" - f["group"].attrs["baz"] = 3.15 + dt = np.dtype([("x", "i4"), ("y", h5py.special_dtype(ref=h5py.Reference))]) + Y_ds = f.create_dataset("Y", data=[1, 2, 3]) + f.create_dataset("dataset1", data=[(1, Y_ds.ref), (2, Y_ds.ref)], dtype=dt) h5f = h5py.File(filename, "r") with LindiH5ZarrStore.from_file(filename, url=filename) as store: rfs = store.to_reference_file_system() - client = LindiZarrWrapper.from_reference_file_system(rfs) - - X1 = h5f["X"] - assert isinstance(X1, h5py.Dataset) - X2 = client["X"] - assert isinstance(X2, LindiZarrWrapperDataset) - - with pytest.raises(KeyError): - X2.attrs["a"] = 1 # cannot set attributes on read-only object - with pytest.raises(KeyError): - X2.attrs["b"] # non-existent attribute - with pytest.raises(KeyError): - del X2.attrs["foo"] # cannot delete attributes on read-only object - - for k, v in X2.attrs.items(): - if not _check_equal(v, X1.attrs[k]): - print(f"WARNING: {k} attribute mismatch") - print(f" h5: {X1.attrs[k]} ({type(X1.attrs[k])})") - print(f" zarr: {v} ({type(v)})") - raise ValueError("Attribute mismatch") - for k, v in X1.attrs.items(): - if not _check_equal(v, X2.attrs[k]): - print(f"WARNING: {k} attribute mismatch") - print(f" h5: {v} ({type(v)})") - print(f" zarr: {X2.attrs[k]} ({type(X2.attrs[k])})") - raise ValueError("Attribute mismatch") - for k in X2.attrs: - assert k in X1.attrs - assert len(X2.attrs) == len(X1.attrs) - assert str(X2.attrs) # for coverage - assert repr(X2.attrs) # for coverage - - group1 = h5f["group"] - assert isinstance(group1, h5py.Group) - group2 = client["group"] - assert isinstance(group2, LindiZarrWrapperGroup) - - for k, v in group2.attrs.items(): - if not _check_equal(v, group1.attrs[k]): - print(f"WARNING: {k} attribute mismatch") - print(f" h5: {group1.attrs[k]} ({type(group1.attrs[k])})") - print(f" zarr: {v} ({type(v)})") - raise ValueError("Attribute mismatch") - for k, v in group1.attrs.items(): - if not _check_equal(v, group2.attrs[k]): - print(f"WARNING: {k} attribute mismatch") - print(f" h5: {v} ({type(v)})") - print(f" zarr: {group2.attrs[k]} ({type(group2.attrs[k])})") - raise ValueError("Attribute mismatch") + h5f_2 = lindi.LindiH5pyFile.from_reference_file_system(rfs) + ds1_1 = h5f['dataset1'] + assert isinstance(ds1_1, h5py.Dataset) + ds1_2 = h5f_2['dataset1'] + assert isinstance(ds1_2, h5py.Dataset) + assert ds1_1.dtype == ds1_2.dtype + assert _arrays_are_equal(ds1_1['x'][()], ds1_2['x'][()]) # type: ignore + ref1 = ds1_1['y'][0] + ref2 = ds1_2['y'][0] + assert isinstance(ref1, h5py.Reference) + assert isinstance(ref2, h5py.Reference) + target1 = h5f[ref1] + assert isinstance(target1, h5py.Dataset) + target2 = h5f_2[ref2] + assert isinstance(target2, h5py.Dataset) + assert _arrays_are_equal(target1[()], target2[()]) -def test_nan_inf_attr(): - print("Testing NaN, Inf, and -Inf attributes") +def test_scalar_arrays(): with tempfile.TemporaryDirectory() as tmpdir: filename = f"{tmpdir}/test.h5" with h5py.File(filename, "w") as f: - f.create_dataset("X", data=[1, 2, 3]) - f["X"].attrs["nan"] = np.nan - f["X"].attrs["inf"] = np.inf - f["X"].attrs["ninf"] = -np.inf + f.create_dataset("X", data=1) + f.create_dataset("Y", data=3.14) + f.create_dataset("Z", data="abc") + f.create_dataset("W", data=b"def") h5f = h5py.File(filename, "r") with LindiH5ZarrStore.from_file(filename, url=filename) as store: rfs = store.to_reference_file_system() - client = LindiZarrWrapper.from_reference_file_system(rfs) - - X1 = h5f["X"] + h5f_2 = lindi.LindiH5pyFile.from_reference_file_system(rfs) + X1 = h5f['X'] assert isinstance(X1, h5py.Dataset) - X2 = client["X"] - assert isinstance(X2, LindiZarrWrapperDataset) - - assert X2.attrs["nan"] == 'NaN' - assert X2.attrs["inf"] == 'Infinity' - assert X2.attrs["ninf"] == '-Infinity' - - -def test_reference_attributes(): - print("Testing reference attributes") + X2 = h5f_2['X'] + assert isinstance(X2, h5py.Dataset) + assert X1[()] == X2[()] + Y1 = h5f['Y'] + assert isinstance(Y1, h5py.Dataset) + Y2 = h5f_2['Y'] + assert isinstance(Y2, h5py.Dataset) + assert Y1[()] == Y2[()] + Z1 = h5f['Z'] + assert isinstance(Z1, h5py.Dataset) + Z2 = h5f_2['Z'] + assert isinstance(Z2, h5py.Dataset) + # Note that encode is needed because Z1[()] is a bytes + assert Z1[()] == Z2[()].encode() # type: ignore + W1 = h5f['W'] + assert isinstance(W1, h5py.Dataset) + W2 = h5f_2['W'] + assert isinstance(W2, h5py.Dataset) + # Note that encode is needed because W2[()] is a str + assert W1[()] == W2[()].encode() # type: ignore + + +def test_arrays_of_strings(): with tempfile.TemporaryDirectory() as tmpdir: filename = f"{tmpdir}/test.h5" with h5py.File(filename, "w") as f: - X_ds = f.create_dataset("X", data=[1, 2, 3]) - Y_ds = f.create_dataset("Y", data=[4, 5, 6]) - X_ds.attrs["ref"] = Y_ds.ref + f.create_dataset("X", data=["abc", "def", "ghi"]) h5f = h5py.File(filename, "r") with LindiH5ZarrStore.from_file(filename, url=filename) as store: rfs = store.to_reference_file_system() - client = LindiZarrWrapper.from_reference_file_system(rfs) - - X1 = h5f["X"] + h5f_2 = lindi.LindiH5pyFile.from_reference_file_system(rfs) + X1 = h5f['X'] assert isinstance(X1, h5py.Dataset) - X2 = client["X"] - assert isinstance(X2, LindiZarrWrapperDataset) + X2 = h5f_2['X'] + assert isinstance(X2, h5py.Dataset) + assert _lists_are_equal(X1[:].tolist(), [x.encode() for x in X2[:]]) # type: ignore - ref1 = X1.attrs["ref"] - assert isinstance(ref1, h5py.Reference) - ref2 = X2.attrs["ref"] - assert isinstance(ref2, LindiZarrWrapperReference) - target1 = h5f[ref1] - assert isinstance(target1, h5py.Dataset) - target2 = client[ref2] - assert isinstance(target2, LindiZarrWrapperDataset) +def test_numpy_arrays(): + array_1 = ("1", np.arange(60).reshape(3, 20), (3, 7)) + array_2 = ("2", np.arange(60).reshape(3, 20), None) + array_boolean = ("3", np.array([[True, False, True], [False, True, False]]), None) + for label, array, chunks in [array_1, array_2, array_boolean]: + print(f"Testing numpy array {label}") + with tempfile.TemporaryDirectory() as tmpdir: + filename = f"{tmpdir}/test.h5" + with h5py.File(filename, "w") as f: + f.create_dataset("X", data=array, chunks=chunks) + with LindiH5ZarrStore.from_file( + filename, url=filename + ) as store: # set url so that a reference file system can be created + rfs = store.to_reference_file_system() + client = LindiZarrWrapper.from_reference_file_system(rfs) + h5f = h5py.File(filename, "r") + X1 = h5f["X"] + assert isinstance(X1, h5py.Dataset) + X2 = client["X"] + assert isinstance(X2, LindiZarrWrapperDataset) - assert _check_equal(target1[:], target2[:]) + assert X1.shape == X2.shape + assert X1.dtype == X2.dtype + assert X1.size == X2.size + assert X1.nbytes == X2.nbytes + assert len(X1) == len(X2) -def test_reference_in_compound_dtype(): - print("Testing reference in dataset with compound dtype") +def test_nan_inf_attributes(): with tempfile.TemporaryDirectory() as tmpdir: filename = f"{tmpdir}/test.h5" with h5py.File(filename, "w") as f: - compound_dtype = np.dtype([("x", "i4"), ("y", h5py.special_dtype(ref=h5py.Reference))]) - Y_ds = f.create_dataset("Y", data=[1, 2, 3]) - f.create_dataset("X", data=[(1, Y_ds.ref), (2, Y_ds.ref)], dtype=compound_dtype) + f.create_dataset("X", data=[1, 2, 3]) + f["X"].attrs["nan"] = np.nan + f["X"].attrs["inf"] = np.inf + f["X"].attrs["ninf"] = -np.inf h5f = h5py.File(filename, "r") with LindiH5ZarrStore.from_file(filename, url=filename) as store: rfs = store.to_reference_file_system() @@ -272,71 +245,31 @@ def test_reference_in_compound_dtype(): X2 = client["X"] assert isinstance(X2, LindiZarrWrapperDataset) - assert _check_equal(X1["x"][:], X2["x"][:]) - ref1 = X1["y"][0] - assert isinstance(ref1, h5py.Reference) - ref2 = X2["y"][0] - assert isinstance(ref2, LindiZarrWrapperReference) - - target1 = h5f[ref1] - assert isinstance(target1, h5py.Dataset) - target2 = client[ref2] - assert isinstance(target2, LindiZarrWrapperDataset) - - assert _check_equal(target1[:], target2[:]) - + assert X2.attrs["nan"] == "NaN" + assert X2.attrs["inf"] == "Infinity" + assert X2.attrs["ninf"] == "-Infinity" -def _check_equal(a, b): - # allow comparison of bytes and strings - if isinstance(a, str): - a = a.encode() - if isinstance(b, str): - b = b.encode() - # allow comparison of numpy scalars with python scalars - if np.issubdtype(type(a), np.floating): - a = float(a) - if np.issubdtype(type(b), np.floating): - b = float(b) - if np.issubdtype(type(a), np.integer): - a = int(a) - if np.issubdtype(type(b), np.integer): - b = int(b) - - # allow comparison of numpy arrays to python lists - if isinstance(a, list): - a = np.array(a) - if isinstance(b, list): - b = np.array(b) - - if type(a) != type(b): # noqa: E721 +def _lists_are_equal(a, b): + if len(a) != len(b): return False - - if isinstance(a, np.ndarray): - assert isinstance(b, np.ndarray) - return _check_arrays_equal(a, b) - - # test for NaNs (we need to use np.isnan because NaN != NaN in python) - if isinstance(a, float) and isinstance(b, float): - if np.isnan(a) and np.isnan(b): - return True - - return a == b + for aa, bb in zip(a, b): + if aa != bb: + return False + return True -def _check_arrays_equal(a: np.ndarray, b: np.ndarray): - # If it's an array of strings, we convert to an array of bytes - if a.dtype == object: - # need to modify all the entries - a = np.array([x.encode() if type(x) is str else x for x in a.ravel()]).reshape( - a.shape - ) - if b.dtype == object: - b = np.array([x.encode() if type(x) is str else x for x in b.ravel()]).reshape( - b.shape - ) +def _arrays_are_equal(a, b): + if a.shape != b.shape: + return False + if a.dtype != b.dtype: + return False # if this is numeric data we need to use allclose so that we can handle NaNs if np.issubdtype(a.dtype, np.number): return np.allclose(a, b, equal_nan=True) else: return np.array_equal(a, b) + + +if __name__ == '__main__': + test_arrays_of_compound_dtype_with_references()