From c5ce88f4a99e0c95dc9e69177b1a913e6fdb4be8 Mon Sep 17 00:00:00 2001 From: Jeremy Magland Date: Wed, 20 Mar 2024 16:21:04 -0400 Subject: [PATCH] add tests --- .vscode/tasks/quick_test.sh | 2 +- examples/example1.py | 2 +- lindi/LindiH5pyFile/LindiH5pyDataset.py | 20 ++-- lindi/LindiH5pyFile/LindiH5pyFile.py | 18 ++-- lindi/LindiH5pyFile/LindiH5pyGroup.py | 2 +- tests/test_core.py | 117 ++++++++++++++---------- tests/test_external_array_link.py | 27 ++++++ tests/test_remote_data.py | 45 +++++++++ tests/test_store.py | 58 ++++++++++++ 9 files changed, 227 insertions(+), 64 deletions(-) create mode 100644 tests/test_external_array_link.py create mode 100644 tests/test_remote_data.py create mode 100644 tests/test_store.py diff --git a/.vscode/tasks/quick_test.sh b/.vscode/tasks/quick_test.sh index 173dd78..5e8873e 100755 --- a/.vscode/tasks/quick_test.sh +++ b/.vscode/tasks/quick_test.sh @@ -8,4 +8,4 @@ flake8 . # pyright cd .. -pytest --cov=lindi --cov-report=xml --cov-report=term -m "not slow" tests/ +pytest --cov=lindi --cov-report=xml --cov-report=term -m "not slow and not network" tests/ diff --git a/examples/example1.py b/examples/example1.py index 413ef15..f5c1b6b 100644 --- a/examples/example1.py +++ b/examples/example1.py @@ -21,4 +21,4 @@ # Open using pynwb with pynwb.NWBHDF5IO(file=client, mode="r") as io: nwbfile = io.read() - print(nwbfile) \ No newline at end of file + print(nwbfile) diff --git a/lindi/LindiH5pyFile/LindiH5pyDataset.py b/lindi/LindiH5pyFile/LindiH5pyDataset.py index 8e06667..f2479f3 100644 --- a/lindi/LindiH5pyFile/LindiH5pyDataset.py +++ b/lindi/LindiH5pyFile/LindiH5pyDataset.py @@ -17,13 +17,18 @@ def __init__(self, _h5py_dataset_id): self._h5py_dataset_id = _h5py_dataset_id +# This is a global list of external hdf5 clients, which are used by +# possibly multiple LindiH5pyFile objects. The key is the URL of the +# external hdf5 file, and the value is the h5py.File object. +# TODO: figure out how to close these clients +_external_hdf5_clients: Dict[str, h5py.File] = {} + + class LindiH5pyDataset(h5py.Dataset): def __init__(self, _dataset_object: Union[h5py.Dataset, zarr.Array], _file: "LindiH5pyFile"): self._dataset_object = _dataset_object self._file = _file - self._external_hdf5_clients: Dict[str, h5py.File] = {} - # See if we have the _COMPOUND_DTYPE attribute, which signifies that # this is a compound dtype if isinstance(_dataset_object, zarr.Array): @@ -176,10 +181,13 @@ def _get_item_for_zarr(self, zarr_array: zarr.Array, selection: Any): return zarr_array[selection] def _get_external_hdf5_client(self, url: str) -> h5py.File: - if url not in self._external_hdf5_clients: - remf = remfile.File(url) - self._external_hdf5_clients[url] = h5py.File(remf, "r") - return self._external_hdf5_clients[url] + if url not in _external_hdf5_clients: + if url.startswith("http://") or url.startswith("https://"): + ff = remfile.File(url) + else: + ff = open(url, "rb") # this never gets closed + _external_hdf5_clients[url] = h5py.File(ff, "r") + return _external_hdf5_clients[url] def _resolve_references(x: Any): diff --git a/lindi/LindiH5pyFile/LindiH5pyFile.py b/lindi/LindiH5pyFile/LindiH5pyFile.py index 8c1f4f9..ea18529 100644 --- a/lindi/LindiH5pyFile/LindiH5pyFile.py +++ b/lindi/LindiH5pyFile/LindiH5pyFile.py @@ -94,15 +94,15 @@ def filename(self): def driver(self): raise Exception("Getting driver is not allowed") - @property - def mode(self): - if isinstance(self._file_object, h5py.File): - return self._file_object.mode - elif isinstance(self._file_object, zarr.Group): - # hard-coded to read-only - return "r" - else: - raise Exception(f"Unhandled type: {type(self._file_object)}") + # @property + # def mode(self): + # if isinstance(self._file_object, h5py.File): + # return self._file_object.mode + # elif isinstance(self._file_object, zarr.Group): + # # hard-coded to read-only + # return "r" + # else: + # raise Exception(f"Unhandled type: {type(self._file_object)}") @property def libver(self): diff --git a/lindi/LindiH5pyFile/LindiH5pyGroup.py b/lindi/LindiH5pyFile/LindiH5pyGroup.py index 8678f71..48b3d90 100644 --- a/lindi/LindiH5pyFile/LindiH5pyGroup.py +++ b/lindi/LindiH5pyFile/LindiH5pyGroup.py @@ -8,7 +8,7 @@ if TYPE_CHECKING: - from .LindiH5pyFile import LindiH5pyFile + from .LindiH5pyFile import LindiH5pyFile # pragma: no cover class LindiH5pyGroupId: diff --git a/tests/test_core.py b/tests/test_core.py index 1743b17..1012b69 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -1,3 +1,4 @@ +import pytest import numpy as np import h5py import tempfile @@ -23,25 +24,27 @@ def test_variety(): f["dataset1"].attrs["test_attr1"] = "attribute-of-dataset1" f["group1"].attrs["test_attr2"] = "attribute-of-group1" h5f = h5py.File(filename, "r") + h5f_wrapped = lindi.LindiH5pyFile.from_h5py_file(h5f) with LindiH5ZarrStore.from_file(filename, url=filename) as store: rfs = store.to_reference_file_system() - h5f_2 = lindi.LindiH5pyFile.from_reference_file_system(rfs) - assert h5f_2.attrs["int1"] == h5f.attrs["int1"] - assert h5f_2.attrs["float1"] == h5f.attrs["float1"] - assert h5f_2.attrs["str1"] == h5f.attrs["str1"] - assert h5f_2.attrs["bytes1"] == h5f.attrs["bytes1"] - assert _lists_are_equal(h5f_2.attrs["list1"], h5f.attrs["list1"]) - assert _lists_are_equal(h5f_2.attrs["tuple1"], h5f.attrs["tuple1"]) - assert _arrays_are_equal(np.array(h5f_2.attrs["array1"]), h5f.attrs["array1"]) - assert h5f_2["dataset1"].attrs["test_attr1"] == h5f["dataset1"].attrs["test_attr1"] # type: ignore - assert _arrays_are_equal(h5f_2["dataset1"][()], h5f["dataset1"][()]) # type: ignore - assert h5f_2["group1"].attrs["test_attr2"] == h5f["group1"].attrs["test_attr2"] # type: ignore - target_1 = h5f[h5f.attrs["dataset1_ref"]] - target_2 = h5f_2[h5f_2.attrs["dataset1_ref"]] - assert target_1.attrs["test_attr1"] == target_2.attrs["test_attr1"] # type: ignore - target_1 = h5f[h5f.attrs["group1_ref"]] - target_2 = h5f_2[h5f_2.attrs["group1_ref"]] - assert target_1.attrs["test_attr2"] == target_2.attrs["test_attr2"] # type: ignore + h5f_rfs = lindi.LindiH5pyFile.from_reference_file_system(rfs) + for h5f_2 in [h5f_rfs, h5f_wrapped]: + assert h5f_2.attrs["int1"] == h5f.attrs["int1"] + assert h5f_2.attrs["float1"] == h5f.attrs["float1"] + assert h5f_2.attrs["str1"] == h5f.attrs["str1"] + assert h5f_2.attrs["bytes1"] == h5f.attrs["bytes1"] + assert _lists_are_equal(h5f_2.attrs["list1"], h5f.attrs["list1"]) + assert _lists_are_equal(h5f_2.attrs["tuple1"], h5f.attrs["tuple1"]) + assert _arrays_are_equal(np.array(h5f_2.attrs["array1"]), h5f.attrs["array1"]) + assert h5f_2["dataset1"].attrs["test_attr1"] == h5f["dataset1"].attrs["test_attr1"] # type: ignore + assert _arrays_are_equal(h5f_2["dataset1"][()], h5f["dataset1"][()]) # type: ignore + assert h5f_2["group1"].attrs["test_attr2"] == h5f["group1"].attrs["test_attr2"] # type: ignore + target_1 = h5f[h5f.attrs["dataset1_ref"]] + target_2 = h5f_2[h5f_2.attrs["dataset1_ref"]] + assert target_1.attrs["test_attr1"] == target_2.attrs["test_attr1"] # type: ignore + target_1 = h5f[h5f.attrs["group1_ref"]] + target_2 = h5f_2[h5f_2.attrs["group1_ref"]] + assert target_1.attrs["test_attr2"] == target_2.attrs["test_attr2"] # type: ignore def test_soft_links(): @@ -53,37 +56,45 @@ def test_soft_links(): g.create_dataset('dataset1', data=[5, 6, 7]) f['soft_link'] = h5py.SoftLink('/group_target') h5f = h5py.File(filename, "r") + h5f_wrapped = lindi.LindiH5pyFile.from_h5py_file(h5f) with LindiH5ZarrStore.from_file(filename, url=filename) as store: rfs = store.to_reference_file_system() - h5f_2 = lindi.LindiH5pyFile.from_reference_file_system(rfs) - g1 = h5f['group_target'] - g2 = h5f_2['group_target'] - assert g1.attrs['foo'] == g2.attrs['foo'] # type: ignore - h1 = h5f['soft_link'] - h2 = h5f_2['soft_link'] - assert h1.attrs['foo'] == h2.attrs['foo'] # type: ignore - # this is tricky: it seems that with h5py, the name of the soft link - # is the source name. So the following assertion will fail. - # assert h1.name == h2.name - k1 = h5f.get('soft_link', getlink=True) - k2 = h5f_2.get('soft_link', getlink=True) - assert isinstance(k1, h5py.SoftLink) - assert isinstance(k2, h5py.SoftLink) - ds1 = h5f['soft_link']['dataset1'] # type: ignore - assert isinstance(ds1, h5py.Dataset) - ds2 = h5f_2['soft_link']['dataset1'] # type: ignore - assert isinstance(ds2, h5py.Dataset) - assert _arrays_are_equal(ds1[()], ds2[()]) - ds1 = h5f['soft_link/dataset1'] - assert isinstance(ds1, h5py.Dataset) - ds2 = h5f_2['soft_link/dataset1'] - assert isinstance(ds2, h5py.Dataset) - assert _arrays_are_equal(ds1[()], ds2[()]) - ds1 = h5f['group_target/dataset1'] - assert isinstance(ds1, h5py.Dataset) - ds2 = h5f_2['group_target/dataset1'] - assert isinstance(ds2, h5py.Dataset) - assert _arrays_are_equal(ds1[()], ds2[()]) + h5f_rfs = lindi.LindiH5pyFile.from_reference_file_system(rfs) + for h5f_2 in [h5f_rfs, h5f_wrapped]: + g1 = h5f['group_target'] + assert isinstance(g1, h5py.Group) + g2 = h5f_2['group_target'] + assert isinstance(g2, h5py.Group) + assert g1.attrs['foo'] == g2.attrs['foo'] # type: ignore + with pytest.raises(TypeError): + g1[np.array([0, 1, 2])] + h1 = h5f['soft_link'] + assert isinstance(h1, h5py.Group) + h2 = h5f_2['soft_link'] + assert isinstance(h2, h5py.Group) + assert h1.attrs['foo'] == h2.attrs['foo'] # type: ignore + # this is tricky: it seems that with h5py, the name of the soft link + # is the source name. So the following assertion will fail. + # assert h1.name == h2.name + k1 = h5f.get('soft_link', getlink=True) + k2 = h5f_2.get('soft_link', getlink=True) + assert isinstance(k1, h5py.SoftLink) + assert isinstance(k2, h5py.SoftLink) + ds1 = h5f['soft_link']['dataset1'] # type: ignore + assert isinstance(ds1, h5py.Dataset) + ds2 = h5f_2['soft_link']['dataset1'] # type: ignore + assert isinstance(ds2, h5py.Dataset) + assert _arrays_are_equal(ds1[()], ds2[()]) + ds1 = h5f['soft_link/dataset1'] + assert isinstance(ds1, h5py.Dataset) + ds2 = h5f_2['soft_link/dataset1'] + assert isinstance(ds2, h5py.Dataset) + assert _arrays_are_equal(ds1[()], ds2[()]) + ds1 = h5f['group_target/dataset1'] + assert isinstance(ds1, h5py.Dataset) + ds2 = h5f_2['group_target/dataset1'] + assert isinstance(ds2, h5py.Dataset) + assert _arrays_are_equal(ds1[()], ds2[()]) def test_arrays_of_compound_dtype(): @@ -246,6 +257,20 @@ def test_nan_inf_attributes(): assert X2.attrs["ninf"] == "-Infinity" +def test_reference_file_system_to_file(): + with tempfile.TemporaryDirectory() as tmpdir: + filename = f"{tmpdir}/test.h5" + with h5py.File(filename, "w") as f: + f.create_dataset("X", data=[1, 2, 3]) + with LindiH5ZarrStore.from_file(filename, url=filename) as store: + rfs_fname = f'{tmpdir}/test.zarr.json' + store.to_file(rfs_fname) + client = lindi.LindiH5pyFile.from_reference_file_system(rfs_fname) + X = client["X"] + assert isinstance(X, lindi.LindiH5pyDataset) + assert _lists_are_equal(X[()], [1, 2, 3]) + + def _lists_are_equal(a, b): if len(a) != len(b): return False diff --git a/tests/test_external_array_link.py b/tests/test_external_array_link.py new file mode 100644 index 0000000..57e689b --- /dev/null +++ b/tests/test_external_array_link.py @@ -0,0 +1,27 @@ +import tempfile +import numpy as np +import h5py +import lindi + + +def test_external_array_link(): + with tempfile.TemporaryDirectory() as tmpdir: + filename = f"{tmpdir}/test.h5" + X = np.random.randn(50, 12) + with h5py.File(filename, "w") as f: + f.create_dataset("dataset1", data=X, chunks=(10, 6)) + with lindi.LindiH5ZarrStore.from_file( + filename, + url=filename, + opts=lindi.LindiH5ZarrStoreOpts( + num_dataset_chunks_threshold=4 + ) + ) as store: + rfs = store.to_reference_file_system() + client = lindi.LindiH5pyFile.from_reference_file_system(rfs) + X2 = client["dataset1"][:] # type: ignore + assert np.array_equal(X, X2) + + +if __name__ == "__main__": + test_external_array_link() diff --git a/tests/test_remote_data.py b/tests/test_remote_data.py new file mode 100644 index 0000000..27af554 --- /dev/null +++ b/tests/test_remote_data.py @@ -0,0 +1,45 @@ +import json +import pytest +import lindi + + +@pytest.mark.network +def test_remote_data_1(): + import pynwb + + # Define the URL for a remote NWB file + h5_url = "https://api.dandiarchive.org/api/assets/11f512ba-5bcf-4230-a8cb-dc8d36db38cb/download/" + + # Create a read-only Zarr store as a wrapper for the h5 file + store = lindi.LindiH5ZarrStore.from_file(h5_url) + + # Generate a reference file system + rfs = store.to_reference_file_system() + + # Save it to a file for later use + with open("example.zarr.json", "w") as f: + json.dump(rfs, f, indent=2) + + # Create an h5py-like client from the reference file system + client = lindi.LindiH5pyFile.from_reference_file_system(rfs) + + # Open using pynwb + with pynwb.NWBHDF5IO(file=client, mode="r") as io: + nwbfile = io.read() + print(nwbfile) + + +@pytest.mark.network +def test_remote_data_2(): + import pynwb + + # Define the URL for a remote .zarr.json file + url = 'https://kerchunk.neurosift.org/dandi/dandisets/000939/assets/11f512ba-5bcf-4230-a8cb-dc8d36db38cb/zarr.json' + + # Load the h5py-like client from the reference file system + client = lindi.LindiH5pyFile.from_reference_file_system(url) + + # Open using pynwb + with pynwb.NWBHDF5IO(file=client, mode="r") as io: + nwbfile = io.read() + print(nwbfile) diff --git a/tests/test_store.py b/tests/test_store.py new file mode 100644 index 0000000..e7688f5 --- /dev/null +++ b/tests/test_store.py @@ -0,0 +1,58 @@ +import h5py +import tempfile +import lindi + + +def test_store(): + with tempfile.TemporaryDirectory() as tmpdir: + filename = f"{tmpdir}/test.h5" + with h5py.File(filename, "w") as f: + f.create_dataset("dataset1", data=[1, 2, 3]) + group1 = f.create_group("group1") + group1.create_group("group2") + group1.create_dataset("dataset2", data=[4, 5, 6]) + with lindi.LindiH5ZarrStore.from_file(filename, url=filename) as store: + store.to_file(f"{tmpdir}/test.zarr.json") # for coverage + a = store.listdir('') + assert _lists_are_equal(a, ['dataset1', 'group1'], ordered=False) + b = store.listdir('group1') + assert _lists_are_equal(b, ['group2', 'dataset2'], ordered=False) + c = store.listdir('group1/group2') + assert _lists_are_equal(c, [], ordered=False) + assert '.zattrs' in store + assert '.zgroup' in store + assert 'dataset1' not in store + assert 'dataset1/.zattrs' in store + assert 'dataset1/.zarray' in store + assert 'dataset1/.zgroup' not in store + assert 'dataset1/0' in store + assert 'group1' not in store + assert 'group1/.zattrs' in store + assert 'group1/.zgroup' in store + assert 'group1/.zarray' not in store + assert 'group1/group2' not in store + assert 'group1/group2/.zattrs' in store + assert 'group1/group2/.zgroup' in store + assert 'group1/group2/.zarray' not in store + assert 'group1/dataset2' not in store + assert 'group1/dataset2/.zattrs' in store + assert 'group1/dataset2/.zarray' in store + assert 'group1/dataset2/.zgroup' not in store + assert 'group1/dataset2/0' in store + client = lindi.LindiH5pyFile.from_zarr_store(store) + X = client["dataset1"][:] # type: ignore + assert _lists_are_equal(X, [1, 2, 3], ordered=True) + Y = client["group1/dataset2"][:] # type: ignore + assert _lists_are_equal(Y, [4, 5, 6], ordered=True) + + +def _lists_are_equal(a, b, ordered: bool): + if ordered: + if len(a) != len(b): + return False + for i in range(len(a)): + if a[i] != b[i]: + return False + return True + else: + return set(a) == set(b)