Skip to content

Commit

Permalink
add tests
Browse files Browse the repository at this point in the history
  • Loading branch information
magland committed Mar 20, 2024
1 parent 09777bd commit c5ce88f
Show file tree
Hide file tree
Showing 9 changed files with 227 additions and 64 deletions.
2 changes: 1 addition & 1 deletion .vscode/tasks/quick_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,4 @@ flake8 .
# pyright
cd ..

pytest --cov=lindi --cov-report=xml --cov-report=term -m "not slow" tests/
pytest --cov=lindi --cov-report=xml --cov-report=term -m "not slow and not network" tests/
2 changes: 1 addition & 1 deletion examples/example1.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,4 @@
# Open using pynwb
with pynwb.NWBHDF5IO(file=client, mode="r") as io:
nwbfile = io.read()
print(nwbfile)
print(nwbfile)
20 changes: 14 additions & 6 deletions lindi/LindiH5pyFile/LindiH5pyDataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,18 @@ def __init__(self, _h5py_dataset_id):
self._h5py_dataset_id = _h5py_dataset_id


# This is a global list of external hdf5 clients, which are used by
# possibly multiple LindiH5pyFile objects. The key is the URL of the
# external hdf5 file, and the value is the h5py.File object.
# TODO: figure out how to close these clients
_external_hdf5_clients: Dict[str, h5py.File] = {}


class LindiH5pyDataset(h5py.Dataset):
def __init__(self, _dataset_object: Union[h5py.Dataset, zarr.Array], _file: "LindiH5pyFile"):
self._dataset_object = _dataset_object
self._file = _file

self._external_hdf5_clients: Dict[str, h5py.File] = {}

# See if we have the _COMPOUND_DTYPE attribute, which signifies that
# this is a compound dtype
if isinstance(_dataset_object, zarr.Array):
Expand Down Expand Up @@ -176,10 +181,13 @@ def _get_item_for_zarr(self, zarr_array: zarr.Array, selection: Any):
return zarr_array[selection]

def _get_external_hdf5_client(self, url: str) -> h5py.File:
if url not in self._external_hdf5_clients:
remf = remfile.File(url)
self._external_hdf5_clients[url] = h5py.File(remf, "r")
return self._external_hdf5_clients[url]
if url not in _external_hdf5_clients:
if url.startswith("http://") or url.startswith("https://"):
ff = remfile.File(url)
else:
ff = open(url, "rb") # this never gets closed
_external_hdf5_clients[url] = h5py.File(ff, "r")
return _external_hdf5_clients[url]


def _resolve_references(x: Any):
Expand Down
18 changes: 9 additions & 9 deletions lindi/LindiH5pyFile/LindiH5pyFile.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,15 +94,15 @@ def filename(self):
def driver(self):
raise Exception("Getting driver is not allowed")

@property
def mode(self):
if isinstance(self._file_object, h5py.File):
return self._file_object.mode
elif isinstance(self._file_object, zarr.Group):
# hard-coded to read-only
return "r"
else:
raise Exception(f"Unhandled type: {type(self._file_object)}")
# @property
# def mode(self):
# if isinstance(self._file_object, h5py.File):
# return self._file_object.mode
# elif isinstance(self._file_object, zarr.Group):
# # hard-coded to read-only
# return "r"
# else:
# raise Exception(f"Unhandled type: {type(self._file_object)}")

@property
def libver(self):
Expand Down
2 changes: 1 addition & 1 deletion lindi/LindiH5pyFile/LindiH5pyGroup.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@


if TYPE_CHECKING:
from .LindiH5pyFile import LindiH5pyFile
from .LindiH5pyFile import LindiH5pyFile # pragma: no cover


class LindiH5pyGroupId:
Expand Down
117 changes: 71 additions & 46 deletions tests/test_core.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import pytest
import numpy as np
import h5py
import tempfile
Expand All @@ -23,25 +24,27 @@ def test_variety():
f["dataset1"].attrs["test_attr1"] = "attribute-of-dataset1"
f["group1"].attrs["test_attr2"] = "attribute-of-group1"
h5f = h5py.File(filename, "r")
h5f_wrapped = lindi.LindiH5pyFile.from_h5py_file(h5f)
with LindiH5ZarrStore.from_file(filename, url=filename) as store:
rfs = store.to_reference_file_system()
h5f_2 = lindi.LindiH5pyFile.from_reference_file_system(rfs)
assert h5f_2.attrs["int1"] == h5f.attrs["int1"]
assert h5f_2.attrs["float1"] == h5f.attrs["float1"]
assert h5f_2.attrs["str1"] == h5f.attrs["str1"]
assert h5f_2.attrs["bytes1"] == h5f.attrs["bytes1"]
assert _lists_are_equal(h5f_2.attrs["list1"], h5f.attrs["list1"])
assert _lists_are_equal(h5f_2.attrs["tuple1"], h5f.attrs["tuple1"])
assert _arrays_are_equal(np.array(h5f_2.attrs["array1"]), h5f.attrs["array1"])
assert h5f_2["dataset1"].attrs["test_attr1"] == h5f["dataset1"].attrs["test_attr1"] # type: ignore
assert _arrays_are_equal(h5f_2["dataset1"][()], h5f["dataset1"][()]) # type: ignore
assert h5f_2["group1"].attrs["test_attr2"] == h5f["group1"].attrs["test_attr2"] # type: ignore
target_1 = h5f[h5f.attrs["dataset1_ref"]]
target_2 = h5f_2[h5f_2.attrs["dataset1_ref"]]
assert target_1.attrs["test_attr1"] == target_2.attrs["test_attr1"] # type: ignore
target_1 = h5f[h5f.attrs["group1_ref"]]
target_2 = h5f_2[h5f_2.attrs["group1_ref"]]
assert target_1.attrs["test_attr2"] == target_2.attrs["test_attr2"] # type: ignore
h5f_rfs = lindi.LindiH5pyFile.from_reference_file_system(rfs)
for h5f_2 in [h5f_rfs, h5f_wrapped]:
assert h5f_2.attrs["int1"] == h5f.attrs["int1"]
assert h5f_2.attrs["float1"] == h5f.attrs["float1"]
assert h5f_2.attrs["str1"] == h5f.attrs["str1"]
assert h5f_2.attrs["bytes1"] == h5f.attrs["bytes1"]
assert _lists_are_equal(h5f_2.attrs["list1"], h5f.attrs["list1"])
assert _lists_are_equal(h5f_2.attrs["tuple1"], h5f.attrs["tuple1"])
assert _arrays_are_equal(np.array(h5f_2.attrs["array1"]), h5f.attrs["array1"])
assert h5f_2["dataset1"].attrs["test_attr1"] == h5f["dataset1"].attrs["test_attr1"] # type: ignore
assert _arrays_are_equal(h5f_2["dataset1"][()], h5f["dataset1"][()]) # type: ignore
assert h5f_2["group1"].attrs["test_attr2"] == h5f["group1"].attrs["test_attr2"] # type: ignore
target_1 = h5f[h5f.attrs["dataset1_ref"]]
target_2 = h5f_2[h5f_2.attrs["dataset1_ref"]]
assert target_1.attrs["test_attr1"] == target_2.attrs["test_attr1"] # type: ignore
target_1 = h5f[h5f.attrs["group1_ref"]]
target_2 = h5f_2[h5f_2.attrs["group1_ref"]]
assert target_1.attrs["test_attr2"] == target_2.attrs["test_attr2"] # type: ignore


def test_soft_links():
Expand All @@ -53,37 +56,45 @@ def test_soft_links():
g.create_dataset('dataset1', data=[5, 6, 7])
f['soft_link'] = h5py.SoftLink('/group_target')
h5f = h5py.File(filename, "r")
h5f_wrapped = lindi.LindiH5pyFile.from_h5py_file(h5f)
with LindiH5ZarrStore.from_file(filename, url=filename) as store:
rfs = store.to_reference_file_system()
h5f_2 = lindi.LindiH5pyFile.from_reference_file_system(rfs)
g1 = h5f['group_target']
g2 = h5f_2['group_target']
assert g1.attrs['foo'] == g2.attrs['foo'] # type: ignore
h1 = h5f['soft_link']
h2 = h5f_2['soft_link']
assert h1.attrs['foo'] == h2.attrs['foo'] # type: ignore
# this is tricky: it seems that with h5py, the name of the soft link
# is the source name. So the following assertion will fail.
# assert h1.name == h2.name
k1 = h5f.get('soft_link', getlink=True)
k2 = h5f_2.get('soft_link', getlink=True)
assert isinstance(k1, h5py.SoftLink)
assert isinstance(k2, h5py.SoftLink)
ds1 = h5f['soft_link']['dataset1'] # type: ignore
assert isinstance(ds1, h5py.Dataset)
ds2 = h5f_2['soft_link']['dataset1'] # type: ignore
assert isinstance(ds2, h5py.Dataset)
assert _arrays_are_equal(ds1[()], ds2[()])
ds1 = h5f['soft_link/dataset1']
assert isinstance(ds1, h5py.Dataset)
ds2 = h5f_2['soft_link/dataset1']
assert isinstance(ds2, h5py.Dataset)
assert _arrays_are_equal(ds1[()], ds2[()])
ds1 = h5f['group_target/dataset1']
assert isinstance(ds1, h5py.Dataset)
ds2 = h5f_2['group_target/dataset1']
assert isinstance(ds2, h5py.Dataset)
assert _arrays_are_equal(ds1[()], ds2[()])
h5f_rfs = lindi.LindiH5pyFile.from_reference_file_system(rfs)
for h5f_2 in [h5f_rfs, h5f_wrapped]:
g1 = h5f['group_target']
assert isinstance(g1, h5py.Group)
g2 = h5f_2['group_target']
assert isinstance(g2, h5py.Group)
assert g1.attrs['foo'] == g2.attrs['foo'] # type: ignore
with pytest.raises(TypeError):
g1[np.array([0, 1, 2])]
h1 = h5f['soft_link']
assert isinstance(h1, h5py.Group)
h2 = h5f_2['soft_link']
assert isinstance(h2, h5py.Group)
assert h1.attrs['foo'] == h2.attrs['foo'] # type: ignore
# this is tricky: it seems that with h5py, the name of the soft link
# is the source name. So the following assertion will fail.
# assert h1.name == h2.name
k1 = h5f.get('soft_link', getlink=True)
k2 = h5f_2.get('soft_link', getlink=True)
assert isinstance(k1, h5py.SoftLink)
assert isinstance(k2, h5py.SoftLink)
ds1 = h5f['soft_link']['dataset1'] # type: ignore
assert isinstance(ds1, h5py.Dataset)
ds2 = h5f_2['soft_link']['dataset1'] # type: ignore
assert isinstance(ds2, h5py.Dataset)
assert _arrays_are_equal(ds1[()], ds2[()])
ds1 = h5f['soft_link/dataset1']
assert isinstance(ds1, h5py.Dataset)
ds2 = h5f_2['soft_link/dataset1']
assert isinstance(ds2, h5py.Dataset)
assert _arrays_are_equal(ds1[()], ds2[()])
ds1 = h5f['group_target/dataset1']
assert isinstance(ds1, h5py.Dataset)
ds2 = h5f_2['group_target/dataset1']
assert isinstance(ds2, h5py.Dataset)
assert _arrays_are_equal(ds1[()], ds2[()])


def test_arrays_of_compound_dtype():
Expand Down Expand Up @@ -246,6 +257,20 @@ def test_nan_inf_attributes():
assert X2.attrs["ninf"] == "-Infinity"


def test_reference_file_system_to_file():
with tempfile.TemporaryDirectory() as tmpdir:
filename = f"{tmpdir}/test.h5"
with h5py.File(filename, "w") as f:
f.create_dataset("X", data=[1, 2, 3])
with LindiH5ZarrStore.from_file(filename, url=filename) as store:
rfs_fname = f'{tmpdir}/test.zarr.json'
store.to_file(rfs_fname)
client = lindi.LindiH5pyFile.from_reference_file_system(rfs_fname)
X = client["X"]
assert isinstance(X, lindi.LindiH5pyDataset)
assert _lists_are_equal(X[()], [1, 2, 3])


def _lists_are_equal(a, b):
if len(a) != len(b):
return False
Expand Down
27 changes: 27 additions & 0 deletions tests/test_external_array_link.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import tempfile
import numpy as np
import h5py
import lindi


def test_external_array_link():
with tempfile.TemporaryDirectory() as tmpdir:
filename = f"{tmpdir}/test.h5"
X = np.random.randn(50, 12)
with h5py.File(filename, "w") as f:
f.create_dataset("dataset1", data=X, chunks=(10, 6))
with lindi.LindiH5ZarrStore.from_file(
filename,
url=filename,
opts=lindi.LindiH5ZarrStoreOpts(
num_dataset_chunks_threshold=4
)
) as store:
rfs = store.to_reference_file_system()
client = lindi.LindiH5pyFile.from_reference_file_system(rfs)
X2 = client["dataset1"][:] # type: ignore
assert np.array_equal(X, X2)


if __name__ == "__main__":
test_external_array_link()
45 changes: 45 additions & 0 deletions tests/test_remote_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import json
import pytest
import lindi


@pytest.mark.network
def test_remote_data_1():
import pynwb

# Define the URL for a remote NWB file
h5_url = "https://api.dandiarchive.org/api/assets/11f512ba-5bcf-4230-a8cb-dc8d36db38cb/download/"

# Create a read-only Zarr store as a wrapper for the h5 file
store = lindi.LindiH5ZarrStore.from_file(h5_url)

# Generate a reference file system
rfs = store.to_reference_file_system()

# Save it to a file for later use
with open("example.zarr.json", "w") as f:
json.dump(rfs, f, indent=2)

# Create an h5py-like client from the reference file system
client = lindi.LindiH5pyFile.from_reference_file_system(rfs)

# Open using pynwb
with pynwb.NWBHDF5IO(file=client, mode="r") as io:
nwbfile = io.read()
print(nwbfile)


@pytest.mark.network
def test_remote_data_2():
import pynwb

# Define the URL for a remote .zarr.json file
url = 'https://kerchunk.neurosift.org/dandi/dandisets/000939/assets/11f512ba-5bcf-4230-a8cb-dc8d36db38cb/zarr.json'

# Load the h5py-like client from the reference file system
client = lindi.LindiH5pyFile.from_reference_file_system(url)

# Open using pynwb
with pynwb.NWBHDF5IO(file=client, mode="r") as io:
nwbfile = io.read()
print(nwbfile)
58 changes: 58 additions & 0 deletions tests/test_store.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import h5py
import tempfile
import lindi


def test_store():
with tempfile.TemporaryDirectory() as tmpdir:
filename = f"{tmpdir}/test.h5"
with h5py.File(filename, "w") as f:
f.create_dataset("dataset1", data=[1, 2, 3])
group1 = f.create_group("group1")
group1.create_group("group2")
group1.create_dataset("dataset2", data=[4, 5, 6])
with lindi.LindiH5ZarrStore.from_file(filename, url=filename) as store:
store.to_file(f"{tmpdir}/test.zarr.json") # for coverage
a = store.listdir('')
assert _lists_are_equal(a, ['dataset1', 'group1'], ordered=False)
b = store.listdir('group1')
assert _lists_are_equal(b, ['group2', 'dataset2'], ordered=False)
c = store.listdir('group1/group2')
assert _lists_are_equal(c, [], ordered=False)
assert '.zattrs' in store
assert '.zgroup' in store
assert 'dataset1' not in store
assert 'dataset1/.zattrs' in store
assert 'dataset1/.zarray' in store
assert 'dataset1/.zgroup' not in store
assert 'dataset1/0' in store
assert 'group1' not in store
assert 'group1/.zattrs' in store
assert 'group1/.zgroup' in store
assert 'group1/.zarray' not in store
assert 'group1/group2' not in store
assert 'group1/group2/.zattrs' in store
assert 'group1/group2/.zgroup' in store
assert 'group1/group2/.zarray' not in store
assert 'group1/dataset2' not in store
assert 'group1/dataset2/.zattrs' in store
assert 'group1/dataset2/.zarray' in store
assert 'group1/dataset2/.zgroup' not in store
assert 'group1/dataset2/0' in store
client = lindi.LindiH5pyFile.from_zarr_store(store)
X = client["dataset1"][:] # type: ignore
assert _lists_are_equal(X, [1, 2, 3], ordered=True)
Y = client["group1/dataset2"][:] # type: ignore
assert _lists_are_equal(Y, [4, 5, 6], ordered=True)


def _lists_are_equal(a, b, ordered: bool):
if ordered:
if len(a) != len(b):
return False
for i in range(len(a)):
if a[i] != b[i]:
return False
return True
else:
return set(a) == set(b)

0 comments on commit c5ce88f

Please sign in to comment.