From bdf10d64dd15e0975a215e687c52466b601e9b32 Mon Sep 17 00:00:00 2001 From: Jeremy Magland Date: Fri, 20 Sep 2024 14:10:48 -0400 Subject: [PATCH 1/5] expand tests --- examples/amend_remote_nwb_as_lindi_tar.py | 2 +- lindi/LindiH5ZarrStore/LindiH5ZarrStore.py | 2 +- lindi/LindiH5pyFile/LindiH5pyFile.py | 77 +-- .../writers/LindiH5pyGroupWriter.py | 1 + lindi/__init__.py | 1 + .../create_zarr_dataset_from_h5_data.py | 2 +- lindi/tar/lindi_tar.py | 35 +- tests/__init__.py | 0 tests/test_copy.py | 6 +- tests/test_core.py | 601 ------------------ tests/test_examples.py | 137 ++++ tests/test_lindi_h5py_file.py | 369 +++++++++++ tests/test_lindi_tar.py | 78 +++ tests/test_load_000409.py | 28 + tests/test_remote_data.py | 99 --- tests/test_store.py | 2 +- tests/test_zarr_write.py | 4 +- tests/utils.py | 24 +- 18 files changed, 700 insertions(+), 768 deletions(-) create mode 100644 tests/__init__.py delete mode 100644 tests/test_core.py create mode 100644 tests/test_examples.py create mode 100644 tests/test_lindi_h5py_file.py create mode 100644 tests/test_lindi_tar.py create mode 100644 tests/test_load_000409.py delete mode 100644 tests/test_remote_data.py diff --git a/examples/amend_remote_nwb_as_lindi_tar.py b/examples/amend_remote_nwb_as_lindi_tar.py index d4c7ebf..0a6efaf 100644 --- a/examples/amend_remote_nwb_as_lindi_tar.py +++ b/examples/amend_remote_nwb_as_lindi_tar.py @@ -21,7 +21,7 @@ rate=1., unit='s' ) - ts = nwbfile.processing['behavior'].add(timeseries_test) # type: ignore + nwbfile.processing['behavior'].add(timeseries_test) # type: ignore io.write(nwbfile) # type: ignore # Later on, you can read the file again diff --git a/lindi/LindiH5ZarrStore/LindiH5ZarrStore.py b/lindi/LindiH5ZarrStore/LindiH5ZarrStore.py index 6ff58a2..bb532c2 100644 --- a/lindi/LindiH5ZarrStore/LindiH5ZarrStore.py +++ b/lindi/LindiH5ZarrStore/LindiH5ZarrStore.py @@ -54,7 +54,7 @@ def __init__(self, h5_item, *, contiguous_dataset_max_chunk_size: Union[int, Non nn = contiguous_dataset_max_chunk_size // size0 if nn == 0: # The chunk size should not be zero - nn = 1 + nn = 1 # pragma: no cover self._split_chunk_shape = (nn,) + h5_item.shape[1:] if h5_item.chunks is not None: zero_chunk_coords = (0,) * h5_item.ndim diff --git a/lindi/LindiH5pyFile/LindiH5pyFile.py b/lindi/LindiH5pyFile/LindiH5pyFile.py index 6d7ecc7..2989f41 100644 --- a/lindi/LindiH5pyFile/LindiH5pyFile.py +++ b/lindi/LindiH5pyFile/LindiH5pyFile.py @@ -26,7 +26,7 @@ class LindiH5pyFile(h5py.File): - def __init__(self, _zarr_group: zarr.Group, *, _zarr_store: Union[ZarrStore, None] = None, _mode: LindiFileMode = "r", _local_cache: Union[LocalCache, None] = None, _source_url_or_path: Union[str, None] = None, _source_tar_file: Union[LindiTarFile, None] = None, _close_source_tar_file_on_close: bool = False): + def __init__(self, _zarr_group: zarr.Group, *, _zarr_store: ZarrStore, _mode: LindiFileMode = "r", _local_cache: Union[LocalCache, None] = None, _source_url_or_path: Union[str, None] = None, _source_tar_file: Union[LindiTarFile, None] = None, _close_source_tar_file_on_close: bool = False): """ Do not use this constructor directly. Instead, use: from_lindi_file, from_h5py_file, from_reference_file_system, from_zarr_store, or @@ -92,7 +92,7 @@ def from_hdf5_file( """ from ..LindiH5ZarrStore.LindiH5ZarrStore import LindiH5ZarrStore # avoid circular import if mode != "r": - raise Exception("Opening hdf5 file in write mode is not supported") + raise ValueError("Opening hdf5 file in write mode is not supported") zarr_store = LindiH5ZarrStore.from_file(url_or_path, local_cache=local_cache, opts=zarr_store_opts, url=url) return LindiH5pyFile.from_zarr_store( zarr_store=zarr_store, @@ -101,7 +101,7 @@ def from_hdf5_file( ) @staticmethod - def from_reference_file_system(rfs: Union[dict, str, None], *, mode: LindiFileMode = "r", local_cache: Union[LocalCache, None] = None, _source_url_or_path: Union[str, None] = None, _source_tar_file: Union[LindiTarFile, None] = None, _close_source_tar_file_on_close: bool = False): + def from_reference_file_system(rfs: Union[dict, str], *, mode: LindiFileMode = "r", local_cache: Union[LocalCache, None] = None, _source_url_or_path: Union[str, None] = None, _source_tar_file: Union[LindiTarFile, None] = None, _close_source_tar_file_on_close: bool = False): """ Create a LindiH5pyFile from a reference file system. @@ -122,20 +122,11 @@ def from_reference_file_system(rfs: Union[dict, str, None], *, mode: LindiFileMo _close_source_tar_file_on_close : bool, optional Internal use only """ - if rfs is None: - rfs = { - "refs": { - '.zgroup': { - 'zarr_format': 2 - } - }, - } - if isinstance(rfs, str): if _source_url_or_path is not None: - raise Exception("_source_file_path is not None even though rfs is a string") + raise Exception("_source_file_path is not None even though rfs is a string") # pragma: no cover if _source_tar_file is not None: - raise Exception("_source_tar_file is not None even though rfs is a string") + raise Exception("_source_tar_file is not None even though rfs is a string") # pragma: no cover rfs_is_url = rfs.startswith("http://") or rfs.startswith("https://") if rfs_is_url: data, tar_file = _load_rfs_from_url(rfs) @@ -153,11 +144,11 @@ def from_reference_file_system(rfs: Union[dict, str, None], *, mode: LindiFileMo if mode == "r": # Readonly, file must exist (default) if not os.path.exists(rfs): - raise Exception(f"File does not exist: {rfs}") + raise FileNotFoundError(f"File does not exist: {rfs}") elif mode == "r+": # Read/write, file must exist if not os.path.exists(rfs): - raise Exception(f"File does not exist: {rfs}") + raise FileNotFoundError(f"File does not exist: {rfs}") elif mode == "w": # Create file, truncate if exists need_to_create_empty_file = True @@ -165,14 +156,16 @@ def from_reference_file_system(rfs: Union[dict, str, None], *, mode: LindiFileMo elif mode in ["w-", "x"]: # Create file, fail if exists if os.path.exists(rfs): - raise Exception(f"File already exists: {rfs}") + raise ValueError(f"File already exists: {rfs}") need_to_create_empty_file = True + # Now that we have already checked for existence, let's just change mode to 'w' + mode = 'w' elif mode == "a": # Read/write if exists, create otherwise if not os.path.exists(rfs): need_to_create_empty_file = True else: - raise Exception(f"Unhandled mode: {mode}") + raise Exception(f"Unhandled mode: {mode}") # pragma: no cover if need_to_create_empty_file: is_tar = rfs.endswith(".tar") is_dir = rfs.endswith(".d") @@ -207,7 +200,7 @@ def from_reference_file_system(rfs: Union[dict, str, None], *, mode: LindiFileMo _close_source_tar_file_on_close=_close_source_tar_file_on_close ) else: - raise Exception(f"Unhandled type for rfs: {type(rfs)}") + raise Exception(f"Unhandled type for rfs: {type(rfs)}") # pragma: no cover @staticmethod def from_zarr_store(zarr_store: ZarrStore, mode: LindiFileMode = "r", local_cache: Union[LocalCache, None] = None, _source_url_or_path: Union[str, None] = None, _source_tar_file: Union[LindiTarFile, None] = None, _close_source_tar_file_on_close: bool = False): @@ -230,7 +223,7 @@ def from_zarr_store(zarr_store: ZarrStore, mode: LindiFileMode = "r", local_cach return LindiH5pyFile.from_zarr_group(zarr_group, _zarr_store=zarr_store, mode=mode, local_cache=local_cache, _source_url_or_path=_source_url_or_path, _source_tar_file=_source_tar_file, _close_source_tar_file_on_close=_close_source_tar_file_on_close) @staticmethod - def from_zarr_group(zarr_group: zarr.Group, *, mode: LindiFileMode = "r", _zarr_store: Union[ZarrStore, None] = None, local_cache: Union[LocalCache, None] = None, _source_url_or_path: Union[str, None] = None, _source_tar_file: Union[LindiTarFile, None] = None, _close_source_tar_file_on_close: bool = False): + def from_zarr_group(zarr_group: zarr.Group, *, mode: LindiFileMode = "r", _zarr_store: ZarrStore, local_cache: Union[LocalCache, None] = None, _source_url_or_path: Union[str, None] = None, _source_tar_file: Union[LindiTarFile, None] = None, _close_source_tar_file_on_close: bool = False): """ Create a LindiH5pyFile from a zarr group. @@ -255,15 +248,13 @@ def to_reference_file_system(self): Export the internal in-memory representation to a reference file system. """ from ..LindiH5ZarrStore.LindiH5ZarrStore import LindiH5ZarrStore # avoid circular import - if self._zarr_store is None: - raise Exception("Cannot convert to reference file system without zarr store") zarr_store = self._zarr_store if isinstance(zarr_store, LindiTarStore): zarr_store = zarr_store._base_store if isinstance(zarr_store, LindiH5ZarrStore): return zarr_store.to_reference_file_system() if not isinstance(zarr_store, LindiReferenceFileSystemStore): - raise Exception(f"Cannot create reference file system when zarr store has type {type(self._zarr_store)}") + raise Exception(f"Cannot create reference file system when zarr store has type {type(self._zarr_store)}") # pragma: no cover rfs = zarr_store.rfs rfs_copy = json.loads(json.dumps(rfs)) LindiReferenceFileSystemStore.replace_meta_file_contents_with_dicts_in_rfs(rfs_copy) @@ -277,19 +268,19 @@ def write_lindi_file(self, filename: str, *, generation_metadata: Union[dict, No Parameters ---------- filename : str - The filename to write to. It must end with '.lindi.json' or '.lindi.tar'. + The filename (or directory) to write to. It must end with '.lindi.json', '.lindi.tar', or '.lindi.d'. generation_metadata : Union[dict, None], optional The optional generation metadata to include in the reference file system, by default None. This information dict is simply set to the 'generationMetadata' key in the reference file system. """ - if not filename.endswith(".lindi.json") and not filename.endswith(".lindi.tar"): - raise Exception("Filename must end with '.lindi.json' or '.lindi.tar'") + if not filename.endswith(".lindi.json") and not filename.endswith(".lindi.tar") and not filename.endswith(".lindi.d"): + raise ValueError("Filename must end with '.lindi.json' or '.lindi.tar'") rfs = self.to_reference_file_system() if self._source_tar_file: source_is_remote = self._source_url_or_path is not None and (self._source_url_or_path.startswith("http://") or self._source_url_or_path.startswith("https://")) if not source_is_remote: - raise Exception("Cannot write to lindi file if the source is a local lindi tar file because it would not be able to resolve the local references within the tar file.") + raise ValueError("Cannot write to lindi file if the source is a local lindi tar file because it would not be able to resolve the local references within the tar file.") assert self._source_url_or_path is not None _update_internal_references_to_remote_tar_file(rfs, self._source_url_or_path, self._source_tar_file) if generation_metadata is not None: @@ -301,7 +292,7 @@ def write_lindi_file(self, filename: str, *, generation_metadata: Union[dict, No elif filename.endswith(".d"): LindiTarFile.create(filename, rfs=rfs, dir_representation=True) else: - raise Exception("Unhandled file extension") + raise Exception("Unhandled file extension") # pragma: no cover @property def attrs(self): # type: ignore @@ -336,8 +327,8 @@ def swmr_mode(self, value): # type: ignore def close(self): if not self._is_open: - print('Warning: LINDI file already closed.') - return + print('Warning: LINDI file already closed.') # pragma: no cover + return # pragma: no cover self.flush() if self._close_source_tar_file_on_close and self._source_tar_file: self._source_tar_file.close() @@ -345,11 +336,11 @@ def close(self): def flush(self): if not self._is_open: - return + return # pragma: no cover if self._mode != 'r' and self._source_url_or_path is not None: is_url = self._source_url_or_path.startswith("http://") or self._source_url_or_path.startswith("https://") if is_url: - raise Exception("Cannot write to URL") + raise Exception("Cannot write to URL") # pragma: no cover rfs = self.to_reference_file_system() if self._source_tar_file: self._source_tar_file.write_rfs(rfs) @@ -394,7 +385,7 @@ def copy(self, source, dest, name=None, raise Exception("name must be provided for copy") src_item = self._get_item(source) if not isinstance(src_item, (h5py.Group, h5py.Dataset)): - raise Exception(f"Unexpected type for source in copy: {type(src_item)}") + raise Exception(f"Unexpected type for source in copy: {type(src_item)}") # pragma: no cover _recursive_copy(src_item, dest, name=name) def __delitem__(self, name): @@ -413,14 +404,14 @@ def _get_item(self, name, getlink=False, default=None): raise Exception("Getting link is not allowed for references") zarr_group = self._zarr_group if name._source != '.': - raise Exception(f'For now, source of reference must be ".", got "{name._source}"') + raise Exception(f'For now, source of reference must be ".", got "{name._source}"') # pragma: no cover if name._source_object_id is not None: - if name._source_object_id != zarr_group.attrs.get("object_id"): - raise Exception(f'Mismatch in source object_id: "{name._source_object_id}" and "{zarr_group.attrs.get("object_id")}"') + if name._source_object_id != zarr_group.attrs.get("object_id"): # pragma: no cover + raise Exception(f'Mismatch in source object_id: "{name._source_object_id}" and "{zarr_group.attrs.get("object_id")}"') # pragma: no cover target = self[name._path] if name._object_id is not None: - if name._object_id != target.attrs.get("object_id"): - raise Exception(f'Mismatch in object_id: "{name._object_id}" and "{target.attrs.get("object_id")}"') + if name._object_id != target.attrs.get("object_id"): # pragma: no cover + raise Exception(f'Mismatch in object_id: "{name._object_id}" and "{target.attrs.get("object_id")}"') # pragma: no cover return target # if it contains slashes, it's a path if isinstance(name, str) and "/" in name: @@ -477,24 +468,24 @@ def ref(self): # write def create_group(self, name, track_order=None): if self._mode == 'r': - raise Exception("Cannot create group in read-only mode") + raise ValueError("Cannot create group in read-only mode") if track_order is not None: raise Exception("track_order is not supported (I don't know what it is)") return self._the_group.create_group(name) def require_group(self, name): if self._mode == 'r': - raise Exception("Cannot require group in read-only mode") + raise ValueError("Cannot require group in read-only mode") return self._the_group.require_group(name) def create_dataset(self, name, shape=None, dtype=None, data=None, **kwds): if self._mode == 'r': - raise Exception("Cannot create dataset in read-only mode") + raise ValueError("Cannot create dataset in read-only mode") return self._the_group.create_dataset(name, shape=shape, dtype=dtype, data=data, **kwds) def require_dataset(self, name, shape, dtype, exact=False, **kwds): if self._mode == 'r': - raise Exception("Cannot require dataset in read-only mode") + raise ValueError("Cannot require dataset in read-only mode") return self._the_group.require_dataset(name, shape, dtype, exact=exact, **kwds) @@ -522,7 +513,7 @@ def _recursive_copy(src_item: Union[h5py.Group, h5py.Dataset], dest: h5py.File, # data because we can copy the reference. if isinstance(src_item.file, LindiH5pyFile) and isinstance(dest, LindiH5pyFile): if src_item.name is None: - raise Exception("src_item.name is None") + raise Exception("src_item.name is None") # pragma: no cover src_item_name = _without_initial_slash(src_item.name) src_zarr_store = src_item.file._zarr_store dst_zarr_store = dest._zarr_store diff --git a/lindi/LindiH5pyFile/writers/LindiH5pyGroupWriter.py b/lindi/LindiH5pyFile/writers/LindiH5pyGroupWriter.py index 0111603..068614f 100644 --- a/lindi/LindiH5pyFile/writers/LindiH5pyGroupWriter.py +++ b/lindi/LindiH5pyFile/writers/LindiH5pyGroupWriter.py @@ -38,6 +38,7 @@ def create_group(self, name, track_order=None): def require_group(self, name): if name in self.p: ret = self.p[name] + from ..LindiH5pyGroup import LindiH5pyGroup # avoid circular import if not isinstance(ret, LindiH5pyGroup): raise Exception(f'Expected a group at {name} but got {type(ret)}') return ret diff --git a/lindi/__init__.py b/lindi/__init__.py index 6e5e7f6..1b3a3ea 100644 --- a/lindi/__init__.py +++ b/lindi/__init__.py @@ -2,3 +2,4 @@ from .LindiH5pyFile import LindiH5pyFile, LindiH5pyGroup, LindiH5pyDataset, LindiH5pyHardLink, LindiH5pySoftLink # noqa: F401 from .LocalCache.LocalCache import LocalCache, ChunkTooLargeError # noqa: F401 from .LindiRemfile.additional_url_resolvers import add_additional_url_resolver # noqa: F401 +from .LindiH5pyFile.LindiH5pyReference import LindiH5pyReference # noqa: F401 diff --git a/lindi/conversion/create_zarr_dataset_from_h5_data.py b/lindi/conversion/create_zarr_dataset_from_h5_data.py index 0d0b500..6998cd6 100644 --- a/lindi/conversion/create_zarr_dataset_from_h5_data.py +++ b/lindi/conversion/create_zarr_dataset_from_h5_data.py @@ -266,7 +266,7 @@ def _get_default_chunks(shape: Tuple, dtype: Any) -> Tuple: shape_prod_0 = np.prod(shape[1:]) optimal_chunk_size_bytes = 1024 * 1024 * 20 # 20 MB optimal_chunk_size = optimal_chunk_size_bytes // (dtype_size * shape_prod_0) - if optimal_chunk_size <= shape[0]: + if optimal_chunk_size > shape[0]: return shape if optimal_chunk_size < 1: return (1,) + shape[1:] diff --git a/lindi/tar/lindi_tar.py b/lindi/tar/lindi_tar.py index 2398786..88d9922 100644 --- a/lindi/tar/lindi_tar.py +++ b/lindi/tar/lindi_tar.py @@ -10,6 +10,18 @@ INITIAL_LINDI_JSON_SIZE = 1024 * 8 +# for tests +def _test_set( + tar_entry_json_size: int, + initial_tar_index_json_size: int, + initial_lindi_json_size: int +): + global TAR_ENTRY_JSON_SIZE, INITIAL_TAR_INDEX_JSON_SIZE, INITIAL_LINDI_JSON_SIZE + TAR_ENTRY_JSON_SIZE = tar_entry_json_size + INITIAL_TAR_INDEX_JSON_SIZE = initial_tar_index_json_size + INITIAL_LINDI_JSON_SIZE = initial_lindi_json_size + + class LindiTarFile: def __init__(self, tar_path_or_url: str, dir_representation=False): self._tar_path_or_url = tar_path_or_url @@ -93,14 +105,18 @@ def overwrite_file_content(self, file_name: str, data: bytes): self._file.seek(info['d']) self._file.write(data) else: - # for safety: - file_parts = file_name.split("/") - for part in file_parts[:-1]: - if part.startswith('..'): - raise ValueError(f"Invalid path: {file_name}") - fname = self._tar_path_or_url + "/" + file_name - with open(fname, "wb") as f: - f.write(data) + # Actually not ever used. The file is just replaced. + raise Exception('Overwriting file content in a directory representation is not supported') # pragma: no cover + + # But if we did do it, it would look like this: + # # for safety: + # file_parts = file_name.split("/") + # for part in file_parts[:-1]: + # if part.startswith('..'): + # raise ValueError(f"Invalid path: {file_name}") + # fname = self._tar_path_or_url + "/" + file_name + # with open(fname, "wb") as f: + # f.write(data) def trash_file(self, file_name: str): if self._is_remote: @@ -160,8 +176,7 @@ def write_rfs(self, rfs: dict): rfs_json = _pad_bytes_to_leave_room_for_growth(rfs_json, INITIAL_LINDI_JSON_SIZE) self.write_file("lindi.json", rfs_json) else: - with open(self._tar_path_or_url + "/lindi.json", "wb") as f: - f.write(rfs_json.encode()) + self.write_file("lindi.json", rfs_json.encode()) def get_file_byte_range(self, file_name: str) -> tuple: if self._dir_representation: diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_copy.py b/tests/test_copy.py index 95af5a9..9525eac 100644 --- a/tests/test_copy.py +++ b/tests/test_copy.py @@ -3,7 +3,7 @@ import pytest import lindi from lindi import LindiH5ZarrStore -from utils import arrays_are_equal, assert_groups_equal +from .utils import arrays_are_equal, assert_groups_equal def test_copy_dataset(): @@ -59,13 +59,13 @@ def test_copy_group(): h5f.copy("X", h5f_2, "Z") h5f_2.copy("X", h5f_2, "Z") assert "Z" in h5f_2 - assert_groups_equal(h5f["X"], h5f_2["Z"]) # type: ignore + assert_groups_equal(h5f["X"], h5f_2["Z"], skip_large_datasets=False) # type: ignore rfs_copy = store.to_reference_file_system() h5f_3 = lindi.LindiH5pyFile.from_reference_file_system(rfs_copy, mode="r+") assert "Z" not in h5f_3 h5f_2.copy("X", h5f_3, "Z") assert "Z" in h5f_3 - assert_groups_equal(h5f["X"], h5f_3["Z"]) # type: ignore + assert_groups_equal(h5f["X"], h5f_3["Z"], skip_large_datasets=False) # type: ignore if __name__ == '__main__': diff --git a/tests/test_core.py b/tests/test_core.py deleted file mode 100644 index b0532a6..0000000 --- a/tests/test_core.py +++ /dev/null @@ -1,601 +0,0 @@ -import json -import pytest -import numpy as np -import h5py -import tempfile -import lindi -from lindi import LindiH5ZarrStore -from utils import arrays_are_equal, lists_are_equal - - -def test_variety(): - with tempfile.TemporaryDirectory() as tmpdir: - filename = f"{tmpdir}/test.h5" - with h5py.File(filename, "w") as f: - f.create_dataset("dataset1", data=[1, 2, 3]) - f.create_group("group1") - f.attrs["int1"] = 1 - f.attrs["float1"] = 3.14 - f.attrs["str1"] = "abc" - f.attrs["bytes1"] = b"def" - f.attrs["list1"] = [1, 2, 3] - f.attrs["tuple1"] = (3, 4, 5) - f.attrs["array1"] = np.arange(10) - f.attrs["dataset1_ref"] = f["dataset1"].ref - f.attrs["group1_ref"] = f["group1"].ref - f["dataset1"].attrs["test_attr1"] = "attribute-of-dataset1" - f["group1"].attrs["test_attr2"] = "attribute-of-group1" - h5f = h5py.File(filename, "r") - with LindiH5ZarrStore.from_file(filename, url=filename) as store: - rfs = store.to_reference_file_system() - h5f_rfs = lindi.LindiH5pyFile.from_reference_file_system(rfs) - assert h5f_rfs.attrs["int1"] == h5f.attrs["int1"] - assert h5f_rfs.attrs["float1"] == h5f.attrs["float1"] - assert h5f_rfs.attrs["str1"] == h5f.attrs["str1"] - assert h5f_rfs.attrs["bytes1"] == h5f.attrs["bytes1"] - assert lists_are_equal(h5f_rfs.attrs["list1"], h5f.attrs["list1"]) - assert lists_are_equal(h5f_rfs.attrs["tuple1"], h5f.attrs["tuple1"]) - assert arrays_are_equal(np.array(h5f_rfs.attrs["array1"]), h5f.attrs["array1"]) - assert h5f_rfs["dataset1"].attrs["test_attr1"] == h5f["dataset1"].attrs["test_attr1"] # type: ignore - assert h5f_rfs["dataset1"].id # type: ignore - assert arrays_are_equal(h5f_rfs["dataset1"][()], h5f["dataset1"][()]) # type: ignore - assert h5f_rfs["group1"].attrs["test_attr2"] == h5f["group1"].attrs["test_attr2"] # type: ignore - target_1 = h5f[h5f.attrs["dataset1_ref"]] - target_2 = h5f_rfs[h5f_rfs.attrs["dataset1_ref"]] - assert target_1.attrs["test_attr1"] == target_2.attrs["test_attr1"] # type: ignore - target_1 = h5f[h5f.attrs["group1_ref"]] - target_2 = h5f_rfs[h5f_rfs.attrs["group1_ref"]] - assert target_1.attrs["test_attr2"] == target_2.attrs["test_attr2"] # type: ignore - - -def test_group_soft_links(): - with tempfile.TemporaryDirectory() as tmpdir: - filename = f"{tmpdir}/test.h5" - with h5py.File(filename, "w") as f: - g = f.create_group('group_target') - g.attrs['foo'] = 'bar' - g.create_dataset('dataset1', data=[5, 6, 7]) - f['soft_link'] = h5py.SoftLink('/group_target') - h5f = h5py.File(filename, "r") - with LindiH5ZarrStore.from_file(filename, url=filename) as store: - rfs = store.to_reference_file_system() - h5f_rfs = lindi.LindiH5pyFile.from_reference_file_system(rfs) - g1 = h5f['group_target'] - assert isinstance(g1, h5py.Group) - g2 = h5f_rfs['group_target'] - assert isinstance(g2, h5py.Group) - assert g1.attrs['foo'] == g2.attrs['foo'] # type: ignore - with pytest.raises(TypeError): - g1[np.array([0, 1, 2])] - h1 = h5f['soft_link'] - assert isinstance(h1, h5py.Group) - h2 = h5f_rfs['soft_link'] - assert isinstance(h2, h5py.Group) - assert h1.attrs['foo'] == h2.attrs['foo'] # type: ignore - # this is tricky: it seems that with h5py, the name of the soft link - # is the source name. So the following assertion will fail. - # assert h1.name == h2.name - k1 = h5f.get('soft_link', getlink=True) - k2 = h5f_rfs.get('soft_link', getlink=True) - assert isinstance(k1, h5py.SoftLink) - assert isinstance(k2, h5py.SoftLink) - ds1 = h5f['soft_link']['dataset1'] # type: ignore - assert isinstance(ds1, h5py.Dataset) - ds2 = h5f_rfs['soft_link']['dataset1'] # type: ignore - assert isinstance(ds2, h5py.Dataset) - assert arrays_are_equal(ds1[()], ds2[()]) - ds1 = h5f['soft_link/dataset1'] - assert isinstance(ds1, h5py.Dataset) - ds2 = h5f_rfs['soft_link/dataset1'] - assert isinstance(ds2, h5py.Dataset) - assert arrays_are_equal(ds1[()], ds2[()]) - ds1 = h5f['group_target/dataset1'] - assert isinstance(ds1, h5py.Dataset) - ds2 = h5f_rfs['group_target/dataset1'] - assert isinstance(ds2, h5py.Dataset) - assert arrays_are_equal(ds1[()], ds2[()]) - - -def test_dataset_soft_links(): - with tempfile.TemporaryDirectory() as tmpdir: - filename = f"{tmpdir}/test.h5" - with h5py.File(filename, "w") as f: - g = f.create_group('group_target') - d = g.create_dataset('dataset1', data=[5, 6, 7]) - d.attrs['foo'] = 'bar' - f['dataset_soft_link'] = h5py.SoftLink('/group_target/dataset1') - h5f = h5py.File(filename, "r") - with LindiH5ZarrStore.from_file(filename, url=filename) as store: - rfs = store.to_reference_file_system() - h5f_rfs = lindi.LindiH5pyFile.from_reference_file_system(rfs) - h1 = h5f['dataset_soft_link'] - assert isinstance(h1, h5py.Dataset) - h2 = h5f_rfs['dataset_soft_link'] - assert isinstance(h2, h5py.Dataset) - assert arrays_are_equal(h1[()], h2[()]) - assert h1.attrs['foo'] == h2.attrs['foo'] # type: ignore - # this is tricky: it seems that with h5py, the name of the soft link - # is the source name. So the following assertion will fail. - # assert h1.name == h2.name - k1 = h5f.get('dataset_soft_link', getlink=True) - k2 = h5f_rfs.get('dataset_soft_link', getlink=True) - assert isinstance(k1, h5py.SoftLink) - assert isinstance(k2, h5py.SoftLink) - - -def test_arrays_of_compound_dtype(): - with tempfile.TemporaryDirectory() as tmpdir: - filename = f"{tmpdir}/test.h5" - with h5py.File(filename, "w") as f: - dt = np.dtype([("x", "i4"), ("y", "f8")]) - dataset1 = f.create_dataset("dataset1", data=[(1, 3.14), (2, 6.28)], dtype=dt) - dt = np.dtype([("a", "i4"), ("b", "f8"), ("c", "S10")]) - dataset2 = f.create_dataset("dataset2", data=[(1, 3.14, "abc"), (2, 6.28, "def")], dtype=dt) - # how about references! - dt = np.dtype([("a", "i4"), ("b", "f8"), ("c", h5py.special_dtype(ref=h5py.Reference))]) - f.create_dataset("dataset3", data=[(1, 3.14, dataset1.ref), (2, 6.28, dataset2.ref)], dtype=dt) - h5f = h5py.File(filename, "r") - with LindiH5ZarrStore.from_file(filename, url=filename) as store: - rfs = store.to_reference_file_system() - h5f_2 = lindi.LindiH5pyFile.from_reference_file_system(rfs) - ds1_1 = h5f['dataset1'] - assert isinstance(ds1_1, h5py.Dataset) - ds1_2 = h5f_2['dataset1'] - assert isinstance(ds1_2, h5py.Dataset) - assert ds1_1.dtype == ds1_2.dtype - assert arrays_are_equal(ds1_1['x'][()], ds1_2['x'][()]) # type: ignore - assert arrays_are_equal(ds1_1['y'][()], ds1_2['y'][()]) # type: ignore - ds2_1 = h5f['dataset2'] - assert isinstance(ds2_1, h5py.Dataset) - ds2_2 = h5f_2['dataset2'] - assert isinstance(ds2_2, h5py.Dataset) - assert ds2_1.dtype == ds2_2.dtype - assert arrays_are_equal(ds2_1['a'][()], ds2_2['a'][()]) # type: ignore - assert arrays_are_equal(ds2_1['b'][()], ds2_2['b'][()]) # type: ignore - assert arrays_are_equal(ds2_1['c'][()], ds2_2['c'][()]) # type: ignore - ds3_1 = h5f['dataset3'] - assert isinstance(ds3_1, h5py.Dataset) - ds3_2 = h5f_2['dataset3'] - assert isinstance(ds3_2, h5py.Dataset) - assert ds3_1.dtype == ds3_2.dtype - assert ds3_1.dtype['c'] == ds3_2.dtype['c'] - assert ds3_2.dtype['c'] == h5py.special_dtype(ref=h5py.Reference) - target1 = h5f[ds3_1['c'][0]] - assert isinstance(target1, h5py.Dataset) - target2 = h5f_2[ds3_2['c'][0]] - assert isinstance(target2, h5py.Dataset) - - -def test_arrays_of_compound_dtype_with_references(): - with tempfile.TemporaryDirectory() as tmpdir: - filename = f"{tmpdir}/test.h5" - with h5py.File(filename, "w") as f: - dt = np.dtype([("x", "i4"), ("y", h5py.special_dtype(ref=h5py.Reference))]) - Y_ds = f.create_dataset("Y", data=[1, 2, 3]) - f.create_dataset("dataset1", data=[(1, Y_ds.ref), (2, Y_ds.ref)], dtype=dt) - h5f = h5py.File(filename, "r") - with LindiH5ZarrStore.from_file(filename, url=filename) as store: - rfs = store.to_reference_file_system() - h5f_2 = lindi.LindiH5pyFile.from_reference_file_system(rfs) - ds1_1 = h5f['dataset1'] - assert isinstance(ds1_1, h5py.Dataset) - ds1_2 = h5f_2['dataset1'] - assert isinstance(ds1_2, h5py.Dataset) - assert ds1_1.dtype == ds1_2.dtype - assert arrays_are_equal(ds1_1['x'][()], ds1_2['x'][()]) # type: ignore - ref1 = ds1_1['y'][0] - ref2 = ds1_2['y'][0] - assert isinstance(ref1, h5py.Reference) - assert isinstance(ref2, h5py.Reference) - target1 = h5f[ref1] - assert isinstance(target1, h5py.Dataset) - target2 = h5f_2[ref2] - assert isinstance(target2, h5py.Dataset) - assert arrays_are_equal(target1[()], target2[()]) - - -def test_scalar_arrays(): - with tempfile.TemporaryDirectory() as tmpdir: - filename = f"{tmpdir}/test.h5" - with h5py.File(filename, "w") as f: - f.create_dataset("X", data=1) - f.create_dataset("Y", data=3.14) - f.create_dataset("Z", data="abc") - f.create_dataset("W", data=b"def") - h5f = h5py.File(filename, "r") - with LindiH5ZarrStore.from_file(filename, url=filename) as store: - rfs = store.to_reference_file_system() - h5f_2 = lindi.LindiH5pyFile.from_reference_file_system(rfs) - X1 = h5f['X'] - assert isinstance(X1, h5py.Dataset) - X2 = h5f_2['X'] - assert isinstance(X2, h5py.Dataset) - assert X1[()] == X2[()] - assert X2.size == 1 - Y1 = h5f['Y'] - assert isinstance(Y1, h5py.Dataset) - Y2 = h5f_2['Y'] - assert isinstance(Y2, h5py.Dataset) - assert Y1[()] == Y2[()] - Z1 = h5f['Z'] - assert isinstance(Z1, h5py.Dataset) - Z2 = h5f_2['Z'] - assert isinstance(Z2, h5py.Dataset) - # Note that encode is needed because Z1[()] is a bytes - assert Z1[()] == Z2[()].encode() # type: ignore - W1 = h5f['W'] - assert isinstance(W1, h5py.Dataset) - W2 = h5f_2['W'] - assert isinstance(W2, h5py.Dataset) - # Note that encode is needed because W2[()] is a str - assert W1[()] == W2[()].encode() # type: ignore - - -def test_arrays_of_strings(): - with tempfile.TemporaryDirectory() as tmpdir: - filename = f"{tmpdir}/test.h5" - with h5py.File(filename, "w") as f: - f.create_dataset("X", data=["abc", "def", "ghi"]) - h5f = h5py.File(filename, "r") - with LindiH5ZarrStore.from_file(filename, url=filename) as store: - rfs = store.to_reference_file_system() - h5f_2 = lindi.LindiH5pyFile.from_reference_file_system(rfs) - X1 = h5f['X'] - assert isinstance(X1, h5py.Dataset) - X2 = h5f_2['X'] - assert isinstance(X2, h5py.Dataset) - assert lists_are_equal(X1[:].tolist(), [x.encode() for x in X2[:]]) # type: ignore - - -def test_numpy_arrays(): - array_1 = ("1", np.arange(60).reshape(3, 20), (3, 7)) - array_2 = ("2", np.arange(60).reshape(3, 20), None) - array_boolean = ("3", np.array([[True, False, True], [False, True, False]]), None) - for label, array, chunks in [array_1, array_2, array_boolean]: - print(f"Testing numpy array {label}") - with tempfile.TemporaryDirectory() as tmpdir: - filename = f"{tmpdir}/test.h5" - with h5py.File(filename, "w") as f: - f.create_dataset("X", data=array, chunks=chunks) - with LindiH5ZarrStore.from_file( - filename, url=filename - ) as store: # set url so that a reference file system can be created - rfs = store.to_reference_file_system() - client = lindi.LindiH5pyFile.from_reference_file_system(rfs) - h5f = h5py.File(filename, "r") - X1 = h5f["X"] - assert isinstance(X1, h5py.Dataset) - X2 = client["X"] - assert isinstance(X2, lindi.LindiH5pyDataset) - - assert X1.shape == X2.shape - assert X1.dtype == X2.dtype - assert X1.size == X2.size - assert X1.nbytes == X2.nbytes - assert len(X1) == len(X2) - - -def test_nan_inf_attributes(): - with tempfile.TemporaryDirectory() as tmpdir: - filename = f"{tmpdir}/test.h5" - with h5py.File(filename, "w") as f: - f.create_dataset("X", data=[1, 2, 3]) - f["X"].attrs["nan"] = np.nan - f["X"].attrs["inf"] = np.inf - f["X"].attrs["ninf"] = -np.inf - f["X"].attrs['float_list'] = [np.nan, np.inf, -np.inf, 23] - h5f = h5py.File(filename, "r") - with LindiH5ZarrStore.from_file(filename, url=filename) as store: - rfs = store.to_reference_file_system() - client = lindi.LindiH5pyFile.from_reference_file_system(rfs) - - X1 = h5f["X"] - assert isinstance(X1, h5py.Dataset) - X2 = client["X"] - assert isinstance(X2, lindi.LindiH5pyDataset) - - nanval = X1.attrs["nan"] - assert isinstance(nanval, float) and np.isnan(nanval) - assert X1.attrs["inf"] == np.inf - assert X1.attrs["ninf"] == -np.inf - assert lists_are_equal(X1.attrs['float_list'], [np.nan, np.inf, -np.inf, 23]) - - nanval = X2.attrs["nan"] - assert isinstance(nanval, float) and np.isnan(nanval) - assert X2.attrs["inf"] == np.inf - assert X2.attrs["ninf"] == -np.inf - assert lists_are_equal(X2.attrs['float_list'], [np.nan, np.inf, -np.inf, 23]) - - for test_string in ["NaN", "Infinity", "-Infinity", "Not-illegal"]: - filename = f"{tmpdir}/illegal_string.h5" - with h5py.File(filename, "w") as f: - f.create_dataset("X", data=[1, 2, 3]) - f["X"].attrs["test_string"] = test_string - with LindiH5ZarrStore.from_file(filename, url=filename) as store: - if test_string in ["NaN", "Infinity", "-Infinity"]: - with pytest.raises(Exception): - rfs = store.to_reference_file_system() - else: - rfs = store.to_reference_file_system() - client = lindi.LindiH5pyFile.from_reference_file_system(rfs) - assert client["X"].attrs["test_string"] == test_string # type: ignore - - -def test_reference_file_system_to_file(): - with tempfile.TemporaryDirectory() as tmpdir: - filename = f"{tmpdir}/test.h5" - with h5py.File(filename, "w") as f: - f.create_dataset("X", data=[1, 2, 3]) - with LindiH5ZarrStore.from_file(filename, url=filename) as store: - rfs_fname = f'{tmpdir}/test.lindi.json' - store.write_reference_file_system(rfs_fname) - client = lindi.LindiH5pyFile.from_reference_file_system(rfs_fname) - X = client["X"] - assert isinstance(X, lindi.LindiH5pyDataset) - assert lists_are_equal(X[()], [1, 2, 3]) - - -def test_lindi_reference_file_system_store(): - from lindi.LindiH5pyFile.LindiReferenceFileSystemStore import LindiReferenceFileSystemStore - - # test for invalid rfs - rfs = {"rfs_misspelled": {"a": "a"}} # misspelled - with pytest.raises(Exception): - store = LindiReferenceFileSystemStore(rfs) - rfs = {"refs": {"a": 1}} # invalid value - with pytest.raises(Exception): - store = LindiReferenceFileSystemStore(rfs) - rfs = {"refs": {"a": ["a", 1]}} # invalid list - with pytest.raises(Exception): - store = LindiReferenceFileSystemStore(rfs) - rfs = {"refs": {"a": ["a", 1, 2, 3]}} # invalid list - with pytest.raises(Exception): - store = LindiReferenceFileSystemStore(rfs) - rfs = {"refs": {"a": [1, 2, 3]}} # invalid list - with pytest.raises(Exception): - store = LindiReferenceFileSystemStore(rfs) - rfs = {"refs": {"a": ['a', 'a', 2]}} # invalid list - with pytest.raises(Exception): - store = LindiReferenceFileSystemStore(rfs) - rfs = {"refs": {"a": ['a', 1, 'a']}} # invalid list - with pytest.raises(Exception): - store = LindiReferenceFileSystemStore(rfs) - rfs = {"refs": {"a": "base64:abc+++"}} # invalid base64 - store = LindiReferenceFileSystemStore(rfs) - with pytest.raises(Exception): - store["a"] - with pytest.raises(Exception): - store[{}] # invalid key type # type: ignore - rfs = {"refs": {"a": 83}} # invalid value - with pytest.raises(Exception): - store = LindiReferenceFileSystemStore(rfs) - rfs = {"refs": {"a": {"test": 1}}} - store = LindiReferenceFileSystemStore(rfs) - assert json.loads(store["a"]) == {"test": 1} - rfs = {"refs": {".zattrs": "{\"test\": 2}"}} - store = LindiReferenceFileSystemStore(rfs) - assert json.loads(store[".zattrs"]) == {"test": 2} - rfs = {"refs": {".zattrs": "{\"test\": 3}"}} - LindiReferenceFileSystemStore.replace_meta_file_contents_with_dicts_in_rfs(rfs) - assert isinstance(rfs["refs"][".zattrs"], dict) - store = LindiReferenceFileSystemStore(rfs) - assert json.loads(store[".zattrs"]) == {"test": 3} - rfs = {"refs": {".zattrs_xxx": "{\"test\": 5}"}} - LindiReferenceFileSystemStore.replace_meta_file_contents_with_dicts_in_rfs(rfs) - assert isinstance(rfs["refs"][".zattrs_xxx"], str) - rfs = {"refs": {"0": ["http://example.com", 0, 1000]}} - LindiReferenceFileSystemStore.use_templates_in_rfs(rfs) - assert 'templates' not in rfs - assert rfs['refs']['0'] == ['http://example.com', 0, 1000] - with tempfile.TemporaryDirectory() as tmpdir: - with open(f"{tmpdir}/file1.txt", "wb") as f: - f.write(b"a" * 1000) - f.write(b"b" * 1000) - f.write(b"c" * 1000) - f.write(b"d" * 1000) - f.write(b"e" * 1000) - with open(f"{tmpdir}/file2.txt", "wb") as f: - f.write(b"f" * 1000) - f.write(b"g" * 1000) - f.write(b"h" * 1000) - f.write(b"i" * 1000) - f.write(b"j" * 1000) - rfs = {"refs": { - "0": [f"{tmpdir}/file1.txt", 0, 1000], - "1": [f"{tmpdir}/file1.txt", 1000, 1000], - "2": [f"{tmpdir}/file1.txt", 2000, 1000], - "3": [f"{tmpdir}/file1.txt", 3000, 1000], - "4": [f"{tmpdir}/file1.txt", 4000, 1000], - "5": [f"{tmpdir}/file2.txt", 0, 1000], - "6": [f"{tmpdir}/file2.txt", 1000, 1000], - "7": [f"{tmpdir}/file2.txt", 2000, 1000], - "8": [f"{tmpdir}/file2.txt", 3000, 1000], - "9": [f"{tmpdir}/file2.txt", 4000, 1000], - }} - LindiReferenceFileSystemStore.use_templates_in_rfs(rfs) - assert 'templates' in rfs - assert rfs['templates']['u1'] == f"{tmpdir}/file1.txt" - assert rfs['templates']['u2'] == f"{tmpdir}/file2.txt" - assert rfs['refs']['0'] == ['{{u1}}', 0, 1000] - assert rfs['refs']['1'] == ['{{u1}}', 1000, 1000] - assert rfs['refs']['2'] == ['{{u1}}', 2000, 1000] - assert rfs['refs']['3'] == ['{{u1}}', 3000, 1000] - assert rfs['refs']['4'] == ['{{u1}}', 4000, 1000] - assert rfs['refs']['5'] == ['{{u2}}', 0, 1000] - assert rfs['refs']['6'] == ['{{u2}}', 1000, 1000] - assert rfs['refs']['7'] == ['{{u2}}', 2000, 1000] - assert rfs['refs']['8'] == ['{{u2}}', 3000, 1000] - assert rfs['refs']['9'] == ['{{u2}}', 4000, 1000] - store = LindiReferenceFileSystemStore(rfs) - assert store['0'] == b"a" * 1000 - assert store['1'] == b"b" * 1000 - assert store['2'] == b"c" * 1000 - assert store['3'] == b"d" * 1000 - assert store['4'] == b"e" * 1000 - assert store['5'] == b"f" * 1000 - assert store['6'] == b"g" * 1000 - assert store['7'] == b"h" * 1000 - assert store['8'] == b"i" * 1000 - assert store['9'] == b"j" * 1000 - - rfs = {"refs": {"a": "abc"}} - store = LindiReferenceFileSystemStore(rfs) - assert store.is_readable() - assert store.is_writeable() - assert store.is_listable() - assert not store.is_erasable() - assert len(store) == 1 - assert "a" in store - assert "b" not in store - assert store["a"] == b"abc" - - -def test_lindi_h5py_reference(): - from lindi.LindiH5pyFile.LindiH5pyReference import LindiH5pyReference - obj = { - "object_id": "object_id", - "path": "path", - "source": "source", - "source_object_id": "source_object_id", - } - ref = LindiH5pyReference(obj) - assert repr(ref) == "LindiH5pyReference(object_id, path)" - assert str(ref) == "LindiH5pyReference(object_id, path)" - assert ref._object_id == "object_id" - assert ref._path == "path" - assert ref._source == "source" - assert ref._source_object_id == "source_object_id" - assert ref.__class__.__name__ == "LindiH5pyReference" - assert isinstance(ref, h5py.h5r.Reference) - assert isinstance(ref, LindiH5pyReference) - - -def test_lindi_h5_zarr_store(): - # Test that exceptions are raised as expected - with tempfile.TemporaryDirectory() as tmpdir: - filename = f"{tmpdir}/test.h5" - with h5py.File(filename, "w") as f: - f.create_dataset("dataset1", data=[1, 2, 3]) - f.create_group("group1") - f.create_dataset("scalar_dataset", data=42) - # Store is closed - store = LindiH5ZarrStore.from_file(filename) - store.close() - store_is_closed_msg = "Store is closed" - with pytest.raises(Exception, match=store_is_closed_msg): - if 'dataset1/.zarray' in store: - pass - with pytest.raises(Exception, match=store_is_closed_msg): - store["dataset1/.zarray"] - with pytest.raises(Exception, match=store_is_closed_msg): - store["dataset1/.zattrs"] - with pytest.raises(Exception, match=store_is_closed_msg): - store["group1/.zgroup"] - with pytest.raises(Exception, match=store_is_closed_msg): - store["group1/.zattrs"] - with pytest.raises(Exception, match=store_is_closed_msg): - store["dataset1/0"] - with pytest.raises(Exception, match=store_is_closed_msg): - store.listdir() - with pytest.raises(Exception, match=store_is_closed_msg): - store.to_reference_file_system() - with pytest.raises(Exception, match=store_is_closed_msg): - store.write_reference_file_system("test.lindi.json") - with pytest.raises(Exception, match=store_is_closed_msg): - store._get_chunk_file_bytes_data("dataset1", "0") - - # Nonexistent item - store = LindiH5ZarrStore.from_file(filename) - assert 'nonexistent/.zattrs' not in store - with pytest.raises(KeyError): - store["nonexistent/.zattrs"] - assert 'nonexistent/.zgroup' not in store - with pytest.raises(Exception, match="Item nonexistent is not a group"): - store["nonexistent/.zgroup"] - assert 'nonexistent/.zarray' not in store - with pytest.raises(Exception, match="Item nonexistent is not a dataset"): - store["nonexistent/.zarray"] - assert 'nonexistent/0' not in store - with pytest.raises(Exception, match="Item nonexistent is not a dataset"): - store["nonexistent/0"] - - # Key error - store = LindiH5ZarrStore.from_file(filename, url='.') - with pytest.raises(KeyError): - store[''] - assert '' not in store - with pytest.raises(KeyError): - store["nonexistent/.zattrs"] - - # URL is not set - store = LindiH5ZarrStore.from_file(filename, url=None) - with pytest.raises(Exception, match="You must specify a url to create a reference file system"): - store.to_reference_file_system() - - # External links not supported - with h5py.File(f'{tmpdir}/external.h5', 'w') as f: - grp = f.create_group('group1') - grp.attrs['attr1'] = 'value1' - with h5py.File(filename, "a") as f: - f["external_link"] = h5py.ExternalLink(f'{tmpdir}/external.h5', 'group1') - store = LindiH5ZarrStore.from_file(filename, url=filename) - with pytest.raises(Exception, match="External links not supported: external_link"): - print(store["external_link/.zattrs"]) - - store = LindiH5ZarrStore.from_file(filename, url=filename) - with pytest.raises(Exception, match="Setting items is not allowed"): - store["dataset1/.zattrs"] = b"{}" - with pytest.raises(Exception, match="Deleting items is not allowed"): - del store["dataset1/.zattrs"] - with pytest.raises(Exception, match="Not implemented"): - iter(store) - with pytest.raises(Exception, match="Not implemented"): - len(store) - - store = LindiH5ZarrStore.from_file(filename, url=filename) - assert 'dataset1/0.0' not in store - assert 'dataset1/1' not in store - assert 'scalar_dataset/0' in store - assert 'scalar_dataset/1' not in store - - -def test_numpy_array_of_byte_strings(): - with tempfile.TemporaryDirectory() as tmpdir: - filename = f"{tmpdir}/test.h5" - with h5py.File(filename, "w") as f: - f.create_dataset("X", data=np.array([b"abc", b"def", b"ghi"])) - h5f = h5py.File(filename, "r") - with LindiH5ZarrStore.from_file(filename, url=filename) as store: - rfs = store.to_reference_file_system() - h5f_2 = lindi.LindiH5pyFile.from_reference_file_system(rfs) - X1 = h5f['X'] - assert isinstance(X1, h5py.Dataset) - X2 = h5f_2['X'] - assert isinstance(X2, h5py.Dataset) - assert lists_are_equal(X1[:].tolist(), X2[:].tolist()) # type: ignore - - -def test_dataset_zero_shape(): - with tempfile.TemporaryDirectory() as tmpdir: - filename = f"{tmpdir}/test.h5" - with h5py.File(filename, "w") as f: - f.create_dataset("X1D", data=np.array([], dtype=np.int32), shape=(0,)) # NOTE this is not a scalar - f.create_dataset("X3D", data=np.array([], dtype=np.int32), shape=(0,0,0)) - h5f = h5py.File(filename, "r") - with LindiH5ZarrStore.from_file(filename, url=filename) as store: - rfs = store.to_reference_file_system() - h5f_2 = lindi.LindiH5pyFile.from_reference_file_system(rfs) - X1 = h5f['X1D'] - assert isinstance(X1, h5py.Dataset) - X2 = h5f_2['X1D'] - assert isinstance(X2, h5py.Dataset) - assert arrays_are_equal(X1[:], X2[:]) - X1 = h5f['X3D'] - assert isinstance(X1, h5py.Dataset) - X2 = h5f_2['X3D'] - assert isinstance(X2, h5py.Dataset) - assert arrays_are_equal(X1[:], X2[:]) - - -if __name__ == '__main__': - pass diff --git a/tests/test_examples.py b/tests/test_examples.py new file mode 100644 index 0000000..aafc5de --- /dev/null +++ b/tests/test_examples.py @@ -0,0 +1,137 @@ +import os +import tempfile +import pytest +import numpy as np +import lindi + + +def test_create_and_read_lindi_json(): + with tempfile.TemporaryDirectory() as tmpdir: + fname = f'{tmpdir}/example.lindi.json' + # Create a new lindi.json file + with lindi.LindiH5pyFile.from_lindi_file(fname, mode='w') as f: + f.attrs['attr1'] = 'value1' + f.attrs['attr2'] = 7 + ds = f.create_dataset('dataset1', shape=(10,), dtype='f') + ds[...] = 12 + + # Later read the file + with lindi.LindiH5pyFile.from_lindi_file(fname, mode='r') as f: + assert f.attrs['attr1'] == 'value1' + assert f.attrs['attr2'] == 7 + ds = f['dataset1'] + assert isinstance(ds, lindi.LindiH5pyDataset) + assert ds.shape == (10,) + for i in range(10): + assert ds[i] == 12 + + +def test_create_and_read_lindi_tar(): + with tempfile.TemporaryDirectory() as tmpdir: + fname = f'{tmpdir}/example.lindi.tar' + # Create a new lindi.json file + with lindi.LindiH5pyFile.from_lindi_file(fname, mode='w') as f: + f.attrs['attr1'] = 'value1' + f.attrs['attr2'] = 7 + ds = f.create_dataset('dataset1', shape=(10,), dtype='f') + ds[...] = 12 + + # Later read the file + with lindi.LindiH5pyFile.from_lindi_file(fname, mode='r') as f: + assert f.attrs['attr1'] == 'value1' + assert f.attrs['attr2'] == 7 + ds = f['dataset1'] + assert isinstance(ds, lindi.LindiH5pyDataset) + assert ds.shape == (10,) + for i in range(10): + assert ds[i] == 12 + + +def test_create_and_read_lindi_dir(): + with tempfile.TemporaryDirectory() as tmpdir: + dirname = f'{tmpdir}/example.lindi.d' + # Create a new lindi.json file + with lindi.LindiH5pyFile.from_lindi_file(dirname, mode='w') as f: + f.attrs['attr1'] = 'value1' + f.attrs['attr2'] = 7 + ds = f.create_dataset('dataset1', shape=(10,), dtype='f') + ds[...] = 12 + + # verify that it's a directory + assert os.path.isdir(dirname) + + # Later read the file + with lindi.LindiH5pyFile.from_lindi_file(dirname, mode='r') as f: + assert f.attrs['attr1'] == 'value1' + assert f.attrs['attr2'] == 7 + ds = f['dataset1'] + assert isinstance(ds, lindi.LindiH5pyDataset) + assert ds.shape == (10,) + for i in range(10): + assert ds[i] == 12 + + +@pytest.mark.network +def test_represent_remote_nwb_as_lindi_json(): + with tempfile.TemporaryDirectory() as tmpdir: + fname = f'{tmpdir}/example.nwb.lindi.json' + # Define the URL for a remote NWB file + h5_url = "https://api.dandiarchive.org/api/assets/11f512ba-5bcf-4230-a8cb-dc8d36db38cb/download/" + + # Load as LINDI and view using pynwb + f = lindi.LindiH5pyFile.from_hdf5_file(h5_url) + + # Save as LINDI JSON + f.write_lindi_file(fname) + f.flush() + + # Later, read directly from the LINDI JSON file + g = lindi.LindiH5pyFile.from_lindi_file(fname) + + # Later, read directly from the LINDI JSON file + for k, v in f.attrs.items(): + v2 = g.attrs[k] + if isinstance(v, lindi.LindiH5pyReference): + assert isinstance(v2, lindi.LindiH5pyReference) + else: + assert v == v2 + + f.close() + g.close() + + +@pytest.mark.network +def test_amend_remote_nwb_as_lindi_tar(): + import pynwb + from pynwb.file import TimeSeries + with tempfile.TemporaryDirectory() as tmpdir: + fname = f'{tmpdir}/example.nwb.lindi.tar' + # Load the remote NWB file from DANDI + h5_url = "https://api.dandiarchive.org/api/assets/11f512ba-5bcf-4230-a8cb-dc8d36db38cb/download/" + f = lindi.LindiH5pyFile.from_hdf5_file(h5_url) + + # Write to a local .lindi.tar file + f.write_lindi_file(fname) + f.close() + + # Open with pynwb and add new data + g = lindi.LindiH5pyFile.from_lindi_file(fname, mode='r+') + with pynwb.NWBHDF5IO(file=g, mode="a") as io: + nwbfile = io.read() + timeseries_test = TimeSeries( + name="test", + data=np.array([1, 2, 3, 4, 5, 6, 7, 8, 9]), + rate=1., + unit='s' + ) + nwbfile.processing['behavior'].add(timeseries_test) # type: ignore + io.write(nwbfile) # type: ignore + + # Later on, you can read the file again + h = lindi.LindiH5pyFile.from_lindi_file(fname) + with pynwb.NWBHDF5IO(file=h, mode="r") as io: + nwbfile = io.read() + test_timeseries = nwbfile.processing['behavior']['test'] # type: ignore + assert test_timeseries.data.shape == (9,) + for i in range(9): + assert test_timeseries.data[i] == i + 1 diff --git a/tests/test_lindi_h5py_file.py b/tests/test_lindi_h5py_file.py new file mode 100644 index 0000000..032f8e6 --- /dev/null +++ b/tests/test_lindi_h5py_file.py @@ -0,0 +1,369 @@ +import tempfile +import os +import pytest +import h5py +import lindi +from .utils import assert_h5py_files_equal + + +def test_1(): + with tempfile.TemporaryDirectory() as tmpdir: + h5_fname = f'{tmpdir}/test.h5' + lindi_json_fname = f'{tmpdir}/test.lindi.json' + lindi_tar_fname = f'{tmpdir}/test.lindi.tar' + lindi_d_fname = f'{tmpdir}/test.lindi.d' + + create_example_h5_file(h5_fname) + + with lindi.LindiH5pyFile.from_hdf5_file(h5_fname, url=h5_fname) as f: + f.write_lindi_file(lindi_json_fname) + + with lindi.LindiH5pyFile.from_lindi_file(lindi_json_fname) as f: + f.write_lindi_file(lindi_tar_fname) + + with lindi.LindiH5pyFile.from_lindi_file(lindi_json_fname) as f: + f.write_lindi_file(lindi_d_fname) + + assert os.path.isdir(lindi_d_fname) + + with h5py.File(h5_fname, 'r') as h5f: + with lindi.LindiH5pyFile.from_lindi_file(lindi_json_fname) as f: + assert_h5py_files_equal(h5f, f, skip_large_datasets=False) + + with lindi.LindiH5pyFile.from_lindi_file(lindi_tar_fname) as f: + assert_h5py_files_equal(h5f, f, skip_large_datasets=False) + + with lindi.LindiH5pyFile.from_lindi_file(lindi_d_fname) as f: + assert_h5py_files_equal(h5f, f, skip_large_datasets=False) + + +def test_fail_open_hdf5_in_write_mode(): + with tempfile.TemporaryDirectory() as tmpdir: + h5_fname = f'{tmpdir}/test.h5' + create_example_h5_file(h5_fname) + with lindi.LindiH5pyFile.from_hdf5_file(h5_fname, url=h5_fname, mode='r'): + pass + with pytest.raises(ValueError): + with lindi.LindiH5pyFile.from_hdf5_file(h5_fname, url=h5_fname, mode='w'): + pass + + +def test_create_new_lindi_json_file(): + with tempfile.TemporaryDirectory() as tmpdir: + lindi_json_fname = f'{tmpdir}/test.lindi.json' + with lindi.LindiH5pyFile.from_lindi_file(lindi_json_fname, mode='w') as f: + f.attrs['attr1'] = 'value1' + with lindi.LindiH5pyFile.from_lindi_file(lindi_json_fname) as f: + assert f.attrs['attr1'] == 'value1' + + +@pytest.mark.network +def test_load_remote_lindi_json_file(): + # https://neurosift.app/?p=/nwb&url=https://api.dandiarchive.org/api/assets/c04f6b30-82bf-40e1-9210-34f0bcd8be24/download/&dandisetId=000409&dandisetVersion=draft + url_lindi_json = 'https://lindi.neurosift.org/dandi/dandisets/000409/assets/c04f6b30-82bf-40e1-9210-34f0bcd8be24/nwb.lindi.json' + url_hdf5 = 'https://api.dandiarchive.org/api/assets/c04f6b30-82bf-40e1-9210-34f0bcd8be24/download/' + f1 = lindi.LindiH5pyFile.from_lindi_file(url_lindi_json) + f2 = lindi.LindiH5pyFile.from_hdf5_file(url_hdf5) + assert_h5py_files_equal(f1, f2, skip_large_datasets=True) + + +def test_fail_open_non_existing_file_for_reading(): + with tempfile.TemporaryDirectory() as tmpdir: + with pytest.raises(FileNotFoundError): + with lindi.LindiH5pyFile.from_lindi_file(f'{tmpdir}/non_existing_file.lindi.json', mode='r'): + pass + with pytest.raises(FileNotFoundError): + with lindi.LindiH5pyFile.from_lindi_file(f'{tmpdir}/non_existing_file.lindi.json', mode='r+'): + pass + + +def test_fail_open_existing_file_for_new_writing(): + with tempfile.TemporaryDirectory() as tmpdir: + with open(f'{tmpdir}/existing_file.lindi.json', 'w') as f: + f.write('test') + with pytest.raises(ValueError): + with lindi.LindiH5pyFile.from_lindi_file(f'{tmpdir}/existing_file.lindi.json', mode='w-'): + pass + + +def test_create_lindi_json_file_in_x_mode(): + # w- and x are equivalent + for mode in ['w-', 'x']: + with tempfile.TemporaryDirectory() as tmpdir: + lindi_json_fname = f'{tmpdir}/test.lindi.json' + with lindi.LindiH5pyFile.from_lindi_file(lindi_json_fname, mode=mode) as f: # type: ignore + f.attrs['attr1'] = 'value1' + with lindi.LindiH5pyFile.from_lindi_file(lindi_json_fname) as f: + assert f.attrs['attr1'] == 'value1' + + +def test_append_to_lindi_json_file(): + with tempfile.TemporaryDirectory() as tmpdir: + lindi_json_fname = f'{tmpdir}/test.lindi.json' + with lindi.LindiH5pyFile.from_lindi_file(lindi_json_fname, mode='a') as f: + f.attrs['attr1'] = 'value1' + with lindi.LindiH5pyFile.from_lindi_file(lindi_json_fname, mode='a') as f: + f.attrs['attr2'] = 2 + with lindi.LindiH5pyFile.from_lindi_file(lindi_json_fname) as f: + assert f.attrs['attr1'] == 'value1' + assert f.attrs['attr2'] == 2 + + +def test_rfs_for_lindi_tar_file(): + with tempfile.TemporaryDirectory() as tmpdir: + lindi_tar_fname = f'{tmpdir}/test.lindi.tar' + with lindi.LindiH5pyFile.from_lindi_file(lindi_tar_fname, mode='w') as f: + f.attrs['attr1'] = 'value1' + with lindi.LindiH5pyFile.from_lindi_file(lindi_tar_fname) as f: + assert f.attrs['attr1'] == 'value1' + rfs = f.to_reference_file_system() + assert rfs['refs']['.zattrs']['attr1'] == 'value1' + + +def test_fail_write_lindi_invalid_extension(): + with tempfile.TemporaryDirectory() as tmpdir: + lindi_fname = f'{tmpdir}/test.lindi.json' + with lindi.LindiH5pyFile.from_lindi_file(lindi_fname, mode='w') as f: + f.attrs['attr1'] = 'value1' + with pytest.raises(ValueError): + with lindi.LindiH5pyFile.from_lindi_file(lindi_fname) as f: + f.write_lindi_file(f'{tmpdir}/test.lindi.invalid_extension') + + +def test_fail_write_lindi_json_from_local_lindi_tar(): + with tempfile.TemporaryDirectory() as tmpdir: + lindi_tar_fname = f'{tmpdir}/test.lindi.tar' + with lindi.LindiH5pyFile.from_lindi_file(lindi_tar_fname, mode='w') as f: + f.attrs['attr1'] = 'value1' + with pytest.raises(ValueError): + with lindi.LindiH5pyFile.from_lindi_file(lindi_tar_fname) as f: + f.write_lindi_file(f'{tmpdir}/test.lindi.json') + + +@pytest.mark.network +def test_create_local_lindi_json_from_remote_lindi_tar(): + # This example will probably disappear in the future + # and will need to be replaced with another example + # https://neurosift.app/?p=/nwb&url=https://tempory.net/f/dendro/f/hello_world_service/hello_neurosift/spike_sorting_post_processing/FKu2zK3TAsehGtJJXjQa/output/post.nwb.lindi.tar&dandisetId=215561&dandisetVersion=draft&st=lindi + url = 'https://tempory.net/f/dendro/f/hello_world_service/hello_neurosift/spike_sorting_post_processing/FKu2zK3TAsehGtJJXjQa/output/post.nwb.lindi.tar' + with tempfile.TemporaryDirectory() as tmpdir: + with lindi.LindiH5pyFile.from_lindi_file(url) as f: + f.write_lindi_file(f'{tmpdir}/test.lindi.json') + f1 = lindi.LindiH5pyFile.from_lindi_file(url) + f2 = lindi.LindiH5pyFile.from_lindi_file(f'{tmpdir}/test.lindi.json') + assert_h5py_files_equal(f1, f2, skip_large_datasets=True) + + +def test_write_lindi_json_with_generation_metadata(): + with tempfile.TemporaryDirectory() as tmpdir: + lindi_json_fname = f'{tmpdir}/test.lindi.json' + with lindi.LindiH5pyFile.from_lindi_file(lindi_json_fname, mode='w') as f: + f.attrs['attr1'] = 'value1' + f = lindi.LindiH5pyFile.from_lindi_file(lindi_json_fname) + f.write_lindi_file(f'{tmpdir}/test.lindi.json', generation_metadata={'test': 1}) + g = lindi.LindiH5pyFile.from_lindi_file(f'{tmpdir}/test.lindi.json') + rfs = g.to_reference_file_system() + assert rfs['generationMetadata']['test'] == 1 + + +def test_misc_coverage(): + with tempfile.TemporaryDirectory() as tmpdir: + lindi_json_fname = f'{tmpdir}/test.lindi.json' + with lindi.LindiH5pyFile.from_lindi_file(lindi_json_fname, mode='w') as f: + f.attrs['attr1'] = 'value1' + f = lindi.LindiH5pyFile.from_lindi_file(lindi_json_fname) + assert f.filename == '' + with pytest.raises(Exception): + f.driver + assert f.mode == 'r' + with pytest.raises(Exception): + f.libver + with pytest.raises(Exception): + f.userblock_size + with pytest.raises(Exception): + f.meta_block_size + with pytest.raises(Exception): + f.swmr_mode(1) + assert isinstance(str(f), str) + assert isinstance(repr(f), str) + assert f.__bool__() is True + assert f.__hash__() + assert f.id + assert f.file + assert f.name + # cannot get ref on readonly object + with pytest.raises(ValueError): + f.ref + + +def test_delete_group(): + with tempfile.TemporaryDirectory() as tmpdir: + lindi_json_fname = f'{tmpdir}/test.lindi.json' + with lindi.LindiH5pyFile.from_lindi_file(lindi_json_fname, mode='w') as f: + with pytest.raises(Exception): + f.create_group('group1', track_order=True) + f.require_group('group1') + f.create_group('group2') + f.require_group('group2') + with pytest.raises(Exception): + f.create_group('group2') + f.create_group('group3') + del f['group2'] + with lindi.LindiH5pyFile.from_lindi_file(lindi_json_fname) as f: + assert 'group1' in f + assert 'group2' not in f + assert 'group3' in f + + +def test_copy_lindi_to_lindi(): + with tempfile.TemporaryDirectory() as tmpdir: + lindi_json_fname = f'{tmpdir}/test.lindi.json' + with lindi.LindiH5pyFile.from_lindi_file(lindi_json_fname, mode='w') as f: + f.create_group('group1') + group1 = f['group1'] + assert isinstance(group1, lindi.LindiH5pyGroup) + group1.attrs['attr1'] = 'value1' + group2 = group1.create_group('group2') + group2.attrs['attr2'] = 2 + group2.create_dataset('dataset1', data=[1, 2, 3]) + group2.require_dataset('dataset1', shape=(3,), dtype=int) + f.copy('group1', f, 'group3') + f = lindi.LindiH5pyFile.from_lindi_file(lindi_json_fname) + assert 'group1' in f + assert 'group2' in f['group1'] # type: ignore + assert 'group3' in f + assert 'group2' in f['group3'] # type: ignore + assert f['group1'].attrs['attr1'] == 'value1' # type: ignore + assert f['group3'].attrs['attr1'] == 'value1' # type: ignore + assert f['group3']['group2'].attrs['attr2'] == 2 # type: ignore + ds = f['group3']['group2']['dataset1'] # type: ignore + assert isinstance(ds, lindi.LindiH5pyDataset) + assert ds.shape == (3,) + + +def test_copy_lindi_to_hdf5(): + with tempfile.TemporaryDirectory() as tmpdir: + lindi_json_fname = f'{tmpdir}/test.lindi.json' + h5_fname = f'{tmpdir}/test.h5' + with lindi.LindiH5pyFile.from_lindi_file(lindi_json_fname, mode='w') as lindi_f: + lindi_f.create_group('group1') + group1 = lindi_f['group1'] + assert isinstance(group1, lindi.LindiH5pyGroup) + group1.attrs['attr1'] = 'value1' + group2 = group1.create_group('group2') + group2.attrs['attr2'] = 2 + ds = group2.create_dataset('dataset1', data=[1, 2, 3]) + f = lindi.LindiH5pyFile.from_lindi_file(lindi_json_fname, mode='r') + with h5py.File(h5_fname, 'w') as h5f: + with pytest.raises(Exception): + f.copy('group1', h5f, 'group1_copy', shallow=True) + with pytest.raises(Exception): + f.copy('group1', h5f, 'group1_copy', expand_soft=True) + with pytest.raises(Exception): + f.copy('group1', h5f, 'group1_copy', expand_external=True) + with pytest.raises(Exception): + f.copy('group1', h5f, 'group1_copy', expand_refs=True) + with pytest.raises(Exception): + f.copy('group1', h5f, 'group1_copy', without_attrs=True) + with pytest.raises(Exception): + f.copy('group1', h5f, None) + f.copy('group1', h5f, 'group1_copy') + with h5py.File(h5_fname, 'r') as h5f: + assert 'group1_copy' in h5f + assert 'group2' in h5f['group1_copy'] # type: ignore + assert h5f['group1_copy'].attrs['attr1'] == 'value1' + assert h5f['group1_copy']['group2'].attrs['attr2'] == 2 # type: ignore + ds = h5f['group1_copy']['group2']['dataset1'] # type: ignore + assert isinstance(ds, h5py.Dataset) + assert ds.shape == (3,) + + +def test_soft_link(): + with tempfile.TemporaryDirectory() as tmpdir: + h5_fname = f'{tmpdir}/test.h5' + with h5py.File(h5_fname, 'w') as h5f: + group1 = h5f.create_group('group1') + group1.attrs['attr1'] = 'value1' + h5f['group_sl'] = h5py.SoftLink('group1') + f = lindi.LindiH5pyFile.from_hdf5_file(h5_fname, url=h5_fname) + f.write_lindi_file(f'{tmpdir}/test.lindi.json') + f.close() + g = lindi.LindiH5pyFile.from_lindi_file(f'{tmpdir}/test.lindi.json') + assert 'group_sl' in g + aa = g.get('group_sl', getlink=True) + assert isinstance(aa, h5py.SoftLink) + bb = g['group_sl'] + assert isinstance(bb, lindi.LindiH5pyGroup) + assert bb.attrs['attr1'] == 'value1' + + +def test_reference(): + with tempfile.TemporaryDirectory() as tmpdir: + h5_fname = f'{tmpdir}/test.h5' + with h5py.File(h5_fname, 'w') as h5f: + group1 = h5f.create_group('group1') + dataset1 = group1.create_dataset('dataset1', data=[1, 2, 3]) + dataset1.attrs['attr1'] = 'value1' + h5f.attrs['ref1'] = dataset1.ref + f = lindi.LindiH5pyFile.from_hdf5_file(h5_fname, url=h5_fname) + for k, _ in f.items(): + if k != 'group1': + raise Exception(f'Unexpected key: {k}') + for k in f: + if k != 'group1': + raise Exception(f'Unexpected key: {k}') + f.write_lindi_file(f'{tmpdir}/test.lindi.json') + f.close() + g = lindi.LindiH5pyFile.from_lindi_file(f'{tmpdir}/test.lindi.json') + ref1 = g.attrs['ref1'] + assert isinstance(ref1, h5py.Reference) + with pytest.raises(Exception): + g.get(ref1, getlink=True) + with pytest.raises(Exception): + g.get(ref1, getclass=True) + b = g[ref1] + assert isinstance(b, lindi.LindiH5pyDataset) + assert b.attrs['attr1'] == 'value1' + + +def test_fail_attempt_write_in_read_only_mode(): + with tempfile.TemporaryDirectory() as tmpdir: + lindi_json_fname = f'{tmpdir}/test.lindi.json' + with lindi.LindiH5pyFile.from_lindi_file(lindi_json_fname, mode='w') as f: + f.attrs['attr1'] = 'value1' + with lindi.LindiH5pyFile.from_lindi_file(lindi_json_fname, mode='r') as f: + with pytest.raises(ValueError): + f.attrs['attr2'] = 2 + with pytest.raises(ValueError): + f.create_group('group1') + with pytest.raises(ValueError): + f.create_dataset('dataset1', data=[1, 2, 3]) + with pytest.raises(ValueError): + f.require_group('group1') + with pytest.raises(ValueError): + f.require_dataset('dataset1', shape=(3,), dtype=int) + + +def test_create_dataset(): + with tempfile.TemporaryDirectory() as tmpdir: + lindi_json_fname = f'{tmpdir}/test.lindi.json' + with lindi.LindiH5pyFile.from_lindi_file(lindi_json_fname, mode='w') as f: + f.create_dataset('dataset1', data=[1, 2, 3]) + f.require_dataset('dataset1', shape=(3,), dtype=int) + with lindi.LindiH5pyFile.from_lindi_file(lindi_json_fname) as f: + assert 'dataset1' in f + ds = f['dataset1'] + assert isinstance(ds, lindi.LindiH5pyDataset) + assert ds.shape == (3,) + + +def create_example_h5_file(fname): + with h5py.File(fname, 'w') as f: + f.attrs['attr1'] = 'value1' + f.attrs['attr2'] = 2 + f.create_dataset('dataset1', data=[1, 2, 3]) + f.create_group('group1') + f.create_group('group2') + group1 = f['group1'] + assert isinstance(group1, h5py.Group) + group1.create_dataset('dataset2', data=[4, 5, 6]) diff --git a/tests/test_lindi_tar.py b/tests/test_lindi_tar.py new file mode 100644 index 0000000..ec361f6 --- /dev/null +++ b/tests/test_lindi_tar.py @@ -0,0 +1,78 @@ +import tempfile +import pytest +import lindi + + +def test_write_growing_lindi_tar(): + from lindi.tar.lindi_tar import _test_set, LindiTarFile + _test_set( + tar_entry_json_size=1024, + initial_tar_index_json_size=1024, + initial_lindi_json_size=1024 + ) + for extension in ['tar', 'd']: + with tempfile.TemporaryDirectory() as tmpdir: + fname = f'{tmpdir}/example.lindi.{extension}' + with lindi.LindiH5pyFile.from_lindi_file(fname, mode='w') as f: + f.attrs['attr1'] = 'value1' + + for j in range(4): + with lindi.LindiH5pyFile.from_lindi_file(fname, mode='a') as f: + for i in range(20): + # inline - to grow the lindi.json + f.create_dataset(f'small_dataset_{j}_{i}', data=[i] * 10) + f.flush() + for i in range(20): + # blob - to grow the index.json + f.create_dataset(f'big_dataset_{j}_{i}', data=[i] * 100000) + f.flush() + + with lindi.LindiH5pyFile.from_lindi_file(fname, mode='r') as f: + assert f.attrs['attr1'] == 'value1' + for i in range(20): + ds = f[f'small_dataset_{j}_{i}'] + assert isinstance(ds, lindi.LindiH5pyDataset) + assert ds.shape == (10,) + for i in range(20): + ds = f[f'big_dataset_{j}_{i}'] + assert isinstance(ds, lindi.LindiH5pyDataset) + assert ds.shape == (100000,) + + with LindiTarFile(fname, dir_representation=extension == 'd') as f: + if extension == 'd': + with pytest.raises(ValueError): + f.get_file_info('lindi.json') + assert f.read_file('lindi.json') + f.trash_file('lindi.json') + with pytest.raises(FileNotFoundError): + f.read_file('lindi.json') + with pytest.raises(ValueError): + f.get_file_byte_range('lindi.json') + else: + a = f.get_file_info('lindi.json') + assert isinstance(a, dict) + assert f.get_file_byte_range('lindi.json') + + +@pytest.mark.network +def test_load_remote_lindi_tar(): + # This example will probably disappear in the future + # and will need to be replaced with another example + # https://neurosift.app/?p=/nwb&url=https://tempory.net/f/dendro/f/hello_world_service/hello_neurosift/spike_sorting_post_processing/FKu2zK3TAsehGtJJXjQa/output/post.nwb.lindi.tar&dandisetId=215561&dandisetVersion=draft&st=lindi + url = 'https://tempory.net/f/dendro/f/hello_world_service/hello_neurosift/spike_sorting_post_processing/FKu2zK3TAsehGtJJXjQa/output/post.nwb.lindi.tar' + + from lindi.tar.lindi_tar import LindiTarFile + with LindiTarFile(url) as f: + a = f.get_file_info('lindi.json') + assert isinstance(a, dict) + with pytest.raises(ValueError): + # cannot overwrite for remote file + f.overwrite_file_content('lindi.json', b'xxx') + with pytest.raises(ValueError): + # cannot trash file for remote file + f.trash_file('lindi.json') + + +if __name__ == '__main__': + test_write_growing_lindi_tar() + test_load_remote_lindi_tar() diff --git a/tests/test_load_000409.py b/tests/test_load_000409.py new file mode 100644 index 0000000..172ff8b --- /dev/null +++ b/tests/test_load_000409.py @@ -0,0 +1,28 @@ +import tempfile +import pytest +import lindi + + +@pytest.mark.network +def test_load_000409(): + import pynwb + # https://neurosift.app/?p=/nwb&url=https://api.dandiarchive.org/api/assets/c04f6b30-82bf-40e1-9210-34f0bcd8be24/download/&dandisetId=000409&dandisetVersion=draft + url = 'https://api.dandiarchive.org/api/assets/c04f6b30-82bf-40e1-9210-34f0bcd8be24/download/' + with tempfile.TemporaryDirectory() as tmpdir: + fname = f'{tmpdir}/example.nwb.lindi.json' + with lindi.LindiH5pyFile.from_hdf5_file(url) as f: + f.write_lindi_file(fname) + f = lindi.LindiH5pyFile.from_lindi_file(fname, mode='r') + with pynwb.NWBHDF5IO(file=f, mode='r') as io: + nwb = io.read() + print(nwb) + X = nwb.acquisition['ElectricalSeriesAp'] # type: ignore + a = X.data[:1000] + assert a.shape == (1000, X.data.shape[1]) + units = nwb.units # type: ignore + units_ids = units['id'] + assert len(units_ids) == 590 + + +if __name__ == '__main__': + test_load_000409() diff --git a/tests/test_remote_data.py b/tests/test_remote_data.py deleted file mode 100644 index 41f80e6..0000000 --- a/tests/test_remote_data.py +++ /dev/null @@ -1,99 +0,0 @@ -import json -import pytest -import lindi -from utils import arrays_are_equal - - -@pytest.mark.network -def test_remote_data_1(): - import pynwb - - # Define the URL for a remote NWB file - h5_url = "https://api.dandiarchive.org/api/assets/11f512ba-5bcf-4230-a8cb-dc8d36db38cb/download/" - - # Create a read-only Zarr store as a wrapper for the h5 file - store = lindi.LindiH5ZarrStore.from_file(h5_url) - - # Generate a reference file system - rfs = store.to_reference_file_system() - - # Save it to a file for later use - with open("example.nwb.lindi.json", "w") as f: - json.dump(rfs, f, indent=2) - - # Create an h5py-like client from the reference file system - client = lindi.LindiH5pyFile.from_reference_file_system(rfs) - - # Open using pynwb - with pynwb.NWBHDF5IO(file=client, mode="r") as io: - nwbfile = io.read() - print(nwbfile) - - -@pytest.mark.network -def test_remote_data_2(): - import pynwb - - # Define the URL for a remote .nwb.lindi.json file - url = 'https://lindi.neurosift.org/dandi/dandisets/000939/assets/56d875d6-a705-48d3-944c-53394a389c85/nwb.lindi.json' - - # Load the h5py-like client from the reference file system - client = lindi.LindiH5pyFile.from_reference_file_system(url) - - # Open using pynwb - with pynwb.NWBHDF5IO(file=client, mode="r") as io: - nwbfile = io.read() - print(nwbfile) - - -@pytest.mark.network -def test_remote_data_rfs_copy(): - # Test that we can copy datasets and groups from one reference file system to another - # and the data itself is not copied, only the references. - url = 'https://lindi.neurosift.org/dandi/dandisets/000939/assets/56d875d6-a705-48d3-944c-53394a389c85/nwb.lindi.json' - - client = lindi.LindiH5pyFile.from_reference_file_system(url) - - rfs2 = {'refs': { - '.zgroup': '{"zarr_format": 2}', - }} - client2 = lindi.LindiH5pyFile.from_reference_file_system(rfs2) - - # This first dataset is a 2D array with chunks - ds = client['processing/behavior/Position/position/data'] - assert isinstance(ds, lindi.LindiH5pyDataset) - assert ds.shape == (360867, 2) - - client.copy('processing/behavior/Position/position/data', client2, 'copied_data1') - aa = rfs2['refs']['copied_data1/.zarray'] - assert isinstance(aa, str) or isinstance(aa, dict) - assert 'copied_data1/0.0' in rfs2['refs'] - bb = rfs2['refs']['copied_data1/0.0'] - assert isinstance(bb, list) # make sure it is a reference, not the actual data - - ds2 = client2['copied_data1'] - assert isinstance(ds2, lindi.LindiH5pyDataset) - assert arrays_are_equal(ds[()], ds2[()]) # make sure the data is the same - - # This next dataset has an _EXTERNAL_ARRAY_LINK which means it has a pointer - # to a dataset in a remote h5py - # https://neurosift.app/?p=/nwb&dandisetId=000409&dandisetVersion=draft&url=https://api.dandiarchive.org/api/assets/ab3998c2-3540-4bda-8b03-3f3795fa602d/download/ - url_b = 'https://lindi.neurosift.org/dandi/dandisets/000409/assets/ab3998c2-3540-4bda-8b03-3f3795fa602d/nwb.lindi.json' - client_b = lindi.LindiH5pyFile.from_reference_file_system(url_b) - - ds = client_b['acquisition/ElectricalSeriesAp/data'] - assert isinstance(ds, lindi.LindiH5pyDataset) - assert ds.shape == (109281892, 384) - - client_b.copy('acquisition/ElectricalSeriesAp/data', client2, 'copied_data2') - aa = rfs2['refs']['copied_data2/.zarray'] - assert isinstance(aa, str) or isinstance(aa, dict) - assert 'copied_data2/0.0' not in rfs2['refs'] # make sure the chunks were not copied - - ds2 = client2['copied_data2'] - assert isinstance(ds2, lindi.LindiH5pyDataset) - assert arrays_are_equal(ds[100000:100010], ds2[100000:100010]) - - -if __name__ == "__main__": - test_remote_data_rfs_copy() diff --git a/tests/test_store.py b/tests/test_store.py index 07d59cf..cea8985 100644 --- a/tests/test_store.py +++ b/tests/test_store.py @@ -1,7 +1,7 @@ import h5py import tempfile import lindi -from utils import lists_are_equal +from .utils import lists_are_equal def test_store(): diff --git a/tests/test_zarr_write.py b/tests/test_zarr_write.py index d5380a4..b511b8a 100644 --- a/tests/test_zarr_write.py +++ b/tests/test_zarr_write.py @@ -5,7 +5,7 @@ import lindi import pytest import numcodecs -from utils import assert_groups_equal, arrays_are_equal +from .utils import assert_groups_equal, arrays_are_equal def test_zarr_write(): @@ -121,7 +121,7 @@ def compare_example_h5_data(h5f: h5py.File, tmpdir: str): with h5py.File(f'{tmpdir}/for_comparison.h5', 'w') as h5f2: write_example_h5_data(h5f2) with h5py.File(f'{tmpdir}/for_comparison.h5', 'r') as h5f2: - assert_groups_equal(h5f, h5f2) + assert_groups_equal(h5f, h5f2, skip_large_datasets=False) if __name__ == '__main__': diff --git a/tests/utils.py b/tests/utils.py index eb07b38..6089cf2 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -4,7 +4,11 @@ from lindi.conversion.attr_conversion import h5_to_zarr_attr -def assert_groups_equal(h5f: h5py.Group, h5f2: h5py.Group): +def assert_h5py_files_equal(h5f1: h5py.File, h5f2: h5py.File, *, skip_large_datasets: bool): + assert_groups_equal(h5f1, h5f2, skip_large_datasets=skip_large_datasets) + + +def assert_groups_equal(h5f: h5py.Group, h5f2: h5py.Group, *, skip_large_datasets: bool): print(f'Comparing groups: {h5f.name}') assert_attrs_equal(h5f, h5f2) for k in h5f.keys(): @@ -12,10 +16,10 @@ def assert_groups_equal(h5f: h5py.Group, h5f2: h5py.Group): X2 = h5f2[k] if isinstance(X1, h5py.Group): assert isinstance(X2, h5py.Group) - assert_groups_equal(X1, X2) + assert_groups_equal(X1, X2, skip_large_datasets=skip_large_datasets) elif isinstance(X1, h5py.Dataset): assert isinstance(X2, h5py.Dataset) - assert_datasets_equal(X1, X2) + assert_datasets_equal(X1, X2, skip_large_datasets=skip_large_datasets) else: raise Exception(f'Unexpected type: {type(X1)}') @@ -24,19 +28,27 @@ def assert_groups_equal(h5f: h5py.Group, h5f2: h5py.Group): raise Exception(f'Key {k} not found in h5f') -def assert_datasets_equal(h5d1: h5py.Dataset, h5d2: h5py.Dataset): +def assert_datasets_equal(h5d1: h5py.Dataset, h5d2: h5py.Dataset, *, skip_large_datasets: bool): print(f'Comparing datasets: {h5d1.name}') assert h5d1.shape == h5d2.shape, f'h5d1.shape: {h5d1.shape}, h5d2.shape: {h5d2.shape}' assert h5d1.dtype == h5d2.dtype, f'h5d1.dtype: {h5d1.dtype}, h5d2.dtype: {h5d2.dtype}' + if skip_large_datasets and np.prod(h5d1.shape) > 1000: + print(f'Skipping large dataset: {h5d1.name}') + return if h5d1.dtype.kind == 'V': for name in h5d1.dtype.names: data1 = h5d1[name][()] data2 = h5d2[name][()] - assert arrays_are_equal(data1, data2), f'data1: {data1}, data2: {data2}' + if not arrays_are_equal(data1, data2): + raise Exception(f'Arrays are not equal for field {name}') + elif h5d1.dtype.kind == 'O': + # skip object arrays + pass else: data1 = h5d1[()] data2 = h5d2[()] - assert arrays_are_equal(data1, data2), f'data1: {data1}, data2: {data2}' + if not arrays_are_equal(data1, data2): + raise Exception(f'Arrays are not equal for dataset {h5d1.name} with dtype {h5d1.dtype}') def arrays_are_equal(a, b): From 2e7845a19b156364e70e5e746bf0c9df3f87774c Mon Sep 17 00:00:00 2001 From: Jeremy Magland Date: Fri, 20 Sep 2024 14:15:16 -0400 Subject: [PATCH 2/5] codespell opts --- .github/workflows/codespell.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/codespell.yml b/.github/workflows/codespell.yml index dd0eb8e..48ea624 100644 --- a/.github/workflows/codespell.yml +++ b/.github/workflows/codespell.yml @@ -21,3 +21,5 @@ jobs: uses: actions/checkout@v4 - name: Codespell uses: codespell-project/actions-codespell@v2 + with: + args: -L tempory From d348360dfd1c4f77e88a7225159e87845f4e82b7 Mon Sep 17 00:00:00 2001 From: Jeremy Magland Date: Sat, 21 Sep 2024 06:57:28 -0400 Subject: [PATCH 3/5] spellcheck ignore words --- .github/workflows/codespell.yml | 2 -- pyproject.toml | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/.github/workflows/codespell.yml b/.github/workflows/codespell.yml index 48ea624..dd0eb8e 100644 --- a/.github/workflows/codespell.yml +++ b/.github/workflows/codespell.yml @@ -21,5 +21,3 @@ jobs: uses: actions/checkout@v4 - name: Codespell uses: codespell-project/actions-codespell@v2 - with: - args: -L tempory diff --git a/pyproject.toml b/pyproject.toml index ff95166..fabfa1a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,4 +33,4 @@ build-backend = "poetry.core.masonry.api" skip = '.git' check-hidden = true # ignore-regex = '' -# ignore-words-list = '' +ignore-words-list = 'tempory' From 0533a668b61a41edba21110e713b4e7d3a4f0f63 Mon Sep 17 00:00:00 2001 From: rly Date: Tue, 24 Sep 2024 12:42:02 -0700 Subject: [PATCH 4/5] Use __all__ in lindi/__init__.py --- lindi/LindiH5pyFile/__init__.py | 2 ++ lindi/__init__.py | 30 +++++++++++++++++++++++++----- 2 files changed, 27 insertions(+), 5 deletions(-) diff --git a/lindi/LindiH5pyFile/__init__.py b/lindi/LindiH5pyFile/__init__.py index 4d95cea..586d22b 100644 --- a/lindi/LindiH5pyFile/__init__.py +++ b/lindi/LindiH5pyFile/__init__.py @@ -2,6 +2,7 @@ from .LindiH5pyDataset import LindiH5pyDataset from .LindiH5pyGroup import LindiH5pyGroup from .LindiH5pyLink import LindiH5pySoftLink, LindiH5pyHardLink +from .LindiH5pyReference import LindiH5pyReference __all__ = [ "LindiH5pyFile", @@ -9,4 +10,5 @@ "LindiH5pyGroup", "LindiH5pySoftLink", "LindiH5pyHardLink", + "LindiH5pyReference", ] diff --git a/lindi/__init__.py b/lindi/__init__.py index 1b3a3ea..8a692e3 100644 --- a/lindi/__init__.py +++ b/lindi/__init__.py @@ -1,5 +1,25 @@ -from .LindiH5ZarrStore import LindiH5ZarrStore, LindiH5ZarrStoreOpts # noqa: F401 -from .LindiH5pyFile import LindiH5pyFile, LindiH5pyGroup, LindiH5pyDataset, LindiH5pyHardLink, LindiH5pySoftLink # noqa: F401 -from .LocalCache.LocalCache import LocalCache, ChunkTooLargeError # noqa: F401 -from .LindiRemfile.additional_url_resolvers import add_additional_url_resolver # noqa: F401 -from .LindiH5pyFile.LindiH5pyReference import LindiH5pyReference # noqa: F401 +from .LindiH5ZarrStore import LindiH5ZarrStore, LindiH5ZarrStoreOpts +from .LindiH5pyFile import ( + LindiH5pyFile, + LindiH5pyGroup, + LindiH5pyDataset, + LindiH5pyHardLink, + LindiH5pySoftLink, + LindiH5pyReference, +) +from .LocalCache.LocalCache import LocalCache, ChunkTooLargeError +from .LindiRemfile.additional_url_resolvers import add_additional_url_resolver + +__all__ = [ + "LindiH5ZarrStore", + "LindiH5ZarrStoreOpts", + "LindiH5pyFile", + "LindiH5pyGroup", + "LindiH5pyDataset", + "LindiH5pyHardLink", + "LindiH5pySoftLink", + "LindiH5pyReference", + "LocalCache", + "ChunkTooLargeError", + "add_additional_url_resolver", +] From b5b281f2ad542d821e67b9a46502225a412915a3 Mon Sep 17 00:00:00 2001 From: Jeremy Magland Date: Tue, 24 Sep 2024 20:33:57 -0400 Subject: [PATCH 5/5] Update lindi/LindiH5pyFile/LindiH5pyFile.py Co-authored-by: Ryan Ly --- lindi/LindiH5pyFile/LindiH5pyFile.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lindi/LindiH5pyFile/LindiH5pyFile.py b/lindi/LindiH5pyFile/LindiH5pyFile.py index 2989f41..7dcd3ec 100644 --- a/lindi/LindiH5pyFile/LindiH5pyFile.py +++ b/lindi/LindiH5pyFile/LindiH5pyFile.py @@ -275,7 +275,7 @@ def write_lindi_file(self, filename: str, *, generation_metadata: Union[dict, No 'generationMetadata' key in the reference file system. """ if not filename.endswith(".lindi.json") and not filename.endswith(".lindi.tar") and not filename.endswith(".lindi.d"): - raise ValueError("Filename must end with '.lindi.json' or '.lindi.tar'") + raise ValueError("Filename must end with '.lindi.json', '.lindi.tar', or '.lindi.d'.") rfs = self.to_reference_file_system() if self._source_tar_file: source_is_remote = self._source_url_or_path is not None and (self._source_url_or_path.startswith("http://") or self._source_url_or_path.startswith("https://"))