From 0b35daed749bab8b118310b8b2fa003e6dca7d68 Mon Sep 17 00:00:00 2001 From: Jeremy Magland Date: Fri, 19 Apr 2024 10:28:38 -0500 Subject: [PATCH] to_file -> write_reference_file_system --- lindi/LindiH5ZarrStore/LindiH5ZarrStore.py | 11 ++-- tests/test_core.py | 12 ++-- tests/test_staging_area.py | 68 ++++++++++++++++++++++ tests/test_store.py | 2 +- 4 files changed, 79 insertions(+), 14 deletions(-) create mode 100644 tests/test_staging_area.py diff --git a/lindi/LindiH5ZarrStore/LindiH5ZarrStore.py b/lindi/LindiH5ZarrStore/LindiH5ZarrStore.py index 9858248..589b56d 100644 --- a/lindi/LindiH5ZarrStore/LindiH5ZarrStore.py +++ b/lindi/LindiH5ZarrStore/LindiH5ZarrStore.py @@ -1,6 +1,6 @@ import json import base64 -from typing import Union, List, IO, Any, Dict, Literal +from typing import Union, List, IO, Any, Dict from dataclasses import dataclass import numpy as np import zarr @@ -460,16 +460,17 @@ def listdir(self, path: str = "") -> List[str]: else: return [] - def to_file(self, file_name: str, *, file_type: Literal["zarr.json"] = "zarr.json"): + def write_reference_file_system(self, output_file_name: str): """Write a reference file system corresponding to this store to a file. This can then be loaded using LindiH5pyFile.from_reference_file_system(file_name) """ - if file_type != "zarr.json": - raise Exception(f"Unsupported file type: {file_type}") + + if not output_file_name.endswith(".lindi.json"): + raise Exception("The output file name must end with .lindi.json") ret = self.to_reference_file_system() - with open(file_name, "w") as f: + with open(output_file_name, "w") as f: json.dump(ret, f, indent=2) def to_reference_file_system(self) -> dict: diff --git a/tests/test_core.py b/tests/test_core.py index 20a0d46..f57f9d7 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -304,8 +304,8 @@ def test_reference_file_system_to_file(): with h5py.File(filename, "w") as f: f.create_dataset("X", data=[1, 2, 3]) with LindiH5ZarrStore.from_file(filename, url=filename) as store: - rfs_fname = f'{tmpdir}/test.zarr.json' - store.to_file(rfs_fname) + rfs_fname = f'{tmpdir}/test.lindi.json' + store.write_reference_file_system(rfs_fname) client = lindi.LindiH5pyFile.from_reference_file_system(rfs_fname) X = client["X"] assert isinstance(X, lindi.LindiH5pyDataset) @@ -423,7 +423,7 @@ def test_lindi_h5_zarr_store(): with pytest.raises(Exception, match=store_is_closed_msg): store.to_reference_file_system() with pytest.raises(Exception, match=store_is_closed_msg): - store.to_file("test.json") + store.write_reference_file_system("test.lindi.json") with pytest.raises(Exception, match=store_is_closed_msg): store._get_chunk_file_bytes_data("dataset1", "0") @@ -443,17 +443,13 @@ def test_lindi_h5_zarr_store(): store["nonexistent/0"] # Key error - store = LindiH5ZarrStore.from_file(filename) + store = LindiH5ZarrStore.from_file(filename, url='.') with pytest.raises(KeyError): store[''] assert '' not in store with pytest.raises(KeyError): store["nonexistent/.zattrs"] - # Unsupported file type - with pytest.raises(Exception, match="Unsupported file type: zarr"): - store.to_file("test.json", file_type="zarr") # type: ignore - # URL is not set store = LindiH5ZarrStore.from_file(filename, url=None) with pytest.raises(Exception, match="You must specify a url to create a reference file system"): diff --git a/tests/test_staging_area.py b/tests/test_staging_area.py new file mode 100644 index 0000000..e79f58b --- /dev/null +++ b/tests/test_staging_area.py @@ -0,0 +1,68 @@ +import tempfile +import os +import numpy as np +import lindi +import shutil + + +def test_staging_area(): + with tempfile.TemporaryDirectory() as tmpdir: + staging_area = lindi.StagingArea.create(tmpdir + '/staging_area') + empty_rfs = {'refs': {'.zgroup': {'zarr_format': 2}}} + client = lindi.LindiH5pyFile.from_reference_file_system(empty_rfs, mode='r+', staging_area=staging_area) + X = np.random.randn(1000, 1000).astype(np.float32) + client.create_dataset('large_array', data=X, chunks=(400, 400)) + total_size = _get_total_size_of_directory(tmpdir) + assert total_size >= X.nbytes * 0.5, f'{total_size} < {X.nbytes} * 0.5' # take into consideration compression + rfs = client.to_reference_file_system() + client2 = lindi.LindiH5pyFile.from_reference_file_system(rfs, mode='r') + assert isinstance(client2, lindi.LindiH5pyFile) + X1 = client['large_array'] + assert isinstance(X1, lindi.LindiH5pyDataset) + X2 = client2['large_array'] + assert isinstance(X2, lindi.LindiH5pyDataset) + assert np.allclose(X1[:], X2[:]) + + upload_dir = f'{tmpdir}/upload_dir' + os.makedirs(upload_dir, exist_ok=True) + output_fname = f'{tmpdir}/output.lindi.json' + + def on_upload_blob(fname: str): + random_fname = f'{upload_dir}/{_random_string(10)}' + shutil.copy(fname, random_fname) + return random_fname + + def on_upload_main(fname: str): + shutil.copy(fname, output_fname) + return output_fname + + assert client.staging_store + client.staging_store.upload( + on_upload_blob=on_upload_blob, + on_upload_main=on_upload_main, + consolidate_chunks=True + ) + + client3 = lindi.LindiH5pyFile.from_reference_file_system(output_fname, mode='r') + X3 = client3['large_array'] + assert isinstance(X3, lindi.LindiH5pyDataset) + assert np.allclose(X1[:], X3[:]) + + +def _get_total_size_of_directory(directory): + total_size = 0 + for dirpath, dirnames, filenames in os.walk(directory): + for f in filenames: + fp = os.path.join(dirpath, f) + total_size += os.path.getsize(fp) + return total_size + + +def _random_string(n): + import random + import string + return ''.join(random.choices(string.ascii_uppercase + string.digits, k=n)) + + +if __name__ == '__main__': + test_staging_area() diff --git a/tests/test_store.py b/tests/test_store.py index c2ed53b..07d59cf 100644 --- a/tests/test_store.py +++ b/tests/test_store.py @@ -13,7 +13,7 @@ def test_store(): group1.create_group("group2") group1.create_dataset("dataset2", data=[4, 5, 6]) with lindi.LindiH5ZarrStore.from_file(filename, url=filename) as store: - store.to_file(f"{tmpdir}/test.zarr.json") # for coverage + store.write_reference_file_system(f"{tmpdir}/test.lindi.json") # for coverage a = store.listdir('') assert _lists_are_equal_as_sets(a, ['dataset1', 'group1']) b = store.listdir('group1')