From 0b35daed749bab8b118310b8b2fa003e6dca7d68 Mon Sep 17 00:00:00 2001
From: Jeremy Magland <jmagland@flatironinstitute.org>
Date: Fri, 19 Apr 2024 10:28:38 -0500
Subject: [PATCH] to_file -> write_reference_file_system

---
 lindi/LindiH5ZarrStore/LindiH5ZarrStore.py | 11 ++--
 tests/test_core.py                         | 12 ++--
 tests/test_staging_area.py                 | 68 ++++++++++++++++++++++
 tests/test_store.py                        |  2 +-
 4 files changed, 79 insertions(+), 14 deletions(-)
 create mode 100644 tests/test_staging_area.py

diff --git a/lindi/LindiH5ZarrStore/LindiH5ZarrStore.py b/lindi/LindiH5ZarrStore/LindiH5ZarrStore.py
index 9858248..589b56d 100644
--- a/lindi/LindiH5ZarrStore/LindiH5ZarrStore.py
+++ b/lindi/LindiH5ZarrStore/LindiH5ZarrStore.py
@@ -1,6 +1,6 @@
 import json
 import base64
-from typing import Union, List, IO, Any, Dict, Literal
+from typing import Union, List, IO, Any, Dict
 from dataclasses import dataclass
 import numpy as np
 import zarr
@@ -460,16 +460,17 @@ def listdir(self, path: str = "") -> List[str]:
         else:
             return []
 
-    def to_file(self, file_name: str, *, file_type: Literal["zarr.json"] = "zarr.json"):
+    def write_reference_file_system(self, output_file_name: str):
         """Write a reference file system corresponding to this store to a file.
 
         This can then be loaded using LindiH5pyFile.from_reference_file_system(file_name)
         """
-        if file_type != "zarr.json":
-            raise Exception(f"Unsupported file type: {file_type}")
+
+        if not output_file_name.endswith(".lindi.json"):
+            raise Exception("The output file name must end with .lindi.json")
 
         ret = self.to_reference_file_system()
-        with open(file_name, "w") as f:
+        with open(output_file_name, "w") as f:
             json.dump(ret, f, indent=2)
 
     def to_reference_file_system(self) -> dict:
diff --git a/tests/test_core.py b/tests/test_core.py
index 20a0d46..f57f9d7 100644
--- a/tests/test_core.py
+++ b/tests/test_core.py
@@ -304,8 +304,8 @@ def test_reference_file_system_to_file():
         with h5py.File(filename, "w") as f:
             f.create_dataset("X", data=[1, 2, 3])
         with LindiH5ZarrStore.from_file(filename, url=filename) as store:
-            rfs_fname = f'{tmpdir}/test.zarr.json'
-            store.to_file(rfs_fname)
+            rfs_fname = f'{tmpdir}/test.lindi.json'
+            store.write_reference_file_system(rfs_fname)
             client = lindi.LindiH5pyFile.from_reference_file_system(rfs_fname)
             X = client["X"]
             assert isinstance(X, lindi.LindiH5pyDataset)
@@ -423,7 +423,7 @@ def test_lindi_h5_zarr_store():
         with pytest.raises(Exception, match=store_is_closed_msg):
             store.to_reference_file_system()
         with pytest.raises(Exception, match=store_is_closed_msg):
-            store.to_file("test.json")
+            store.write_reference_file_system("test.lindi.json")
         with pytest.raises(Exception, match=store_is_closed_msg):
             store._get_chunk_file_bytes_data("dataset1", "0")
 
@@ -443,17 +443,13 @@ def test_lindi_h5_zarr_store():
             store["nonexistent/0"]
 
         # Key error
-        store = LindiH5ZarrStore.from_file(filename)
+        store = LindiH5ZarrStore.from_file(filename, url='.')
         with pytest.raises(KeyError):
             store['']
         assert '' not in store
         with pytest.raises(KeyError):
             store["nonexistent/.zattrs"]
 
-        # Unsupported file type
-        with pytest.raises(Exception, match="Unsupported file type: zarr"):
-            store.to_file("test.json", file_type="zarr")  # type: ignore
-
         # URL is not set
         store = LindiH5ZarrStore.from_file(filename, url=None)
         with pytest.raises(Exception, match="You must specify a url to create a reference file system"):
diff --git a/tests/test_staging_area.py b/tests/test_staging_area.py
new file mode 100644
index 0000000..e79f58b
--- /dev/null
+++ b/tests/test_staging_area.py
@@ -0,0 +1,68 @@
+import tempfile
+import os
+import numpy as np
+import lindi
+import shutil
+
+
+def test_staging_area():
+    with tempfile.TemporaryDirectory() as tmpdir:
+        staging_area = lindi.StagingArea.create(tmpdir + '/staging_area')
+        empty_rfs = {'refs': {'.zgroup': {'zarr_format': 2}}}
+        client = lindi.LindiH5pyFile.from_reference_file_system(empty_rfs, mode='r+', staging_area=staging_area)
+        X = np.random.randn(1000, 1000).astype(np.float32)
+        client.create_dataset('large_array', data=X, chunks=(400, 400))
+        total_size = _get_total_size_of_directory(tmpdir)
+        assert total_size >= X.nbytes * 0.5, f'{total_size} < {X.nbytes} * 0.5'  # take into consideration compression
+        rfs = client.to_reference_file_system()
+        client2 = lindi.LindiH5pyFile.from_reference_file_system(rfs, mode='r')
+        assert isinstance(client2, lindi.LindiH5pyFile)
+        X1 = client['large_array']
+        assert isinstance(X1, lindi.LindiH5pyDataset)
+        X2 = client2['large_array']
+        assert isinstance(X2, lindi.LindiH5pyDataset)
+        assert np.allclose(X1[:], X2[:])
+
+        upload_dir = f'{tmpdir}/upload_dir'
+        os.makedirs(upload_dir, exist_ok=True)
+        output_fname = f'{tmpdir}/output.lindi.json'
+
+        def on_upload_blob(fname: str):
+            random_fname = f'{upload_dir}/{_random_string(10)}'
+            shutil.copy(fname, random_fname)
+            return random_fname
+
+        def on_upload_main(fname: str):
+            shutil.copy(fname, output_fname)
+            return output_fname
+
+        assert client.staging_store
+        client.staging_store.upload(
+            on_upload_blob=on_upload_blob,
+            on_upload_main=on_upload_main,
+            consolidate_chunks=True
+        )
+
+        client3 = lindi.LindiH5pyFile.from_reference_file_system(output_fname, mode='r')
+        X3 = client3['large_array']
+        assert isinstance(X3, lindi.LindiH5pyDataset)
+        assert np.allclose(X1[:], X3[:])
+
+
+def _get_total_size_of_directory(directory):
+    total_size = 0
+    for dirpath, dirnames, filenames in os.walk(directory):
+        for f in filenames:
+            fp = os.path.join(dirpath, f)
+            total_size += os.path.getsize(fp)
+    return total_size
+
+
+def _random_string(n):
+    import random
+    import string
+    return ''.join(random.choices(string.ascii_uppercase + string.digits, k=n))
+
+
+if __name__ == '__main__':
+    test_staging_area()
diff --git a/tests/test_store.py b/tests/test_store.py
index c2ed53b..07d59cf 100644
--- a/tests/test_store.py
+++ b/tests/test_store.py
@@ -13,7 +13,7 @@ def test_store():
             group1.create_group("group2")
             group1.create_dataset("dataset2", data=[4, 5, 6])
         with lindi.LindiH5ZarrStore.from_file(filename, url=filename) as store:
-            store.to_file(f"{tmpdir}/test.zarr.json")  # for coverage
+            store.write_reference_file_system(f"{tmpdir}/test.lindi.json")  # for coverage
             a = store.listdir('')
             assert _lists_are_equal_as_sets(a, ['dataset1', 'group1'])
             b = store.listdir('group1')