diff --git a/lindi/LindiH5ZarrStore/LindiH5ZarrStore.py b/lindi/LindiH5ZarrStore/LindiH5ZarrStore.py index d934a51..e40fcef 100644 --- a/lindi/LindiH5ZarrStore/LindiH5ZarrStore.py +++ b/lindi/LindiH5ZarrStore/LindiH5ZarrStore.py @@ -20,7 +20,7 @@ from ..conversion.h5_filters_to_codecs import h5_filters_to_codecs from ..conversion.create_zarr_dataset_from_h5_data import create_zarr_dataset_from_h5_data from ..LindiH5pyFile.LindiReferenceFileSystemStore import LindiReferenceFileSystemStore -from ..LocalCache.LocalCache import LocalCache +from ..LocalCache.LocalCache import ChunkTooLargeError, LocalCache from ..LindiRemfile.LindiRemfile import LindiRemfile from .LindiH5ZarrStoreOpts import LindiH5ZarrStoreOpts @@ -346,12 +346,15 @@ def _get_chunk_file_bytes(self, key_parent: str, key_name: str): buf = _read_bytes(self._file, byte_offset, byte_count) if self._local_cache is not None: assert self._url is not None, "Unexpected: url is None but local_cache is not None" - self._local_cache.put_remote_chunk( - url=self._url, - offset=byte_offset, - size=byte_count, - data=buf - ) + try: + self._local_cache.put_remote_chunk( + url=self._url, + offset=byte_offset, + size=byte_count, + data=buf + ) + except ChunkTooLargeError: + print(f"Warning: Unable to store chunk of size {byte_count} in local cache in LindiH5ZarrStore (key: {key_parent}/{key_name})") return buf def _get_chunk_file_bytes_data(self, key_parent: str, key_name: str): diff --git a/lindi/LindiH5pyFile/LindiReferenceFileSystemStore.py b/lindi/LindiH5pyFile/LindiReferenceFileSystemStore.py index 8fea9f8..48b2801 100644 --- a/lindi/LindiH5pyFile/LindiReferenceFileSystemStore.py +++ b/lindi/LindiH5pyFile/LindiReferenceFileSystemStore.py @@ -4,7 +4,7 @@ import requests from zarr.storage import Store as ZarrStore -from ..LocalCache.LocalCache import LocalCache +from ..LocalCache.LocalCache import ChunkTooLargeError, LocalCache class LindiReferenceFileSystemStore(ZarrStore): @@ -141,7 +141,10 @@ def __getitem__(self, key: str): return x val = _read_bytes_from_url_or_path(url, offset, length) if self.local_cache is not None: - self.local_cache.put_remote_chunk(url=url, offset=offset, size=length, data=val) + try: + self.local_cache.put_remote_chunk(url=url, offset=offset, size=length, data=val) + except ChunkTooLargeError: + print(f'Warning: unable to cache chunk of size {length} on LocalCache (key: {key})') return val else: # should not happen given checks in __init__, but self.rfs is mutable diff --git a/lindi/LindiRemfile/LindiRemfile.py b/lindi/LindiRemfile/LindiRemfile.py index db3e7ec..f19eb95 100644 --- a/lindi/LindiRemfile/LindiRemfile.py +++ b/lindi/LindiRemfile/LindiRemfile.py @@ -217,10 +217,11 @@ def _load_chunk(self, chunk_index: int) -> bytes: if self._local_cache is None: self._memory_chunks[chunk_index] = x if self._local_cache is not None: + size = min(self._min_chunk_size, self.length - chunk_index * self._min_chunk_size) self._local_cache.put_remote_chunk( url=self._url, offset=chunk_index * self._min_chunk_size, - size=min(self._min_chunk_size, self.length - chunk_index * self._min_chunk_size), + size=size, data=x ) self._memory_chunk_indices.append(chunk_index) diff --git a/lindi/LocalCache/LocalCache.py b/lindi/LocalCache/LocalCache.py index a244eb7..f675f30 100644 --- a/lindi/LocalCache/LocalCache.py +++ b/lindi/LocalCache/LocalCache.py @@ -21,6 +21,10 @@ def put_remote_chunk(self, *, url: str, offset: int, size: int, data: bytes): self._sqlite_client.put_remote_chunk(url=url, offset=offset, size=size, data=data) +class ChunkTooLargeError(Exception): + pass + + class LocalCacheSQLiteClient: def __init__(self, *, db_fname: str): import sqlite3 @@ -58,6 +62,11 @@ def get_remote_chunk(self, *, url: str, offset: int, size: int): return row[0] def put_remote_chunk(self, *, url: str, offset: int, size: int, data: bytes): + if size >= 1000 * 1000 * 900: + # This is a sqlite limitation/configuration + # https://www.sqlite.org/limits.html + # For some reason 1000 * 1000 * 1000 seems to be too large, whereas 1000 * 1000 * 900 is fine + raise ChunkTooLargeError("Cannot store blobs larger than 900 MB in LocalCache") self._cursor.execute( """ INSERT OR REPLACE INTO remote_chunks (url, offset, size, data) VALUES (?, ?, ?, ?) diff --git a/lindi/__init__.py b/lindi/__init__.py index 628432e..f300066 100644 --- a/lindi/__init__.py +++ b/lindi/__init__.py @@ -1,5 +1,5 @@ from .LindiH5ZarrStore import LindiH5ZarrStore, LindiH5ZarrStoreOpts # noqa: F401 from .LindiH5pyFile import LindiH5pyFile, LindiH5pyGroup, LindiH5pyDataset, LindiH5pyHardLink, LindiH5pySoftLink # noqa: F401 from .LindiStagingStore import LindiStagingStore, StagingArea # noqa: F401 -from .LocalCache.LocalCache import LocalCache # noqa: F401 +from .LocalCache.LocalCache import LocalCache, ChunkTooLargeError # noqa: F401 from .File.File import File # noqa: F401 diff --git a/tests/test_local_cache.py b/tests/test_local_cache.py index 4f7644e..7fdb857 100644 --- a/tests/test_local_cache.py +++ b/tests/test_local_cache.py @@ -45,5 +45,31 @@ def test_remote_data_1(): assert elapsed_1 < elapsed_0 * 0.3 # type: ignore +def test_put_local_cache(): + with tempfile.TemporaryDirectory() as tmpdir: + local_cache = lindi.LocalCache(cache_dir=tmpdir + '/local_cache') + data = b'x' * (1000 * 1000 * 900 - 1) + local_cache.put_remote_chunk( + url='dummy', + offset=0, + size=len(data), + data=data + ) + data2 = local_cache.get_remote_chunk( + url='dummy', + offset=0, + size=len(data) + ) + assert data == data2 + data_too_large = b'x' * (1000 * 1000 * 900) + with pytest.raises(lindi.ChunkTooLargeError): + local_cache.put_remote_chunk( + url='dummy', + offset=0, + size=len(data_too_large), + data=data_too_large + ) + + if __name__ == "__main__": - test_remote_data_1() + test_put_local_cache()